[
  {
    "path": ".dockerignore",
    "content": "notebooks\n.venv/\n"
  },
  {
    "path": ".git-blame-ignore-revs",
    "content": "# Format with black\n6c03f96a9335e548685ece233474125fe453c262"
  },
  {
    "path": ".gitattributes",
    "content": "perception/_version.py export-subst\n"
  },
  {
    "path": ".github/dependabot.yaml",
    "content": "version: 2\nupdates:\n  - package-ecosystem: \"github-actions\"\n    directory: \"/\"\n    schedule:\n      # Check for updates to GitHub Actions every week.\n      interval: \"weekly\"\n"
  },
  {
    "path": ".github/workflows/ci.yaml",
    "content": "name: ci\non:\n  push:\n    branches:\n      - \"**\"\n    tags-ignore:\n      - v*\njobs:\n  test:\n    strategy:\n      matrix:\n        python-version: [\"3.10\", \"3.11\", \"3.12\", \"3.13\"]\n        os: [\"ubuntu-latest\", \"windows-latest\", \"macos-latest\"]\n    runs-on: ${{ matrix.os }}\n    steps:\n      - name: checkout\n        uses: actions/checkout@v6\n      - name: Setup Poetry\n        uses: abatilo/actions-poetry@v4\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v6\n        with:\n          python-version: ${{ matrix.python-version }}\n          cache: poetry\n          cache-dependency-path: poetry.lock\n      - name: Setup FFMPEG\n        uses: FedericoCarboni/setup-ffmpeg@v3\n        if: ${{ ! startsWith(matrix.os, 'macos') }}\n      - name: Setup Dependencies with Homebrew\n        if: startsWith(matrix.os, 'macos')\n        run: |\n          brew install llvm ffmpeg\n          echo \"CC=$(brew --prefix)/opt/llvm/bin/clang\" >> $GITHUB_ENV\n          echo \"CXX=$(brew --prefix)/opt/llvm/bin/clang++\" >> $GITHUB_ENV\n      - name: Setup Project\n        run: make init-project\n      - name: Normalize OpenCV package\n        run: |\n          poetry run python -m pip uninstall -y opencv-python-headless\n          poetry run python -m pip install --no-deps --force-reinstall opencv-contrib-python-headless\n      - name: Run precommit\n        run: make precommit\n"
  },
  {
    "path": ".github/workflows/gh-pages.yaml",
    "content": "name: Deploy Sphinx documentation to Pages\n\non:\n  push:\n    branches:\n      - dunnack/sphinx-to-github-pages\n      - main\n    paths:\n      - .github/workflows/gh-pages.yaml\n      - docs/**\n\njobs:\n  pages:\n    runs-on: ubuntu-latest\n    environment:\n      name: github-pages\n      url: ${{ steps.deployment.outputs.page_url }}\n    permissions:\n      contents: read\n      pages: write\n      id-token: write\n    steps:\n      - uses: actions/checkout@v6\n        with:\n          fetch-depth: 0\n      - id: deployment\n        uses: sphinx-notes/pages@v3\n        with:\n          checkout: false\n          documentation_path: docs\n          requirements_path: docs/requirements.txt\n"
  },
  {
    "path": ".github/workflows/release.yaml",
    "content": "name: release\non:\n  release:\n    types: [published]\n  workflow_dispatch:\n\njobs:\n  build-wheels:\n    runs-on: ${{ matrix.os }}\n    strategy:\n      matrix:\n        python-version: [\"3.10\", \"3.11\", \"3.12\", \"3.13\"]\n        os: [\"ubuntu-latest\", \"windows-latest\", \"macos-latest\"]\n    name: Build for ${{ matrix.os }} on Python ${{ matrix.python-version }}\n    steps:\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v6\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Setup Poetry\n        uses: abatilo/actions-poetry@v4\n      - name: Setup FFMPEG\n        uses: FedericoCarboni/setup-ffmpeg@v3\n        if: ${{ ! startsWith(matrix.os, 'macos') }}\n      - name: Setup Dependencies with Homebrew\n        if: startsWith(matrix.os, 'macos')\n        run: |\n          brew install llvm ffmpeg\n          echo \"CC=$(brew --prefix)/opt/llvm/bin/clang\" >> $GITHUB_ENV\n          echo \"CXX=$(brew --prefix)/opt/llvm/bin/clang++\" >> $GITHUB_ENV\n      - uses: actions/checkout@v6\n        with:\n          # Full clone for version calculation\n          fetch-depth: 0\n          fetch-tags: true\n          ref: ${{ github.event_name == 'release' && format('refs/tags/{0}', github.event.release.tag_name) || github.ref }}\n      - name: Build Project\n        run: make build-wheel\n      - uses: actions/upload-artifact@v7\n        with:\n          name: package-wheels-${{ matrix.os }}-${{ matrix.python-version }}\n          path: dist/*\n\n  build-sdist:\n    runs-on: ubuntu-latest\n    name: Build sdist\n    steps:\n      - name: Set up Python\n        uses: actions/setup-python@v6\n        with:\n          python-version: \"3.13\"\n      - name: Setup Poetry\n        uses: abatilo/actions-poetry@v4\n      - uses: actions/checkout@v6\n        with:\n          # Full clone for version calculation\n          fetch-depth: 0\n          fetch-tags: true\n          ref: ${{ github.event_name == 'release' && format('refs/tags/{0}', github.event.release.tag_name) || github.ref }}\n      - name: Build Project\n        run: make build-sdist\n      - uses: actions/upload-artifact@v7\n        with:\n          name: package-sdist\n          path: dist/*\n\n  publish:\n    needs: [build-wheels, build-sdist]\n    runs-on: ubuntu-latest\n    if: ${{ github.repository_owner == 'thorn-oss' && github.event_name == 'release' }}\n    steps:\n      - uses: actions/checkout@v6\n        with:\n          # Full clone for version calculation\n          fetch-depth: 0\n          fetch-tags: true\n          ref: refs/tags/${{ github.event.release.tag_name }}\n      - uses: actions/setup-python@v6\n        with:\n          python-version: \"3.13\"\n      - name: Setup Poetry\n        uses: abatilo/actions-poetry@v4\n      - name: Setup Dynamic Versioning\n        run: poetry self add \"poetry-dynamic-versioning[plugin]\"\n      - name: Download wheels\n        uses: actions/download-artifact@v8\n        with:\n          path: dist\n          pattern: package-*\n          merge-multiple: true\n      - name: Load PyPI Token\n        uses: 1password/load-secrets-action@v4\n        with:\n          # Export loaded secrets as environment variables\n          export-env: true\n        env:\n          OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.DATA_SCIENCE_OP_SERVICE_ACCOUNT_TOKEN }}\n          POETRY_PYPI_TOKEN_PYPI: op://data-science-oss/perception-pypi-api-key/secret/value\n      - name: Verify artifacts\n        run: |\n          mapfile -t artifacts < <(find dist -type f \\( -name \"*.whl\" -o -name \"*.tar.gz\" \\))\n          if [ ${#artifacts[@]} -eq 0 ]; then\n            echo \"No artifacts found in dist\"\n            exit 1\n          fi\n          printf '%s\\n' \"${artifacts[@]}\"\n          if printf '%s\\n' \"${artifacts[@]}\" | grep -E -- '-0\\.0\\.0([.-]|$)'; then\n            echo \"Refusing to publish placeholder version 0.0.0 artifacts\"\n            exit 1\n          fi\n      - name: Publish package\n        run: poetry publish -n\n"
  },
  {
    "path": ".gitignore",
    "content": "# MacOS stuff\n.DS_Store\n\n# Python artifacts\n*.egg-info\n\n# Cache\n.mypy_cache\n.pytest_cache\n__pycache__\n.ipynb_checkpoints\ndist\n\n# Any temporary images or CSV files\nnotebooks\n\n# Local environment\n.venv\n.python-version\n\n# Coverage file\n.coverage\n\n# Versioneer artifacts\n/versioneer.pyc\n\n# Build artifacts\n/build\n\n# Docs build artifacts\n/docs/_build\n\n# Remove .vscode folder\n.vscode\n\n# Extension artifacts\n*.c\n*.cpp\n*.so\ndebug-image*\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "# See https://pre-commit.com for more information\n# See https://pre-commit.com/hooks.html for more hooks\nrepos:\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v4.5.0\n    hooks:\n      - id: trailing-whitespace\n      - id: end-of-file-fixer\n      - id: check-yaml\n      - id: check-added-large-files\n  - repo: https://github.com/psf/black\n    rev: 26.3.1\n    hooks:\n      - id: black\n        language_version: python3\n  - repo: https://github.com/astral-sh/ruff-pre-commit\n    # Ruff version.\n    rev: v0.11.13\n    hooks:\n      # Run the linter.\n      - id: ruff\n        args: [ --fix ]\n  - repo: https://github.com/pre-commit/mirrors-mypy\n    rev: v1.8.0\n    hooks:\n      - id: mypy\n"
  },
  {
    "path": ".readthedocs.yaml",
    "content": "version: 2\n\n# Build documentation in the docs/ directory with Sphinx\nsphinx:\n  configuration: docs/conf.py\n\nformats: all\n\n# Installs the package and the docs requirements.\npython:\n   version: 3.9\n   install:\n      - requirements: docs/requirements.txt\n      - method: pip\n        path: .\n   system_packages: true\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "# Changelog\nAll notable changes to this project will be documented in this file.\n\nThe format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),\nand this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).\n\n## [0.4.0] - 2020-10-17\nThis release switches from using false positive rates in benchmarking to reporting precision, which is more intuitive.\n\n### Breaking changes\nAll references to fpr_threshold now refer to precision_threshold.\n\n### Bug fixes\nThe PDQHash hasher now correctly returns the hash vector instead of the (vector, quality) tuple.\n\n## [0.3.0] - 2020-04-27\nThis release adds significantly more support for video.\n\n### Breaking changes\n- Previously, `read_video` returned `(frame, index, timestamp)` tuples where `index` reflected the index of the yielded frame (i.e., it always increased by exactly 1). It now reflects the index of the frame in the original video. This means that, if the requested framerate is higher than the encoded video framerate, this index may repeat the same value, indicating that we have repeated the same frame.\n\n### Enhancements\n- We now include a `SimpleSceneDetection` hasher that can wrap other video hashers using scene detection.\n- `compute_metrics` is much faster now for integer-valued hashes that use a euclidean distance metric.\n- We now include an unsigned 8-bit integer version of `PHash`, called `PHashU8`. This provides a useful framewise hasher for averaging across frames (e.g., using TMK) while being more compact than `PHashF`.\n- We include more thorough support for benchmarking video hashes.\n\n### Bug fixes\n- When using `hasher.vector_to_string` with hashers that return multiple hashes, the `hash_format` argument was not respected.\n- The `compute_threshold_recall` and `show_histograms` functions did not work properly when `grouping=[]`.\n\n## [0.2.0] - 2019-12-20\nThis release adds more support for hashing videos (including TMK L2 and TMK L2). As part of that, it also includes a re-factor to separate `benchmarking.BenchmarkDataset` and `benchmarking.BenchmarkTransforms` into image and video variants.\n\n## [0.1.0] - 2019-11-04\nInitial release"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nIn the interest of fostering an open and welcoming environment, we as\ncontributors and maintainers pledge to make participation in our project and\nour community a harassment-free experience for everyone, regardless of age, body\nsize, disability, ethnicity, sex characteristics, gender identity and expression,\nlevel of experience, education, socio-economic status, nationality, personal\nappearance, race, religion, or sexual identity and orientation.\n\n## Our Standards\n\nExamples of behavior that contributes to creating a positive environment\ninclude:\n\n* Using welcoming and inclusive language\n* Being respectful of differing viewpoints and experiences\n* Gracefully accepting constructive criticism\n* Focusing on what is best for the community\n* Showing empathy towards other community members\n\nExamples of unacceptable behavior by participants include:\n\n* The use of sexualized language or imagery and unwelcome sexual attention or\n  advances\n* Trolling, insulting/derogatory comments, and personal or political attacks\n* Public or private harassment\n* Publishing others' private information, such as a physical or electronic\n  address, without explicit permission\n* Other conduct which could reasonably be considered inappropriate in a\n  professional setting\n\n## Our Responsibilities\n\nProject maintainers are responsible for clarifying the standards of acceptable\nbehavior and are expected to take appropriate and fair corrective action in\nresponse to any instances of unacceptable behavior.\n\nProject maintainers have the right and responsibility to remove, edit, or\nreject comments, commits, code, wiki edits, issues, and other contributions\nthat are not aligned to this Code of Conduct, or to ban temporarily or\npermanently any contributor for other behaviors that they deem inappropriate,\nthreatening, offensive, or harmful.\n\n## Scope\n\nThis Code of Conduct applies within all project spaces, and it also applies when\nan individual is representing the project or its community in public spaces.\nExamples of representing a project or community include using an official\nproject e-mail address, posting via an official social media account, or acting\nas an appointed representative at an online or offline event. Representation of\na project may be further defined and clarified by project maintainers.\n\n## Enforcement\n\nInstances of abusive, harassing, or otherwise unacceptable behavior may be\nreported by contacting the project team at conduct@thorn.org. All\ncomplaints will be reviewed and investigated and will result in a response that\nis deemed necessary and appropriate to the circumstances. The project team is\nobligated to maintain confidentiality with regard to the reporter of an incident.\nFurther details of specific enforcement policies may be posted separately.\n\nProject maintainers who do not follow or enforce the Code of Conduct in good\nfaith may face temporary or permanent repercussions as determined by other\nmembers of the project's leadership.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,\navailable at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html\n\n[homepage]: https://www.contributor-covenant.org\n\nFor answers to common questions about this code of conduct, see\nhttps://www.contributor-covenant.org/faq\n"
  },
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        https://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   Copyright 2019 Thorn\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       https://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License."
  },
  {
    "path": "MANIFEST.in",
    "content": "include perception/testing/images/*\ninclude perception/testing/videos/*\ninclude perception/testing/logos/*\ninclude perception/**/*.pyx\ninclude perception/*.pyx\ninclude perception/py.typed\nexclude tests/*\n"
  },
  {
    "path": "Makefile",
    "content": "TEST_SCOPE?=tests/\n\n.PHONY: build build-wheel build-sdist verify-version init-project init test lint_check type_check format format_check precommit\n\ninit-project:\n\tpoetry install --all-extras\n\ninit: init-project\n\tpoetry run pre-commit install\n\ntest:\n\tpoetry run pytest $(TEST_SCOPE)\n\nlint_check:\n\tpoetry run ruff check perception tests\n\ntype_check:\n\tpoetry run mypy perception\n\nformat:\n\tpoetry run black .\n\nformat_check:\n\tpoetry run black --check . || (echo '\\nUnexpected format.' && exit 1)\n\nprecommit:\n\tpoetry check\n\tmake lint_check\n\tmake type_check\n\tmake format_check\n\tmake test\n\nverify-version:\n\t@echo \"Poetry: $$(poetry --version)\"\n\t@echo \"Poetry plugins:\"\n\tpoetry self show plugins\n\t@echo \"Git describe: $$(git describe --tags --always)\"\n\t@poetry self show plugins | grep -q \"poetry-dynamic-versioning\"\n\nbuild-wheel:\n\tpoetry run pip -q install repairwheel\n\tpoetry self add \"poetry-dynamic-versioning[plugin]\"\n\t$(MAKE) verify-version\n\tpoetry build --format=\"wheel\" --output=\"dist-tmp\"\n\tpoetry run repairwheel -o dist dist-tmp/*.whl\n\t@find dist -name \"*.whl\" -type f | sed -n \"s/\\(.*\\)\\.linux.*\\.whl$$/& \\1.whl/p\" | xargs -r -n 2 mv # Fix wheel name\n\t@rm -rf dist-tmp\n\nbuild-sdist:\n\tpoetry self add \"poetry-dynamic-versioning[plugin]\"\n\t$(MAKE) verify-version\n\tpoetry build --format=\"sdist\" --output=\"dist\"\n\nbuild: build-wheel build-sdist\n"
  },
  {
    "path": "README.md",
    "content": "# perception ![ci](https://github.com/thorn-oss/perception/workflows/ci/badge.svg)\n\n`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. See [the documentation](https://perception.thorn.engineering/en/latest/) for details.\n\n## Background\n\n`perception` was initially developed at [Thorn](https://www.thorn.org) as part of our work to eliminate child sexual abuse material from the internet. For more information on the issue, check out [our CEO's TED talk](https://www.thorn.org/blog/time-is-now-eliminate-csam/).\n\n## Getting Started\n\n### Installation\n\n`pip install perception`\n\n### Hashing\n\nHashing with different functions is simple with `perception`.\n\n```python\nfrom perception import hashers\n\nfile1, file2 = 'test1.jpg', 'test2.jpg'\nhasher = hashers.PHash()\nhash1, hash2 = hasher.compute(file1), hasher.compute(file2)\ndistance = hasher.compute_distance(hash1, hash2)\n```\n\n### Examples\n\nSee below for end-to-end examples for common use cases for perceptual hashes.\n\n- [Detecting child sexual abuse material](https://perception.thorn.engineering/en/latest/examples/detecting_csam.html)\n- [Deduplicating media](https://perception.thorn.engineering/en/latest/examples/deduplication.html)\n- [Benchmarking perceptual hashes](https://perception.thorn.engineering/en/latest/examples/benchmarking.html)\n\n## Supported Hashing Algorithms\n\n`perception` currently ships with:\n\n- pHash (DCT hash) (`perception.hashers.PHash`)\n- Facebook's PDQ Hash (`perception.hashers.PDQ`)\n- dHash (difference hash) (`perception.hashers.DHash`)\n- aHash (average hash) (`perception.hashers.AverageHash`)\n- Marr-Hildreth (`perception.hashers.MarrHildreth`)\n- Color Moment (`perception.hashers.ColorMoment`)\n- Block Mean (`perception.hashers.BlockMean`)\n- wHash (wavelet hash) (`perception.hashers.WaveletHash`)\n\n## Contributing\n\nTo work on the project, start by doing the following.\n\n```bash\n# Install local dependencies for\n# code completion, etc.\nmake init\n\n- To do a (close to) comprehensive check before committing code, you can use `make precommit`.\n\nTo implement new features, please first file an issue proposing your change for discussion.\n\nTo report problems, please file an issue with sample code, expected results, actual results, and a complete traceback.\n\n## Alternatives\n\nThere are other packages worth checking out to see if they meet your needs for perceptual hashing. Here are some\nexamples.\n\n- [dedupe](https://github.com/dedupeio/dedupe)\n- [imagededup](https://idealo.github.io/imagededup/)\n- [ImageHash](https://github.com/JohannesBuchner/imagehash)\n- [PhotoHash](https://github.com/bunchesofdonald/photohash)\n```\n"
  },
  {
    "path": "build.py",
    "content": "from Cython.Build import cythonize\nimport numpy as np\n\ncompiler_directives = {\"language_level\": 3, \"embedsignature\": True}\n\n\ndef build(setup_kwargs):\n    setup_kwargs.update(\n        {\n            \"ext_modules\": cythonize(\n                \"perception/**/extensions.pyx\", compiler_directives=compiler_directives\n            ),\n            \"include_dirs\": [np.get_include()],\n        }\n    )\n"
  },
  {
    "path": "docs/api/benchmarking.rst",
    "content": "Benchmarking\n************\n\n.. autoclass:: perception.benchmarking.BenchmarkImageDataset\n        :members:\n        :inherited-members:\n\n.. autoclass:: perception.benchmarking.BenchmarkImageTransforms\n        :members:\n        :inherited-members:\n\n.. autoclass:: perception.benchmarking.BenchmarkVideoDataset\n        :members:\n        :inherited-members:\n\n.. autoclass:: perception.benchmarking.BenchmarkVideoTransforms\n        :members:\n        :inherited-members:\n\n.. autoclass:: perception.benchmarking.BenchmarkHashes\n        :members:\n        :inherited-members:\n\nVideo Transforms\n================\n\nTransforming videos can be more complex, so we provide the following\ntools for transforming videos.\n\n.. automodule:: perception.benchmarking.video_transforms\n        :members: get_simple_transform, get_black_frame_padding_transform, get_slideshow_transform\n"
  },
  {
    "path": "docs/api/hashers.rst",
    "content": "Hashers\n*******\n\nAll hashers from the :code:`Hasher` class.\n\n.. autoclass:: perception.hashers.hasher.Hasher\n        :members:\n\nImages\n~~~~~~\n\nAll image hashers inherit from the :code:`ImageHasher` class. \n\n.. autoclass:: perception.hashers.hasher.ImageHasher\n        :members:\n\nThe following image hash functions are included in the package.\n\n.. automodule:: perception.hashers.image\n        :members:\n        :imported-members:\n\n\nVideos\n~~~~~~\n\nAll video hashers inherit from the :code:`VideoHasher` class. \n\n.. autoclass:: perception.hashers.hasher.VideoHasher\n        :members:\n\nThe following video hash functions are included in the package.\n\n.. automodule:: perception.hashers.video\n        :members:\n        :imported-members:\n\nTools\n~~~~~\n\nThese utility functions are only used by the hashers but are documented\nhere for completeness.\n\n.. automodule:: perception.hashers.tools\n    :members:"
  },
  {
    "path": "docs/api/index.rst",
    "content": "API\n***\n\n.. toctree::\n   :maxdepth: 2\n   :caption: Contents:\n\n   hashers\n   benchmarking\n   tools\n"
  },
  {
    "path": "docs/api/tools.rst",
    "content": "\nTools\n*****\n\n\n.. automodule:: perception.tools\n    :members:"
  },
  {
    "path": "docs/conf.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Configuration file for the Sphinx documentation builder.\n#\n# This file does only contain a selection of the most common options. For a\n# full list see the documentation:\n# http://www.sphinx-doc.org/en/master/config\n\n# -- Path setup --------------------------------------------------------------\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\n\n# -- Project information -----------------------------------------------------\nproject = \"perception\"\ncopyright = \"2019, thorn\"\nauthor = \"thorn\"\n\n# The short X.Y version\nversion = \"\"\n# The full version, including alpha/beta/rc tags\nrelease = \"\"\n\n# -- General configuration ---------------------------------------------------\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    \"sphinx.ext.autodoc\",\n    \"sphinx.ext.imgmath\",\n    \"sphinx.ext.napoleon\",\n    \"sphinx_autodoc_typehints\",\n    \"m2r\",\n]\n\n# The suffix(es) of source filenames.\n# You can specify multiple suffix as a list of string:\n#\n# source_suffix = ['.rst', '.md']\nsource_suffix = \".rst\"\n\n# The master toctree document.\nmaster_doc = \"index\"\n\n# The language for content autogenerated by Sphinx. Refer to documentation\n# for a list of supported languages.\n#\n# This is also used if you do content translation via gettext catalogs.\n# Usually you set \"language\" from the command line for these cases.\nlanguage = None\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This pattern also affects html_static_path and html_extra_path.\nexclude_patterns = [\"_build\", \"Thumbs.db\", \".DS_Store\"]\n\n# The name of the Pygments (syntax highlighting) style to use.\npygments_style = None\n\nhtml_theme = \"sphinx_rtd_theme\"\n\nhtml_theme_options = {\"navigation_depth\": 4, \"collapse_navigation\": False}\n"
  },
  {
    "path": "docs/examples/benchmarking.rst",
    "content": "Benchmarking\n************\n\nThis package provides a fair amount of infrastructure for benchmarking different hashers to evaluate their performance.\n\nImage Hashing\n=============\n\nThe below example does the following:\n\n- Download a benchmarking dataset (we provide a dataset with images that have compatible licensing for this example)\n- Load the dataset. If you are using your own datasets, you may wish to call `deduplicate` on it to ensure no duplicates are included.\n- Transform the dataset to generate synthetic images.\n- Define a new custom hasher that we want to evaluate.\n  It's not very good -- but demonstrates how you can evaluate your own custom hash functions.\n- Compute all the hashes.\n- Report metrics for each image category / hasher / transformation combination.\n\n.. code-block:: python\n\n    import os\n    import glob\n    import zipfile\n    import urllib.request\n\n    import cv2\n    import albumentations\n    import tabulate # Optional: Only used for generating tables for the Sphinx documentation\n    import numpy as np\n\n    from perception import benchmarking, hashers\n    from perception.hashers.image.pdq import PDQHash\n\n    urllib.request.urlretrieve(\n        \"https://thorn-perception.s3.amazonaws.com/thorn-perceptual-benchmark-v0.zip\",\n        \"thorn-perceptual-benchmark-v0.zip\"\n    )\n\n    with zipfile.ZipFile('thorn-perceptual-benchmark-v0.zip') as f:\n        f.extractall('.')\n\n    # Load the dataset\n    dataset = benchmarking.BenchmarkImageDataset.from_tuples(files=[\n        (filepath, filepath.split(os.path.sep)[-2]) for filepath in glob.glob(\n            os.path.join('thorn-perceptual-benchmark-v0', '**', '*.jpg')\n        )\n    ])\n\n    # Define the transforms we want to use for\n    # evaluation hash quality.\n    def watermark(image):\n        fontScale = 5\n        thickness = 5\n        text = \"TEXT\"\n        fontFace = cv2.FONT_HERSHEY_SIMPLEX\n        targetWidth = 0.2*image.shape[1]\n        (textWidth, textHeight), _ = cv2.getTextSize(\n            text=\"TEST\",\n            fontFace=fontFace,\n            fontScale=fontScale,\n            thickness=thickness\n        )\n        fontScaleCorr = targetWidth / textWidth\n        textHeight *= fontScaleCorr\n        textWidth *= fontScaleCorr\n        fontScale *= fontScaleCorr\n\n        org = ( textHeight, image.shape[0] - textHeight )\n        org = tuple(map(int, org))\n        color = (0, 0, 0, 200)\n        placeholder = cv2.putText(\n            img=np.zeros(image.shape[:2] + (4, ), dtype='uint8'),\n            text=\"TEST\",\n            org=org,\n            color=color,\n            fontFace=fontFace,\n            fontScale=fontScale,\n            thickness=thickness\n        ).astype('float32')\n        augmented = (\n            (image.astype('float32')[..., :3]*(255 - placeholder[..., 3:]) + placeholder[..., :3]*placeholder[..., 3:])\n        ) / 255\n        return augmented.astype('uint8')\n\n    def vignette(image):\n        height, width = image.shape[:2]\n        a = cv2.getGaussianKernel(height, height/2)\n        b = cv2.getGaussianKernel(width, width/2)\n        c = (b.T*a)[..., np.newaxis]\n        d = c/c.max()\n        e = image*d\n        return e.astype('uint8')\n\n    transforms={\n        'watermark': watermark,\n        'blur2': albumentations.GaussianBlur(sigma_limit=2.0, p=1),\n        'vignette': vignette,\n        'gamma2': albumentations.RandomGamma(gamma_limit=2, p=1),\n        'jpeg95': albumentations.ImageCompression(quality=95, p=1),\n        'pad0.2': albumentations.CropAndPad(percent=(0.2, 2), p=1),\n        'crop0.05': albumentations.CropAndPad(percent=-0.05, p=1),\n        'noise0.2': albumentations.GaussNoise(noise_scale_factor=0.2, p=1),\n        'rotate4': albumentations.Affine(rotate=4, p=1),\n        'noop': albumentations.NoOp(p=1),\n    }\n\n    # Compute the transformed versions of the images.\n    # This takes a while but you can reload the\n    # generated dataset without recomputing it (see next line).\n    transformed = dataset.transform(\n        transforms=transforms,\n        storage_dir='transformed',\n        errors=\"raise\"\n    )\n    # We don't actually have to do this, but it shows\n    # how to reload the transformed dataset later.\n    transformed = benchmarking.BenchmarkImageTransforms.load(\n        path_to_zip_or_directory='transformed', verify_md5=False\n    )\n\n    # Create a new hash that we want to evaluate.\n    # perception will handle most of the plumbing but\n    # we do have to specify a few things.\n    class ShrinkHash(hashers.ImageHasher):\n        \"\"\"This is a simple hash to demonstrate how you\n        can create your own hasher and compare it to others.\n        It just shrinks images to 8x8 pixels and then flattens\n        the result.\n        \"\"\"\n\n        # We have to let perception know\n        # the shape and type of our hash.\n        hash_length = 64\n        dtype = 'uint8'\n\n        # We need to specify how distance is\n        # computed between hashes.\n        distance_metric = 'euclidean'\n\n        def _compute(self, image):\n            gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)\n            resized = cv2.resize(gray, dsize=(8, 8))\n            return resized.flatten()\n\n    hashers_dict = {\n        'ahash': hashers.AverageHash(hash_size=16),\n        'dhash': hashers.DHash(hash_size=16),\n        'pdq': PDQHash(),\n        'phash': hashers.PHash(hash_size=16),\n        'marrhildreth': hashers.MarrHildreth(),\n        'wavelet': hashers.WaveletHash(hash_size=16),\n        'blockmean': hashers.BlockMean(),\n        'shrinkhash': ShrinkHash()\n    }\n\n    # Compute the hashes\n    hashes = transformed.compute_hashes(hashers=hashers_dict)\n\n    # Get performance metrics (i.e., recall) for each hash function based on\n    # a minimum precision threshold. Here we use 99.99%.\n    precision_threshold = 99.99\n\n    # The metrics are just pandas dataframes. We use tabulate here to obtain the tables\n    # formatted for the documentation.\n    metrics = hashes.compute_threshold_recall(precision_threshold=precision_threshold).reset_index()\n    print(tabulate.tabulate(metrics, showindex=False, headers=metrics.columns, tablefmt='rst'))\n\n    metrics_by_transform = hashes.compute_threshold_recall(grouping=['transform_name'], precision_threshold=precision_threshold).reset_index()\n    print(tabulate.tabulate(metrics_by_transform, showindex=False, headers=metrics_by_transform.columns, tablefmt='rst'))\n\n    metrics_simple = hashes.compute_threshold_recall(grouping=[], precision_threshold=precision_threshold).reset_index()\n    print(tabulate.tabulate(metrics_simple, showindex=False, headers=metrics_simple.columns, tablefmt='rst'))\n\n\n\n===========  ================  =============  ============  ========  ===========  =============\ncategory     transform_name    hasher_name       threshold    recall    precision    n_exemplars\n===========  ================  =============  ============  ========  ===========  =============\npaintings    blur2             ahash            0.0078125     51.724          100           2204\npaintings    blur2             blockmean        0.0123967     85.753          100           2204\npaintings    blur2             dhash            0.105469     100              100           2204\npaintings    blur2             marrhildreth     0.0989583    100              100           2204\npaintings    blur2             pdq              0.117188     100              100           2204\npaintings    blur2             phash            0.0390625    100              100           2204\npaintings    blur2             shrinkhash      60.8112        43.33           100           2204\npaintings    blur2             wavelet          0.0117188     66.379          100           2204\npaintings    crop0.05          ahash            0.00390625     0.045          100           2204\npaintings    crop0.05          blockmean        0.0123967      0.227          100           2204\npaintings    crop0.05          dhash            0.210938       7.577          100           2204\npaintings    crop0.05          marrhildreth     0.213542       3.584          100           2204\npaintings    crop0.05          pdq              0.257812       8.439          100           2204\npaintings    crop0.05          phash            0.226562       6.76           100           2204\npaintings    crop0.05          shrinkhash      95.0053         2.269          100           2204\npaintings    crop0.05          wavelet          0.0078125      0              nan           2204\npaintings    gamma2            ahash            0.00390625     0.998          100           2204\npaintings    gamma2            blockmean        0.0072314      1.724          100           2204\npaintings    gamma2            dhash            0.167969      98.639          100           2204\npaintings    gamma2            marrhildreth     0.159722      99.41           100           2204\npaintings    gamma2            pdq              0.164062     100              100           2204\npaintings    gamma2            phash            0.164062     100              100           2204\npaintings    gamma2            shrinkhash      46.5296         0              nan           2204\npaintings    gamma2            wavelet          0.0117188     18.512          100           2204\npaintings    jpeg95            ahash            0.00390625     4.22           100           2204\npaintings    jpeg95            blockmean        0.0134298     28.811          100           2204\npaintings    jpeg95            dhash            0.191406      94.782          100           2204\npaintings    jpeg95            marrhildreth     0.168403      82.985          100           2204\npaintings    jpeg95            pdq              0.257812     100              100           2204\npaintings    jpeg95            phash            0.234375     100              100           2204\npaintings    jpeg95            shrinkhash      66.053         55.172          100           2204\npaintings    jpeg95            wavelet          0              0              nan           2204\npaintings    noise0.2          ahash            0.00390625     2.677          100           2204\npaintings    noise0.2          blockmean        0.00826446     6.987          100           2204\npaintings    noise0.2          dhash            0.25          93.648          100           2204\npaintings    noise0.2          marrhildreth     0.170139      73.911          100           2204\npaintings    noise0.2          pdq              0.257812      99.229          100           2204\npaintings    noise0.2          phash            0.257812     100              100           2204\npaintings    noise0.2          shrinkhash     169.387          3.312          100           2204\npaintings    noise0.2          wavelet          0.0078125      1.407          100           2204\npaintings    noop              ahash            0            100              100           2204\npaintings    noop              blockmean        0            100              100           2204\npaintings    noop              dhash            0            100              100           2204\npaintings    noop              marrhildreth     0            100              100           2204\npaintings    noop              pdq              0            100              100           2204\npaintings    noop              phash            0            100              100           2204\npaintings    noop              shrinkhash       0            100              100           2204\npaintings    noop              wavelet          0            100              100           2204\npaintings    pad0.2            ahash            0.0703125      0              nan           2204\npaintings    pad0.2            blockmean        0.0795455      0              nan           2204\npaintings    pad0.2            dhash            0.210938       1.089          100           2204\npaintings    pad0.2            marrhildreth     0.177083       0              nan           2204\npaintings    pad0.2            pdq              0.289062       1.86           100           2204\npaintings    pad0.2            phash            0.273438       2.541          100           2204\npaintings    pad0.2            shrinkhash     146.325          0.181          100           2204\npaintings    pad0.2            wavelet          0.109375       0              nan           2204\npaintings    resize0.5         ahash            0.0078125     76.089          100           2204\npaintings    resize0.5         blockmean        0.0144628     98.185          100           2204\npaintings    resize0.5         dhash            0.0976562    100              100           2204\npaintings    resize0.5         marrhildreth     0.154514      99.819          100           2204\npaintings    resize0.5         pdq              0.1875       100              100           2204\npaintings    resize0.5         phash            0.09375      100              100           2204\npaintings    resize0.5         shrinkhash      56.9034        76.27           100           2204\npaintings    resize0.5         wavelet          0.0117188     84.71           100           2204\npaintings    rotate4           ahash            0.0390625      2.949          100           2204\npaintings    rotate4           blockmean        0.0382231      2.949          100           2204\npaintings    rotate4           dhash            0.207031      36.298          100           2204\npaintings    rotate4           marrhildreth     0.227431      61.978          100           2204\npaintings    rotate4           pdq              0.273438      56.08           100           2204\npaintings    rotate4           phash            0.257812      61.615          100           2204\npaintings    rotate4           shrinkhash      69.1737         2.813          100           2204\npaintings    rotate4           wavelet          0.03125        0.136          100           2204\npaintings    vignette          ahash            0.0429688      6.171          100           2204\npaintings    vignette          blockmean        0.0475207      8.122          100           2204\npaintings    vignette          dhash            0.121094      32.305          100           2204\npaintings    vignette          marrhildreth     0.177083      77.904          100           2204\npaintings    vignette          pdq              0.132812     100              100           2204\npaintings    vignette          phash            0.132812     100              100           2204\npaintings    vignette          shrinkhash     102.186          3.267          100           2204\npaintings    vignette          wavelet          0.046875       3.085          100           2204\npaintings    watermark         ahash            0.00390625    20.054          100           2204\npaintings    watermark         blockmean        0.0123967     45.145          100           2204\npaintings    watermark         dhash            0.0585938    100              100           2204\npaintings    watermark         marrhildreth     0.0625       100              100           2204\npaintings    watermark         pdq              0.273438      98.866          100           2204\npaintings    watermark         phash            0.28125       99.456          100           2204\npaintings    watermark         shrinkhash     104.398         75.998          100           2204\npaintings    watermark         wavelet          0.0117188     51.27           100           2204\nphotographs  blur2             ahash            0.015625      76.727          100           1650\nphotographs  blur2             blockmean        0.0330579     98              100           1650\nphotographs  blur2             dhash            0.0859375     98.97           100           1650\nphotographs  blur2             marrhildreth     0.107639      97.576          100           1650\nphotographs  blur2             pdq              0.304688     100              100           1650\nphotographs  blur2             phash            0.179688     100              100           1650\nphotographs  blur2             shrinkhash     117.627         44              100           1650\nphotographs  blur2             wavelet          0.0195312     79.879          100           1650\nphotographs  crop0.05          ahash            0.0078125      0.182          100           1650\nphotographs  crop0.05          blockmean        0.0258264      0.788          100           1650\nphotographs  crop0.05          dhash            0.0976562      1.091          100           1650\nphotographs  crop0.05          marrhildreth     0.173611       3.152          100           1650\nphotographs  crop0.05          pdq              0.304688      30.606          100           1650\nphotographs  crop0.05          phash            0.320312      63.697          100           1650\nphotographs  crop0.05          shrinkhash     125.94           1.152          100           1650\nphotographs  crop0.05          wavelet          0.015625       0.182          100           1650\nphotographs  gamma2            ahash            0.015625       8.182          100           1650\nphotographs  gamma2            blockmean        0.0268595     17.212          100           1650\nphotographs  gamma2            dhash            0.101562      90.303          100           1650\nphotographs  gamma2            marrhildreth     0.105903      90.909          100           1650\nphotographs  gamma2            pdq              0.210938     100              100           1650\nphotographs  gamma2            phash            0.234375     100              100           1650\nphotographs  gamma2            shrinkhash     119.683          0.545          100           1650\nphotographs  gamma2            wavelet          0.0195312     18.424          100           1650\nphotographs  jpeg95            ahash            0.0117188     29.879          100           1650\nphotographs  jpeg95            blockmean        0.0278926     76.788          100           1650\nphotographs  jpeg95            dhash            0.121094      84.182          100           1650\nphotographs  jpeg95            marrhildreth     0.104167      69.576          100           1650\nphotographs  jpeg95            pdq              0.296875      99.879          100           1650\nphotographs  jpeg95            phash            0.28125       99.879          100           1650\nphotographs  jpeg95            shrinkhash     131.031         89.212          100           1650\nphotographs  jpeg95            wavelet          0.0195312     40.242          100           1650\nphotographs  noise0.2          ahash            0.015625      27.636          100           1650\nphotographs  noise0.2          blockmean        0.036157      75.091          100           1650\nphotographs  noise0.2          dhash            0.121094      54.121          100           1650\nphotographs  noise0.2          marrhildreth     0.0989583     46.364          100           1650\nphotographs  noise0.2          pdq              0.296875      99.697          100           1650\nphotographs  noise0.2          phash            0.304688      99.818          100           1650\nphotographs  noise0.2          shrinkhash     210.661         57.576          100           1650\nphotographs  noise0.2          wavelet          0.0234375     27.03           100           1650\nphotographs  noop              ahash            0            100              100           1650\nphotographs  noop              blockmean        0            100              100           1650\nphotographs  noop              dhash            0            100              100           1650\nphotographs  noop              marrhildreth     0            100              100           1650\nphotographs  noop              pdq              0            100              100           1650\nphotographs  noop              phash            0            100              100           1650\nphotographs  noop              shrinkhash       0            100              100           1650\nphotographs  noop              wavelet          0            100              100           1650\nphotographs  pad0.2            ahash            0.0429688      0.061          100           1650\nphotographs  pad0.2            blockmean        0.0320248      0              nan           1650\nphotographs  pad0.2            dhash            0.105469       0.545          100           1650\nphotographs  pad0.2            marrhildreth     0.177083       0.121          100           1650\nphotographs  pad0.2            pdq              0.28125        1.455          100           1650\nphotographs  pad0.2            phash            0.289062       3.515          100           1650\nphotographs  pad0.2            shrinkhash     114.721          0.061          100           1650\nphotographs  pad0.2            wavelet          0.0820312      0              nan           1650\nphotographs  resize0.5         ahash            0.015625      87.697          100           1650\nphotographs  resize0.5         blockmean        0.0330579     99.152          100           1650\nphotographs  resize0.5         dhash            0.0898438     98.485          100           1650\nphotographs  resize0.5         marrhildreth     0.111111      95.394          100           1650\nphotographs  resize0.5         pdq              0.328125      99.818          100           1650\nphotographs  resize0.5         phash            0.234375     100              100           1650\nphotographs  resize0.5         shrinkhash     132.117         80.242          100           1650\nphotographs  resize0.5         wavelet          0.0195312     88.97           100           1650\nphotographs  rotate4           ahash            0.0273438      1.818          100           1650\nphotographs  rotate4           blockmean        0.0371901      3.879          100           1650\nphotographs  rotate4           dhash            0.09375        2.97           100           1650\nphotographs  rotate4           marrhildreth     0.149306       4.606          100           1650\nphotographs  rotate4           pdq              0.304688      73.394          100           1650\nphotographs  rotate4           phash            0.3125        89.818          100           1650\nphotographs  rotate4           shrinkhash     130.211          4.424          100           1650\nphotographs  rotate4           wavelet          0.0078125      0.061          100           1650\nphotographs  vignette          ahash            0.0273438      8.242          100           1650\nphotographs  vignette          blockmean        0.0320248     10              100           1650\nphotographs  vignette          dhash            0.0703125     22              100           1650\nphotographs  vignette          marrhildreth     0.0954861     38.727          100           1650\nphotographs  vignette          pdq              0.117188     100              100           1650\nphotographs  vignette          phash            0.125        100              100           1650\nphotographs  vignette          shrinkhash     138.989         11.939          100           1650\nphotographs  vignette          wavelet          0.0195312      4.242          100           1650\nphotographs  watermark         ahash            0.015625      42.667          100           1650\nphotographs  watermark         blockmean        0.0247934     60.788          100           1650\nphotographs  watermark         dhash            0.078125     100              100           1650\nphotographs  watermark         marrhildreth     0.112847      98.727          100           1650\nphotographs  watermark         pdq              0.3125        99.818          100           1650\nphotographs  watermark         phash            0.3125        99.758          100           1650\nphotographs  watermark         shrinkhash     142.046         79.576          100           1650\nphotographs  watermark         wavelet          0.0195312     53.455          100           1650\n===========  ================  =============  ============  ========  ===========  =============\n\n================  =============  ============  ========  ===========  =============\ntransform_name    hasher_name       threshold    recall    precision    n_exemplars\n================  =============  ============  ========  ===========  =============\nblur2             ahash            0.0078125     49.014          100           3854\nblur2             blockmean        0.0123967     80.773          100           3854\nblur2             dhash            0.0859375     99.196          100           3854\nblur2             marrhildreth     0.107639      98.962          100           3854\nblur2             pdq              0.234375      99.948          100           3854\nblur2             phash            0.179688     100              100           3854\nblur2             shrinkhash      60.8112        28.412          100           3854\nblur2             wavelet          0.0117188     62.247          100           3854\ncrop0.05          ahash            0.00390625     0.052          100           3854\ncrop0.05          blockmean        0.0123967      0.208          100           3854\ncrop0.05          dhash            0.0976562      0.493          100           3854\ncrop0.05          marrhildreth     0.173611       1.635          100           3854\ncrop0.05          pdq              0.257812       9.03           100           3854\ncrop0.05          phash            0.226562       7.058          100           3854\ncrop0.05          shrinkhash      95.0053         1.427          100           3854\ncrop0.05          wavelet          0.0078125      0              nan           3854\ngamma2            ahash            0.00390625     0.934          100           3854\ngamma2            blockmean        0.0072314      1.713          100           3854\ngamma2            dhash            0.101562      90.036          100           3854\ngamma2            marrhildreth     0.105903      94.24           100           3854\ngamma2            pdq              0.210938     100              100           3854\ngamma2            phash            0.234375     100              100           3854\ngamma2            shrinkhash     108.457          0.156          100           3854\ngamma2            wavelet          0.0117188     14.997          100           3854\njpeg95            ahash            0.00390625     5.319          100           3854\njpeg95            blockmean        0.0134298     32.045          100           3854\njpeg95            dhash            0.121094      74.079          100           3854\njpeg95            marrhildreth     0.104167      59.263          100           3854\njpeg95            pdq              0.257812      99.896          100           3854\njpeg95            phash            0.234375      99.896          100           3854\njpeg95            shrinkhash      66.053         40.296          100           3854\njpeg95            wavelet          0.00390625     3.71           100           3854\nnoise0.2          ahash            0.00390625     2.984          100           3854\nnoise0.2          blockmean        0.00826446     8.563          100           3854\nnoise0.2          dhash            0.121094      40.088          100           3854\nnoise0.2          marrhildreth     0.0989583     33.083          100           3854\nnoise0.2          pdq              0.257812      99.222          100           3854\nnoise0.2          phash            0.273438      99.896          100           3854\nnoise0.2          shrinkhash     169.387          4.385          100           3854\nnoise0.2          wavelet          0.0078125      1.894          100           3854\nnoop              ahash            0            100              100           3854\nnoop              blockmean        0            100              100           3854\nnoop              dhash            0            100              100           3854\nnoop              marrhildreth     0            100              100           3854\nnoop              pdq              0            100              100           3854\nnoop              phash            0            100              100           3854\nnoop              shrinkhash       0            100              100           3854\nnoop              wavelet          0            100              100           3854\npad0.2            ahash            0.0429688      0.026          100           3854\npad0.2            blockmean        0.0320248      0              nan           3854\npad0.2            dhash            0.105469       0.234          100           3854\npad0.2            marrhildreth     0.177083       0.052          100           3854\npad0.2            pdq              0.28125        1.349          100           3854\npad0.2            phash            0.273438       2.387          100           3854\npad0.2            shrinkhash     114.721          0.052          100           3854\npad0.2            wavelet          0.0820312      0              nan           3854\nresize0.5         ahash            0.0078125     70.784          100           3854\nresize0.5         blockmean        0.0144628     95.226          100           3854\nresize0.5         dhash            0.0898438     99.299          100           3854\nresize0.5         marrhildreth     0.112847      97.846          100           3854\nresize0.5         pdq              0.265625      99.844          100           3854\nresize0.5         phash            0.234375     100              100           3854\nresize0.5         shrinkhash      56.9034        51.453          100           3854\nresize0.5         wavelet          0.0117188     80.747          100           3854\nrotate4           ahash            0.0273438      1.297          100           3854\nrotate4           blockmean        0.0371901      3.036          100           3854\nrotate4           dhash            0.09375        1.401          100           3854\nrotate4           marrhildreth     0.149306       3.762          100           3854\nrotate4           pdq              0.273438      54.489          100           3854\nrotate4           phash            0.257812      59.626          100           3854\nrotate4           shrinkhash      69.1737         1.894          100           3854\nrotate4           wavelet          0.0078125      0.026          100           3854\nvignette          ahash            0.0273438      4.67           100           3854\nvignette          blockmean        0.0320248      6.098          100           3854\nvignette          dhash            0.0703125     12.195          100           3854\nvignette          marrhildreth     0.0954861     30.54           100           3854\nvignette          pdq              0.132812     100              100           3854\nvignette          phash            0.132812     100              100           3854\nvignette          shrinkhash     103.005          4.541          100           3854\nvignette          wavelet          0.0195312      1.946          100           3854\nwatermark         ahash            0.00390625    18.5            100           3854\nwatermark         blockmean        0.0123967     41.593          100           3854\nwatermark         dhash            0.078125     100              100           3854\nwatermark         marrhildreth     0.112847      99.455          100           3854\nwatermark         pdq              0.273438      99.014          100           3854\nwatermark         phash            0.28125       99.377          100           3854\nwatermark         shrinkhash     104.398         71.199          100           3854\nwatermark         wavelet          0.0117188     46.912          100           3854\n================  =============  ============  ========  ===========  =============\n\n=============  ===========  ========  ===========  =============\nhasher_name      threshold    recall    precision    n_exemplars\n=============  ===========  ========  ===========  =============\nahash           0.00390625    17.578     100               42394\nblockmean       0.00826446    27.714     100               42394\ndhash           0.0859375     51.981      99.9952          42394\nmarrhildreth    0.100694      55.942      99.9957          42394\npdq             0.257812      77.181      99.9969          42394\nphash           0.273438      81.967      99.9942          42394\nshrinkhash     56.9034        22.378     100               42394\nwavelet         0.00390625    18.467     100               42394\n=============  ===========  ========  ===========  =============\n\nVideo Hashing\n=============\n\nThe below example does the following:\n\n- Download a benchmarking dataset. Here we use the `Charades <https://prior.allenai.org/projects/charades>`_ dataset which contain over 9,000 videos.\n- Load the dataset.\n- Transform the dataset to generate synthetically altered videos. Our hashers are responsible for\n  matching the altered videos with the originals.\n- Define some hashers we want to evaluate.\n- Compute all the hashes.\n- Report metrics for each video category / hasher / transformation combination to see how well our hashers\n  can match the altered videos to the original (\"no-op\" videos).\n\n.. code-block:: python\n\n    import os\n    import zipfile\n    import urllib.request\n\n\n    import pandas as pd\n\n    import perception.benchmarking\n    import perception.hashers\n\n    if not os.path.isdir('Charades_v1_480'):\n        # Download the dataset since it appears we do not have it. Note that\n        # these are large files (> 13GB).\n        urllib.request.urlretrieve(\n            url='http://ai2-website.s3.amazonaws.com/data/Charades_v1_480.zip',\n            filename='Charades_v1_480.zip'\n        )\n        with zipfile.ZipFile('Charades_v1_480.zip') as zfile:\n            zfile.extractall('.')\n        urllib.request.urlretrieve(\n            url='http://ai2-website.s3.amazonaws.com/data/Charades.zip',\n            filename='Charades.zip'\n        )\n        with zipfile.ZipFile('Charades.zip') as zfile:\n            zfile.extractall('.')\n\n\n    # These are files that we've identified as having identical subsequences, typically\n    # when a person is out of frame and the backgrounds are the same.\n    duplicates = [\n        ('0HVVN.mp4', 'UZRQD.mp4'), ('ZIOET.mp4', 'YGXX6.mp4'), ('82XPD.mp4', 'E7QDZ.mp4'),\n        ('FQDS1.mp4', 'AIOTI.mp4'), ('PBV4T.mp4', 'XXYWL.mp4'), ('M0P0H.mp4', 'STY6W.mp4'),\n        ('3Q92U.mp4', 'GHPO3.mp4'), ('NFIQM.mp4', 'I2DHG.mp4'), ('PIRMO.mp4', '0GFE8.mp4'),\n        ('LRPBA.mp4', '9VK0J.mp4'), ('UI0QG.mp4', 'FHXKQ.mp4'), ('Y05U8.mp4', '4RVZB.mp4'),\n        ('J6TVB.mp4', '2ZBL5.mp4'), ('A8T8V.mp4', 'IGOQK.mp4'), ('H8QM1.mp4', 'QYMWC.mp4'),\n        ('O45BC.mp4', 'ZS7X6.mp4'), ('NOP6W.mp4', 'F7KFE.mp4'), ('4MPPQ.mp4', 'A3M94.mp4'),\n        ('L8FFR.mp4', 'M8MP0.mp4'), ('EHYXP.mp4', 'O8PO3.mp4'), ('MGBLJ.mp4', 'RIEG6.mp4'),\n        ('53FPM.mp4', 'BLFEV.mp4'), ('UIIF3.mp4', 'TKEKQ.mp4'), ('GVX7E.mp4', '7GPSY.mp4'),\n        ('T7HZB.mp4', '6KGZA.mp4'), ('65M4K.mp4', 'UDGP2.mp4'), ('6SS4H.mp4', 'CK6OL.mp4'),\n        ('OVHFT.mp4', 'GG1X2.mp4'), ('VEHER.mp4', 'XBPEJ.mp4'), ('WN38A.mp4', '2QI8F.mp4'),\n        ('UMXKN.mp4', 'EOKJ0.mp4'), ('OSIKP.mp4', 'WT2C0.mp4'), ('H5V2Y.mp4', 'ZXN6A.mp4'),\n        ('XS6PF.mp4', '1WJ6O.mp4'), ('S2XJW.mp4', 'YH0BX.mp4'), ('UO607.mp4', 'Z5JZD.mp4'),\n        ('XN64E.mp4', 'CSRZM.mp4'), ('YXI7M.mp4', 'IKQLJ.mp4'), ('1B9C8.mp4', '004QE.mp4'),\n        ('V1SQH.mp4', '48WOM.mp4'), ('107YZ.mp4', 'I049A.mp4'), ('3S6WL.mp4', 'SC5YW.mp4'),\n        ('OY50Q.mp4', '5T607.mp4'), ('XKH7W.mp4', '028CE.mp4'), ('X8XQE.mp4', 'J0VXY.mp4'),\n        ('STB0G.mp4', 'J0VXY.mp4'), ('UNXLF.mp4', 'J0VXY.mp4'), ('56PK0.mp4', 'M1TZR.mp4'),\n        ('FVITB.mp4', 'R0M34.mp4'), ('BPZE3.mp4', 'R0M34.mp4'), ('VS7DA.mp4', '1X0M3.mp4'),\n        ('I7MEA.mp4', 'YMM1Z.mp4'), ('9N76L.mp4', '0LDP7.mp4'), ('AXS82.mp4', 'W8WRK.mp4'),\n        ('8TSU4.mp4', 'MXATD.mp4'), ('80FWF.mp4', '18HFG.mp4'), ('RO3A2.mp4', 'V4HY4.mp4'),\n        ('HU409.mp4', 'BDWIX.mp4'), ('3YY88.mp4', 'EHHRS.mp4'), ('65RS3.mp4', 'SLIH4.mp4'),\n        ('LR0L8.mp4', 'Y665P.mp4'), ('DVPL2.mp4', 'EI5M3.mp4'), ('0EGNU.mp4', 'CU3JE.mp4'),\n        ('94KP4.mp4', '94KP4.mp4'), ('79QDP.mp4', '79QDP.mp4'), ('GKBX9.mp4', 'GKBX9.mp4'),\n        ('RX6R8.mp4', 'RX6R8.mp4'), ('PMVT7.mp4', 'PMVT7.mp4'), ('XNXW6.mp4', 'XNXW6.mp4'),\n        ('I005F.mp4', 'I005F.mp4'), ('TF95Y.mp4', 'TF95Y.mp4'), ('79QDP.mp4', '79QDP.mp4'),\n        ('LQGMM.mp4', 'LQGMM.mp4'), ('QCAUL.mp4', 'QCAUL.mp4'), ('GFVSV.mp4', 'GFVSV.mp4'),\n        ('4UYGY.mp4', '4UYGY.mp4'), ('BYDSE.mp4', 'BYDSE.mp4'), ('PV3KQ.mp4', 'PV3KQ.mp4'),\n        ('1X0M3.mp4', '1X0M3.mp4'), ('T5FHD.mp4', 'T5FHD.mp4'), ('QRHJJ.mp4', 'QRHJJ.mp4'),\n        ('JYBGS.mp4', 'JYBGS.mp4'), ('N2XCF.mp4', 'N2XCF.mp4'), ('OZPA9.mp4', 'OZPA9.mp4'),\n        ('297S4.mp4', '297S4.mp4'), ('LHU7D.mp4', 'LHU7D.mp4'), ('TSKZL.mp4', 'TSKZL.mp4'),\n        ('BCONW.mp4', 'BCONW.mp4'), ('KBPDM.mp4', 'KBPDM.mp4'), ('7FTBS.mp4', '7FTBS.mp4'),\n        ('099Y1.mp4', '099Y1.mp4'), ('S2RIQ.mp4', 'S2RIQ.mp4'), ('22FJU.mp4', '22FJU.mp4'),\n        ('99UA6.mp4', '99UA6.mp4'), ('WJ13E.mp4', 'WJ13E.mp4'), ('5OLVC.mp4', '5OLVC.mp4'),\n        ('YQ6Z6.mp4', 'YQ6Z6.mp4'), ('T5MLJ.mp4', 'T5MLJ.mp4'), ('0VOQC.mp4', '0VOQC.mp4'),\n        ('S2RIQ.mp4', 'S2RIQ.mp4'), ('2VNXF.mp4', '2VNXF.mp4'), ('G87XG.mp4', 'G87XG.mp4'),\n        ('RRS54.mp4', 'RRS54.mp4'), ('TXJK7.mp4', 'TXJK7.mp4'), ('G4KE3.mp4', 'G4KE3.mp4'),\n        ('3SNSC.mp4', '3SNSC.mp4'), ('U2FA5.mp4', 'U2FA5.mp4'), ('9AFQ7.mp4', '9AFQ7.mp4')\n    ]\n\n    blacklist = [fp1 for fp1, fp2 in duplicates]\n    df = pd.concat([pd.read_csv('Charades/Charades_v1_test.csv'), pd.read_csv('Charades/Charades_v1_train.csv')])\n    df = df[~(df['id'] + '.mp4').isin(blacklist)]\n    df['filepath'] = df['id'].apply(lambda video_id: os.path.join('Charades_v1_480', video_id + '.mp4'))\n    assert df['filepath'].apply(os.path.isfile).all(), 'Some video files are missing.'\n    dataset = perception.benchmarking.BenchmarkVideoDataset.from_tuples(\n        files=df[['filepath', 'scene']].itertuples(index=False)\n    )\n\n    if not os.path.isdir('benchmarking_videos'):\n        # We haven't computed the transforms yet, so we do that\n        # now. Below, we create the following files for each of\n        # the videos in our dataset. Note that the only required\n        # transform is `noop` (see documentation for\n        # perception.bencharmking.BenchmarkVideoDataset.transform).\n        #\n        # noop: This is the base video we'll actually use in benchmarking, rather\n        #       than using the raw video. It is the same as the raw video but downsampled\n        #       to a size that is reasonable for hashing (240p). This is because all\n        #       of our hashers downsample to a size smaller than this anyway, so there\n        #       is no benefit to a higher resolution. Also, we limit the length to the\n        #       first five minutes of the video, which speeds everything up significantly.\n        # shrink: Shrink the noop video down to 70% of its original size.\n        # clip0.2: Clip the first 20% and last 20% of the noop video off.\n        # slideshow: Create a slideshow version of the video that grabs frames periodically\n        #            from the original.\n        # black_frames: Add black frames before and after the start of the video.\n        # gif: Create a GIF from the video (similar to slideshow but with re-encoding)\n        # black_padding: Add black bars to the top and bottom of the video.\n        pad_width = 240\n        pad_height = 320\n        transforms = {\n            'noop': perception.benchmarking.video_transforms.get_simple_transform(\n                width='ceil(min(240/max(iw, ih), 1)*iw/2)*2',\n                height='ceil(min(240/max(iw, ih), 1)*ih/2)*2',\n                codec='h264',\n                output_ext='.m4v',\n                sar='1/1',\n                clip_s=(None, 60*5)\n            ),\n            'shrink': perception.benchmarking.video_transforms.get_simple_transform(\n                width='ceil(0.7*iw/2)*2',\n                height='ceil(0.7*ih/2)*2'\n            ),\n            'clip0.2': perception.benchmarking.video_transforms.get_simple_transform(clip_pct=(0.2, 0.8)),\n            'slideshow': perception.benchmarking.video_transforms.get_slideshow_transform(\n                frame_input_rate=1/2.5, frame_output_rate=0.5, max_frames=10, offset=1.3),\n            'black_frames': perception.benchmarking.video_transforms.get_black_frame_padding_transform(0.5, 0.05),\n            'gif': perception.benchmarking.video_transforms.get_simple_transform(\n                output_ext='.gif', codec='gif', clip_s=(1.2, 10.2), fps=1/2.5\n            ),\n            'black_padding': perception.benchmarking.video_transforms.get_simple_transform(\n                width=f'(iw*sar)*min({pad_width}/(iw*sar),{pad_height}/ih)', height=f'ih*min({pad_width}/(iw*sar),{pad_height}/ih)',\n                pad=f'{pad_width}:{pad_height}:({pad_width}-iw*min({pad_width}/iw,{pad_height}/ih))/2:({pad_height}-ih*min({pad_width}/iw,{pad_height}/ih))/2'\n            )\n        }\n\n        # Save the transforms for later.\n        transformed = dataset.transform(transforms=transforms, storage_dir='benchmarking_videos')\n\n    transformed = perception.benchmarking.BenchmarkVideoTransforms.load('benchmarking_videos', verify_md5=False)\n\n    phashu8 = perception.hashers.PHashU8(exclude_first_term=False, freq_shift=1, hash_size=12)\n    hashers = {\n        'phashu8_framewise': perception.hashers.FramewiseHasher(\n            frames_per_second=1, frame_hasher=phashu8, interframe_threshold=50, quality_threshold=90),\n        'phashu8_tmkl1': perception.hashers.FramewiseHasher(\n            base_hasher=perception.hashers.TMKL1(\n                frames_per_second=5, frame_hasher=phashu8,\n                distance_metric='euclidean', dtype='uint8',\n                norm=None, quality_threshold=90)\n        )\n    }\n    if not os.path.isfile('hashes.csv'):\n        # We haven't computed the hashes, so we do that now.\n        hashes = transformed.compute_hashes(hashers=hashers, max_workers=5)\n        # Save the hashes for later. It took a long time after all!\n        hashes.save('hashes.csv')\n\n    hashes = perception.benchmarking.BenchmarkHashes.load('hashes.csv')\n\n    hashes.compute_threshold_recall(precision_threshold=99.9, grouping=['transform_name'])\n\n\n================  =================  ===========  ========  ===========  =============\ntransform_name    hasher_name          threshold    recall    precision    n_exemplars\n================  =================  ===========  ========  ===========  =============\nblack_frames      phashu8_framewise      51.0979    88.12       99.9069         278644\nblack_frames      phashu8_tmkl1          55.7584    99.918      99.9079         403768\nblack_padding     phashu8_framewise      74.6391     7.662     100              277399\nblack_padding     phashu8_tmkl1          53.8702    99.898      99.9079         406899\nclip0.2           phashu8_framewise      54.8635    90.741      99.9098         224264\nclip0.2           phashu8_tmkl1          59.0424    99.724      99.9077         324251\ngif               phashu8_framewise      55.4437    68.21       99.9088          82232\ngif               phashu8_tmkl1          55.4887    81.029      99.9103          39757\nnoop              phashu8_framewise       0        100         100              282658\nnoop              phashu8_tmkl1           0        100         100              408871\nshrink            phashu8_framewise      24.7184   100         100              281731\nshrink            phashu8_tmkl1          49.8999    99.836      99.9078         400650\nslideshow         phashu8_framewise      56.9825    99.713      99.9076         172829\nslideshow         phashu8_tmkl1          56.8683    95.934      99.9035          90684\n================  =================  ===========  ========  ===========  =============\n"
  },
  {
    "path": "docs/examples/deduplication.rst",
    "content": "Media Deduplication\n*******************\n\nPerceptual hashes can be used to deduplicate sets of images. Below we provide two examples (one simple, one larger scale).\n\n**For most use cases, we recommend using PHash with** :code:`hash_size=16` **and\nwith 0.2 as the distance threshold as in the example below.** You may wish to adjust\nthis threshold up or down based on your tolerance for false negatives / positives.\n\nIn practice, deduplicating in memory on your machine by the methods below may be impractical.\nFor larger-scale applications, you may wish to use tools like\n`FAISS <https://github.com/facebookresearch/faiss>`_,\n`Annoy <https://github.com/spotify/annoy>`_, or databases with\nfunctionality for querying based on distance such as\n`MemSQL <https://docs.memsql.com/sql-reference/v6.8/euclidean_distance/>`_.\n\nFor the supported hashers, below are our recommended thresholds with expected false positive rates of <1%.\n\n======================  ===========\nhasher                  threshold\n======================  ===========\nahash (hash_size=16)    0.008\nblockmean               0.008\ndhash (hash_size=16)    0.07\nmarrhildreth            0.1\npdq                     0.2\nphash (hash_size=16)    0.2\nwavelet (hash_size=16)  0.02\n======================  ===========\n\nSimple example\n==============\n\nIn this example, we download a ZIP file containing 18 images. One of the images is duplicated\ntwice and another image is duplicated once.\n\n.. code-block:: python\n\n    import os\n    import glob\n    import zipfile\n    import urllib.request\n\n    import tabulate\n    import pandas as pd\n\n    from perception import tools, hashers\n\n    urllib.request.urlretrieve(\n        \"https://thorn-perception.s3.amazonaws.com/thorn-perceptual-deduplication-example.zip\",\n        \"thorn-perceptual-deduplication-example.zip\"\n    )\n\n    with zipfile.ZipFile('thorn-perceptual-deduplication-example.zip') as f:\n        f.extractall('.')\n        \n    filepaths = glob.glob('thorn-perceptual-deduplication-example/*.jpg')\n    duplicate_pairs = tools.deduplicate(files=filepaths, hashers=[(hashers.PHash(hash_size=16), 0.2)])\n    print(tabulate.tabulate(pd.DataFrame(duplicate_pairs), showindex=False, headers=['file1', 'file2'], tablefmt='rst'))\n    \n    # Now we can do whatever we want with the duplicates. We could just delete\n    # the first entry in each pair or manually verify the pairs to ensure they\n    # are, in fact duplicates.\n\n\n===============================================  ===============================================\nfile1                                            file2\n===============================================  ===============================================\nthorn-perceptual-deduplication-example/309b.jpg  thorn-perceptual-deduplication-example/309.jpg\nthorn-perceptual-deduplication-example/309b.jpg  thorn-perceptual-deduplication-example/309a.jpg\nthorn-perceptual-deduplication-example/309a.jpg  thorn-perceptual-deduplication-example/309.jpg\nthorn-perceptual-deduplication-example/315a.jpg  thorn-perceptual-deduplication-example/315.jpg\n===============================================  ===============================================\n\nReal-world example\n==================\n\nIn the example below, we use the \n`Caltech 256 Categories <http://www.vision.caltech.edu/Image_Datasets/Caltech256>`_ dataset. Like\nmost other public image datasets, it contains a handful of duplicates in some categories.\n\nThe code below will:\n\n1. Download the dataset\n2. Group all the filepaths by category (the dataset is provided in folders)\n3. Within each group, find duplicates using PHash. We will compare not just the\n   original images, but also the 8 isometric transformations for each image.\n\n.. code-block:: python\n\n    import os\n    import tarfile\n    from glob import glob\n    import urllib.request\n\n    import tqdm\n\n    from perception import hashers, tools\n\n    urllib.request.urlretrieve(\n        \"http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar\",\n        \"256_ObjectCategories.tar\"\n    )\n    with tarfile.open('256_ObjectCategories.tar') as tfile:\n        tfile.extractall()\n\n    files = glob('256_ObjectCategories/**/*.jpg')\n\n    # To reduce the number of pairwise comparisons,\n    # we can deduplicate within each image category\n    # (i.e., we don't need to compare images of \n    # butterflies with images of chess boards).\n    filepath_group = [\n        (\n            filepath,\n            os.path.normpath(filepath).split(os.sep)[-2]\n        ) for filepath in files\n    ]\n    groups = list(set([group for _, group in filepath_group]))\n    \n    # We consider any pair of images with a PHash distance of < 0.2 as\n    # as a duplicate.\n    comparison_hashers = [(hashers.PHash(hash_size=16), 0.2)]\n\n    duplicate_pairs = []\n\n    for current_group in groups:\n        current_filepaths = [\n            filepath for filepath, group in filepath_group if group == current_group\n        ]\n        current_duplicate_pairs = tools.deduplicate(\n            files=current_filepaths,\n            hashers=comparison_hashers,\n            isometric=True,\n            progress=tqdm.tqdm\n        )\n        duplicate_pairs.extend(current_duplicate_pairs)\n\n    # Now we can do whatever we want with the duplicates. We could just delete\n    # the first entry in each pair or manually verify the pairs to ensure they\n    # are, in fact duplicates.\n\nVideo deduplication\n===================\n\nVideo deduplication requires more thought depending on your tolerance for false positives and\nhow important temporal relationships are. Below is one example approach for deduplicating a\ngroup of videos by taking frames from each video that are sufficiently different from each other\n(to avoid keeping too many) and then using them all to find\npairs of videos that have matching frames.\n\n.. code-block:: python\n\n    import urllib.request\n    import zipfile\n\n    import glob\n    import tqdm\n\n    import perception.hashers\n\n    # Download some example videos.\n    urllib.request.urlretrieve(\n        \"https://thorn-perception.s3.amazonaws.com/thorn-perceptual-video-deduplication-example.zip\",\n        \"thorn-perceptual-video-deduplication-example.zip\"\n    )\n\n    with zipfile.ZipFile('thorn-perceptual-video-deduplication-example.zip') as f:\n        f.extractall('.')\n\n    frame_hasher = hashers.PHash(hash_size=16)\n \n    hasher = perception.hashers.FramewiseHasher(frames_per_second=1, \n        frame_hasher=frame_hasher, \n        interframe_threshold=50, \n        quality_threshold=90)\n\n    # Set a threshold for matching frames within videos and across videos.\n    filepaths = glob.glob('thorn-perceptual-video-deduplication-example/*.m4v') + \\\n                glob.glob('thorn-perceptual-video-deduplication-example/*.gif')\n\n    # Returns a list of dicts with a \"filepath\" and \"hash\" key. \"hash\" contains a\n    # list of hashes.\n    hashes = hasher.compute_parallel(filepaths=filepaths, progress=tqdm.tqdm)\n\n\n    # Flatten the hashes into a list of (filepath, hash) tuples.\n    hashes_flattened = perception.tools.flatten([\n        [(hash_group['filepath'], hash_string) for hash_string in hash_group['hash']]\n        for hash_group in hashes\n    ])\n\n    duplicates = perception.tools.deduplicate_hashes(\n        hashes=hashes_flattened,\n        threshold=50,\n        hasher=hasher\n    )"
  },
  {
    "path": "docs/examples/detecting_csam.rst",
    "content": "Detecting Child Sexual Abuse Material\n*************************************\n\nUsing `perception` and a subscription to Thorn's Safer service,\nyou can easily check for child sexual abuse material against a database of known bad content\n**without** having to send any images to a third party. You do this by sending compact, irreversible\nimage hashes to get matches with a high degree of precision. We support matching using\n16x16 PHash hashes and md5 hashes.\n\nSee usage example below. Please contact info@getsafer.io to discuss Thorn's Safer service\nand subscription options and visit `getsafer.io <https://getsafer.io/>`_ to learn more.\n\n.. code-block:: python\n\n    from perception import tools\n    matcher = tools.SaferMatcher(\n        api_key='YOUR_API_KEY',\n        url='MATCHING_SERVICE_URL'\n    )\n    matches = matcher.match(['myfile.jpg'])\n\nIn some cases, you may have a username/password instead of an API key, in which case\nyou can pass those instead (see API documentation for details)."
  },
  {
    "path": "docs/examples/index.rst",
    "content": "Examples\n********\n\n.. toctree::\n   :maxdepth: 2\n   :caption: Contents:\n\n   deduplication\n   detecting_csam\n   benchmarking"
  },
  {
    "path": "docs/index.rst",
    "content": "perception\n==========\n\n:code:`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing\nresearch, development, and production use. It provides a common wrapper around existing, popular perceptual hashes\n(such as those implemented by `ImageHash <https://pypi.org/project/ImageHash/>`_)\nalong with tools to compare their performance and use them for common tasks.\n\nPerceptual hashes are used to create compact image \"fingerprints\" which are invariant to small alterations to\nthe original image. Typically, the representations are compact enough that they are irreversible, which makes\nthem useful for deduplication and detecting abusive content while preserving the privacy of content owners.\n\nInstallation\n************\n\nYou can install :code:`perception` using pip. You must install OpenCV separately (e.g., with :code:`pip install opencv-python`).\n\n.. code-block:: bash\n    \n    # Install from PyPi\n    pip install perception\n\n    # Install from GitHub\n    pip install git+https://github.com/thorn-oss/perception.git#egg=perception\n\nTo install with the necessary dependencies for benchmarking, use:\n\n.. code-block:: bash\n\n    # Install from PyPi\n    pip install perception[benchmarking]\n\n    # Install from GitHub\n    pip install opencv-python git+https://github.com/thorn-oss/perception.git#egg=perception[benchmarking]\n\nGetting Started\n***************\n\nPlease see the examples for code snippets for common use cases.\n\n\n.. toctree::\n   :maxdepth: 2\n   :caption: Contents:\n\n   examples/index\n   api/index\n    \n"
  },
  {
    "path": "docs/requirements.txt",
    "content": "sphinx-autodoc-typehints==3.2.0\n# sphinx-autobuild==3.0.2\n# sphinx==1.8.3\nsphinx_rtd_theme==3.0.2\nm2r==0.3.1\nopencv-contrib-python-headless\ntqdm\nalbumentations\nffmpeg-python\ntyping-extensions\nfaiss-cpu\naiohttp\npython-json-logger\nnetworkit\n"
  },
  {
    "path": "perception/__init__.py",
    "content": "from importlib import metadata\n\n__version__ = metadata.version(\"perception\")\n"
  },
  {
    "path": "perception/approximate_deduplication/__init__.py",
    "content": "import logging\nimport math\nimport os.path as op\nimport typing\n\nimport faiss\nimport numpy as np\nimport tqdm\nimport typing_extensions\n\nfrom ._graph_backend import get_graph_backend\n\nLOGGER = logging.getLogger(__name__)\nDEFAULT_PCT_PROBE = 0\n\n\n# For faiss training on datasets larger than 50,000 vectors, we take a random sub-sample.\nTRAIN_LARGE_SIZE: int = 50_000\n\n\nclass ClusterAssignment(typing_extensions.TypedDict):\n    cluster: int\n    id: typing.Any\n\n\ndef build_index(\n    X: np.ndarray,\n    pct_probe: float = DEFAULT_PCT_PROBE,\n    approximate: bool = True,\n    use_gpu: bool = True,\n):\n    \"\"\"Buid a FAISS index from a reference dataframe.\n\n    Args:\n        X: The vectors to add to the index.\n        pct_probe: The minimum fraction of nearest lists to search. If\n            the product of pct_probe and the number of lists is less\n            than 1, one list will be searched.\n        approximate: Whether to build an approximate or exact index.\n\n    Returns:\n        An (index, lookup) tuple where the lookup returns the filepath\n        for a given entry in the index.\n    \"\"\"\n    if X is None:\n        return None\n    X = X.astype(\"float32\")\n    d = X.shape[1]\n    if approximate:\n        ntotal = X.shape[0]\n        nlist = int(max(min(4 * np.sqrt(ntotal), ntotal / 39), 1))\n        quantizer = faiss.IndexFlatL2(d)\n        index = faiss.IndexIVFFlat(quantizer, d, nlist)\n        gpu = False\n        if use_gpu:\n            try:\n                res = faiss.StandardGpuResources()\n                index = faiss.index_cpu_to_gpu(res, 0, index)\n                gpu = True\n            except AttributeError:\n                LOGGER.info(\"Building approximate FAISS index on CPU.\")\n\n        if X.shape[0] > TRAIN_LARGE_SIZE:\n            # Take random sample of 50,000 or 39 points per centroid.\n            # 39 points per centroid is the min for for not getting warnings.\n            # https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids\n            sample_size = max(39 * nlist, TRAIN_LARGE_SIZE)\n            index.train(X[np.random.choice(X.shape[0], sample_size, replace=False)])\n        else:\n            index.train(X)\n\n        batch_size = 10_000\n        for i in range(0, X.shape[0], batch_size):\n            index.add(X[i : i + batch_size])\n        if gpu:\n            index = faiss.index_gpu_to_cpu(index)\n        nprobe = max(math.ceil(pct_probe * nlist), 1)\n        faiss.ParameterSpace().set_index_parameter(index, \"nprobe\", nprobe)\n    else:\n        index = faiss.IndexFlat(d)\n        index.add(X)\n    return index\n\n\ndef compute_euclidean_pairwise_duplicates_approx(\n    X,\n    counts,\n    threshold,\n    minimum_overlap,\n    Y=None,\n    y_counts=None,\n    pct_probe=0.1,\n    use_gpu: bool = True,\n    faiss_cache_path: str | None = None,\n    show_progress: bool = False,\n):\n    \"\"\"Provides the same result as perception.extensions.compute_pairwise_duplicates_simple\n    but uses an approximate search instead of an exhaustive search, which can dramatically reduce\n    processing time.\n\n    Args:\n        X: An array of vectors to compute pairs for.\n        Y: if provided we search in X for Y vectors.\n        counts: A list of counts of vectors for separate files in the\n            in the vectors (should add up to the length of X)\n        threshold: The threshold for a match as a euclidean distance.\n        minimum_overlap: The minimum overlap between two files to qualify as a match.\n        pct_probe: The minimum percentage of sublists to search for matches. The larger the\n            value, the more exhaustive the search.\n        faiss_cache_path: If provided load any existing faiss index from this path, and if\n            it does not exist then save the generated faiss index to the path.\n        show_progress: Whether or not to show a progress bar while computing pairs\n    Returns:\n        A list of pairs of matching file indexes.\n    \"\"\"\n    assert (\n        counts.sum() == X.shape[0]\n    ), \"Length of counts incompatible with vectors shape.\"\n    assert (Y is None) == (\n        y_counts is None\n    ), \"Must provide both or neither for y, y_counts.\"\n    if X.dtype != \"float32\":\n        # Only make the copy if we have to.\n        X = X.astype(\"float32\")\n\n    if Y is not None and Y.dtype != \"float32\":\n        # Only make the copy if we have to.\n        Y = Y.astype(\"float32\")\n\n    lookup_ = []\n    for idx, count in enumerate(counts):\n        lookup_.extend([idx] * count)\n    lookup = np.array(lookup_)\n\n    if faiss_cache_path is not None and op.exists(faiss_cache_path):\n        LOGGER.debug(\"Loading cached FAISS index from %s\", faiss_cache_path)\n        index = faiss.read_index(faiss_cache_path)\n        assert (\n            X.shape[0] == index.ntotal\n        ), \"Cached FAISS index does not match provided X.\"\n    else:\n        LOGGER.debug(\"Building FAISS index.\")\n        index = build_index(X=X, pct_probe=pct_probe, approximate=True, use_gpu=use_gpu)\n        if faiss_cache_path is not None:\n            faiss.write_index(index, faiss_cache_path)\n\n    LOGGER.debug(\"FAISS index ready, start aprox search\")\n    pairs = []\n\n    # Only use y_counts if present.\n    if y_counts is None:\n        iterator_counts = counts\n        M = X\n    else:\n        iterator_counts = y_counts\n        M = Y\n\n    for end, length, query in tqdm.tqdm(\n        zip(iterator_counts.cumsum(), iterator_counts, range(len(iterator_counts))),\n        total=len(iterator_counts),\n        disable=not show_progress,\n        desc=\"Vectors\",\n    ):\n        if length == 0:\n            continue\n        Xq = M[end - length : end]\n        lims, _, idxs = index.range_search(Xq, threshold**2)\n        lims = lims.astype(\"int32\")\n        matched = [\n            match\n            for match in np.unique(lookup[list(set(idxs))])  # type: ignore\n            if match != query\n            or Y is not None  # Protect self matches if Y is not present.\n        ]\n        query_in_match: typing.Mapping[int, set] = {m: set() for m in matched}\n        match_in_query: typing.Mapping[int, set] = {m: set() for m in matched}\n        for query_idx in range(length):\n            for match_idx in idxs[lims[query_idx] : lims[query_idx + 1]]:\n                match = lookup[match_idx]\n                if (\n                    match == query and Y is None\n                ):  # Protect self matches if Y is not present.\n                    continue\n                match_in_query[match].add(match_idx)\n                query_in_match[match].add(query_idx)\n        for match in matched:\n            overlap = min(\n                [\n                    len(query_in_match[match]) / length,\n                    len(match_in_query[match]) / counts[match],\n                ]\n            )\n            if overlap >= minimum_overlap and overlap > 0:\n                if Y is None:\n                    pairs.append(tuple(sorted([query, match])))\n                else:\n                    pairs.append(tuple([query, match]))\n    return list(set(pairs))\n\n\ndef pairs_to_clusters(\n    ids: typing.Iterable[str],\n    pairs: typing.Iterable[tuple[str, str]],\n    strictness: typing_extensions.Literal[\n        \"clique\", \"community\", \"component\"\n    ] = \"clique\",\n    max_clique_batch_size: int = 1000,\n) -> list[ClusterAssignment]:\n    \"\"\"Given a list of pairs of matching files, compute sets\n    of cliques where all files in a clique are connected.\n    Args:\n        ids: A list of node ids (e.g., filepaths).\n        pairs: A list of pairs of node ids, each pair is assumed to have an edge\n        strictness: The level at which groups will be clustered. \"component\"\n            means that all clusters will be connected components. \"community\"\n            will select clusters of files within components that are clustered\n            together. \"clique\" will result in clusters where every file is\n            connected to every other file.\n        max_clique_batch_size: The maximum batch size for identifying\n            cliques.\n    Returns:\n        A list of cluster assignments (dicts with id and cluster\n        entries).\n    \"\"\"\n    assert strictness in [\"component\", \"community\", \"clique\"], \"Invalid strictness.\"\n    list_ids = list(ids)\n    id_to_node_map = {v: i for i, v in enumerate(list_ids)}\n    node_to_id_map = {v: k for k, v in id_to_node_map.items()}\n\n    LOGGER.debug(\"Building graph.\")\n    node_pairs = {(id_to_node_map[pair[0]], id_to_node_map[pair[1]]) for pair in pairs}\n    backend = get_graph_backend()\n    graph = backend.build_graph(len(list_ids), node_pairs)\n\n    assignments: list[ClusterAssignment] = []\n    cluster_index = 0\n    components = backend.connected_components(graph)\n\n    for component in components:\n        LOGGER.debug(\"Got component with size: %s\", len(component))\n        if strictness == \"component\":\n            assignments.extend(\n                [{\"id\": node_to_id_map[n], \"cluster\": cluster_index} for n in component]\n            )\n            cluster_index += 1\n            continue\n        communities = backend.communities(graph, component)\n        for community_members in communities:\n            LOGGER.debug(\"Got community with size: %s\", len(community_members))\n            if strictness == \"community\":\n                assignments.extend(\n                    [\n                        {\"id\": node_to_id_map[n], \"cluster\": cluster_index}\n                        for n in community_members\n                    ]\n                )\n                cluster_index += 1\n                continue\n\n            for clique_members in backend.maximal_cliques(\n                graph,\n                community_members,\n                max_clique_batch_size=max_clique_batch_size,\n            ):\n                assignments.extend(\n                    [\n                        {\n                            \"id\": node_to_id_map[n],\n                            \"cluster\": cluster_index,\n                        }\n                        for n in clique_members\n                    ]\n                )\n                cluster_index += 1\n\n    return assignments\n"
  },
  {
    "path": "perception/approximate_deduplication/_graph_backend.py",
    "content": "import sys\nimport typing\nfrom abc import ABC, abstractmethod\n\n\nclass GraphBackend(ABC):\n    @abstractmethod\n    def build_graph(\n        self, node_count: int, edges: typing.Iterable[tuple[int, int]]\n    ) -> typing.Any: ...\n\n    @abstractmethod\n    def connected_components(self, graph: typing.Any) -> list[list[int]]: ...\n\n    @abstractmethod\n    def communities(\n        self, graph: typing.Any, component: list[int]\n    ) -> list[list[int]]: ...\n\n    @abstractmethod\n    def maximal_cliques(\n        self,\n        graph: typing.Any,\n        community_nodes: list[int],\n        max_clique_batch_size: int,\n    ) -> list[list[int]]: ...\n\n\nclass NetworkitGraphBackend(GraphBackend):\n    def __init__(self):\n        import networkit as nk\n\n        self.nk = nk\n\n    def build_graph(\n        self, node_count: int, edges: typing.Iterable[tuple[int, int]]\n    ) -> typing.Any:\n        graph = self.nk.Graph(node_count)\n        for start, end in edges:\n            graph.addEdge(start, end)\n        return graph\n\n    def connected_components(self, graph: typing.Any) -> list[list[int]]:\n        cc_query = self.nk.components.ConnectedComponents(graph)\n        cc_query.run()\n        return cc_query.getComponents()\n\n    def communities(self, graph: typing.Any, component: list[int]) -> list[list[int]]:\n        component_node_map = dict(enumerate(component))\n        subgraph = self.nk.graphtools.subgraphFromNodes(graph, component, compact=True)\n        algo = self.nk.community.PLP(subgraph, maxIterations=32)\n        algo.run()\n        communities = algo.getPartition()\n        return [\n            [component_node_map[node] for node in communities.getMembers(community)]\n            for community in communities.subsetSizeMap().keys()\n        ]\n\n    def maximal_cliques(\n        self,\n        graph: typing.Any,\n        community_nodes: list[int],\n        max_clique_batch_size: int,\n    ) -> list[list[int]]:\n        cliques: list[list[int]] = []\n        for start in range(0, len(community_nodes), max_clique_batch_size):\n            batch_nodes = community_nodes[start : start + max_clique_batch_size]\n            community_node_map = dict(enumerate(batch_nodes))\n            subgraph = self.nk.graphtools.subgraphFromNodes(\n                graph, batch_nodes, compact=True\n            )\n\n            while subgraph.numberOfNodes() > 0:\n                clique = self.nk.clique.MaximalCliques(subgraph, maximumOnly=True)\n                clique.run()\n                clique_members = clique.getCliques()[0]\n                cliques.append([community_node_map[node] for node in clique_members])\n                for node in clique_members:\n                    subgraph.removeNode(node)\n\n        return cliques\n\n\nclass NetworkxGraphBackend(GraphBackend):\n    def __init__(self):\n        import networkx as nx\n\n        self.nx = nx\n\n    def build_graph(\n        self, node_count: int, edges: typing.Iterable[tuple[int, int]]\n    ) -> typing.Any:\n        graph = self.nx.Graph()\n        graph.add_nodes_from(range(node_count))\n        graph.add_edges_from(edges)\n        return graph\n\n    def connected_components(self, graph: typing.Any) -> list[list[int]]:\n        return [list(component) for component in self.nx.connected_components(graph)]\n\n    def communities(self, graph: typing.Any, component: list[int]) -> list[list[int]]:\n        subgraph = graph.subgraph(component)\n        return [\n            list(community)\n            for community in self.nx.algorithms.community.asyn_lpa_communities(\n                subgraph, seed=0\n            )\n        ]\n\n    def maximal_cliques(\n        self,\n        graph: typing.Any,\n        community_nodes: list[int],\n        max_clique_batch_size: int,\n    ) -> list[list[int]]:\n        cliques: list[list[int]] = []\n        for start in range(0, len(community_nodes), max_clique_batch_size):\n            batch_nodes = community_nodes[start : start + max_clique_batch_size]\n            subgraph = graph.subgraph(batch_nodes).copy()\n\n            while subgraph.number_of_nodes() > 0:\n                clique_members = max(\n                    self.nx.find_cliques(subgraph),\n                    key=lambda clique: (\n                        len(clique),\n                        tuple(sorted(clique)),\n                    ),\n                )\n                cliques.append(list(clique_members))\n                subgraph.remove_nodes_from(clique_members)\n\n        return cliques\n\n\ndef get_graph_backend() -> GraphBackend:\n    if sys.platform == \"darwin\":\n        return NetworkxGraphBackend()\n    return NetworkitGraphBackend()\n"
  },
  {
    "path": "perception/approximate_deduplication/debug.py",
    "content": "import logging\nimport random\n\nimport cv2\nimport numpy as np\n\nimport perception.local_descriptor_deduplication as ldd\n\nLOGGER = logging.getLogger(__name__)\n\n# Set a fixed size for drawing, we don't have the real descriptor size.\nKEYPOINT_SIZE: int = 8\n\n\ndef vizualize_pair(\n    features_1,\n    features_2,\n    ratio: float,\n    match_metadata=None,\n    local_path_col: str | None = None,\n    sanitized: bool = False,\n    include_all_points=False,\n    circle_size=KEYPOINT_SIZE,\n):\n    \"\"\"Given two rows from a reference df vizualize their overlap.\n\n    Currently recalcs overlap using cv2 default logic.\n\n    Args:\n        features_1: The row from a reference df for one image.\n        features_2: The row from a reference df for the other image.\n        ratio: Value for ratio test, suggest re-using value from matching.\n        match_metadata: metadata returned from matching, if None will redo brute force matching.\n        local_path_col: column in df with path to the image. If None will\n            use the index: features_1.name and features_2.name\n        sanitized: if True images themselves will not be rendered, only the points.\n        include_all_points: if True will draw all points, not just matched points.\n        circle_size: size of the circle to draw around keypoints.\n    Returns:\n        An image of the two images concatted together and matching keypoints drawn.\n    \"\"\"\n    # Set a fixed size for drawing, we don't have the real descriptor size.\n    if local_path_col is not None:\n        features_1_path = features_1[local_path_col]\n        features_2_path = features_2[local_path_col]\n    else:\n        features_1_path = features_1.name\n        features_2_path = features_2.name\n\n    img1 = np.zeros(\n        (features_1.dimensions[1], features_1.dimensions[0], 1), dtype=\"uint8\"\n    )\n    img2 = np.zeros(\n        (features_2.dimensions[1], features_2.dimensions[0], 1), dtype=\"uint8\"\n    )\n\n    if not sanitized:\n        try:\n            img1 = ldd.load_and_preprocess(\n                features_1_path, max_size=max(features_1.dimensions), grayscale=False\n            )\n        except Exception:\n            LOGGER.warning(\"Failed to load image %s\", features_1_path)\n        try:\n            img2 = ldd.load_and_preprocess(\n                features_2_path, max_size=max(features_2.dimensions), grayscale=False\n            )\n        except Exception:\n            LOGGER.warning(\"Failed to load image %s\", features_2_path)\n\n    if match_metadata is not None:\n        img_matched = viz_match_data(\n            features_1,\n            features_2,\n            img1,\n            img2,\n            match_metadata,\n            include_all_points=include_all_points,\n            circle_size=circle_size,\n        )\n    else:\n        LOGGER.warning(\"\"\"No match_metadata provided, recalculating match points,\n            won't match perception match points.\"\"\")\n        img_matched = viz_brute_force(features_1, features_2, img1, img2, ratio=ratio)\n\n    return img_matched\n\n\ndef viz_match_data(\n    features_1,\n    features_2,\n    img1,\n    img2,\n    match_metadata,\n    include_all_points=False,\n    circle_size=KEYPOINT_SIZE,\n):\n    \"\"\"Given match data viz matching points.\n\n    Args:\n        features_1: The row from a reference df for one image.\n        features_2: The row from a reference df for the other image.\n        img1: cv2 of first image\n        img2: cv2 of second image\n        match_metadata: metadata returned from matching, if None will redo\n            brute force matching.\n        include_all_points: if True will draw all points, not just matched points.\n        circle_size: size of the circle to draw around keypoints.\n    Returns:\n        cv2 img with matching keypoints drawn.\n    \"\"\"\n    # NOTE: could refactor to put matches in to correct format and use: cv2.drawMatchesKnn,\n    #  but python docs on necessary class not clear.\n\n    # Pad img1 or img2 vertically with black pixels to match the height of the other image\n    if img1.shape[0] > img2.shape[0]:\n        img2 = np.pad(\n            img2,\n            ((0, img1.shape[0] - img2.shape[0]), (0, 0), (0, 0)),\n            mode=\"constant\",\n            constant_values=0,\n        )\n    elif img1.shape[0] < img2.shape[0]:\n        img1 = np.pad(\n            img1,\n            ((0, img2.shape[0] - img1.shape[0]), (0, 0), (0, 0)),\n            mode=\"constant\",\n            constant_values=0,\n        )\n    # draw two images h concat:\n    img_matched = np.concatenate((img1, img2), axis=1)\n\n    overlay = img_matched.copy()\n\n    if include_all_points:\n        # draw all points in kp_1\n        for k in features_1[\"keypoints\"]:\n            new_color = (\n                random.randint(0, 255),\n                random.randint(0, 255),\n                random.randint(0, 255),\n            )\n            # Draw semi transparent circle\n            cv2.circle(img_matched, (int(k[0]), int(k[1])), circle_size, new_color, 1)\n\n        # draw all points in kp_2\n        for k in features_2[\"keypoints\"]:\n            new_color = (\n                random.randint(0, 255),\n                random.randint(0, 255),\n                random.randint(0, 255),\n            )\n            cv2.circle(\n                img_matched,\n                (int(k[0] + features_1.dimensions[0]), int(k[1])),\n                circle_size,\n                new_color,\n                1,\n            )\n\n    # draw lines between matching points\n    for i in range(len(match_metadata[\"final_matched_b_pts\"])):\n        new_color = (\n            random.randint(0, 255),\n            random.randint(0, 255),\n            random.randint(0, 255),\n        )\n        a_pt = (\n            int(match_metadata[\"final_matched_a_pts\"][i][0]),\n            int(match_metadata[\"final_matched_a_pts\"][i][1]),\n        )\n        b_pt = (\n            int(match_metadata[\"final_matched_b_pts\"][i][0] + features_1.dimensions[0]),\n            int(match_metadata[\"final_matched_b_pts\"][i][1]),\n        )\n        cv2.circle(img_matched, a_pt, circle_size, new_color, 1)\n        cv2.circle(img_matched, b_pt, circle_size, new_color, 1)\n        cv2.line(\n            img_matched,\n            a_pt,\n            b_pt,\n            new_color,\n            1,\n        )\n\n    # Re-overlay original image to add some transparency effect to lines and circles.\n    alpha = 0.4  # Transparency factor.\n    # Following line overlays transparent rectangle over the image\n    img_matched = cv2.addWeighted(overlay, alpha, img_matched, 1 - alpha, 0)\n\n    return img_matched\n\n\ndef viz_brute_force(features_1, features_2, img1, img2, ratio: float):\n    \"\"\"\n    Given two rows from a reference df vizualize their overlap.\n\n    NOTE: It redoes matching using cv2 bruteforce, so will not match the same\n        as the perception matching code.\n\n    Args:\n        features_1: The row from a reference df for one image.\n        features_2: The row from a reference df for the other image.\n        img1: cv2 of first image\n        img2: cv2 of second image\n        ratio: Value for ratio test, suggest re-using value from matching.\n\n    Returns:\n        An image of the two images concatted together and matching keypoints drawn.\n    \"\"\"\n    # Convert numpy keypoints to cv2.KeyPoints\n    kp1_fixed = []\n    for k in features_1[\"keypoints\"]:\n        kp1_fixed.append(cv2.KeyPoint(k[0], k[1], KEYPOINT_SIZE))\n\n    kp2_fixed = []\n    for k in features_2[\"keypoints\"]:\n        kp2_fixed.append(cv2.KeyPoint(k[0], k[1], KEYPOINT_SIZE))\n    brute_force_matcher = cv2.BFMatcher()\n    kn_matches = brute_force_matcher.knnMatch(\n        features_1[\"descriptors\"], features_2[\"descriptors\"], k=2\n    )\n    # Apply ratio test\n    good = []\n    for nearest_match, next_nearest_match in kn_matches:\n        if nearest_match.distance < ratio * next_nearest_match.distance:\n            good.append([nearest_match])\n    img_matched = cv2.drawMatchesKnn(  # type: ignore[call-overload]\n        img1,\n        kp1_fixed,\n        img2,\n        kp2_fixed,\n        good,\n        None,\n        flags=cv2.DrawMatchesFlags_DRAW_RICH_KEYPOINTS,\n    )\n    return img_matched\n"
  },
  {
    "path": "perception/approximate_deduplication/index.py",
    "content": "import time\nimport typing\nimport warnings\n\nimport faiss\nimport numpy as np\nimport pandas as pd\nimport typing_extensions\n\nimport perception.hashers.tools as pht\n\n\nclass QueryInput(typing_extensions.TypedDict):\n    id: str\n    hash: str\n\n\nclass QueryMatch(typing_extensions.TypedDict):\n    id: typing.Any\n    matches: list[dict]\n\n\nclass TuningFailure(Exception):\n    pass\n\n\nclass QueryDecodingFailure(Exception):\n    pass\n\n\ndef build_query(table, ids, paramstyle, columns):\n    query = \"SELECT {} FROM {} WHERE id in {}\"\n    if paramstyle == \"pyformat\":\n        sql = query.format(\",\".join(columns), table, \"%(ids)s\")\n        params = {\"ids\": tuple(ids)}\n    elif paramstyle == \"qmark\":\n        params = ids\n        sql = query.format(\",\".join(columns), table, f\"({','.join('?' * len(ids))})\")\n    else:\n        raise NotImplementedError(\"Unsupported paramstyle.\")\n    return sql, params\n\n\ndef query_by_id(con, table, ids, paramstyle, extra_columns=None) -> pd.DataFrame:\n    \"\"\"Get data from the database.\n\n    Args:\n        con: A connection to the database\n        table: The table in which to look up hashes\n        ids: The list of IDs to pull\n        paramstyle: The paramstyle for the database\n        extra_columns: A list of additional (non-ID) columns to pull.\n    \"\"\"\n    columns = [\"id\"]\n    if extra_columns is not None:\n        columns += extra_columns\n    if isinstance(ids, np.ndarray):\n        # If it's a numpy array, coerce to a list.\n        ids = ids.tolist()\n    dfs = []\n    batch_size = 1000\n    for start in range(0, len(ids), batch_size):\n        sql, params = build_query(\n            table=table,\n            ids=ids[start : start + batch_size],\n            paramstyle=paramstyle,\n            columns=columns,\n        )\n        dfs.append(pd.read_sql(con=con, sql=sql, params=params))\n    return pd.concat(dfs, ignore_index=True).set_index(\"id\")\n\n\nclass ApproximateNearestNeighbors:\n    \"\"\"A wrapper for a FAISS index.\n\n    Args:\n        con: A database connection from which to obtain metadata for\n            matched hashes.\n        table: The table in the database that we should query for metadata.\n        paramstyle: The parameter style for the given database\n        index: A FAISS index (or filepath to a FAISS index)\n        hash_length: The length of the hash that is being matched against.\n        metadata_columns: The metadata that should be returned for queries.\n        dtype: The data type for the vectors\n        distance_metric: The distance metric for the vectors\n    \"\"\"\n\n    def __init__(\n        self,\n        con,\n        table,\n        paramstyle,\n        index,\n        hash_length,\n        metadata_columns=None,\n        dtype=\"uint8\",\n        distance_metric=\"euclidean\",\n    ):\n        assert (\n            dtype == \"uint8\"\n        ), \"Only unsigned 8-bit integer hashes are supported at this time.\"\n        assert (\n            distance_metric == \"euclidean\"\n        ), \"Only euclidean distance is supported at this time.\"\n        if isinstance(index, str):\n            index = faiss.read_index(index)\n        self.con = con\n        self.index = index\n        self.distance_metric = distance_metric\n        self.hash_length = hash_length\n        self.dtype = dtype\n        self.table = table\n        self.metadata_columns = metadata_columns\n        self.paramstyle = paramstyle\n        assert (\n            self.index.d == self.hash_length\n        ), \"Index is incompatible with hash length.\"\n\n    @classmethod\n    def from_database(\n        cls,\n        con,\n        table,\n        paramstyle,\n        hash_length,\n        ids_train=None,\n        train_size=None,\n        chunksize=100000,\n        metadata_columns=None,\n        index=None,\n        gpu=False,\n        dtype=\"uint8\",\n        distance_metric=\"euclidean\",\n    ):\n        \"\"\"Train and build a FAISS index from a database connection.\n\n        Args:\n            con: A database connection from which to obtain metadata for\n                matched hashes.\n            table: The table in the database that we should query for metadata.\n            paramstyle: The parameter style for the given database\n            hash_length: The length of the hash that is being matched against.\n            ids_train: The IDs for the vectors to train on.\n            train_size: The number of vectors to use for training. Will be\n                randomly selected from 1 to the number of vectors in the database.\n                Ignored if ids_train is not None.\n            chunksize: The chunks of data to draw from the database at a time\n                when adding vectors to the index.\n            metadata_columns: The metadata that should be returned for queries.\n            index: If a pretrained index is provided, training will be skipped,\n                any existing vectors will be discarded, and the index will be\n                repopulated with the current contents of the database.\n            gpu: If true, will attempt to carry out training on a GPU.\n            dtype: The data type for the vectors\n            distance_metric: The distance metric for the vectors\n        \"\"\"\n        assert (\n            dtype == \"uint8\"\n        ), \"Only unsigned 8-bit integer hashes are supported at this time.\"\n        assert (\n            distance_metric == \"euclidean\"\n        ), \"Only euclidean distance is supported at this time.\"\n        if index is None:\n            # Train the index using the practices from\n            # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#if-below-1m-vectors-ivfx\n            ntotal = pd.read_sql(\n                sql=\"select count(*) as count from hashes\", con=con\n            ).iloc[0][\"count\"]\n            assert (\n                train_size <= ntotal\n            ), \"Cannot train on more hashes than are available.\"\n            nlist = int(min(4 * np.sqrt(ntotal), ntotal / 39))\n            min_train_size = 39 * nlist\n            if ids_train is not None:\n                train_size = len(ids_train)\n            if train_size is None:\n                train_size = min_train_size\n            assert (\n                train_size >= min_train_size\n            ), f\"Training an index used for {ntotal} hashes requires at least {min_train_size} training hashes.\"\n            if ids_train is None:\n                ids_train = np.random.choice(\n                    np.arange(ntotal), size=train_size, replace=False\n                )\n            df_train = query_by_id(\n                con=con,\n                table=table,\n                ids=ids_train,\n                paramstyle=paramstyle,\n                extra_columns=[\"hash\"],\n            )\n            x_train = np.array(\n                [np.frombuffer(h, dtype=dtype) for h in df_train[\"hash\"]]\n            ).astype(\"float32\")\n            assert x_train.shape[1] == hash_length, \"Hashes are of incorrect length.\"\n\n            index = faiss.IndexIVFFlat(\n                faiss.IndexFlatL2(hash_length), hash_length, nlist\n            )\n            if gpu:\n                res = faiss.StandardGpuResources()\n                gpu_index = faiss.index_cpu_to_gpu(res, 0, index)\n                gpu_index.train(x_train)\n                index = faiss.index_gpu_to_cpu(gpu_index)\n            else:\n                index.train(x_train)\n        else:\n            index.reset()\n\n        # Add hashes to the index in chunks.\n        for df_add in pd.read_sql(\n            sql=f\"SELECT id, hash FROM {table}\", con=con, chunksize=chunksize\n        ):\n            x_add = np.array(\n                [np.frombuffer(h, dtype=dtype) for h in df_add[\"hash\"]]\n            ).astype(\"float32\")\n            index.add_with_ids(x_add, df_add[\"id\"].values)\n        return cls(\n            con=con,\n            index=index,\n            hash_length=hash_length,\n            distance_metric=distance_metric,\n            dtype=dtype,\n            table=table,\n            paramstyle=paramstyle,\n            metadata_columns=metadata_columns,\n        )\n\n    def query_by_id(\n        self, ids, include_metadata=True, include_hashes=False\n    ) -> pd.DataFrame:\n        \"\"\"Get data from the database.\n\n        Args:\n            ids: The hash IDs to get from the database.\n            include_metadata: Whether to include metadata columns.\n            include_hashes: Whether to include the hashes\n        \"\"\"\n        if not self.metadata_columns and include_metadata and not include_hashes:\n            # There won't be anything to  return.\n            return pd.DataFrame()\n        extra_columns = []\n        if self.metadata_columns and include_metadata:\n            extra_columns += self.metadata_columns\n        if include_hashes:\n            extra_columns += [\"hash\"]\n        return query_by_id(\n            con=self.con,\n            table=self.table,\n            ids=ids,\n            paramstyle=self.paramstyle,\n            extra_columns=extra_columns,\n        )\n\n    def string_to_vector(self, s: str, hash_format=\"base64\") -> np.ndarray:\n        \"\"\"Convert a string to vector form.\n\n        Args:\n            s: The hash string\n            hash_format: The format for the hash string\n        \"\"\"\n        return pht.string_to_vector(\n            s, hash_format=hash_format, dtype=self.dtype, hash_length=self.hash_length\n        )\n\n    def vector_to_string(self, vector, hash_format=\"base64\") -> str | None:\n        \"\"\"Convert a vector back to string\n\n        Args:\n            vector: The hash vector\n            hash_format: The format for the hash\n        \"\"\"\n\n        return pht.vector_to_string(vector, dtype=self.dtype, hash_format=hash_format)\n\n    def search(\n        self,\n        queries: list[QueryInput],\n        threshold: int | None = None,\n        threshold_func: typing.Callable[[np.ndarray], np.ndarray] | None = None,\n        hash_format=\"base64\",\n        k=1,\n    ):\n        \"\"\"Search the index and return matches.\n\n        Args:\n            queries: A list of queries in the form of {\"id\": <id>, \"hash\": \"<hash_string>\"}\n            threshold: The threshold to use for matching. Takes precedence over threshold_func.\n            threshold_func: A function that, given a query vector, returns the desired match threshold for that query.\n            hash_format: The hash format used for the strings in the query.\n            k: The number of nearest neighbors to return.\n\n        Returns:\n            Matches in the form of a list of dicts of the form:\n            { \"id\": <query ID>, \"matches\": [{\"distance\": <distance>, \"id\": <match ID>, \"metadata\": {}}]}\n\n            The metadata consists of the contents of the metadata columns specified for this matching\n            instance.\n        \"\"\"\n        try:\n            xq = np.array(\n                [\n                    self.string_to_vector(h[\"hash\"], hash_format=hash_format)\n                    for h in queries\n                ]\n            ).astype(\"float32\")\n        except Exception as exc:\n            raise QueryDecodingFailure(\"Failed to parse hash query.\") from exc\n\n        thresholds: np.ndarray = np.ones((len(xq), 1)) * np.inf\n        if threshold:\n            thresholds = np.ones((len(xq), 1)) * threshold\n        if not threshold and threshold_func:\n            thresholds = threshold_func(xq)\n        else:\n            thresholds = np.ones((len(xq), 1)) * np.inf\n        distances, indices = self.index.search(xq, k=k)\n        distances = np.sqrt(distances)\n        metadata = (\n            None\n            if not self.metadata_columns\n            else self.query_by_id(ids=np.unique(indices[distances < thresholds]))\n        )\n        matches: list[QueryMatch] = []\n        for match_distances, match_ids, q, q_threshold in zip(\n            distances, indices, queries, thresholds\n        ):\n            match_filter = match_distances < q_threshold\n            match_ids = match_ids[match_filter]\n            match_distances = match_distances[match_filter]\n            match: QueryMatch = {\"id\": q[\"id\"], \"matches\": []}\n            for match_id, distance in zip(match_ids, match_distances):\n                entry = {\"distance\": float(distance), \"id\": match_id}\n                if metadata is not None:\n                    entry[\"metadata\"] = metadata.loc[match_id].to_dict()\n                match[\"matches\"].append(entry)\n            matches.append(match)\n        return matches\n\n    def tune(self, n_query=100, min_recall=99, max_noise=3):\n        \"\"\"Obtain minimum value for nprobe that achieves a target level of recall.\n        Args:\n            n_query: The number of hashes to use as test hashes.\n            min_recall: The minimum desired recall for the index.\n            max_noise: The maximum amount of noise to add to each test hash\n\n        Returns:\n            A tuple of recall, latency (in ms), and nprobe where the nprobe\n            value is the one that achieved the resulting recall.\n\n        Raises:\n            TuningFailure if no suitable nprobe value is found.\n        \"\"\"\n        assert (\n            n_query <= self.ntotal\n        ), \"Cannot use a test larger than ntotal (total number of hashes).\"\n\n        # Pick a random set of query hashes\n        ids = np.random.choice(\n            np.arange(1, self.ntotal + 1), size=n_query, replace=False\n        )\n        df = self.query_by_id(ids, include_metadata=False, include_hashes=True)\n        xq = np.array(\n            [np.frombuffer(v, dtype=self.dtype) for v in df[\"hash\"]], dtype=np.uint8\n        )\n\n        noise = np.random.randint(\n            low=(-xq.astype(\"int32\")).clip(-max_noise, max_noise),\n            high=(255 - xq.astype(\"float32\")).clip(-max_noise, max_noise),\n        )\n        xq = (xq.astype(\"int32\") + noise).astype(\"uint8\").astype(\"float32\")\n\n        if min_recall == 100:\n            warnings.warn(\n                \"100% recall can only be ensured with exhaustive search.\", UserWarning\n            )\n            self.set_nprobe(self.nlist)\n            start = time.time()\n            self.index.search(xq, k=1)\n            latency = time.time() - start\n            return (100, 1000 * latency, self.nlist)\n\n        # Make the search exhaustive so we get ground truth.\n        self.set_nprobe(self.nlist)\n        _, expected = self.index.search(xq, k=1)\n\n        for nprobe in range(1, self.nlist):\n            self.set_nprobe(nprobe)\n            start = time.time()\n            _, actual = self.index.search(xq, k=1)\n            latency = time.time() - start\n            recall = 100 * (actual[:, 0] == expected).sum() / xq.shape[0]\n            if recall >= min_recall:\n                break\n        else:\n            # If we never break, it means we never reached the target recall\n            # for this query.\n            raise TuningFailure(\n                \"Failed to find suitable parameters for selected recall.\"\n            )\n        return recall, 1000 * latency, nprobe\n\n    def save(self, filepath):\n        \"\"\"Save an index to disk.\n\n        Args:\n            filepath: Where to save the index.\n        \"\"\"\n        faiss.write_index(self.index, filepath)\n\n    def set_nprobe(self, nprobe) -> int:\n        \"\"\"Set the value of nprobe.\n\n        Args:\n            nprobe: The new value for nprobe\n        \"\"\"\n        faiss.ParameterSpace().set_index_parameter(self.index, \"nprobe\", nprobe)\n        return faiss.downcast_index(self.index).nprobe\n\n    @property\n    def nlist(self):\n        \"\"\"The number of lists in the index.\"\"\"\n        return faiss.downcast_index(self.index).nlist\n\n    @property\n    def nprobe(self):\n        \"\"\"The current value of nprobe.\"\"\"\n        return faiss.downcast_index(self.index).nprobe\n\n    @property\n    def ntotal(self):\n        \"\"\"The number of vectors in the index.\"\"\"\n        return self.index.ntotal\n"
  },
  {
    "path": "perception/approximate_deduplication/serve.py",
    "content": "import asyncio\nimport functools\nimport json\nimport logging\nimport typing\n\nimport aiohttp.web\nimport numpy as np\nfrom pythonjsonlogger import jsonlogger\n\nimport perception.hashers.tools as pht\n\nfrom .index import ApproximateNearestNeighbors\n\n\ndef is_similarity_valid(data, index: ApproximateNearestNeighbors):\n    \"\"\"Validates input to the similarity endpoint.\"\"\"\n    hash_format = data.get(\"hash_format\", \"base64\")\n    expected_string_length = pht.get_string_length(\n        hash_length=index.hash_length, dtype=index.dtype, hash_format=hash_format\n    )\n    return (\n        isinstance(data, dict)\n        and \"queries\" in data\n        and isinstance(data[\"queries\"], list)\n        and all(isinstance(x.get(\"hash\", None), str) for x in data[\"queries\"])\n        and hash_format in [\"hex\", \"base64\"]\n        and all(\n            len(x.get(\"hash\", None)) == expected_string_length for x in data[\"queries\"]\n        )\n    )\n\n\nasync def similarity(request):\n    \"\"\"Responds to a vector similarity query of the form:\n\n    ```\n    {\n        \"queries\": [{\"id\": str, \"hash\": \"base64_encoded_hash1\"}, ...],\n        \"k\": int,\n        \"threshold\": float,\n        \"hash_format\": \"base64\"\n    }\n    ```\n\n    with information about similar vectors in the index in the form:\n\n    ```\n    {\n      \"queries\": [{\"id\": str, \"matches\": [{\"metadata\": {json metadata}, \"distance\": float},...],...]\n    }\n    ```\n    \"\"\"\n    try:\n        request_data = await request.json()\n    except json.JSONDecodeError:\n        return aiohttp.web.json_response({\"reason\": \"Malformed JSON\"}, status=400)\n\n    index = request.app[\"index\"]\n    try:\n        assert is_similarity_valid(request_data, index)\n    except Exception:\n        return aiohttp.web.json_response({\"reason\": \"Invalid JSON request\"}, status=400)\n\n    async with request.app[\"query_semaphore\"]:\n        matches = await asyncio.get_event_loop().run_in_executor(\n            None,\n            functools.partial(\n                index.search,\n                queries=request_data[\"queries\"],\n                threshold=request_data.get(\n                    \"threshold\", request.app[\"default_threshold\"]\n                ),\n                threshold_func=request.app[\"default_threshold_func\"],\n                k=request_data.get(\"k\", request.app[\"default_k\"]),\n                hash_format=request_data.get(\"hash_format\", \"base64\"),\n            ),\n        )\n        matches = json.loads(json.dumps({\"queries\": matches}))\n\n    return aiohttp.web.json_response(matches)\n\n\ndef get_logger(name, log_level):\n    logger = logging.Logger(name=name, level=log_level)\n    handler = logging.StreamHandler()\n    handler.setFormatter(\n        jsonlogger.JsonFormatter(\n            \"%(asctime)s:%(levelname)s:%(name)s:%(message)s%(exc_info)\"\n        )\n    )\n    logger.addHandler(handler)\n    return logger\n\n\nasync def serve(\n    index: ApproximateNearestNeighbors,\n    default_threshold: int | None = None,\n    default_threshold_func: typing.Callable[[np.ndarray], np.ndarray] | None = None,\n    default_k: int = 1,\n    concurrency: int = 2,\n    log_level=logging.INFO,\n    host=\"localhost\",\n    port=8080,\n):\n    \"\"\"Serve an index as a web API. This function does not block.\n    If you wish to use the function in a blocking manner, you can\n    do something like\n\n    .. code-block:: python\n\n        loop = asyncio.get_event_loop()\n        loop.run_until_complete(serve(...))\n        loop.run_forever()\n\n    You can query the API with something like:\n\n    .. code-block:: bash\n\n        curl --header \"Content-Type: application/json\" \\\\\n             --request POST \\\\\n             --data '{\"queries\": [{\"hash\": \"<hash string>\", \"id\": \"bar\"}], \"threshold\": 1200}' \\\\\n             http://localhost:8080/v1/similarity\n\n    Args:\n        index: The underlying index\n        default_threshold: The default threshold for matches\n        default_k: The default number of nearest neighbors to look for\n        concurrency: The number of concurrent requests served\n        log_level: The log level to use for the logger\n        host: The host for the servoce\n        port: The port for the service\n    \"\"\"\n    logger = get_logger(name=\"serve\", log_level=log_level)\n    logger.info(\"Initializing web service\")\n    app = aiohttp.web.Application()\n    app.router.add_post(\"/v1/similarity\", similarity, name=\"similarity\")\n\n    # Store globals in the application object\n    app[\"default_threshold\"] = default_threshold\n    app[\"logger\"] = logger\n    app[\"default_k\"] = default_k\n    app[\"default_threshold_func\"] = default_threshold_func\n    app[\"index\"] = index\n    app[\"query_semaphore\"] = asyncio.Semaphore(concurrency)\n    logger.info(\"Entering web service listener loop.\")\n    runner = aiohttp.web.AppRunner(app, logger=logger)\n    await runner.setup()\n    site = aiohttp.web.TCPSite(runner, host, port)\n    await site.start()\n    return site\n"
  },
  {
    "path": "perception/benchmarking/__init__.py",
    "content": "from perception.benchmarking import video_transforms\nfrom perception.benchmarking import video\nfrom perception.benchmarking import image\nfrom perception.benchmarking.image import (\n    BenchmarkImageDataset,\n    BenchmarkImageTransforms,\n)\nfrom perception.benchmarking.video import (\n    BenchmarkVideoDataset,\n    BenchmarkVideoTransforms,\n)\nfrom perception.benchmarking.common import BenchmarkHashes\n\n__all__ = [\n    \"BenchmarkImageDataset\",\n    \"BenchmarkImageTransforms\",\n    \"BenchmarkVideoDataset\",\n    \"BenchmarkVideoTransforms\",\n    \"BenchmarkHashes\",\n    \"video_transforms\",\n    \"video\",\n    \"image\",\n]\n"
  },
  {
    "path": "perception/benchmarking/common.py",
    "content": "import itertools\nimport logging\nimport os\nimport shutil\nimport tempfile\nimport uuid\nimport warnings\nimport zipfile\nfrom abc import ABC\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport tqdm\nfrom scipy import spatial, stats\n\nfrom ..hashers.tools import compute_md5, string_to_vector\n\ntry:\n    from . import extensions  # type: ignore\nexcept ImportError:\n    warnings.warn(\n        \"C extensions were not built. Some metrics will be computed more slowly. \"\n        \"Please install from wheels or set up a compiler prior to installation \"\n        \"from source to use extensions.\"\n    )\n    extensions = None\n\nlog = logging.getLogger(__name__)\n\n\ndef create_mask(transformed_guids, noop_guids):\n    \"\"\"Given a list of transformed guids and noop guids,\n    computes an MxN array indicating whether noop n has the same guid\n    as transform m. Used for applying a mask to a distance matrix\n    for efficient computation of recall at different thresholds.\n\n    Args:\n        transformed_guids: An iterable of transformed guids\n        noop: An iterable of noop guids\n\n    Returns:\n        An boolean array of shape\n        `(len(transformed_guids), len(transformed_noops))`\n    \"\"\"\n    n_noops = len(noop_guids)\n    previous_guid = None\n    start = None\n    end = 0\n    mask = np.zeros((len(transformed_guids), len(noop_guids)), dtype=\"bool\")\n    for current_guid, row in zip(transformed_guids, mask):\n        if previous_guid is None or current_guid != previous_guid:\n            start = end\n            end = start + next(\n                (\n                    other_index\n                    for other_index, guid in enumerate(noop_guids[start:])\n                    if guid != current_guid\n                ),\n                n_noops,\n            )\n            previous_guid = current_guid\n        row[start:end] = True\n    return mask\n\n\ndef compute_threshold_precision_recall(pos, neg, precision_threshold=99.9):\n    # Sort both arrays according to the positive distance\n    neg = neg[pos.argsort()]\n    pos = pos[pos.argsort()]\n\n    # Compute false-positive rate for every value in pos\n    tp = np.arange(1, len(pos) + 1)\n    fp = np.array([(neg <= t).sum() for t in pos])\n    precision = 100 * tp / (tp + fp)\n\n    # Choose the optimal threshold\n    bad_threshold_idxs = np.where(precision < precision_threshold)[0]\n\n    if len(bad_threshold_idxs) > 0 and bad_threshold_idxs[0] > 0:\n        optimal_threshold = pos[bad_threshold_idxs[0] - 1]\n        recovered = (pos <= optimal_threshold).sum()\n        if recovered == 0:\n            optimal_precision = np.nan\n        else:\n            optimal_precision = precision[pos <= optimal_threshold].min()\n        optimal_recall = round(100 * recovered / len(pos), 3)\n    elif len(bad_threshold_idxs) > 0:\n        # The closest hash was a false positive.\n        optimal_threshold = pos[0]\n        optimal_recall = 0\n        optimal_precision = np.nan\n    else:\n        optimal_precision = 100\n        optimal_threshold = pos.max()\n        optimal_recall = 100\n    return optimal_threshold, optimal_precision, optimal_recall\n\n\nclass Filterable(ABC):\n    _df: pd.DataFrame\n    expected_columns: list\n\n    def __init__(self, df):\n        assert sorted(df.columns) == sorted(\n            self.expected_columns\n        ), f\"Column mismatch: Expected {sorted(self.expected_columns)}, found {sorted(df.columns)}.\"\n        self._df = df\n\n    @property\n    def categories(self):\n        \"\"\"The categories included in the dataset\"\"\"\n        return self._df[\"category\"].unique()\n\n    def filter(self, **kwargs):\n        \"\"\"Obtain a new dataset filtered with the given\n        keyword arguments.\"\"\"\n        df = self._df.copy()\n        for field, included in kwargs.items():\n            existing = self._df[field].unique()\n            if not all(inc in existing for inc in included):\n                missing = \", \".join(\n                    [str(inc) for inc in included if inc not in existing]\n                )\n                message = f\"Did not find {missing} in column {field} dataset.\"\n                warnings.warn(message, UserWarning)\n            df = df[df[field].isin(included)]\n        return self.__class__(df.copy())\n\n\nclass Saveable(Filterable):\n    @classmethod\n    def load(\n        cls,\n        path_to_zip_or_directory: str,\n        storage_dir: str | None = None,\n        verify_md5=True,\n    ):\n        \"\"\"Load a dataset from a ZIP file or directory.\n\n        Args:\n            path_to_zip_or_directory: Pretty self-explanatory\n            storage_dir: If providing a ZIP file, where to extract\n                the contents. If None, contents will be extracted to\n                a folder with the same name as the ZIP file in the\n                same directory as the ZIP file.\n            verify_md5: Verify md5s when loading\n        \"\"\"\n\n        # Load index whether from inside ZIP file or from directory.\n        if os.path.splitext(path_to_zip_or_directory)[1] == \".zip\":\n            if storage_dir is None:\n                storage_dir = os.path.join(\n                    os.path.dirname(os.path.abspath(path_to_zip_or_directory)),\n                    os.path.splitext(os.path.basename(path_to_zip_or_directory))[0],\n                )\n                os.makedirs(storage_dir, exist_ok=True)\n            with zipfile.ZipFile(path_to_zip_or_directory, \"r\") as z:\n                # Try extracting only the index at first so we can\n                # compare md5.\n                z.extract(\"index.csv\", os.path.join(storage_dir))\n                index: pd.DataFrame = pd.read_csv(\n                    os.path.join(storage_dir, \"index.csv\")\n                )\n                index[\"filepath\"] = index[\"filename\"].apply(\n                    lambda fn: (\n                        os.path.join(storage_dir, fn) if not pd.isnull(fn) else None\n                    )\n                )\n                do_zip_extraction = True\n                if index[\"filepath\"].apply(os.path.isfile).all():\n                    if verify_md5:\n                        do_zip_extraction = not all(\n                            row[\"md5\"] == compute_md5(row[\"filepath\"])\n                            for _, row in tqdm.tqdm(\n                                index.iterrows(), desc=\"Checking cache\"\n                            )\n                        )\n                    else:\n                        do_zip_extraction = False\n                if do_zip_extraction:\n                    z.extractall(storage_dir)\n                else:\n                    log.info(\"Found all files already extracted. Skipping extraction.\")\n                    verify_md5 = False\n        else:\n            assert (\n                storage_dir is None\n            ), \"Storage directory only valid if path is to ZIP file.\"\n            index = pd.read_csv(os.path.join(path_to_zip_or_directory, \"index.csv\"))\n            index[\"filepath\"] = index[\"filename\"].apply(\n                lambda fn: (\n                    os.path.join(path_to_zip_or_directory, fn)\n                    if not pd.isnull(fn)\n                    else None\n                )\n            )\n\n        if verify_md5:\n            assert all(\n                row[\"md5\"] == compute_md5(row[\"filepath\"])\n                for _, row in tqdm.tqdm(\n                    index.iterrows(),\n                    desc=\"Performing final md5 integrity check.\",\n                    total=len(index.index),\n                )\n            ), \"An md5 mismatch has occurred.\"\n        return cls(index.drop([\"filename\", \"md5\"], axis=1))\n\n    def save(self, path_to_zip_or_directory):\n        \"\"\"Save a dataset to a directory or ZIP file.\n\n        Args:\n            path_to_zip_or_directory: Pretty self-explanatory\n        \"\"\"\n        df = self._df\n        assert \"filepath\" in df.columns, \"Index dataframe must contain filepath.\"\n\n        # Build index using filename instead of filepath.\n        index = df.copy()\n        index[\"filename\"] = df[\"filepath\"].apply(\n            lambda filepath: (\n                os.path.basename(filepath) if not pd.isnull(filepath) else None\n            )\n        )\n        if index[\"filename\"].dropna().duplicated().sum() > 0:\n            warnings.warn(\"Changing filenames to UUID due to duplicates.\", UserWarning)\n\n            index[\"filename\"] = [\n                (\n                    str(uuid.uuid4()) + os.path.splitext(row[\"filename\"])[1]\n                    if not pd.isnull(row[\"filename\"])\n                    else None\n                )\n                for _, row in index.iterrows()\n            ]\n        index[\"md5\"] = [\n            compute_md5(filepath) if not pd.isnull(filepath) else None\n            for filepath in tqdm.tqdm(index[\"filepath\"], desc=\"Computing md5s.\")\n        ]\n\n        # Add all files as well as the dataframe index to\n        # a ZIP file if path is to ZIP file or to the directory if it is\n        # not a ZIP file.\n        if os.path.splitext(path_to_zip_or_directory)[1] == \".zip\":\n            with zipfile.ZipFile(path_to_zip_or_directory, \"w\") as f:\n                with tempfile.TemporaryFile(mode=\"w+\") as index_file:\n                    index.drop(\"filepath\", axis=1).to_csv(index_file, index=False)\n                    index_file.seek(0)\n                    f.writestr(\"index.csv\", index_file.read())\n                for _, row in tqdm.tqdm(\n                    index.iterrows(), desc=\"Saving files\", total=len(df)\n                ):\n                    if pd.isnull(row[\"filepath\"]):\n                        #  There was an error associated with this file.\n                        continue\n                    f.write(row[\"filepath\"], row[\"filename\"])\n        else:\n            os.makedirs(path_to_zip_or_directory, exist_ok=True)\n            index.drop(\"filepath\", axis=1).to_csv(\n                os.path.join(path_to_zip_or_directory, \"index.csv\"), index=False\n            )\n            for _, row in tqdm.tqdm(\n                index.iterrows(), desc=\"Saving files\", total=len(df)\n            ):\n                if pd.isnull(row[\"filepath\"]):\n                    # There was an error associated with this file.\n                    continue\n                if row[\"filepath\"] == os.path.join(\n                    path_to_zip_or_directory, row[\"filename\"]\n                ):\n                    # The source file is the same as the target file.\n                    continue\n                shutil.copy(\n                    row[\"filepath\"],\n                    os.path.join(path_to_zip_or_directory, row[\"filename\"]),\n                )\n\n\nclass BenchmarkHashes(Filterable):\n    \"\"\"A dataset of hashes for transformed images. It is essentially\n    a wrapper around a `pandas.DataFrame` with the following columns:\n\n    - guid\n    - error\n    - filepath\n    - category\n    - transform_name\n    - hasher_name\n    - hasher_dtype\n    - hasher_distance_metric\n    - hasher_hash_length\n    - hash\n    \"\"\"\n\n    expected_columns = [\n        \"error\",\n        \"filepath\",\n        \"hash\",\n        \"hasher_name\",\n        \"hasher_dtype\",\n        \"hasher_distance_metric\",\n        \"category\",\n        \"guid\",\n        \"input_filepath\",\n        \"transform_name\",\n        \"hasher_hash_length\",\n    ]\n\n    def __init__(self, df: pd.DataFrame):\n        super().__init__(df)\n        self._metrics: pd.DataFrame | None = None\n\n    def __add__(self, other):\n        return BenchmarkHashes(df=pd.concat([self._df, other._df]).drop_duplicates())\n\n    def __radd__(self, other):\n        return self.__add__(other)\n\n    @classmethod\n    def load(cls, filepath: str):\n        return cls(pd.read_csv(filepath))\n\n    def save(self, filepath):\n        self._df.to_csv(filepath, index=False)\n\n    def compute_metrics(\n        self, custom_distance_metrics: dict | None = None\n    ) -> pd.DataFrame:\n        if self._metrics is not None:\n            return self._metrics\n        metrics = []\n        hashsets = self._df.sort_values(\"guid\")\n        n_dropped = hashsets[\"hash\"].isnull().sum()\n        if n_dropped > 0:\n            hashsets = hashsets.dropna(subset=[\"hash\"])\n            warnings.warn(f\"Dropping {n_dropped} invalid / empty hashes.\", UserWarning)\n        for (hasher_name, transform_name, category), hashset in tqdm.tqdm(\n            hashsets.groupby([\"hasher_name\", \"transform_name\", \"category\"]),\n            desc=\"Computing metrics.\",\n        ):\n            # Note the guid filtering below. We need to include only guids\n            # for which we have the transform *and* the guid. One of them\n            # may have been dropped due to being invalid.\n            noops = hashsets[\n                (hashsets[\"transform_name\"] == \"noop\")\n                & (hashsets[\"hasher_name\"] == hasher_name)\n                & (hashsets[\"guid\"].isin(hashset[\"guid\"]))\n            ]\n            valid_hashset = hashset[hashset[\"guid\"].isin(noops[\"guid\"])]\n            dtype, distance_metric, hash_length = valid_hashset.iloc[0][\n                [\"hasher_dtype\", \"hasher_distance_metric\", \"hasher_hash_length\"]\n            ]\n            n_noops = len(noops.guid)\n            n_hashset = len(valid_hashset.guid)\n            noop_guids = noops.guid.values\n            mask = create_mask(valid_hashset.guid.values, noops.guid.values)\n            if distance_metric != \"custom\":\n                X_trans = np.array(\n                    valid_hashset.hash.apply(\n                        string_to_vector,  # type: ignore[arg-type]\n                        hash_length=int(hash_length),\n                        dtype=dtype,\n                        hash_format=\"base64\",\n                    ).tolist()\n                )\n                X_noop = np.array(\n                    noops.hash.apply(\n                        string_to_vector,  # type: ignore[arg-type]\n                        dtype=dtype,\n                        hash_format=\"base64\",\n                        hash_length=int(hash_length),\n                    ).tolist()\n                )\n                if (\n                    distance_metric != \"euclidean\"\n                    or \"int\" not in dtype\n                    or extensions is None\n                ):\n                    distance_matrix = spatial.distance.cdist(\n                        XA=X_trans, XB=X_noop, metric=distance_metric\n                    )\n                    distance_to_closest_image = distance_matrix.min(axis=1)\n                    distance_to_correct_image = np.ma.masked_array(\n                        distance_matrix, np.logical_not(mask)\n                    ).min(axis=1)\n                    distance_matrix_incorrect_image: np.ndarray = np.ma.masked_array(\n                        distance_matrix, mask\n                    )\n                    distance_to_incorrect_image = distance_matrix_incorrect_image.min(\n                        axis=1\n                    )\n                    closest_incorrect_guid = noop_guids[\n                        distance_matrix_incorrect_image.argmin(axis=1)\n                    ]\n                else:\n                    distances, indexes = extensions.compute_euclidean_metrics(\n                        X_noop.astype(\"int32\"), X_trans.astype(\"int32\"), mask\n                    )\n                    distance_to_correct_image = distances[:, 1]\n                    distance_to_incorrect_image = distances[:, 0]\n                    distance_to_closest_image = distances.min(axis=1)\n                    closest_incorrect_guid = [noop_guids[idx] for idx in indexes[:, 0]]\n            else:\n                assert (\n                    custom_distance_metrics is not None\n                    and hasher_name in custom_distance_metrics\n                ), f\"You must provide a custom distance metric for {hasher_name}.\"\n                noops_hash_values = noops.hash.values\n                hashset_hash_values = valid_hashset.hash.values\n                distance_matrix = np.zeros((n_hashset, n_noops))\n                distance_function = custom_distance_metrics[hasher_name]\n                for i1, i2 in itertools.product(range(n_hashset), range(n_noops)):\n                    distance_matrix[i1, i2] = distance_function(\n                        hashset_hash_values[i1], noops_hash_values[i2]\n                    )\n                distance_to_closest_image = distance_matrix.min(axis=1)\n                distance_to_correct_image = np.ma.masked_array(\n                    distance_matrix, np.logical_not(mask)\n                ).min(axis=1)\n                distance_matrix_incorrect_image = np.ma.masked_array(\n                    distance_matrix, mask\n                )\n                distance_to_incorrect_image = distance_matrix_incorrect_image.min(\n                    axis=1\n                )\n                closest_incorrect_guid = noop_guids[\n                    distance_matrix_incorrect_image.argmin(axis=1)\n                ]\n\n            metrics.append(\n                pd.DataFrame(\n                    {\n                        \"guid\": valid_hashset[\"guid\"].values,\n                        \"transform_name\": transform_name,\n                        \"hasher_name\": hasher_name,\n                        \"category\": category,\n                        \"distance_to_closest_correct_image\": distance_to_correct_image,\n                        \"distance_to_closest_incorrect_image\": distance_to_incorrect_image,\n                        \"distance_to_closest_image\": distance_to_closest_image,\n                        \"closest_incorrect_guid\": closest_incorrect_guid,\n                    }\n                )\n            )\n        metrics_df = pd.concat(metrics)\n        self._metrics = metrics_df\n        return metrics_df\n\n    def show_histograms(self, grouping=None, precision_threshold=99.9, **kwargs):\n        \"\"\"Plot histograms for true and false positives, similar\n        to https://tech.okcupid.com/evaluating-perceptual-image-hashes-okcupid/\n        Additional arguments passed to compute_metrics.\n\n        Args:\n            grouping: List of fields to group by. By default, all fields are used\n                (category, and transform_name).\n        \"\"\"\n        if grouping is None:\n            grouping = [\"category\", \"transform_name\"]\n\n        metrics = self.compute_metrics(**kwargs)\n\n        hasher_names = metrics[\"hasher_name\"].unique().tolist()\n        bounds = (\n            metrics.groupby(\"hasher_name\")[\n                [\"distance_to_closest_image\", \"distance_to_closest_incorrect_image\"]\n            ]\n            .max()\n            .max(axis=1)\n        )\n        if grouping:\n            group_names = [\n                \":\".join(map(str, row.values))\n                for idx, row in metrics[grouping].drop_duplicates().iterrows()\n            ]\n        else:\n            group_names = [\"\"]\n        ncols = len(hasher_names)\n        nrows = len(group_names)\n\n        fig, axs = plt.subplots(\n            ncols=ncols, nrows=nrows, figsize=(ncols * 4, nrows * 3), sharey=True\n        )\n\n        for group_name, subset in metrics.groupby([\"hasher_name\"] + grouping):\n            # Get names of group and hasher\n            if grouping:\n                hasher_name = group_name[0]\n                group_name = \":\".join(map(str, group_name[1:]))\n            else:\n                hasher_name = group_name\n                group_name = \"\"\n\n            # Get the correct axis.\n            colIdx = hasher_names.index(hasher_name)\n            rowIdx = group_names.index(group_name)\n            if ncols > 1 and nrows > 1:\n                ax = axs[rowIdx, colIdx]\n            elif ncols == 1 and nrows == 1:\n                ax = axs\n            else:\n                ax = axs[rowIdx if nrows > 1 else colIdx]\n\n            # Plot the charts\n            inner_keys = [\"guid\"] + (\n                [\"transform_name\"] if \"transform_name\" in subset.columns else []\n            )\n            pos, neg = (\n                subset.groupby(inner_keys)[\n                    [\n                        \"distance_to_closest_correct_image\",\n                        \"distance_to_closest_incorrect_image\",\n                    ]\n                ]\n                .min()\n                .values.T\n            )\n            optimal_threshold, _, optimal_recall = compute_threshold_precision_recall(\n                pos=pos, neg=neg, precision_threshold=precision_threshold\n            )\n            optimal_threshold = optimal_threshold.round(3)\n            emd = stats.wasserstein_distance(pos, neg).round(2)\n            ax.hist(neg, label=\"neg\", bins=10)\n            ax.hist(pos, label=\"pos\", bins=10)\n            ax.text(\n                0.5,\n                0.5,\n                f\"Recall: {optimal_recall:.0f}% @ {optimal_threshold}\\nemd: {emd:.2f}\",\n                horizontalalignment=\"center\",\n                color=\"black\",\n                verticalalignment=\"center\",\n                transform=ax.transAxes,\n                fontsize=12,\n                fontweight=1000,\n            )\n            ax.set_xlim(-0.05 * bounds[hasher_name], bounds[hasher_name])\n            if rowIdx == 0:\n                ax.set_title(hasher_name)\n                ax.legend()\n            if colIdx == 0:\n                ax.set_ylabel(group_name)\n        fig.tight_layout()\n\n    def compute_threshold_recall(\n        self, precision_threshold=99.9, grouping=None, **kwargs\n    ) -> pd.DataFrame:\n        \"\"\"Compute a table for threshold and recall for each category, hasher,\n        and transformation combinations. Additional arguments passed to compute_metrics.\n\n        Args:\n            precision_threshold: The precision threshold to use\n                for choosing a distance threshold for each hasher.\n            grouping: List of fields to group by. By default, all fields are used\n                (category, and transform_name).\n\n        Returns:\n            A pandas DataFrame with 7 columns. The key columns are threshold\n            (The optimal distance threshold for detecting a match for this\n            combination), recall (the number of correct matches divided by\n            the number of possible matches), and precision (the number correct\n            matches divided by the total number of matches whether correct\n            or incorrect).\n        \"\"\"\n        if grouping is None:\n            grouping = [\"category\", \"transform_name\"]\n\n        def group_func(subset):\n            inner_keys = [\"guid\"] + (\n                [\"transform_name\"] if \"transform_name\" in subset.columns else []\n            )\n            pos, neg = (\n                subset.groupby(inner_keys)[\n                    [\n                        \"distance_to_closest_correct_image\",\n                        \"distance_to_closest_incorrect_image\",\n                    ]\n                ]\n                .min()\n                .values.T\n            )\n\n            (\n                optimal_threshold,\n                optimal_precision,\n                optimal_recall,\n            ) = compute_threshold_precision_recall(\n                pos=pos, neg=neg, precision_threshold=precision_threshold\n            )\n            return pd.Series(\n                {\n                    \"threshold\": optimal_threshold,\n                    \"recall\": optimal_recall,\n                    \"precision\": optimal_precision,\n                    \"n_exemplars\": len(subset),\n                }\n            )\n\n        return (\n            self.compute_metrics(**kwargs)\n            .groupby(grouping + [\"hasher_name\"])\n            .apply(group_func)\n        )\n\n\nclass BenchmarkDataset(Saveable):\n    \"\"\"A dataset of images separated into\n    categories. It is essentially a wrapper around a pandas\n    dataframe with the following columns:\n\n    - filepath\n    - category\n    \"\"\"\n\n    expected_columns = [\"filepath\", \"category\"]\n\n    @classmethod\n    def from_tuples(cls, files: list[tuple[str, str]]):\n        \"\"\"Build dataset from a set of files.\n\n        Args:\n            files: A list of tuples where each entry is a pair\n                filepath and category.\n        \"\"\"\n        df = pd.DataFrame.from_records(\n            [{\"filepath\": f, \"category\": c} for f, c in files]\n        )\n        return cls(df)\n\n    def transform(self, transforms, storage_dir, errors):\n        raise NotImplementedError()\n\n\nclass BenchmarkTransforms(Saveable):\n    \"\"\"A dataset of transformed images. Essentially wraps a DataFrame with the\n    following columns:\n\n    - guid\n    - filepath\n    - category\n    - transform_name\n    - input_filepath (for memo purposes only)\n    \"\"\"\n\n    expected_columns = [\n        \"filepath\",\n        \"category\",\n        \"transform_name\",\n        \"input_filepath\",\n        \"guid\",\n    ]\n\n    def compute_hashes(self, hashers, max_workers):\n        raise NotImplementedError()\n"
  },
  {
    "path": "perception/benchmarking/extensions.pyx",
    "content": "# cython: language_level=3\n\nimport cython\nimport numpy as np\nfrom cython.parallel import parallel, prange\n\ncimport numpy as np\nfrom libc.math cimport sqrt\nfrom libc.stdlib cimport abort, free, malloc\n\n\ncdef extern from \"limits.h\":\n    int INT_MAX\n\nctypedef np.uint8_t uint8\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ndef compute_euclidean_metrics(int[:, :] X_noop, int[:, :] X_tran, uint8[:, :] mask):\n    \"\"\"Compute the positive / negative distance metrics between two sets of vectors\n    using euclidean distance. This function obtains the necessary metrics roughly\n    10x faster than using scipy.spatial.distance.cdist and numpy functions.\n\n    Args:\n        X_noop: The vectors for the noop hashes with shape (N, K)\n        X_tran: The vectors for the transformed instances with shape (M, K)\n        mask: A (M, N) array indicating whether noop n corresponds to transform m\n\n    Returns:\n        distances: An M by 2 array with the closest false positive and closest\n            true positive for each transform.\n        indexes: An M by 2 array with the index for the closest false positive\n            noop and the closest true positive noop.\n    \"\"\"\n\n    cdef Py_ssize_t n_noop = X_noop.shape[0]\n    cdef Py_ssize_t d_noop = X_noop.shape[1]\n    cdef Py_ssize_t n_tran = X_tran.shape[0]\n    cdef Py_ssize_t d_tran = X_tran.shape[1]\n    cdef Py_ssize_t n_mask_tran = mask.shape[0]\n    cdef Py_ssize_t n_mask_noop = mask.shape[1]\n    cdef Py_ssize_t i_mask_tran\n    cdef Py_ssize_t i_mask_noop\n    cdef int n_pos\n\n    cdef int current_distance\n    cdef int current_closest_fp\n    cdef int current_closest_tp\n    cdef int[:] x\n    cdef int[:] y\n    cdef uint8 is_pos\n    cdef Py_ssize_t i_noop, i_tran, i_d\n    cdef Py_ssize_t i_closest_fp = 0\n    cdef Py_ssize_t i_closest_tp = 1\n    cdef Py_ssize_t i_closest_fp_idx = 0\n    cdef Py_ssize_t i_closest_tp_idx = 1\n    cdef int * local_buf\n    cdef size_t size = 5\n    cdef float NAN\n    NAN = float(\"NaN\")\n\n    assert d_noop == d_tran, \"Dimensionality of vectors must match.\"\n    assert n_mask_tran == n_tran, \"Dimension 0 of mask must correspond to n_transforms.\"\n    assert n_mask_noop == n_noop, \"Dimension 1 of mask must correspond to n_noops.\"\n    for i_mask_tran in range(n_mask_tran):\n        n_pos = 0\n        for i_mask_noop in range(n_mask_noop):\n            if mask[i_mask_tran, i_mask_noop] == True:\n                n_pos += 1\n        assert n_pos > 0, \"All transforms must have at least one positive noop.\"\n        assert n_pos < n_mask_noop, \"All transforms must have at least one negative noop.\"\n\n    distances = np.zeros((n_tran, 2), dtype=np.float32)\n    indexes = np.zeros((n_tran, 2), dtype=np.int32)\n\n    cdef np.float32_t[:, :] distances_view = distances\n    cdef int[:, :] indexes_view = indexes\n\n    with nogil, parallel():\n        local_buf = <int *> malloc(sizeof(int) * size)\n        if local_buf is NULL:\n            abort()\n        for i_tran in prange(n_tran):\n            local_buf[1] = INT_MAX  # Smallest false positive distance\n            local_buf[2] = INT_MAX  # Smallest true positive distance\n            local_buf[3] = 0        # Smallest false positive index\n            local_buf[4] = 0        # Smallest true positive index\n            for i_noop in range(n_noop):\n                local_buf[0] = 0    # Current distance\n                is_pos = mask[i_tran, i_noop] == True\n                for i_d in range(d_noop):\n                    local_buf[0] += (X_noop[i_noop, i_d] - X_tran[i_tran, i_d]) ** 2\n                if is_pos and (local_buf[0] < local_buf[2]):\n                    local_buf[2] = local_buf[0]\n                    local_buf[4] = i_noop\n                if not is_pos and (local_buf[0] < local_buf[1]):\n                    local_buf[1] = local_buf[0]\n                    local_buf[3] = i_noop\n            # I do not think that an <int *> can ever actually be\n            # greater than INT_MAX but we'll leave the check in.\n            if local_buf[1] < INT_MAX:\n                distances_view[i_tran, i_closest_fp] = sqrt(local_buf[1])\n            else:\n                distances_view[i_tran, i_closest_fp] = NAN\n            if local_buf[2] < INT_MAX:\n                distances_view[i_tran, i_closest_tp] = sqrt(local_buf[2])\n            else:\n                distances_view[i_tran, i_closest_tp] = NAN\n            indexes_view[i_tran, i_closest_fp_idx] = local_buf[3]\n            indexes_view[i_tran, i_closest_tp_idx] = local_buf[4]\n        free(local_buf)\n    return distances, indexes\n"
  },
  {
    "path": "perception/benchmarking/image.py",
    "content": "import logging\nimport os\nimport uuid\nimport warnings\n\nimport cv2\nimport albumentations\nimport pandas as pd\nfrom tqdm import tqdm\n\nfrom ..hashers import tools\nfrom ..hashers.hasher import ImageHasher\nfrom ..tools import deduplicate, flatten\nfrom .common import BenchmarkDataset, BenchmarkHashes, BenchmarkTransforms\n\nlog = logging.getLogger(__name__)\n\n\nclass BenchmarkImageTransforms(BenchmarkTransforms):\n    def compute_hashes(\n        self, hashers: dict[str, ImageHasher], max_workers: int = 5\n    ) -> BenchmarkHashes:\n        \"\"\"Compute hashes for a series of files given some set of hashers.\n\n        Args:\n            hashers: A dictionary of hashers.\n            max_workers: Maximum number of workers for parallel hash\n                computation.\n\n        Returns:\n            metrics: A BenchmarkHashes object.\n        \"\"\"\n        hashsets = []\n        filepaths = self._df[\"filepath\"]\n        for hasher_name, hasher in hashers.items():\n            hash_dicts = hasher.compute_parallel(\n                filepaths,\n                progress=tqdm,\n                progress_desc=f\"Computing hashes for {hasher_name}\",\n                max_workers=max_workers,\n            )\n            if not hasher.returns_multiple:\n                hashes_df = pd.DataFrame.from_records(hash_dicts)\n            else:\n                hash_groups = [\n                    hash_dict[\"hash\"] if hash_dict[\"error\"] is None else [None]\n                    for hash_dict in hash_dicts\n                ]\n                hash_group_sizes = [len(hash_group) for hash_group in hash_groups]\n                current_hashes = flatten(hash_groups)\n                current_filepaths = flatten(\n                    [\n                        [hash_dict[\"filepath\"]] * hash_group_size\n                        for hash_dict, hash_group_size in zip(\n                            hash_dicts, hash_group_sizes\n                        )\n                    ]\n                )\n                current_errors = flatten(\n                    [\n                        [hash_dict[\"error\"]] * hash_group_size\n                        for hash_dict, hash_group_size in zip(\n                            hash_dicts, hash_group_sizes\n                        )\n                    ]\n                )\n                hashes_df = pd.DataFrame(\n                    {\n                        \"error\": current_errors,\n                        \"filepath\": current_filepaths,\n                        \"hash\": current_hashes,\n                    }\n                )\n            hashset = hashes_df.assign(\n                hasher_name=hasher_name,\n                hasher_hash_length=hasher.hash_length,\n                hasher_dtype=hasher.dtype,\n                hasher_distance_metric=hasher.distance_metric,\n            )\n            hashset = hashset.merge(self._df, on=\"filepath\")\n            hashsets.append(hashset)\n        return BenchmarkHashes(pd.concat(hashsets, sort=True))\n\n\nclass BenchmarkImageDataset(BenchmarkDataset):\n    def deduplicate(\n        self, hasher: ImageHasher, threshold=0.001, isometric=False\n    ) -> tuple[\"BenchmarkImageDataset\", set[tuple[str, str]]]:\n        \"\"\"Remove duplicate files from dataset.\n\n        Args:\n            files: A list of file paths\n            hasher: A hasher to use for finding a duplicate\n            threshold: The threshold required for a match\n            isometric: Whether to compute the rotated versions of the images\n\n        Returns:\n            A list where each entry is a list of files that are\n            duplicates of each other. We keep only the last entry.\n        \"\"\"\n        pairs: set[tuple[str, str]] = set()\n        for _, group in tqdm(\n            self._df.groupby([\"category\"]), desc=\"Deduplicating categories.\"\n        ):\n            pairs = pairs.union(\n                set(\n                    deduplicate(\n                        files=group[\"filepath\"].tolist(),\n                        hashers=[(hasher, threshold)],\n                        isometric=isometric,\n                    )\n                )\n            )\n        removed = [pair[0] for pair in pairs]\n        return (\n            BenchmarkImageDataset(self._df[~self._df[\"filepath\"].isin(removed)].copy()),\n            pairs,\n        )\n\n    def transform(\n        self,\n        transforms: dict[str, albumentations.BasicTransform],\n        storage_dir: str,\n        errors: str = \"raise\",\n    ) -> BenchmarkImageTransforms:\n        \"\"\"Prepare files to be used as part of benchmarking run.\n\n        Args:\n            transforms: A dictionary of transformations. The only required\n                key is `noop` which determines how the original, untransformed\n                image is saved. For a true copy, simply make the `noop` key\n                `albumentations.NoOp`\n            storage_dir: A directory to store all the images along with\n                their transformed counterparts.\n            errors: How to handle errors reading files. If \"raise\", exceptions are\n                raised. If \"warn\", the error is printed as a warning.\n\n        Returns:\n            transforms: A BenchmarkImageTransforms object\n        \"\"\"\n        assert (\n            \"noop\" in transforms\n        ), \"You must provide a no-op transform such as `lambda img: img`.\"\n\n        os.makedirs(storage_dir, exist_ok=True)\n\n        files = self._df.copy()\n        files[\"guid\"] = [str(uuid.uuid4()) for n in range(len(files))]\n\n        def apply_transform(files, transform_name):\n            transform = transforms[transform_name]\n            transformed_arr = []\n            for _, row in tqdm(\n                files.iterrows(),\n                desc=f\"Creating files for {transform_name}\",\n                total=len(files),\n            ):\n                filepath, guid, category = row[[\"filepath\", \"guid\", \"category\"]]\n                try:\n                    image = tools.read(filepath)\n                except Exception as exception:\n                    message = f\"An error occurred reading {filepath}.\"\n                    if errors == \"raise\":\n                        raise exception\n                    warnings.warn(message, UserWarning)\n                    continue\n                try:\n                    transformed = transform(image=image)\n                    # If albumentations, output is a dict with 'image' key\n                    if isinstance(transformed, dict) and \"image\" in transformed:\n                        transformed = transformed[\"image\"]\n                except Exception as e:\n                    raise RuntimeError(\n                        f\"An exception occurred while processing {filepath} \"\n                        f\"with transform {transform_name}.\"\n                    ) from e\n                transformed_path = os.path.join(\n                    storage_dir, f\"{guid}_{transform_name}.jpg\"\n                )\n                cv2.imwrite(\n                    transformed_path, cv2.cvtColor(transformed, cv2.COLOR_RGB2BGR)\n                )\n                transformed_arr.append(\n                    {\n                        \"guid\": guid,\n                        \"transform_name\": transform_name,\n                        \"input_filepath\": filepath,\n                        \"filepath\": transformed_path,\n                        \"category\": category,\n                    }\n                )\n            return pd.DataFrame.from_records(transformed_arr)\n\n        results = [apply_transform(files, transform_name=\"noop\")]\n\n        for transform_name in transforms.keys():\n            if transform_name == \"noop\":\n                continue\n            results.append(apply_transform(results[0], transform_name=transform_name))\n        benchmark_transforms = BenchmarkImageTransforms(\n            df=pd.concat(results, axis=0, ignore_index=True)\n        )\n        benchmark_transforms.save(storage_dir)\n        return benchmark_transforms\n"
  },
  {
    "path": "perception/benchmarking/image_transforms.py",
    "content": "import cv2\nimport numpy as np\n\n\ndef apply_watermark(watermark, alpha: float = 1.0, size: float = 1.0):\n    \"\"\"Apply a watermark to the bottom right of\n    images. Based on the work provided at\n    https://www.pyimagesearch.com/2016/04/25/watermarking-images-with-opencv-and-python/\n\n    Args:\n        watermark: The watermark to overlay\n        alpha: The strength of the overlay\n        size: The maximum proportion of the image\n            taken by the watermark.\n    \"\"\"\n    assert watermark.shape[-1] == 4, \"Watermark must have an alpha channel.\"\n\n    # Why do we have to do this? It's not clear. But the process doesn't work\n    # without it.\n    B, G, R, A = cv2.split(watermark)\n    B = cv2.bitwise_and(B, B, mask=A)\n    G = cv2.bitwise_and(G, G, mask=A)\n    R = cv2.bitwise_and(R, R, mask=A)\n    watermark = cv2.merge([B, G, R, A])\n\n    def transform(image):\n        # Add alpha channel\n        h, w = image.shape[:2]\n        wh, ww = watermark.shape[:2]\n        scale = size * min(h / wh, w / ww)\n        image = np.dstack([image, np.ones((h, w), dtype=\"uint8\") * 255])\n        # Construct an overlay that is the same size as the input.\n        overlay = np.zeros((h, w, 4), dtype=\"uint8\")\n        scaled = cv2.resize(watermark, (int(scale * ww), int(scale * wh)))\n        sh, sw = scaled.shape[:2]\n        overlay[max(h - sh, 0) :, max(w - sw, 0) : w] = scaled\n        # Blend the two images together using transparent overlays\n        output = image.copy()\n        cv2.addWeighted(overlay, alpha, output, 1.0, 0, output)\n        return cv2.cvtColor(output, cv2.COLOR_RGBA2RGB)\n\n    return transform\n"
  },
  {
    "path": "perception/benchmarking/video.py",
    "content": "import concurrent.futures\nimport os\nimport typing\nimport uuid\n\nimport pandas as pd\nimport tqdm\n\nfrom ..hashers import VideoHasher, tools\nfrom ..tools import flatten\nfrom .common import BenchmarkDataset, BenchmarkHashes, BenchmarkTransforms\n\n\ndef _process_row(row, hashers, framerates):\n    error = None\n    try:\n        assert not pd.isnull(row[\"filepath\"]), \"No filepath provided.\"\n        hashes = tools.compute_synchronized_video_hashes(\n            filepath=row[\"filepath\"],\n            hashers=hashers,\n            framerates=framerates,\n            hash_format=\"base64\",\n        )\n    except Exception as exception:\n        error = str(exception)\n        hashes = {\n            hasher_name: [None] if hasher.returns_multiple else None\n            for hasher_name, hasher in hashers.items()\n        }\n    base_dict = {\n        \"guid\": row[\"guid\"],\n        \"filepath\": row[\"filepath\"],\n        \"error\": error,\n        \"category\": row[\"category\"],\n        \"transform_name\": row[\"transform_name\"],\n        \"input_filepath\": row[\"input_filepath\"],\n    }\n    hash_dicts = []\n    for hasher_name, hasher in hashers.items():\n        base_hash_dict = {\n            \"hasher_name\": hasher_name,\n            \"hasher_dtype\": hasher.dtype,\n            \"hasher_distance_metric\": hasher.distance_metric,\n            \"hasher_hash_length\": hasher.hash_length,\n        }\n        if not hasher.returns_multiple:\n            hash_dicts.append(\n                {\n                    **{\n                        \"hash\": hashes[hasher_name],\n                    },\n                    **base_hash_dict,\n                }\n            )\n        else:\n            for hash_value in hashes[hasher_name]:\n                hash_dicts.append(\n                    {\n                        **{\n                            \"hash\": hash_value,\n                        },\n                        **base_hash_dict,\n                    }\n                )\n    return [{**hash_dict, **base_dict} for hash_dict in hash_dicts]\n\n\nclass BenchmarkVideoDataset(BenchmarkDataset):\n    def transform(\n        self,\n        transforms: dict[str, typing.Callable],\n        storage_dir: str,\n        errors: str = \"raise\",\n    ):\n        \"\"\"Prepare files to be used as part of benchmarking run.\n\n        Args:\n            transforms: A dictionary of transformations. The only required\n                key is `noop` which determines how the original, untransformed\n                video is saved. Each transform should be a callable function with\n                that accepts an `input_filepath` and `output_filepath` argument and\n                it should return the `output_filepath` (which may have a different\n                extension appended by the transform function).\n            storage_dir: A directory to store all the videos along with\n                their transformed counterparts.\n            errors: How to handle errors reading files. If \"raise\", exceptions are\n                raised. If \"warn\", the error is printed as a warning.\n\n        Returns:\n            transforms: A BenchmarkVideoTransforms object\n        \"\"\"\n        assert \"noop\" in transforms, \"You must provide a no-op transform.\"\n\n        os.makedirs(storage_dir, exist_ok=True)\n\n        files = self._df.copy()\n        files[\"guid\"] = [str(uuid.uuid4()) for n in range(len(files))]\n\n        def apply_transform_to_file(input_filepath, guid, transform_name, category):\n            if input_filepath is None:\n                # This can happen if the noop transform did not yield\n                # a file. We don't want to drop the records so we\n                # keep them.\n                return {\n                    \"guid\": guid,\n                    \"error\": \"No source file provided\",\n                    \"transform_name\": transform_name,\n                    \"input_filepath\": input_filepath,\n                    \"filepath\": None,\n                    \"category\": category,\n                }\n            try:\n                output_filepath = transforms[transform_name](\n                    input_filepath,\n                    output_filepath=os.path.join(\n                        storage_dir, f\"{guid}_{transform_name}\"\n                    ),\n                )\n                error = None\n            except Exception as e:\n                output_filepath = None\n                error = str(e)\n            return {\n                \"guid\": guid,\n                \"error\": error,\n                \"transform_name\": transform_name,\n                \"input_filepath\": input_filepath,\n                \"filepath\": output_filepath,\n                \"category\": category,\n            }\n\n        def apply_transform_to_files(files, transform_name):\n            return pd.DataFrame.from_records(\n                [\n                    apply_transform_to_file(\n                        input_filepath=row[\"filepath\"],\n                        guid=row[\"guid\"],\n                        transform_name=transform_name,\n                        category=row[\"category\"],\n                    )\n                    for _, row in tqdm.tqdm(\n                        files.iterrows(),\n                        desc=f\"Creating files for {transform_name}\",\n                        total=len(files),\n                    )\n                ]\n            )\n\n        results = [apply_transform_to_files(files, transform_name=\"noop\")]\n        for transform_name in transforms.keys():\n            if transform_name == \"noop\":\n                continue\n            results.append(\n                apply_transform_to_files(results[0], transform_name=transform_name)\n            )\n        benchmark_transforms = BenchmarkVideoTransforms(\n            df=pd.concat(results, axis=0, ignore_index=True)\n        )\n        benchmark_transforms.save(storage_dir)\n        return benchmark_transforms\n\n\nclass BenchmarkVideoTransforms(BenchmarkTransforms):\n    expected_columns = [\n        \"filepath\",\n        \"category\",\n        \"transform_name\",\n        \"input_filepath\",\n        \"guid\",\n        \"error\",\n    ]\n\n    def compute_hashes(\n        self, hashers: dict[str, VideoHasher], max_workers: int = 5\n    ) -> BenchmarkHashes:\n        \"\"\"Compute hashes for a series of files given some set of hashers.\n\n        Args:\n            hashers: A dictionary of hashers.\n            max_workers: Maximum number of workers for parallel hash\n                computation.\n\n        Returns:\n            hashes: A BenchmarkHashes object.\n        \"\"\"\n        id_rates = {\n            hasher_name: hasher.frames_per_second\n            for hasher_name, hasher in hashers.items()\n            if hasher.frames_per_second is not None\n        }\n        if id_rates:\n            framerates = tools.get_common_framerates(\n                {\n                    hasher_name: hasher.frames_per_second\n                    for hasher_name, hasher in hashers.items()\n                    if hasher.frames_per_second is not None\n                }\n            )\n        else:\n            framerates = {}\n\n        with concurrent.futures.ProcessPoolExecutor(\n            max_workers=max_workers\n        ) as executor:\n            futures = [\n                executor.submit(\n                    _process_row, row=row, framerates=framerates, hashers=hashers\n                )\n                for index, row in self._df.iterrows()\n            ]\n            return BenchmarkHashes(\n                pd.DataFrame.from_records(\n                    flatten(\n                        [\n                            future.result()\n                            for future in tqdm.tqdm(\n                                concurrent.futures.as_completed(futures),\n                                desc=\"Computing hashes.\",\n                                total=len(self._df),\n                            )\n                        ]\n                    )\n                )\n            )\n"
  },
  {
    "path": "perception/benchmarking/video_transforms.py",
    "content": "import os\n\nimport cv2\nimport ffmpeg\n\nfrom ..hashers.tools import read_video\n\n\ndef probe(filepath):\n    \"\"\"Get the output of ffprobe.\"\"\"\n    return ffmpeg.probe(filepath)\n\n\ndef sanitize_output_filepath(input_filepath, output_filepath, output_ext=None):\n    \"\"\"Get a suitable output filepath with an extension based on\n    an input filepath.\n\n    Args:\n        input_filepath: The filepath for the source file.\n        output_filepath: The filepath for the output file.\n        output_ext: A new extension to add (e.g., '.gif')\n    \"\"\"\n    _, input_ext = os.path.splitext(input_filepath)\n    if not output_filepath.lower().endswith(output_ext or input_ext):\n        output_filepath += output_ext or input_ext\n    return output_filepath\n\n\ndef get_simple_transform(\n    width: str | int = -1,\n    height: str | int = -1,\n    pad: str | None = None,\n    codec: str | None = None,\n    clip_pct: tuple[float, float] | None = None,\n    clip_s: tuple[float, float] | None = None,\n    sar=None,\n    fps=None,\n    output_ext=None,\n):\n    \"\"\"Resize to a specific size and re-encode.\n\n    Args:\n        width: The target width (-1 to maintain aspect ratio)\n        height: The target height (-1 to maintain aspect ratio)\n        pad: An ffmpeg pad argument provided as a string.\n        codec: The codec for encoding the video.\n        fps: The new frame rate for the video.\n        clip_pct: The video start and end in percentages of video duration.\n        clip_s: The video start and end in seconds (used over clip_pct if both\n            are provided).\n        sar: Whether to make all videos have a common sample aspect\n            ratio (i.e., for all square pixels, set this to '1/1').\n        output_ext: The extension to use when re-encoding (used to select\n            video format). It should include the leading '.'.\n    \"\"\"\n\n    def transform(input_filepath, output_filepath):\n        output_filepath = sanitize_output_filepath(\n            input_filepath, output_filepath, output_ext\n        )\n        data = None\n        if codec is None:\n            data = data or probe(input_filepath)\n            output_codec = [s for s in data[\"streams\"] if s[\"codec_type\"] == \"video\"][\n                0\n            ][\"codec_name\"]\n        else:\n            output_codec = codec\n        format_kwargs = {\"codec:v\": output_codec}\n        if clip_pct is not None or clip_s is not None:\n            pct_start, pct_end, pos_start, pos_end = None, None, None, None\n            if clip_pct is not None:\n                pct_start, pct_end = clip_pct\n            if clip_s is not None:\n                pos_start, pos_end = clip_s\n            if pct_start is not None:\n                assert 0 <= pct_start <= 1, \"Start position must be between 0 and 1.\"\n            if pct_end is not None:\n                assert 0 <= pct_end <= 1, \"End position must be between 0 and 1.\"\n            if pct_start is not None and pct_end is not None:\n                assert pct_start < pct_end, \"End must be greater than start.\"\n            if (pct_start is not None and pos_start is None) or (\n                pct_end is not None and pos_end is None\n            ):\n                # We only want to get the duration for the video if we need\n                # it.\n                data = data or probe(input_filepath)\n                duration = float(data[\"streams\"][0][\"duration\"])\n            if pct_start is not None or pos_start is not None:\n                format_kwargs[\"ss\"] = pos_start or pct_start * duration  # type: ignore\n            if pct_end is not None or pos_end is not None:\n                format_kwargs[\"t\"] = pos_end or pct_end * duration  # type: ignore\n        stream = ffmpeg.input(input_filepath)\n        if not (width == -1 and height == -1):\n            stream = stream.filter(\"scale\", width, height)\n        if pad is not None:\n            stream = stream.filter(\"pad\", *pad.split(\":\"))\n        if fps is not None:\n            stream = stream.filter(\"fps\", fps)\n        if sar is not None:\n            stream = stream.filter(\"setsar\", sar)\n        stream = stream.output(output_filepath, **format_kwargs).overwrite_output()\n        ffmpeg.run(stream)\n        if os.path.isfile(output_filepath):\n            return output_filepath\n        return None\n\n    return transform\n\n\ndef get_slideshow_transform(\n    frame_input_rate, frame_output_rate, max_frames=None, offset=0\n):\n    \"\"\"Get a slideshow transform to create slideshows from\n    videos.\n\n    Args:\n        frame_input_rate: The rate at which frames will be sampled\n            from the source video (e.g., a rate of 1 means we collect\n            one frame per second of the input video).\n        frame_output_rate: The rate at which the sampled frames are played\n            in the slideshow (e.g., a rate of 0.5 means each frame will\n            appear for 2 seconds).\n        max_frames: The maximum number of frames to write.\n        offset: The number of seconds to wait before beginning the slide show.\n    \"\"\"\n\n    def transform(input_filepath, output_filepath):\n        output_filepath = sanitize_output_filepath(\n            input_filepath, output_filepath, output_ext=\".avi\"\n        )\n        writer = None\n        frame_count = 0\n        try:\n            for frame, _, timestamp in read_video(\n                filepath=input_filepath, frames_per_second=frame_input_rate\n            ):\n                if timestamp < offset:\n                    continue\n                if writer is None:\n                    writer = cv2.VideoWriter(\n                        filename=output_filepath,\n                        fourcc=cv2.VideoWriter_fourcc(*\"MJPG\"),  # type: ignore[attr-defined]\n                        fps=frame_output_rate,\n                        frameSize=tuple(frame.shape[:2][::-1]),\n                        isColor=True,\n                    )\n                writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))\n                frame_count += 1\n                if max_frames is not None and frame_count >= max_frames:\n                    break\n        finally:\n            if writer is not None:\n                writer.release()\n        if os.path.isfile(output_filepath):\n            return output_filepath\n        return None\n\n    return transform\n\n\ndef get_black_frame_padding_transform(duration_s=0, duration_pct=0):\n    \"\"\"Get a transform that adds black frames at the start and end\n    of a video.\n\n    Args:\n        duration_s: The duration of the black frames in seconds.\n        duration_pct: The duration of the black frames\n            as a percentage of video duration. If both duration_s\n            and duration_pct are provided, the maximum value\n            is used.\n    \"\"\"\n\n    def transform(input_filepath, output_filepath):\n        output_filepath = sanitize_output_filepath(input_filepath, output_filepath)\n        stream = next(\n            stream\n            for stream in probe(input_filepath)[\"streams\"]\n            if stream[\"codec_type\"] == \"video\"\n        )\n        assert stream[\"sample_aspect_ratio\"] == \"1:1\", \"SAR is not 1:1.\"\n        width = stream[\"width\"]\n        height = stream[\"height\"]\n        duration = max(duration_s, duration_pct * float(stream[\"duration\"]))\n        ffmpeg.input(input_filepath).output(\n            output_filepath,\n            vf=(\n                \"color=c=black:s={width}x{height}:d={duration} [pre] ; \"\n                \"color=c=black:s={width}x{height}:d={duration} [post] ; \"\n                \"[pre] [in] [post] concat=n=3\"\n            ).format(width=width, height=height, duration=duration),\n            fps_mode=\"vfr\",\n        ).overwrite_output().run()\n        if os.path.isfile(output_filepath):\n            return output_filepath\n        return None\n\n    return transform\n"
  },
  {
    "path": "perception/extensions.pyx",
    "content": "# cython: language_level=3\n# cython: language=c++\n\nimport math\nimport sys\n\nimport cython\nimport numpy as np\nfrom cython.parallel import parallel, prange\n\ncimport numpy as np\nfrom libc.stdlib cimport abort, free, malloc\nfrom libcpp cimport bool as cppbool\nfrom libcpp.vector cimport vector\n\n\ncdef extern from \"limits.h\":\n    int INT_MAX\n\nctypedef np.uint8_t uint8\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ndef compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts: np.uint32_t[:] = None, compute_overlap=False):\n    \"\"\"Find the pairwise overlap within an array of vectors, where there may be multiple\n    vectors for the same file. This function is faster than using scipy.spatial.distance\n    because it computes distances in parallel, avoids computing full distances when they're\n    not necessary, skips computing distances for pairs of hashes that are for the\n    same file, and skips computing distances for vectors if both have already been matched.\n\n    Args:\n        X: The vectors with shape (N, D). Vectors for the same file need to be\n            supplied sequentially so that we can use the counts argument\n            to determine which vectors are for the same file.\n        counts: For each file, the number of sequential vectors in X. If not\n            provided, each vector is assumed to be for a different file (i.e.,\n            this is equivalent to `counts = np.ones(N)`).\n        compute_overlap: If True, the values returned will be divided by the number\n            of hashes in each file. If False, the raw duplicate counts will\n            be returned.\n\n    Returns:\n        duplicates: An array of shape (M!/(2*((M-2)!)), 2) indicating\n            the fraction of vectors for each file found in another file.\n            The indexing matches that of scipy.spatial.pdist. M is the number of files.\n            So if M = 4, the array will represent comparisons of the file indexes as follows:\n            [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3)]. So (assuming compute_overlap=True),\n            a possible return would be [(1.0, 1.0), (0, 0), (0, 0), (0.66, 1.0), (0.5, 0.25)]\n            which means that:\n\n            - There was 100% overlap between file 0 and file 1\n            - 66% of file 1 was in file 2 and 100% of file 2 was in file 1\n            - 50% of file 2 was in file 3 and 25% of file 3 was in file 2\n    \"\"\"\n    if counts is None:\n        counts = np.ones(X.shape[0], dtype=np.uint32)\n    cdef Py_ssize_t n = X.shape[0]\n    cdef Py_ssize_t m = counts.shape[0]\n    cdef Py_ssize_t d = X.shape[1]\n    n_pairs_python = int(math.factorial(m)/(2*math.factorial(m-2)))\n    assert n_pairs_python < sys.maxsize, 'Too many files were provided for deduplication.'\n    cdef Py_ssize_t n_pairs = n_pairs_python\n    cdef Py_ssize_t max_counts = np.max(counts)\n    cdef int compute_overlap_int = 0\n    if compute_overlap:\n        compute_overlap_int = 1\n    # i_1 is the index of file1, i_2 is the index of file2, i_d is the\n    # index of the vector dimension we're on, i_i is used to compute\n    # the starting index in the flattened vector in the different threads.\n    # i_1_subhash is the index of the hash on file1, i_2_subhash is\n    # the index of the hash on file2.\n    cdef Py_ssize_t i_1, i_2, i_d, i_i, i_1_sub, i_2_sub, i_1_offset\n    duplicate_arr = np.zeros((n_pairs, 2), dtype=np.double)\n    cdef double[:, :] duplicate = duplicate_arr\n    offsets_arr = np.zeros(m, dtype=np.int32)\n    cdef np.int32_t[:] offsets = offsets_arr\n    for i_1 in range(m):\n        for i_i in range(i_1):\n            offsets[i_1] += counts[i_i]\n    # local_buf will contain distance, flattened array offset, index_offset_1, index_offset_2\n    cdef size_t local_buf_size = 4\n    cdef float threshold2 = threshold ** 2\n    with nogil, parallel():\n        local_buf = <np.uint64_t *> malloc(sizeof(np.uint64_t) * local_buf_size)\n\n        # An array of flags indicating whether a vector in file 1 was\n        # matched.\n        matched_1 = <int *> malloc(sizeof(int) * max_counts)\n\n        # An array of flags indicating whether a vector in file 2 was\n        # matched.\n        matched_2 = <int *> malloc(sizeof(int) * max_counts)\n        if local_buf is NULL or matched_1 is NULL or matched_2 is NULL:\n            abort()\n        # Iterate over all of the files.\n        for i_1 in prange(m-1):\n            local_buf[1] = 0\n            local_buf[2] = offsets[i_1]\n            # Compute the index of the output vector\n            # where we will count the number of duplicates.\n            for i_i in range(i_1):\n                local_buf[1] += m - i_i - 1\n            # Iterate over all the other files to compare.\n            for i_2 in range(i_1 + 1, m):\n                local_buf[3] = offsets[i_2]\n                # Initialize all match flags to zero for\n                # both file 1 and file 2.\n                for i_1_sub in range(counts[i_1]):\n                    matched_1[i_1_sub] = 0\n                for i_2_sub in range(counts[i_2]):\n                    matched_2[i_2_sub] = 0\n                # Iterate over all the hashes in file1\n                for i_1_sub in range(counts[i_1]):\n                    # Iterate over all the hashes in file2\n                    for i_2_sub in range(counts[i_2]):\n                        local_buf[0] = 0\n                        if matched_1[i_1_sub] == 1 and matched_2[i_2_sub] == 1:\n                            # Both the vectors in this pair have already been matched, so\n                            # there is nothing to gain from this comparison.\n                            continue\n                        for i_d in range(d):\n                            local_buf[0] += (X[local_buf[2] + i_1_sub, i_d] - X[local_buf[3] + i_2_sub, i_d]) ** 2\n                            if local_buf[0] > threshold2:\n                                # If we're already beyond the distance threshold,\n                                # we don't need to continue computing squared\n                                # distances.\n                                break\n                        if local_buf[0] < threshold2:\n                            # A match was found. Set flags for both vectors\n                            # to 1.\n                            matched_1[i_1_sub] = 1\n                            matched_2[i_2_sub] = 1\n                # Add up the number of matches for file 1.\n                for i_1_sub in range(counts[i_1]):\n                    duplicate[local_buf[1], 0] += matched_1[i_1_sub]\n                # Add up the number of matches for file 2.\n                for i_2_sub in range(counts[i_2]):\n                    duplicate[local_buf[1], 1] += matched_2[i_2_sub]\n                # Divide by the total number of vectors for each file.\n                if compute_overlap_int:\n                    duplicate[local_buf[1], 0] /= counts[i_1]\n                    duplicate[local_buf[1], 1] /= counts[i_2]\n                # Advance to the next pair index.\n                local_buf[1] += 1\n        free(local_buf)\n        free(matched_1)\n        free(matched_2)\n    return duplicate_arr\n\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ndef compute_euclidean_pairwise_duplicates_simple(int[:, :] X, float threshold, np.uint32_t[:] counts = None, float minimum_overlap = 0):\n    \"\"\"Find the pairwise overlap within an array of vectors, where there may be multiple\n    vectors for the same file. This function is similar to compute_euclidean_pairwise_duplicates\n    but uses much less memory.\n\n    Args:\n        X: The vectors with shape (N, D). Vectors for the same file need to be\n            supplied sequentially so that we can use the counts argument\n            to determine which vectors are for the same file.\n        threshold: The maximum distance between to vectors to allow for\n            a match.\n        counts: For each of the M files, the number of sequential vectors in X.\n            If not provided, each vector is assumed to be for a different file (i.e.,\n            this is equivalent to `counts = np.ones(N)` which also implies M == N).\n            Otherwise, assumed to have length M. The counts should add up to N.\n        minimum_overlap: The minimum overlap between two groups of hashes to\n            call it a match.\n\n    Returns:\n        pairs: Pairs of indexes that met the matching criteria.\n    \"\"\"\n    if counts is None:\n        counts_arr = np.ones(X.shape[0], dtype=np.uint32)\n        counts = counts_arr\n    cdef Py_ssize_t n = X.shape[0]\n    cdef Py_ssize_t m = counts.shape[0]\n    cdef Py_ssize_t d = X.shape[1]\n    n_pairs_python = int(math.factorial(m)/(2*math.factorial(m-2)))\n    assert n_pairs_python < sys.maxsize, 'Too many files were provided for deduplication.'\n    cdef Py_ssize_t n_pairs = n_pairs_python\n    cdef Py_ssize_t max_counts = np.max(counts)\n    # i_1 is the index of file1, i_2 is the index of file2, i_d is the\n    # index of the vector dimension we're on, i_i is used to compute\n    # the starting index in the flattened vector in the different threads.\n    # i_1_subhash is the index of the hash on file1, i_2_subhash is\n    # the index of the hash on file2.\n    cdef Py_ssize_t i_1, i_2, i_d, i_i, i_1_sub, i_2_sub\n    cdef vector[cppbool] duplicate\n    duplicate.resize(n_pairs)\n    offsets_arr = np.zeros(m, dtype=np.uint64)\n    cdef np.uint64_t[:] offsets = offsets_arr\n    cdef np.int32_t expected_n = 0\n    for i_1 in range(m):\n        for i_i in range(i_1):\n            offsets[i_1] += counts[i_i]\n        expected_n += counts[i_1]\n    assert expected_n == n, \"Provided value for counts is inconsistent with X.\"\n    # local_buf will contain:\n    # distance, flattened array offset,\n    # index_offset_1, index_offset_2\n    cdef size_t local_buf_size = 4\n    cdef float threshold2 = threshold ** 2\n    with nogil, parallel():\n        local_buf = <np.uint64_t *> malloc(sizeof(np.uint64_t) * local_buf_size)\n\n        # An array of flags indicating whether a vector in file 1 was\n        # matched.\n        matched_1 = <int *> malloc(sizeof(int) * max_counts)\n\n        # An array of flags indicating whether a vector in file 2 was\n        # matched.\n        matched_2 = <int *> malloc(sizeof(int) * max_counts)\n\n        # Pair overlap and minimum required overlap\n        overlap = <float *> malloc(sizeof(float) * 4)\n\n        if local_buf is NULL or matched_1 is NULL or matched_2 is NULL or overlap is NULL:\n            abort()\n        # Iterate over all of the files.\n        for i_1 in prange(m-1):\n            local_buf[1] = 0\n            local_buf[2] = offsets[i_1]\n            # Compute the index of the output vector\n            # where we will count the number of duplicates.\n            for i_i in range(i_1):\n                local_buf[1] += m - i_i - 1\n            # Iterate over all the other files to compare.\n            for i_2 in range(i_1 + 1, m):\n                # Set the current and minimum overlaps\n                overlap[0] = 0\n                overlap[1] = 0\n                overlap[2] = minimum_overlap * counts[i_1]\n                overlap[3] = minimum_overlap * counts[i_2]\n                local_buf[3] = offsets[i_2]\n\n                # Set early termination flag.\n                local_buf[4] = 0\n\n                # Initialize all match flags to zero for\n                # both file 1 and file 2.\n                for i_1_sub in range(counts[i_1]):\n                    matched_1[i_1_sub] = 0\n                for i_2_sub in range(counts[i_2]):\n                    matched_2[i_2_sub] = 0\n                # Iterate over all the hashes in file1\n                for i_1_sub in range(counts[i_1]):\n                    # Stop early if there's no way to get enough\n                    # matches from i1 to i2\n                    if overlap[0] + counts[i_1] - i_1_sub < overlap[2]:\n                        break\n                    # Stop early if we've already reached the minimum overlap\n                    if overlap[0] >= overlap[2] and overlap[1] >= overlap[3] and overlap[0] > 0 and overlap[1] > 0:\n                        break\n\n                    # Iterate over all the hashes in file2\n                    for i_2_sub in range(counts[i_2]):\n                        local_buf[0] = 0\n                        if matched_1[i_1_sub] == 1 and matched_2[i_2_sub] == 1:\n                            # Both the vectors in this pair have already been matched, so\n                            # there is nothing to gain from this comparison.\n                            continue\n                        for i_d in range(d):\n                            local_buf[0] += (X[local_buf[2] + i_1_sub, i_d] - X[local_buf[3] + i_2_sub, i_d]) ** 2\n                            if local_buf[0] > threshold2:\n                                # If we're already beyond the distance threshold,\n                                # we don't need to continue computing squared\n                                # distances.\n                                break\n                        if local_buf[0] < threshold2:\n                            # A match was found. Set flags for both vectors\n                            # to 1 and increment the overlap.\n                            if matched_1[i_1_sub] != 1:\n                                overlap[0] += 1\n                            if matched_2[i_2_sub] != 1:\n                                overlap[1] += 1\n                            matched_1[i_1_sub] = 1\n                            matched_2[i_2_sub] = 1\n                if overlap[0] >= overlap[2] and overlap[1] >= overlap[3] and overlap[0] > 0 and overlap[1] > 0:\n                    duplicate[local_buf[1]] = 1\n                local_buf[1] += 1\n        free(matched_1)\n        free(matched_2)\n        free(overlap)\n        free(local_buf)\n    cdef int n_duplicates = 0\n    cdef Py_ssize_t i_offset = 0\n    for i_offset in range(n_pairs):\n        if duplicate[i_offset] > 0:\n            n_duplicates += 1\n    pairs_arr = np.zeros((n_duplicates, 2), dtype=np.int32)\n    cdef np.int32_t[:, :] pairs = pairs_arr\n    i_offset = 0\n    cdef Py_ssize_t pair_offset = 0\n    for i_1 in range(m-1):\n        # Compute the index of the output vector\n        # where we will count the number of duplicates.\n        for i_2 in range(i_1 + 1, m):\n            if duplicate[i_offset] > 0:\n                pairs[pair_offset][0] = i_1\n                pairs[pair_offset][1] = i_2\n                pair_offset += 1\n            i_offset += 1\n    return pairs_arr\n"
  },
  {
    "path": "perception/hashers/__init__.py",
    "content": "from .hasher import ImageHasher, VideoHasher\nfrom .image.average import AverageHash\nfrom .image.dhash import DHash\nfrom .image.opencv import BlockMean, ColorMoment, MarrHildreth\nfrom .image.phash import PHash, PHashF, PHashU8\nfrom .image.wavelet import WaveletHash\nfrom .video.framewise import FramewiseHasher\nfrom .video.tmk import TMKL1, TMKL2\n\n__all__ = [\n    \"ImageHasher\",\n    \"VideoHasher\",\n    \"AverageHash\",\n    \"PHash\",\n    \"WaveletHash\",\n    \"MarrHildreth\",\n    \"BlockMean\",\n    \"ColorMoment\",\n    \"DHash\",\n    \"FramewiseHasher\",\n    \"TMKL1\",\n    \"TMKL2\",\n    \"PHashU8\",\n    \"PHashF\",\n]\n\ntry:\n    from .image.pdq import PDQHash as PDQHash, PDQHashF as PDQHashF\nexcept ImportError:\n    pass\nelse:\n    __all__.extend([\"PDQHash\", \"PDQHashF\"])\n"
  },
  {
    "path": "perception/hashers/hasher.py",
    "content": "import concurrent.futures\nimport typing\nimport warnings\nfrom abc import ABC, abstractmethod\nfrom logging import warning\n\nimport numpy as np\nimport scipy.spatial\nimport tqdm\n\nfrom perception.hashers import tools\n\n\nclass Hasher(ABC):\n    \"\"\"All hashers implement a common set of methods from\n    the Hasher base class.\n    \"\"\"\n\n    #: The metric to use when computing distance between two hashes. All hashers\n    #: must supply this parameter.\n    distance_metric: str\n\n    #: The numpy type to use when converting from string to array form.\n    #: All hashers must supply this parameter.\n    dtype: str\n\n    #: Indicates the length of the hash vector\n    hash_length: int\n\n    #: Whether or not this hash returns multiple values\n    returns_multiple: bool = False\n\n    #: Indicates whether the hashes can be computed in parallel\n    allow_parallel: bool = True\n\n    def string_to_vector(self, hash_string: str, hash_format: str = \"base64\"):\n        \"\"\"Convert hash string to vector.\n\n        Args:\n            hash_string: The input hash string\n            hash_format: One of 'base64' or 'hex'\n        \"\"\"\n        return tools.string_to_vector(\n            hash_string,\n            dtype=self.dtype,\n            hash_length=self.hash_length,\n            hash_format=hash_format,\n        )\n\n    def vector_to_string(\n        self, vector: np.ndarray, hash_format: str = \"base64\"\n    ) -> str | None:\n        \"\"\"Convert vector to hash string.\n\n        Args:\n            vector: Input vector\n            hash_format: One of 'base64' or 'hex'\n        \"\"\"\n        return tools.vector_to_string(vector, dtype=self.dtype, hash_format=hash_format)\n\n    def compute_distance(\n        self,\n        hash1: np.ndarray | str,\n        hash2: np.ndarray | str,\n        hash_format=\"base64\",\n    ):\n        \"\"\"Compute the distance between two hashes.\n\n        Args:\n            hash1: The first hash or vector\n            hash2: The second hash or vector\n            hash_format: If either or both of the hashes are hash strings,\n                what format the string is encoded in.\n        \"\"\"\n        hash1 = (\n            self.string_to_vector(hash1, hash_format=hash_format)\n            if isinstance(hash1, str)\n            else hash1\n        )  # makes mypy happy\n        hash2 = (\n            self.string_to_vector(hash2, hash_format=hash_format)\n            if isinstance(hash2, str)\n            else hash2\n        )\n\n        if self.distance_metric == \"sqeuclidean\":\n            return scipy.spatial.distance.sqeuclidean(\n                hash1.astype(\"float32\"), hash2.astype(\"float32\")\n            )\n        if self.distance_metric == \"euclidean\":\n            return scipy.spatial.distance.euclidean(\n                hash1.astype(\"float32\"), hash2.astype(\"float32\")\n            )\n        if self.distance_metric == \"hamming\":\n            return scipy.spatial.distance.hamming(hash1, hash2)\n        if self.distance_metric == \"cosine\":\n            return scipy.spatial.distance.cosine(\n                hash1.astype(\"float32\"), hash2.astype(\"float32\")\n            )\n        if self.distance_metric == \"custom\":\n            return self._compute_distance(hash1, hash2)\n        raise NotImplementedError(\n            f\"Distance metric: {self.distance_metric} not supported.\"\n        )\n\n    def _compute_distance(self, vector1, vector2):\n        raise ValueError(\"Called a custom distance function but it is not implemented.\")\n\n    @typing.no_type_check\n    def compute_parallel(\n        self,\n        filepaths: list[str],\n        progress: tqdm.tqdm | None = None,\n        progress_desc: str | None = None,\n        max_workers: int = 5,\n        isometric: bool = False,\n    ):\n        \"\"\"Compute hashes in a parallelized fashion.\n\n        Args:\n            filepaths: A list of paths to images or videos (depending on the hasher).\n            progress: A tqdm-like wrapper for reporting progress. If None,\n                progress is not reported.\n            progress_desc: The title of the progress bar.\n            max_workers: The maximum number of workers\n            isometric: Whether to compute all eight isometric transforms for\n                each image.\n        \"\"\"\n        if not self.allow_parallel and max_workers != 1:\n            warnings.warn(\n                message=\"This hash cannot be used in parallel. Setting max_workers to 1.\",\n                category=UserWarning,\n            )\n            max_workers = 1\n        assert all(\n            isinstance(p, str) for p in filepaths\n        ), \"All images should be provided as paths.\"\n\n        if isinstance(self, VideoHasher) and isometric:\n            raise ValueError(\"Computing isometric hashes for videos is not supported.\")\n\n        # We can use a with statement to ensure threads are cleaned up promptly\n        records = []\n        if isinstance(self, VideoHasher):\n            executor_class = concurrent.futures.ProcessPoolExecutor\n        else:\n            executor_class = concurrent.futures.ThreadPoolExecutor\n        with executor_class(max_workers=max_workers) as executor:\n            # Start the load operations and mark each future with its filepath\n            compute: typing.Callable = (\n                self.compute_isometric if isometric else self.compute\n            )\n            future_to_path: dict = {\n                executor.submit(compute, path): path for path in filepaths\n            }\n            generator = concurrent.futures.as_completed(future_to_path)\n            if progress is not None:\n                generator = progress(\n                    generator, total=len(filepaths), desc=progress_desc\n                )\n            for future in generator:\n                path = future_to_path[future]\n                try:\n                    hash_value = future.result()\n                except Exception as exc:\n                    records.append({\"filepath\": path, \"hash\": None, \"error\": str(exc)})\n                else:\n                    records.append(\n                        {\"filepath\": path, \"hash\": hash_value, \"error\": None}\n                    )\n        return records\n\n\nclass ImageHasher(Hasher):\n    @abstractmethod\n    def _compute(self, image: np.ndarray) -> np.ndarray:\n        \"\"\"Compute hash from an image.\n\n        Args:\n            image: A numpy array representing an image as\n                of shape (H, W, 3) where channels are ordered\n                as RGB or a filepath to an image.\n        \"\"\"\n\n    def compute_isometric_from_hash(self, hash_string_or_vector, hash_format=\"base64\"):\n        \"\"\"For supported hashes, obtain the hashes for the dihedral transformations\n        of the original image. They are provided in the following order:\n\n        - Vertical flip\n        - Horizontal flip\n        - 180 degree rotation\n        - 90 degree rotation\n        - 90 degree rotation and vertical flip\n        - 90 degree rotation and horizontal flip\n        - 270 degree rotation\n\n        Args:\n            hash_string_or_vector: The hash string or vector\n            hash_format: One 'base64' or 'hex'\n        \"\"\"\n        if not hasattr(self, \"_compute_isometric_from_hash\"):\n            raise NotImplementedError(\"This hasher does not support hash rotation.\")\n        rotations = self._compute_isometric_from_hash(  # type: ignore\n            hash_string_or_vector\n            if isinstance(hash_string_or_vector, np.ndarray)\n            else self.string_to_vector(hash_string_or_vector, hash_format=hash_format)\n        )\n        return {\n            transform_name: self.vector_to_string(vector, hash_format=hash_format)\n            for transform_name, vector in rotations.items()\n        }\n\n    def compute_isometric(self, image: tools.ImageInputType):\n        image = tools.to_image_array(image)\n        if hasattr(self, \"_compute_isometric\"):\n            hashes = self._compute_isometric(image)  # type: ignore\n        elif hasattr(self, \"_compute_isometric_from_hash\"):\n            hashes = self._compute_isometric_from_hash(  # type: ignore\n                self._compute(image)\n            )\n        else:\n            transforms = tools.get_isometric_transforms(image)\n            for name, transform in transforms.items():\n                transforms[name] = self._compute(transform)\n            hashes = transforms\n        return {\n            transform_name: self.vector_to_string(vector)\n            for transform_name, vector in hashes.items()\n        }\n\n    def compute(\n        self, image: tools.ImageInputType, hash_format=\"base64\"\n    ) -> np.ndarray | str | None | list[str | None]:\n        \"\"\"Compute a hash from an image.\n\n        Args:\n            image: An image represented as a filepath, a PIL image object,\n                or as an np.ndarray object. If it is an np.ndarray object,\n                it must be in RGB color order (note the OpenCV default is\n                BGR).\n            hash_format: One 'base64', 'hex', or 'vector'\n        \"\"\"\n        vector = self._compute(tools.to_image_array(image))\n        if hash_format == \"vector\":\n            # Take care of this separately because we took out `vector`\n            # as valid return type to vector_to_string().\n            # The .tolist() might seem unnecessary for the\n            # ndarray `vector` but downstream expects a list and it\n            # stays consistent with original, so keeping for now.\n            # return (vector.tolist() if self.returns_multiple\n            #        else vector)\n            return vector  # should iterate the same as vector.tolist()\n        if self.returns_multiple:\n            return [self.vector_to_string(v, hash_format=hash_format) for v in vector]\n        return self.vector_to_string(vector, hash_format=hash_format)\n\n    def compute_with_quality(\n        self, image: tools.ImageInputType, hash_format=\"base64\"\n    ) -> tuple[\n        (np.ndarray | str | None | list[str | None]),\n        int,\n    ]:\n        \"\"\"Compute hash and hash quality from image.\n\n        Args:\n            image: An image represented as a filepath, a PIL image object,\n                or as an np.ndarray object. If it is an np.ndarray object,\n                it must be in RGB color order (note the OpenCV default is\n                BGR).\n            hash_format: One 'base64', 'hex', or 'vector'\n\n        Returns:\n            A tuple of (hash, quality)\n        \"\"\"\n        vector, quality = self._compute_with_quality(tools.to_image_array(image))\n        if hash_format == \"vector\":\n            return vector, quality\n        if self.returns_multiple:\n            return (\n                [self.vector_to_string(v, hash_format=hash_format) for v in vector],\n                quality,\n            )\n        return (self.vector_to_string(vector, hash_format=hash_format), quality)\n\n    def _compute_with_quality(self, image: np.ndarray) -> tuple[np.ndarray, int]:\n        return self._compute(image), tools.compute_quality(image)\n\n\nclass VideoHasher(Hasher):\n\n    #: The frame rate at which videos are read\n    frames_per_second: float = 1\n\n    @abstractmethod\n    def process_frame(\n        self,\n        frame: np.ndarray,\n        frame_index: int | None,\n        frame_timestamp: float | None,\n        state: dict | None = None,\n    ) -> dict:\n        \"\"\"Called for each frame in the video. For all\n        but the first frame, a state is provided recording the state from\n        the previous frame.\n\n        Args:\n            frame: The current frame as an RGB ndarray\n            frame_index: The current frame index\n            frame_timestamp: The current frame timestamp\n            state: The state from the last call to process_frame\n        \"\"\"\n\n    @abstractmethod\n    def hash_from_final_state(self, state: dict) -> np.ndarray:\n        \"\"\"Called after all frames have been processed. Returns the final\n        feature vector.\n\n        Args:\n            state: The state dictionary at the end of processing.\n        \"\"\"\n\n    def compute(\n        self,\n        filepath,\n        errors=\"raise\",\n        hash_format=\"base64\",\n        scenes=None,\n        **kwargs,\n    ):\n        \"\"\"Compute a hash for a video at a given filepath. All\n        other arguments are passed to perception.hashers.tools.read_video.\n\n        Args:\n            filepath: Path to video file\n            errors: One of \"raise\", \"ignore\", or \"warn\". Passed\n                to perception.hashers.tools.read_video.\n            hash_format: One of \"vector\", \"base64\", or \"hex\"\n            max_duration: The maximum length of the video to hash.\n            max_size: The maximum size of frames to queue\n            scenes: An array used to pass scene info back to wrapper\n                functions\n        \"\"\"\n        frame_timestamp, state = None, None\n        # Iterate through the video, aggregating scene info in the state\n        # dict\n        for frame, frame_index, frame_timestamp in tools.read_video(\n            filepath=filepath,\n            frames_per_second=self.frames_per_second,\n            errors=errors,\n            **kwargs,\n        ):\n            state = self.process_frame(\n                frame=frame,\n                frame_index=frame_index,\n                frame_timestamp=frame_timestamp,\n                state=state,\n            )\n\n        if state is None:\n            if errors == \"raise\":\n                raise ValueError(\n                    f\"Video processing failed for {filepath}, State is None.\"\n                )\n            if errors == \"warn\":\n                warning(f\"Video processing failed for {filepath}, State is None.\")\n\n            return None\n\n        # Persist the final timestamp in the state to allow us to pass along\n        # duration\n        state[\"end\"] = frame_timestamp\n        vectors = self.hash_from_final_state(state=state)\n        if scenes is not None:\n            scenes += state.get(\"scenes\", [])\n        if hash_format == \"vector\":\n            # Take care of this separately because we took out `vector`\n            # as valid return type to vector_to_string().\n            # The .tolist() might seem unnecessary for the\n            # ndarray `vector` but downstream expects a list and it\n            # stays consistent with original, so keeping for now.\n            # return (vector.tolist() if self.returns_multiple\n            #        else vector)\n            return vectors  # should iterate the same as vector.tolist()\n        if self.returns_multiple:\n            return [self.vector_to_string(v, hash_format=hash_format) for v in vectors]\n        return self.vector_to_string(vectors, hash_format=hash_format)\n"
  },
  {
    "path": "perception/hashers/image/__init__.py",
    "content": "from .average import AverageHash\nfrom .dhash import DHash\nfrom .opencv import BlockMean, ColorMoment, MarrHildreth\nfrom .phash import PHash, PHashF, PHashU8\nfrom .wavelet import WaveletHash\n\n__all__ = [\n    \"AverageHash\",\n    \"PHash\",\n    \"WaveletHash\",\n    \"MarrHildreth\",\n    \"BlockMean\",\n    \"ColorMoment\",\n    \"DHash\",\n    \"PHashF\",\n    \"PHashU8\",\n]\n"
  },
  {
    "path": "perception/hashers/image/average.py",
    "content": "import cv2\n\nfrom .. import tools\nfrom ..hasher import ImageHasher\n\n\nclass AverageHash(ImageHasher):\n    \"\"\"Computes a simple hash comparing the intensity of each\n    pixel in a resized version of the image to the mean.\n    Implementation based on that of\n    `ImageHash <https://github.com/JohannesBuchner/imagehash>`_.\"\"\"\n\n    distance_metric = \"hamming\"\n    dtype = \"bool\"\n\n    def __init__(self, hash_size=8):\n        assert hash_size >= 2, \"Hash size must be greater than or equal to 2.\"\n        self.hash_size = hash_size\n        self.hash_length = hash_size * hash_size\n\n    def _compute(self, image):\n        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n        image = cv2.resize(\n            image, dsize=(self.hash_size, self.hash_size), interpolation=cv2.INTER_AREA\n        )\n        diff = image > image.mean()\n        return diff.flatten()\n\n    def _compute_isometric_from_hash(self, vector):\n        return {\n            transform_name: diff.flatten()\n            for transform_name, diff in tools.get_isometric_transforms(\n                vector.reshape(self.hash_size, self.hash_size, 1), require_color=False\n            ).items()\n        }\n"
  },
  {
    "path": "perception/hashers/image/dhash.py",
    "content": "import cv2\n\nfrom ..hasher import ImageHasher\n\n\nclass DHash(ImageHasher):\n    \"\"\"A hash based on the differences between adjacent pixels.\n    Implementation based on that of\n    `ImageHash <https://github.com/JohannesBuchner/imagehash>`_.\n    \"\"\"\n\n    dtype = \"bool\"\n    distance_metric = \"hamming\"\n\n    def __init__(self, hash_size=8):\n        assert hash_size > 1, \"Hash size must be greater than 1.\"\n        self.hash_size = hash_size\n        self.hash_length = hash_size * hash_size\n\n    def _compute(self, image):\n        image = cv2.resize(\n            image,\n            dsize=(self.hash_size + 1, self.hash_size),\n            interpolation=cv2.INTER_AREA,\n        )\n        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)\n        previous = image[:, :-1]\n        current = image[:, 1:]\n        difference = previous > current\n        return difference.flatten()\n"
  },
  {
    "path": "perception/hashers/image/opencv.py",
    "content": "import cv2\nimport numpy as np\n\nfrom ..hasher import ImageHasher\n\n\nclass OpenCVHasher(ImageHasher):\n    allow_parallel = False\n\n    def __init__(self):\n        if not hasattr(cv2, \"img_hash\"):\n            raise RuntimeError(\n                \"You do not appear to have opencv-contrib installed. It is required for pure OpenCV hashers.\"\n            )\n\n\nclass MarrHildreth(OpenCVHasher):\n    \"\"\"A wrapper around OpenCV's Marr-Hildreth hash.\n    See `paper <https://www.phash.org/docs/pubs/thesis_zauner.pdf>`_ for details.\"\"\"\n\n    dtype = \"bool\"\n    distance_metric = \"hamming\"\n    hash_length = 576\n\n    def __init__(self):\n        super().__init__()\n        self.hasher = cv2.img_hash.MarrHildrethHash.create()  # type: ignore[attr-defined]\n\n    def _compute(self, image):\n        return np.unpackbits(self.hasher.compute(image)[0])\n\n\nclass ColorMoment(OpenCVHasher):\n    \"\"\"A wrapper around OpenCV's Color Moments hash.\n    See `paper <https://www.phash.org/docs/pubs/thesis_zauner.pdf>`_ for details.\"\"\"\n\n    dtype = \"float32\"\n    distance_metric = \"euclidean\"\n    hash_length = 42\n\n    def __init__(self):\n        super().__init__()\n        self.hasher = cv2.img_hash.ColorMomentHash.create()  # type: ignore[attr-defined]\n\n    def _compute(self, image):\n        return 10000 * self.hasher.compute(image)[0]\n\n\nclass BlockMean(OpenCVHasher):\n    \"\"\"A wrapper around OpenCV's Block Mean hash.\n    See `paper <https://www.phash.org/docs/pubs/thesis_zauner.pdf>`_ for details.\"\"\"\n\n    dtype = \"bool\"\n    distance_metric = \"hamming\"\n    hash_length = 968\n\n    def __init__(self):\n        super().__init__()\n        self.hasher = cv2.img_hash.BlockMeanHash.create(1)  # type: ignore[attr-defined]\n\n    def _compute(self, image):\n        # https://stackoverflow.com/questions/54762896/why-cv2-norm-hamming-gives-different-value-than-actual-hamming-distance\n        return np.unpackbits(self.hasher.compute(image)[0])\n"
  },
  {
    "path": "perception/hashers/image/pdq.py",
    "content": "import pdqhash\n\nfrom ..hasher import ImageHasher\n\n\nclass PDQHash(ImageHasher):\n    \"\"\"The Facebook PDQ hash. Based on the original implementation located at\n    the `official repository <https://github.com/facebook/ThreatExchange>`_.\n    \"\"\"\n\n    distance_metric = \"hamming\"\n    dtype = \"bool\"\n    hash_length = 256\n\n    def _compute(self, image):\n        return pdqhash.compute(image)[0] > 0\n\n    def _compute_with_quality(self, image):\n        hash_vector, quality = pdqhash.compute(image)\n        return hash_vector > 0, quality\n\n    def _compute_isometric(self, image):\n        hash_vectors, _ = pdqhash.compute_dihedral(image)\n        names = [\"r0\", \"r90\", \"r180\", \"r270\", \"fv\", \"fh\", \"r90fv\", \"r90fh\"]\n        return dict(zip(names, hash_vectors))\n\n\nclass PDQHashF(PDQHash):\n    dtype = \"float32\"\n    distance_metric = \"euclidean\"\n    hash_length = 256\n\n    def _compute(self, image):\n        return pdqhash.compute_float(image)[0]\n"
  },
  {
    "path": "perception/hashers/image/phash.py",
    "content": "import cv2\nimport numpy as np\nimport scipy.fftpack\n\nfrom .. import tools\nfrom ..hasher import ImageHasher\n\n\nclass PHash(ImageHasher):\n    \"\"\"Also known as the DCT hash, a hash based on discrete cosine transforms of images.\n    See `complete paper <https://www.phash.org/docs/pubs/thesis_zauner.pdf>`_ for\n    details. Implementation based on that of\n    `ImageHash <https://github.com/JohannesBuchner/imagehash>`_.\n\n    Args:\n        hash_size: The number of DCT elements to retain (the hash length\n            will be hash_size * hash_size).\n        highfreq_factor: The multiple of the hash size to resize the input\n            image to before computing the DCT.\n        exclude_first_term: WHether to exclude the first term of the DCT\n        freq_shift: The number of DCT low frequency elements to skip.\n    \"\"\"\n\n    distance_metric = \"hamming\"\n    dtype = \"bool\"\n\n    def __init__(\n        self, hash_size=8, highfreq_factor=4, exclude_first_term=False, freq_shift=0\n    ):\n        assert hash_size >= 2, \"Hash size must be greater than or equal to 2\"\n        assert (\n            freq_shift <= highfreq_factor * hash_size - hash_size\n        ), \"Frequency shift is too large for this hash size / highfreq_factor combination.\"\n        self.hash_size = hash_size\n        self.highfreq_factor = highfreq_factor\n        self.exclude_first_term = exclude_first_term\n        self.hash_length = hash_size * hash_size\n        self.freq_shift = freq_shift\n        if exclude_first_term:\n            self.hash_length -= 1\n\n    def _compute_dct(self, image):\n        img_size = self.hash_size * self.highfreq_factor\n        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)\n        image = cv2.resize(\n            image, dsize=(img_size, img_size), interpolation=cv2.INTER_AREA\n        )\n        dct = scipy.fftpack.dct(scipy.fftpack.dct(image, axis=0), axis=1)\n        return dct[\n            self.freq_shift : self.hash_size + self.freq_shift,\n            self.freq_shift : self.hash_size + self.freq_shift,\n        ]\n\n    def _dct_to_hash(self, dct):\n        dct = dct.flatten()\n        if self.exclude_first_term:\n            dct = dct[1:]\n        return dct > np.median(dct)\n\n    def _compute(self, image):\n        dct = self._compute_dct(image)\n        return self._dct_to_hash(dct)\n\n    def _compute_isometric(self, image):\n        return {\n            transform_name: self._dct_to_hash(dct)\n            for transform_name, dct in tools.get_isometric_dct_transforms(\n                self._compute_dct(image)\n            ).items()\n        }\n\n\nclass PHashF(PHash):\n    \"\"\"A real-valued version of PHash. It\n    returns the raw 32-bit floats in the DCT.\n    For a more compact approach, see PHashU8.\"\"\"\n\n    dtype = \"float32\"\n    distance_metric = \"euclidean\"\n\n    def _dct_to_hash(self, dct):\n        dct = dct.flatten()\n        if self.exclude_first_term:\n            dct = dct[1:]\n        if (dct == 0).all():\n            return None\n        return dct\n\n\nclass PHashU8(PHash):\n    \"\"\"A real-valued version of PHash. It\n    uses minimum / maximum scaling to convert\n    DCT values to unsigned 8-bit integers (more\n    compact than the 32-bit floats used by PHashF at\n    the cost of precision).\"\"\"\n\n    dtype = \"uint8\"\n    distance_metric = \"euclidean\"\n\n    def _dct_to_hash(self, dct):\n        dct = dct.flatten()\n        if self.exclude_first_term:\n            dct = dct[1:]\n        if (dct == 0).all():\n            return None\n        min_value = dct.min()\n        max_value = dct.max()\n        dct = np.uint8(255 * (dct - min_value) / (max_value - min_value))\n        return dct\n"
  },
  {
    "path": "perception/hashers/image/wavelet.py",
    "content": "import cv2\nimport numpy as np\nimport pywt\n\nfrom ..hasher import ImageHasher\n\n\nclass WaveletHash(ImageHasher):\n    \"\"\"Similar to PHash but using wavelets instead of DCT.\n    Implementation based on that of\n    `ImageHash <https://github.com/JohannesBuchner/imagehash>`_.\n    \"\"\"\n\n    distance_metric = \"hamming\"\n    dtype = \"bool\"\n\n    def __init__(self, hash_size=8, image_scale=None, mode=\"haar\"):\n        assert hash_size & (hash_size - 1) == 0, \"Hash size must be a power of 2.\"\n        if image_scale is not None:\n            assert (\n                image_scale & (image_scale - 1) == 0\n            ), \"Image scale must be a power of 2.\"\n            assert (\n                image_scale >= hash_size\n            ), \"Image scale must be greater than or equal to than hash size.\"\n        self.hash_size = hash_size\n        self.image_scale = image_scale\n        self.mode = mode\n        self.hash_length = hash_size * hash_size\n\n    def _compute(self, image):\n        if self.image_scale is None:\n            image_scale = max(2 ** int(np.log2(min(image.shape[:2]))), self.hash_size)\n        else:\n            image_scale = self.image_scale\n        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n        image = cv2.resize(\n            image, dsize=(image_scale, image_scale), interpolation=cv2.INTER_AREA\n        )\n        image = np.float32(image) / 255\n\n        ll_max_level = int(np.log2(image_scale))\n        level = int(np.log2(self.hash_size))\n        dwt_level = ll_max_level - level\n\n        if self.mode == \"haar\":\n            coeffs = pywt.wavedec2(image, \"haar\", level=ll_max_level)\n            coeffs = list(coeffs)\n            coeffs[0] *= 0\n            image = pywt.waverec2(coeffs, \"haar\")\n\n        coeffs = pywt.wavedec2(image, self.mode, level=dwt_level)\n        dwt_low = coeffs[0]\n\n        # Subtract median and compute hash\n        med = np.median(dwt_low)\n        diff = dwt_low > med\n\n        return diff.flatten()\n"
  },
  {
    "path": "perception/hashers/tools.py",
    "content": "import base64\nimport fractions\nimport functools\nimport hashlib\nimport io\nimport itertools\nimport json\nimport logging\nimport math\nimport os\nimport queue\nimport shlex\nimport subprocess\nimport tempfile\nimport threading\nimport typing\nimport warnings\nfrom collections import Counter\nfrom http import client\nfrom numbers import Number\nfrom urllib import request\n\nimport cv2\nimport numpy as np\nimport PIL\nimport PIL.Image\nimport validators\n\nLOGGER = logging.getLogger(__name__)\n\nImageInputType = typing.Union[\n    str, np.ndarray, \"PIL.Image.Image\", io.BytesIO, tempfile.SpooledTemporaryFile\n]\n\nSIZES = {\"float32\": 32, \"uint8\": 8, \"bool\": 1}\n\n# Map codec names to the CUDA-accelerated version. Obtain\n# from ffmpeg -codecs after building using CUDA.\nCUDA_CODECS = {\n    \"h264\": \"h264_cuvid\",\n    \"hevc\": \"hevc_cuvid\",\n    \"mjpeg\": \"mjpeg_cuvid\",\n    \"mpeg1video\": \"mpeg1_cuvid\",\n    \"mpeg2video\": \"mpeg2_cuvid\",\n    \"mpeg4\": \"mpeg4_cuvid\",\n    \"vc1\": \"vc1_cuvid\",\n    \"vp8\": \"vp8_cuvid\",\n    \"vp9\": \"vp9_cuvid\",\n}\n\nFramesWithIndexesAndTimestamps = typing.Generator[\n    tuple[np.ndarray, int | None, float | None], None, None\n]\n\n\ndef get_ffprobe():\n    return os.environ.get(\"PERCEPTION_FFPROBE_BINARY\", \"ffprobe\")\n\n\ndef get_ffmpeg():\n    return os.environ.get(\"PERCEPTION_FFMPEG_BINARY\", \"ffmpeg\")\n\n\ndef compute_quality(image) -> int:\n    \"\"\"Compute a quality metric, using the calculation proposed by\n    `Facebook <https://github.com/facebook/ThreatExchange/blob/master/hashing/hashing.pdf/>`_\n    for their PDQ hash algorithm.\"\"\"\n    if len(image.shape) == 3:\n        image = cv2.cvtColor(image, code=cv2.COLOR_RGB2GRAY)\n    if image.shape[0] != 64 or image.shape[1] != 64:\n        image = cv2.resize(src=image, dsize=(64, 64)).astype(\"float32\")\n    dx = 100 * np.abs(image[:, 1:] - image[:, :-1]) / 255\n    dy = 100 * np.abs(image[1:] - image[:-1]) / 255\n    dx = dx.astype(\"int\").sum()\n    dy = dy.astype(\"int\").sum()\n    return int(np.clip(a=int((dx + dy) / 90), a_min=0, a_max=100))\n\n\ndef compute_md5(filepath) -> str:\n    \"\"\"Compute the md5 hash for a file at `filepath`.\n\n    Args:\n        filepath: The path to the file\n    \"\"\"\n    with open(filepath, \"rb\") as f:\n        hash_str = hashlib.md5(f.read()).hexdigest()\n    return hash_str\n\n\ndef get_string_length(hash_length: int, dtype: str, hash_format=\"hex\") -> int:\n    \"\"\"Compute the expected length of a hash string.\n\n    Args:\n        hash_length: The length of the hash vector\n        dtype: The dtype of the vector\n        hash_format: One of 'base64' or 'hex'\n\n    Returns:\n        The expected string length\n    \"\"\"\n    hash_bytes = math.ceil(hash_length * SIZES[dtype] / 8)\n\n    if hash_format == \"base64\":\n        return int((4 * hash_bytes / 3) + 3) & ~3\n    if hash_format == \"hex\":\n        return 2 * hash_bytes\n    raise NotImplementedError(\"Unknown hash format: \" + hash_format)\n\n\ndef vector_to_string(vector: np.ndarray, dtype: str, hash_format: str) -> str | None:\n    \"\"\"Convert vector to hash.\n\n    Args:\n        vector: Input vector\n    \"\"\"\n    # At times, a vector returned by a hasher is None (e.g., for hashes\n    # that depend on the image not being featureless). In those cases,\n    # we need to just return None, which is the least surprising outcome\n    # because after all, the string representation of None is None.\n    if vector is None:\n        return None\n    if hash_format == \"vector\":\n        # return vector.astype(dtype)  # old behavior\n        raise DeprecationWarning(\"`hash_format` `vector` has been removed.\")\n    if dtype == \"uint8\":\n        vector_bytes = vector.astype(\"uint8\")\n    elif dtype == \"float32\":\n        vector_bytes = vector.astype(\"float32\")\n    elif dtype == \"bool\":\n        vector_bytes = np.packbits(vector.astype(\"bool\"))\n    else:\n        raise NotImplementedError(f\"Cannot convert hash of type {dtype}.\")\n    if hash_format == \"base64\":\n        return base64.b64encode(vector_bytes.tobytes()).decode(\"utf-8\")\n    if hash_format == \"hex\":\n        return vector_bytes.tobytes().hex()\n    raise NotImplementedError(f\"Cannot convert to string format: {hash_format}.\")\n\n\ndef string_to_vector(\n    hash_string: str,\n    dtype: str,\n    hash_length: int,\n    hash_format: str,\n    verify_length: bool = True,\n) -> np.ndarray:\n    \"\"\"Convert hash back to vector.\n\n    Args:\n        hash_string: The input hash string\n        dtype: The data type of the hash\n        hash_length: The length of the hash vector\n        hash_format: The input format of the hash (base64 or hex)\n        verify_length: Whether to verify the string length\n    \"\"\"\n    assert not verify_length or len(hash_string) == get_string_length(\n        hash_length=hash_length, hash_format=hash_format, dtype=dtype\n    ), \"Incorrect string length for this hash format.\"\n    if hash_format == \"base64\":\n        vector_bytes = np.frombuffer(\n            base64.b64decode(hash_string),\n            dtype=\"uint8\" if dtype in [\"bool\", \"uint8\"] else dtype,\n        )\n    elif hash_format == \"hex\":\n        vector_bytes = np.frombuffer(\n            bytearray.fromhex(hash_string),\n            dtype=\"uint8\" if dtype in [\"bool\", \"uint8\"] else dtype,\n        )\n    else:\n        raise NotImplementedError(f\"Cannot convert to string format: {hash_format}\")\n    if dtype == \"uint8\":\n        return vector_bytes[:hash_length]\n    if dtype == \"float32\":\n        return vector_bytes[:hash_length]\n    if dtype == \"bool\":\n        return np.unpackbits(vector_bytes)[:hash_length].astype(\"bool\")\n    raise NotImplementedError(f\"Cannot convert hash of type {dtype}.\")\n\n\ndef hex_to_b64(\n    hash_string: str, dtype: str, hash_length: int, verify_length: bool = True\n):\n    \"\"\"Convert a hex-encoded hash to base64.\n\n    Args:\n        hash_string: The input base64 hash string\n        dtype: The data type of the hash\n        hash_length: The length of the hash vector\n        verify_length: Whether to verify the string length\n    \"\"\"\n    return vector_to_string(\n        string_to_vector(\n            hash_string,\n            hash_length=hash_length,\n            hash_format=\"hex\",\n            dtype=dtype,\n            verify_length=verify_length,\n        ),\n        dtype=dtype,\n        hash_format=\"base64\",\n    )\n\n\ndef b64_to_hex(\n    hash_string: str, dtype: str, hash_length: int, verify_length: bool = True\n):\n    \"\"\"Convert a base64-encoded hash to hex.\n\n    Args:\n        hash_string: The input hex hash string\n        dtype: The data type of the hash\n        hash_length: The length of the hash vector\n        verify_length: Whether to verify the string length\n    \"\"\"\n    return vector_to_string(\n        string_to_vector(\n            hash_string,\n            hash_length=hash_length,\n            hash_format=\"base64\",\n            dtype=dtype,\n            verify_length=verify_length,\n        ),\n        dtype=dtype,\n        hash_format=\"hex\",\n    )\n\n\ndef to_image_array(image: ImageInputType, require_color=True) -> np.ndarray:\n    if isinstance(image, np.ndarray):\n        assert image.flags[\"C_CONTIGUOUS\"], (\n            \"Provided arrays must be contiguous to avoid \"\n            \"erroneous results when arrays are passed to \"\n            \"underlying libraries. This can be achieved using\"\n            \"np.ascontiguousarray(image)\"\n        )\n        assert not require_color or (\n            len(image.shape) == 3 and image.shape[-1] == 3\n        ), \"Provided images must be RGB images.\"\n        return image\n    return read(image)\n\n\ndef get_common_framerates(id_rates: dict):\n    \"\"\"Compute an optimal set of framerates for a list\n    of framerates. Optimal here means that reading the video\n    at each of the framerates will allow one to collect all\n    of the frames required with the smallest possible number of\n    frames decoded.\n\n    For example, consider if we need to read a video at\n    3 fps, 5 fps, 1 fps and 0.5 fps. We could read the video\n    4 times (once per framerate). But a more optimal approach\n    is to read the video only twice, once at 3 frames per second\n    and another time at 5 frames per second. For the 1 fps hasher,\n    we simply pass every 3rd frame of the 3 fps pass. For the\n    0.5 fps hasher, we pass every 6th frame of the 3 fps pass. So\n    if you pass this function {A: 3, B: 5, C: 1, D: 0.5}, you will\n    get back {3: [A, C, D], 5: C}.\n\n    Args:\n        id_rates: A dictionary with IDs as keys and frame rates as values.\n\n    Returns:\n        rate_ids: A dictionary with framerates as keys and a list of\n            ids as values.\n    \"\"\"\n\n    def partition(collection):\n        \"\"\"This function taken from\n        https://stackoverflow.com/questions/19368375/set-partitions-in-python/30134039#30134039\n        \"\"\"\n        if len(collection) == 1:\n            yield [collection]\n            return\n\n        first = collection[0]\n        for smaller in partition(collection[1:]):\n            # insert `first` in each of the subpartition's subsets\n            for n, subset in enumerate(smaller):\n                yield smaller[:n] + [[first] + subset] + smaller[n + 1 :]\n            # put `first` in its own subset\n            yield [[first]] + smaller\n\n    framerates = list(id_rates.values())\n    factor = 2 * 3 * 5 * 7 * 11 * 60 * 60\n    assert (\n        min(framerates) >= 1 / factor\n    ), \"Framerates must be at least 1 frame per hour.\"\n    best_frame_count = np.inf\n    best_grouping: list | None = None\n    best_frame_rates: list | None = None\n\n    # We try every possible grouping of framerates to minimize the number\n    # of frames we decode. There is likely a better way to do this,\n    # but this seems to do the job for now.\n    for grouping in partition(list(set(framerates))):\n        current_frame_rates = [\n            functools.reduce(np.lcm, (np.array(group) * factor).round().astype(int))\n            / factor\n            for group in grouping\n        ]\n        current_frame_count = sum(current_frame_rates)\n        if current_frame_count < best_frame_count:\n            best_frame_count = current_frame_count\n            best_frame_rates = current_frame_rates\n            best_grouping = grouping\n\n    assert best_frame_rates is not None\n    assert best_grouping is not None\n    return {\n        framerate: tuple(name for name, rate in id_rates.items() if rate in group)\n        for framerate, group in zip(best_frame_rates, best_grouping)\n    }\n\n\ndef get_isometric_transforms(image: ImageInputType, require_color=True) -> dict:\n    image_array = to_image_array(image, require_color=require_color)\n    return {\n        \"r0\": image_array,\n        \"fv\": np.ascontiguousarray(image_array[::-1, :]),\n        \"fh\": np.ascontiguousarray(image_array[:, ::-1]),\n        \"r180\": np.ascontiguousarray(image_array[::-1, ::-1]),\n        \"r90\": np.ascontiguousarray(image_array.transpose(1, 0, 2)[::-1, :, :]),\n        \"r90fv\": np.ascontiguousarray(image_array.transpose(1, 0, 2)),\n        \"r90fh\": np.ascontiguousarray(image_array.transpose(1, 0, 2)[::-1, ::-1]),\n        \"r270\": np.ascontiguousarray(image_array.transpose(1, 0, 2)[:, ::-1]),\n    }\n\n\ndef get_isometric_dct_transforms(dct: np.ndarray):\n    T1 = np.empty_like(dct)\n    T1[::2] = 1\n    T1[1::2] = -1\n\n    T2 = np.empty_like(dct)\n    T2[::2, ::2] = 1\n    T2[1::2, 1::2] = 1\n    T2[::2, 1::2] = -1\n    T2[1::2, ::2] = -1\n    return {\n        \"r0\": dct,\n        \"fv\": dct * T1,\n        \"fh\": dct * T1.T,\n        \"r180\": dct * T2,\n        \"r90\": dct.T * T1,\n        \"r90fv\": dct.T,\n        \"r90fh\": dct.T * T2,\n        \"r270\": dct.T * T1.T,\n    }\n\n\ndef read(filepath_or_buffer: ImageInputType, timeout=None) -> np.ndarray:\n    \"\"\"Read a file into an image object\n\n    Args:\n        filepath_or_buffer: The path to the file or any object\n            with a `read` method (such as `io.BytesIO`)\n        timeout: If filepath_or_buffer is a URL, the timeout to\n            use for making the HTTP request.\n    \"\"\"\n    if isinstance(filepath_or_buffer, PIL.Image.Image):\n        return np.array(filepath_or_buffer.convert(\"RGB\"))\n    if isinstance(\n        filepath_or_buffer,\n        (io.BytesIO, client.HTTPResponse, tempfile.SpooledTemporaryFile),\n    ):\n        image = np.asarray(bytearray(filepath_or_buffer.read()), dtype=np.uint8)\n        decoded_image = cv2.imdecode(image, cv2.IMREAD_UNCHANGED)\n    elif isinstance(filepath_or_buffer, str):\n        if validators.url(filepath_or_buffer):\n            with request.urlopen(filepath_or_buffer, timeout=timeout) as response:\n                return read(response)\n        if not os.path.isfile(filepath_or_buffer):\n            raise FileNotFoundError(\n                \"Could not find image at path: \" + filepath_or_buffer\n            )\n        decoded_image = cv2.imread(filepath_or_buffer)\n    else:\n        raise RuntimeError(\n            \"Unhandled filepath_or_buffer type: \" + str(type(filepath_or_buffer))\n        )\n    if decoded_image is None:\n        raise ValueError(f\"An error occurred reading {filepath_or_buffer}.\")\n    # We use cvtColor here instead of just ret[..., ::-1]\n    # in order to ensure that we provide a contiguous\n    # array for later processing. Some hashers use ctypes\n    # to pass the array and non-contiguous arrays can lead\n    # to erroneous results.\n    return cv2.cvtColor(decoded_image, cv2.COLOR_BGR2RGB)\n\n\ndef _get_keyframes(filepath):\n    \"\"\"Get the keyframes for a video.\n\n    Args:\n        filepath: Path to the target file\n\n    Returns:\n        A list of frame indexes.\n    \"\"\"\n    args = [\n        get_ffprobe(),\n        \"-select_streams\",\n        \"v\",\n        \"-i\",\n        f\"'{filepath}'\",\n        \"-print_format\",\n        \"json\",\n        \"-show_entries\",\n        \"frame=pict_type,coded_picture_number\",\n    ]\n    with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:\n        out, err = p.communicate()\n        if p.returncode != 0:\n            raise ValueError(f\"{str(out)}: {str(err)}\")\n        data = json.loads(out.decode(\"utf-8\"))[\"frames\"]\n        frames = [f[\"coded_picture_number\"] for f in data if f[\"pict_type\"] == \"I\"]\n        # ffprobe will return frames repeated and out of order at times. This\n        # last step deduplicates and sorts them.\n        frames = list(set(frames))\n        frames.sort()\n    return frames\n\n\ndef get_video_properties(filepath):\n    cmd = f\"\"\"\n    {get_ffprobe()} -select_streams v:0 -i '{filepath}'\n    -print_format json -show_entries stream=width,height,avg_frame_rate,codec_name,start_time\n    \"\"\"\n    with subprocess.Popen(\n        shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE\n    ) as p:\n        out, err = p.communicate()\n        if p.returncode != 0:\n            raise ValueError(f\"{str(out)}: {str(err)}\")\n        data = json.loads(out.decode(\"utf-8\"))[\"streams\"][0]\n        numerator, denominator = tuple(map(int, data[\"avg_frame_rate\"].split(\"/\")[:2]))\n        avg_frame_rate: fractions.Fraction | None\n        if numerator > 0 and denominator > 0:\n            avg_frame_rate = fractions.Fraction(\n                numerator=numerator, denominator=denominator\n            )\n        else:\n            avg_frame_rate = None\n        return (\n            data[\"width\"],\n            data[\"height\"],\n            avg_frame_rate,\n            data[\"codec_name\"],\n            float(data.get(\"start_time\", \"0\")),\n        )\n\n\ndef read_video_to_generator_ffmpeg(\n    filepath,\n    frames_per_second: str | float | None = None,\n    errors=\"raise\",\n    max_duration: float | None = None,\n    max_size: int | None = None,\n    interp: str | None = None,\n    frame_rounding: str = \"up\",\n    draw_timestamps=False,\n    use_cuda=False,\n) -> FramesWithIndexesAndTimestamps:\n    \"\"\"This is used by :code:`read_video` when :code:`use_ffmpeg` is True. It\n    differs from :code:`read_video_to_generator` in that it uses FFMPEG instead of\n    OpenCV and, optionally, allows for CUDA acceleration. CUDA acceleration\n    can be faster for larger videos (>1080p) where downsampling is desired.\n    For other videos, CUDA may be slower, but the decoding load will still be\n    taken off the CPU, which may still be advantageous. You can specify which\n    FFMPEG binary to use by setting PERCEPTION_FFMPEG_BINARY.\n\n    Args:\n        filepath: See read_video\n        frames_per_second: See read_video\n        errors: See read_video\n        max_duration: See read_video\n        max_size: See read_video\n        interp: The interpolation method to use. When not using CUDA, you must choose one\n            of the `interpolation options <https://ffmpeg.org/ffmpeg-scaler.html#sws_005fflags>`_\n            (default: area). When using CUDA, you must choose from the\n            `interp_algo options <http://underpop.online.fr/f/ffmpeg/help/scale_005fnpp.htm.gz>`_\n            (default: super).\n        frame_rounding: The frame rounding method.\n        draw_timestamps: Draw original timestamps onto the frames (for debugging only)\n        use_cuda: Whether to enable CUDA acceleration. Requires a\n            CUDA-accelerated version of ffmpeg.\n\n    To build FFMPEG with CUDA, do the following in a Docker\n    container based on nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04. The\n    FFMPEG binary will be ffmpeg/ffmpeg.\n\n    .. code-block:: bash\n\n        git clone https://git.videolan.org/git/ffmpeg/nv-codec-headers.git\n        cd nv-codec-headers\n        make\n        sudo make install\n        cd ..\n        git clone https://git.ffmpeg.org/ffmpeg.git\n        cd ffmpeg\n        sudo apt-get update && sudo apt-get -y install yasm\n        export PATH=$PATH:/usr/local/cuda/bin\n        # Note: Scroll far right to see full configure command:\n        ./configure --enable-cuda-nvcc --enable-cuvid --enable-nvenc --enable-nvdec \\\n                    --enable-libnpp --enable-nonfree --extra-cflags=-I/usr/local/cuda/include \\\n                    --extra-ldflags=-L/usr/local/cuda/lib64\n        make -j 10\n        sudo make install\n\n    Returns:\n        See :code:`read_video`\n    \"\"\"\n    if interp is None:\n        interp = \"super\" if use_cuda else \"area\"\n    try:\n        (\n            raw_width,\n            raw_height,\n            avg_frame_rate,\n            codec_name,\n            start_time,\n        ) = get_video_properties(filepath)\n        start_time_offset = (\n            0.0 if avg_frame_rate is None else float(1 / (2 * avg_frame_rate))\n        )\n        LOGGER.debug(\n            \"raw_width: %s, raw_height: %s, avg_frame_rate: %s, codec_name: %s, start_time: %s\",\n            raw_width,\n            raw_height,\n            avg_frame_rate,\n            codec_name,\n            start_time,\n        )\n        channels = 3\n        scale = (\n            min(max_size / raw_width, max_size / raw_height, 1)\n            if max_size is not None\n            else 1\n        )\n        width, height = map(lambda d: int(round(scale * d)), [raw_width, raw_height])\n        # If there is no average frame rate, the offset tends to be unreliable.\n        offset = max(start_time, start_time_offset) if avg_frame_rate is not None else 0\n        cmd = (\n            f\"{get_ffmpeg()} -hide_banner -an -vsync 0 -loglevel fatal \"\n            f\"-itsoffset -{offset}\"\n        )\n        filters = []\n        if draw_timestamps:\n            pattern = \"%{pts}-%{frame_num}\"\n            filters.append(\n                f\"drawtext=fontsize={int(raw_height * 0.1)}:\"\n                f\"fontcolor=yellow:text={pattern}\"\n                \":x=(w-text_w):y=(h-text_h)\"\n            )\n        # Add frame rate filters.\n        if frames_per_second is None:\n            seconds_per_frame = (\n                float(1 / avg_frame_rate) if avg_frame_rate is not None else None\n            )\n        elif frames_per_second == \"keyframes\":\n            seconds_per_frame = None\n            filters.append(r\"select=eq(pict_type\\,I)\")\n        else:\n            assert isinstance(\n                frames_per_second, (float, int)\n            ), f\"Invalid framerate: {frames_per_second}\"\n            seconds_per_frame = 1 / frames_per_second\n            filters.append(\n                f\"fps={frames_per_second}:round={frame_rounding}:start_time={offset}\"\n            )\n        # Add resizing filters.\n        if use_cuda and codec_name in CUDA_CODECS:\n            cuda_codec = CUDA_CODECS[codec_name]\n            cmd += f\" -hwaccel cuda -c:v {cuda_codec}\"\n            filters.append(\"hwupload_cuda\")\n            if scale != 1:\n                filters.append(f\"scale_npp={width}:{height}:interp_algo={interp}\")\n            filters.extend(\n                [\n                    \"hwdownload\",\n                    \"format=nv12\",\n                ]\n            )\n        elif scale != 1:\n            filters.append(f\"scale={width}:{height}:flags={interp}\")\n        cmd += f\" -i '{filepath}'\"\n        if filters:\n            cmd += f\" -vf '{','.join(filters)}'\"\n        cmd += \" -pix_fmt rgb24 -f image2pipe -vcodec rawvideo -\"\n        LOGGER.debug(\"running ffmpeg with: %s\", cmd)\n        framebytes = width * height * channels\n        bufsize = framebytes * int(os.environ.get(\"PERCEPTION_FFMPEG_BUFSIZE\", \"5\"))\n        with subprocess.Popen(\n            shlex.split(cmd),\n            stdout=subprocess.PIPE,\n            stderr=subprocess.PIPE,\n            bufsize=bufsize,\n        ) as p:\n            assert p.stdout is not None, \"Could not launch subprocess pipe.\"\n            timestamp: float | None = 0\n            frame_index: int | None = 0\n            while True:\n                batch = p.stdout.read(bufsize)\n                if not batch:\n                    break\n                for image in np.frombuffer(batch, dtype=\"uint8\").reshape(\n                    (\n                        -1,\n                        height,\n                        width,\n                        channels,\n                    )\n                ):\n                    if frames_per_second != \"keyframes\":\n                        yield (image, frame_index, timestamp)\n                        if seconds_per_frame is not None:\n                            assert timestamp is not None\n                            timestamp += seconds_per_frame\n                            frame_index = (\n                                math.ceil(avg_frame_rate * timestamp)\n                                if avg_frame_rate is not None\n                                else None\n                            )\n                        else:\n                            timestamp = None\n                            frame_index = None\n                    else:\n                        # Obtaining the keyframe indexes with ffprobe is very slow (slower\n                        # than reading the video sometimes). We don't *have* to do it\n                        # when using ffmpeg, so we don't. The OpenCV approach *does*\n                        # get the keyframe indexes, but only because they're required\n                        # in order to select them.\n                        yield (image, None, None)\n                    if (\n                        max_duration is not None\n                        and timestamp is not None\n                        and timestamp > max_duration\n                    ):\n                        break\n            stdout, stderr = p.communicate()\n            if p.returncode != 0:\n                raise ValueError(\n                    f\"Error parsing video: {stdout.decode('utf-8')} {stderr.decode('utf-8')}\"\n                )\n    except Exception as e:\n        if errors not in [\"warn\", \"ignore\"]:\n            raise e\n        if errors == \"warn\":\n            warnings.warn(\n                message=f\"An error occurred while reading {filepath}. Processing may be truncated.\"\n            )\n\n\ndef read_video_to_generator(\n    filepath,\n    frames_per_second: str | float | None = None,\n    errors=\"raise\",\n    max_duration: float | None = None,\n    max_size: int | None = None,\n) -> FramesWithIndexesAndTimestamps:\n    \"\"\"This is used by :code:`read_video` when :code:`use_ffmpeg` is False (default).\n\n    Args:\n        filepath: See :code:`read_video`.\n        frames_per_second: See :code:`read_video`.\n        errors: See :code:`read_video`.\n        max_duration: See :code:`read_video`.\n        max_size: See :code:`read_video`.\n\n    Returns:\n        See :code:`read_video`.\n    \"\"\"\n    if cv2.__version__ < \"4.1.1\" and filepath.lower().endswith(\"gif\"):\n        message = \"Versions of OpenCV < 4.1.1 may read GIF files improperly. Upgrade recommended.\"\n        if errors == \"raise\":\n            raise ValueError(message)\n        warnings.warn(message=message)\n\n    if not os.path.isfile(filepath):\n        raise FileNotFoundError(f\"Could not find {filepath}.\")\n    if not os.access(filepath, os.R_OK):\n        raise OSError(f\"{filepath} is not readable\")\n    cap = cv2.VideoCapture(filename=filepath, apiPreference=cv2.CAP_FFMPEG)\n    try:\n        # The purpose of the following block is largely to create a\n        # frame_indexes (iterator or list) that indicates which\n        # frames we should be returning to the user and then\n        # yielding those frames as we come across them.\n        file_frames_per_second = cap.get(cv2.CAP_PROP_FPS)\n        if file_frames_per_second == 0:\n            if errors == \"raise\":\n                raise ValueError(\"Video file has framerate of 0fps.\")\n            # The known case where this occurs is for GIFs, where\n            # 0 fps is typically inferred as 10 fps.\n            file_frames_per_second = 10\n            if errors == \"warn\":\n                warnings.warn(\n                    message=\"Video file has framerate of 0 fps. Guessing framerate of 10fps.\"\n                )\n        if frames_per_second is None:\n            frames_per_second = file_frames_per_second\n        seconds_between_desired_frames = (\n            None\n            if (frames_per_second is not None and isinstance(frames_per_second, str))\n            else 1 / frames_per_second  # type: ignore\n        )\n        seconds_between_grabbed_frames = 1 / file_frames_per_second\n        grabbed_frame_count = 0\n        if frames_per_second == \"keyframes\":\n            frame_indexes: range | list[int] | typing.Iterator[int] = _get_keyframes(\n                filepath\n            )\n            # The repeat flag is used to handle the case where the\n            # desired sampling rate is higher than the file's frame\n            # rate. In this case, we will need to repeat frames in\n            # order to provide the least-surprising behavior that\n            # we can.\n            repeat = False\n        else:\n            num_frames_per_second = float(frames_per_second)\n            frame_indexes = itertools.count(\n                0, max(1, file_frames_per_second / num_frames_per_second)\n            )\n            repeat = file_frames_per_second < num_frames_per_second\n        input_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)\n        input_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)\n        if max_size is not None:\n            scale = min(max_size / max(input_width, input_height), 1)\n        else:\n            scale = 1\n        target_size: tuple[int, int] | None\n        if scale < 1:\n            target_size = (int(scale * input_width), int(scale * input_height))\n        else:\n            target_size = None\n        for frame_index in frame_indexes:\n            while grabbed_frame_count < frame_index:\n                # We need to skip this frame.\n                success = cap.grab()\n                if not success:\n                    break\n                grabbed_frame_count += 1\n            success, frame = cap.read()\n            grabbed_frame_count += 1\n            if not success:\n                # The video is over or an error has occurred.\n                break\n            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n            if target_size is not None:\n                frame = cv2.resize(frame, target_size, interpolation=cv2.INTER_NEAREST)\n            current_timestamp = frame_index / file_frames_per_second\n            yield frame, grabbed_frame_count - 1, current_timestamp\n            if max_duration is not None and current_timestamp > max_duration:\n                break\n            if repeat and isinstance(seconds_between_desired_frames, Number):\n                next_desired_timestamp = (\n                    current_timestamp + seconds_between_desired_frames\n                )\n                next_timestamp = current_timestamp + seconds_between_grabbed_frames\n                while next_desired_timestamp < next_timestamp:\n                    yield (frame, grabbed_frame_count - 1, next_desired_timestamp)\n                    next_desired_timestamp += seconds_between_desired_frames\n    except Exception as e:\n        if errors not in [\"warn\", \"ignore\"]:\n            raise e\n        if errors == \"warn\":\n            warnings.warn(\n                message=f\"An error occurred while reading {filepath}. Processing may be truncated.\"\n            )\n    finally:\n        cap.release()\n\n\ndef read_video_into_queue(*args, video_queue, terminate, func, **kwargs):\n    # We're inside a thread now and the queue is being read elsewhere.\n    try:\n        for frame, frame_index, timestamp in func(*args, **kwargs):\n            if not terminate.is_set():\n                video_queue.put((frame, frame_index, timestamp))\n            else:\n                break\n    finally:\n        video_queue.put((None, None, None))\n\n\ndef read_video(\n    filepath,\n    frames_per_second: str | float | None = None,\n    max_queue_size=128,\n    use_queue=True,\n    errors=\"raise\",\n    use_ffmpeg=False,\n    **kwargs,\n) -> FramesWithIndexesAndTimestamps:\n    \"\"\"Provides a generator of RGB frames, frame indexes, and timestamps from a\n    video. This function requires you to have installed ffmpeg. All other\n    arguments passed to read_video_to_generator.\n\n    Args:\n        filepath: Path to the video file\n        frames_per_second: How many frames to provide for\n            each second of video. If None, all frames\n            are provided. If frames_per_second is \"keyframes\",\n            we use ffmpeg to select I frames from the video.\n        max_queue_size: The maximum number of frames to load in the queue\n        use_queue: Whether to use a queue of frames during processing\n        max_duration: The maximum length of the video to hash.\n        max_size: The maximum size of frames to queue\n        errors: Whether to 'raise', 'warn', or 'ignore' errors\n        use_ffmpeg: Whether to use the FFMPEG CLI to read videos. If True, other\n            kwargs (e.g., :code:`use_cuda`) are passed to\n            :code:`read_video_to_generator_ffmpeg`.\n\n    Yields:\n        (frame, frame_index, timestamp) tuples\n    \"\"\"\n    for ffmpeg_kwarg in [\"interp\", \"frame_rounding\", \"draw_timestamps\", \"use_cuda\"]:\n        if not use_ffmpeg and ffmpeg_kwarg in kwargs:\n            if kwargs[ffmpeg_kwarg] is not None:\n                # Only log a warning if the value is something other than None.\n                warnings.warn(\n                    f\"{ffmpeg_kwarg} is ignored when use_ffmpeg is False.\", UserWarning\n                )\n            del kwargs[ffmpeg_kwarg]\n    generator: typing.Callable[..., FramesWithIndexesAndTimestamps]\n    if use_ffmpeg:\n        generator = read_video_to_generator_ffmpeg\n    else:\n        generator = read_video_to_generator\n    frame_index: int | None\n    timestamp: float | None\n    if use_queue:\n        video_queue: queue.Queue[tuple[np.ndarray, int, float]] = queue.Queue(\n            maxsize=max_queue_size\n        )\n        terminate = threading.Event()\n        thread = threading.Thread(\n            target=read_video_into_queue,\n            kwargs={\n                \"frames_per_second\": frames_per_second,\n                \"func\": generator,\n                \"video_queue\": video_queue,\n                \"filepath\": filepath,\n                \"errors\": errors,\n                \"terminate\": terminate,\n                **kwargs,\n            },\n        )\n        thread.start()\n        try:\n            while True:\n                frame, frame_index, timestamp = video_queue.get()\n                video_queue.task_done()\n                if frame is None:\n                    break\n                yield (frame, frame_index, timestamp)\n        finally:\n            # Set the termination flag for the\n            # background thread.\n            terminate.set()\n            try:\n                # Unblock the thread, in the event\n                # that it is waiting.\n                video_queue.get_nowait()\n\n                # Do it twice for the edge case\n                # where the queue is completely\n                # full and the end sentinel is\n                # blocking.\n                video_queue.get_nowait()\n            except queue.Empty:\n                # It doesn't matter if it's empty.\n                pass\n            # Wait for the background thread to terminate.\n            thread.join()\n    else:\n        for frame, frame_index, timestamp in generator(\n            filepath=filepath,\n            frames_per_second=frames_per_second,\n            errors=errors,\n            **kwargs,\n        ):\n            yield (frame, frame_index, timestamp)\n\n\ndef compute_synchronized_video_hashes(\n    filepath: str, hashers: dict, framerates=None, hash_format=\"base64\", use_queue=True\n):\n    \"\"\"Compute the video hashes for a group of hashers with synchronized\n    frame processing wherever possible.\n\n    Args:\n        filepath: Path to video file.\n        hashers: A dictionary mapping hasher names to video hasher objects\n        hash_format: The format in which to return the hashes\n        use_queue: Whether to use queued video frames\n    \"\"\"\n    if framerates is None:\n        framerates = get_common_framerates(\n            {\n                k: h.frames_per_second\n                for k, h in hashers.items()\n                if h.frames_per_second is not None\n            }\n        )\n    else:\n        assert all(\n            any(hasher_name in hasher_names for hasher_names in framerates.values())\n            for hasher_name, hasher in hashers.items()\n            if hasher.frames_per_second is not None\n        ), \"Provided framerates do not have an entry for all required hashers.\"\n\n    results = {\n        hasher_name: {\n            \"state\": None,\n            \"hash\": None,\n            \"relative_framerate\": next(\n                framerate / hasher.frames_per_second\n                for framerate, hasher_names in framerates.items()\n                if hasher_name in hasher_names\n            ),\n        }\n        for hasher_name, hasher in hashers.items()\n        if hasher.frames_per_second is not None\n    }\n    for current_framerate, current_hasher_names in framerates.items():\n        for frame_index, (frame, grabbed_frame_index, frame_timestamp) in enumerate(\n            read_video(\n                filepath=filepath,\n                frames_per_second=current_framerate,\n                use_queue=use_queue,\n            )\n        ):\n            for hasher_name in current_hasher_names:\n                config = results[hasher_name]\n                hasher = hashers[hasher_name]\n                assert config[\"relative_framerate\"] is not None\n                if frame_index % config[\"relative_framerate\"] == 0:\n                    config[\"state\"] = hasher.process_frame(\n                        frame=frame,\n                        frame_index=grabbed_frame_index,\n                        frame_timestamp=frame_timestamp,\n                        state=config[\"state\"],\n                    )\n        for hasher_name in current_hasher_names:\n            config = results[hasher_name]\n            hasher = hashers[hasher_name]\n            current_hash = hasher.hash_from_final_state(state=config[\"state\"])\n            if hash_format == \"vector\":\n                config[\"hash\"] = current_hash\n            else:\n                if not hasher.returns_multiple:\n                    config[\"hash\"] = hasher.vector_to_string(\n                        current_hash, hash_format=hash_format\n                    )\n                else:\n                    config[\"hash\"] = [\n                        hasher.vector_to_string(h, hash_format=hash_format)\n                        for h in current_hash\n                    ]\n            config[\"state\"] = None\n    hashes = {hasher_name: config[\"hash\"] for hasher_name, config in results.items()}\n    for hasher_name, hasher in hashers.items():\n        if hasher.frames_per_second is None:\n            # This is a custom hasher that we just pass a video path to.\n            hashes[hasher_name] = hasher.compute(filepath)\n    return hashes\n\n\ndef unletterbox(\n    image: np.ndarray,\n    only_remove_black: bool = False,\n    min_fraction_meaningful_pixels: float = 0.1,\n    color_threshold: float = 2,\n    min_side_length: int = 50,\n    min_reduction: float = 0.02,\n) -> tuple[tuple[int, int], tuple[int, int]] | None:\n    \"\"\"Return bounds of the non-trivial (content) region of an image, or None.\n\n    Letterboxing refers to uniform-color borders added around an image\n    (e.g., black bars on a video frame). This function detects such borders\n    by identifying the background color from the image corners and finding\n    the bounding box of pixels that differ from that background.\n\n    The function returns bounds as ``(x1, x2), (y1, y2)`` suitable for\n    slicing: ``image[y1:y2, x1:x2]``. The bounds are exclusive on the\n    right/bottom (i.e., x2 and y2 point one past the last content pixel).\n\n    **Algorithm overview:**\n\n    1. Sample the four corner pixels and find the most common value as\n       the candidate background color. If all four corners differ, return\n       ``None`` (no consistent letterbox detected).\n    2. Build a binary content mask where each pixel whose grayscale\n       intensity differs from the background by more than\n       ``color_threshold`` is marked as content.\n    3. Project the mask onto rows and columns and find the first/last\n       row and column where the fraction of content pixels exceeds\n       ``min_fraction_meaningful_pixels``.\n    4. Validate that the resulting crop is meaningfully smaller than the\n       original (controlled by ``min_reduction``) and that both sides\n       exceed ``min_side_length``.\n\n    Returns ``None`` when:\n\n    - No two corners share the same color (no clear background).\n    - Every pixel differs from the detected background (no border).\n    - No row or column meets the content-pixel threshold.\n    - The crop would not remove at least ``min_reduction`` fraction\n      from any dimension.\n    - Either cropped dimension would be smaller than ``min_side_length``.\n\n    Args:\n        image: Input image as an ``np.ndarray``. May be grayscale (H×W)\n            or RGB (H×W×3); RGB images are converted to grayscale\n            internally for background detection.\n        only_remove_black: If ``True``, treat black (intensity 0) as the\n            background regardless of corner colors. If ``False`` (default),\n            infer the background color from the most common corner value.\n        min_fraction_meaningful_pixels: The minimum fraction (0–1) of\n            pixels in a row or column that must differ from the background\n            for that row/column to be considered part of the content region.\n            Defaults to 0.1 (10%).\n        color_threshold: The minimum absolute difference in grayscale\n            intensity between a pixel and the background color for that\n            pixel to be classified as content. Defaults to 2.\n        min_side_length: The minimum width or height (in pixels) of the\n            cropped region. If the crop would be smaller, ``None`` is\n            returned. Defaults to 50.\n        min_reduction: The minimum fraction (0–1) of the original width\n            or height that must be removed for the crop to be worthwhile.\n            If the crop removes less than this from both dimensions,\n            ``None`` is returned. Defaults to 0.02 (2%).\n\n    Returns:\n        A tuple ``((x1, x2), (y1, y2))`` giving the left, right, top,\n        and bottom bounds of the content region (right/bottom exclusive),\n        or ``None`` if no meaningful letterbox was detected.\n    \"\"\"\n    if not 0 <= min_fraction_meaningful_pixels <= 1:\n        raise ValueError(\"min_fraction_meaningful_pixels must be between 0 and 1\")\n    if not 0 <= min_reduction <= 1:\n        raise ValueError(\"min_reduction must be between 0 and 1\")\n    image = image.astype(np.uint8)\n\n    shape = image.shape\n    h, w = shape[0:2]\n    if len(shape) == 3:\n        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)\n\n    # Determine background color and build binary content mask.\n    if only_remove_black:\n        bg_gray = 0\n    else:\n        # Sample the four corner pixels. If all four are unique there is no\n        # consistent background color, so we bail out early (O(1) rejection).\n        corners = (\n            image[0, 0],\n            image[0, w - 1],\n            image[h - 1, 0],\n            image[h - 1, w - 1],\n        )\n\n        if len(set(corners)) == 4:\n            LOGGER.debug(\"No common corner color detected, skipping content detection.\")\n            return (\n                (0, w),\n                (0, h),\n            )  # Return full image bounds instead of None to maintain backwards compatibility\n        # Use the most common corner value as the background intensity.\n        counts = Counter(corners)\n        bg_gray = counts.most_common(1)[0][0]\n\n    # Mark pixels whose grayscale intensity differs from the background\n    # by more than color_threshold as content (True).\n    content_mask = np.abs(image.astype(np.int16) - bg_gray) > color_threshold\n\n    # If every pixel is classified as content, there is no border to remove.\n    if content_mask.all():\n        LOGGER.debug(\"All pixels differ from background; no letterbox detected.\")\n        return (\n            (0, w),\n            (0, h),\n        )  # Return full image bounds instead of None to maintain backwards compatibility\n\n    # Find the content bounding box by projecting the mask onto rows and\n    # columns. cv2.reduce is used instead of np.sum for performance.\n    mask_u8 = content_mask.astype(np.uint8)\n    row_content = cv2.reduce(mask_u8, 1, cv2.REDUCE_SUM, dtype=cv2.CV_32S).ravel()\n    col_content = cv2.reduce(mask_u8, 0, cv2.REDUCE_SUM, dtype=cv2.CV_32S).ravel()\n\n    # Thresholds for minimum content per row/column\n    row_threshold = min_fraction_meaningful_pixels * w\n    col_threshold = min_fraction_meaningful_pixels * h\n\n    # Find first/last rows and columns with sufficient content\n    content_rows = np.where(row_content > row_threshold)[0]\n    content_cols = np.where(col_content > col_threshold)[0]\n\n    if len(content_rows) == 0 or len(content_cols) == 0:\n        LOGGER.debug(\"No rows or columns with sufficient content detected.\")\n        return None\n\n    top = int(content_rows[0])\n    bottom = int(content_rows[-1]) + 1\n    left = int(content_cols[0])\n    right = int(content_cols[-1]) + 1\n    height = bottom - top\n    width = right - left\n\n    # Reject if the crop does not remove at least min_reduction from\n    # at least one dimension (i.e., the border is negligibly thin).\n    if width >= w * (1 - min_reduction) and height >= h * (1 - min_reduction):\n        LOGGER.debug(\n            \"Crop would not reduce either dimension by %.0f%%; skipping.\",\n            min_reduction * 100,\n        )\n        return (\n            (0, w),\n            (0, h),\n        )  # Return full image bounds instead of None to maintain backwards compatibility\n    # Reject if the remaining content region is too small to be useful.\n    if width < min_side_length or height < min_side_length:\n        LOGGER.debug(\n            \"Cropped region (%dx%d) smaller than min_side_length=%d; skipping.\",\n            width,\n            height,\n            min_side_length,\n        )\n        return None\n\n    return ((left, right), (top, bottom))\n\n\ndef unletterbox_crop(\n    image: np.ndarray,\n    min_fraction_meaningful_pixels: float = 0.1,\n    color_threshold: float = 2,\n    min_side_length: int = 50,\n    min_reduction: float = 0.02,\n) -> np.ndarray | None:\n    \"\"\"Detect and crop the letterboxed regions from an image.\n\n    Args:\n        image: The image from which to remove letterboxing.\n        min_fraction_meaningful_pixels: 0 to 1: if cropped version is\n            smaller than this fraction of the image do not unletterbox.\n            0.1 == 10% of the image.\n        color_threshold: The minimum absolute difference in grayscale\n            intensity between a pixel and the background color for that\n            pixel to be classified as content. Defaults to 2.\n        min_side_length: The minimum width or height (in pixels) of the\n            cropped region. If the crop would be smaller, ``None`` is\n            returned. Defaults to 50.\n        min_reduction: The minimum fraction (0–1) of the original width\n            or height that must be removed for the crop to be worthwhile.\n            If the crop removes less than this from both dimensions,\n            the original image is returned. Defaults to 0.02 (2%).\n    Returns:\n        The cropped image or None if the image is mostly blank space.\n    \"\"\"\n    if not isinstance(image, np.ndarray):\n        raise TypeError(f\"Expected np.ndarray, got {type(image).__name__}\")\n\n    bounds = unletterbox(\n        image,\n        min_fraction_meaningful_pixels=min_fraction_meaningful_pixels,\n        color_threshold=color_threshold,\n        min_side_length=min_side_length,\n        min_reduction=min_reduction,\n    )\n    if bounds is None:\n        return None\n    (x1, x2), (y1, y2) = bounds\n    cropped = np.ascontiguousarray(image[y1:y2, x1:x2])\n    assert cropped.data.contiguous\n    return cropped\n"
  },
  {
    "path": "perception/hashers/video/__init__.py",
    "content": "from .framewise import FramewiseHasher\nfrom .tmk import TMKL1, TMKL2\n\n__all__ = [\"FramewiseHasher\", \"TMKL1\", \"TMKL2\"]\n"
  },
  {
    "path": "perception/hashers/video/framewise.py",
    "content": "import numpy as np\n\nfrom .. import tools\nfrom ..hasher import ImageHasher, VideoHasher\n\n\nclass FramewiseHasher(VideoHasher):\n    \"\"\"A hasher that simply returns frame-wise hashes at some\n    regular interval with some minimum inter-frame distance threshold.\"\"\"\n\n    returns_multiple = True\n\n    def __init__(\n        self,\n        frame_hasher: ImageHasher,\n        interframe_threshold: float,\n        frames_per_second: int = 15,\n        quality_threshold: float | None = None,\n    ):\n        self.hash_length = frame_hasher.hash_length\n        self.frames_per_second = frames_per_second\n        self.frame_hasher = frame_hasher\n        self.distance_metric = frame_hasher.distance_metric\n        if self.distance_metric == \"hamming\" and interframe_threshold > 1:\n            raise ValueError(\n                \"Hamming distance is always between 0 and 1 but \"\n                f\"`interframe_threshold` was set to {interframe_threshold}.\"\n            )\n        self.dtype = frame_hasher.dtype\n        self.interframe_threshold = interframe_threshold\n        self.quality_threshold = quality_threshold\n\n    def process_frame(self, frame, frame_index, frame_timestamp, state=None):\n        if self.quality_threshold is None:\n            current = self.frame_hasher.compute(frame, hash_format=\"vector\")\n        else:\n            current, quality = self.frame_hasher.compute_with_quality(\n                frame, hash_format=\"vector\"\n            )\n            if quality < self.quality_threshold:\n                return state or {\"previous\": None, \"hashes\": []}\n        assert isinstance(current, np.ndarray)  # help type checking below\n        if state is None or state[\"previous\"] is None:\n            # We keep a separate reference to the previous hash instead of using\n            # the last entry in the hashes list because `compute_batches` may\n            # clear the hashes list but we still want to be able to compare\n            # the final entry.\n            state = {\n                \"previous\": current,\n                \"hashes\": [current],\n            }\n        else:\n            if (\n                self.frame_hasher.compute_distance(current, state[\"previous\"])\n                > self.interframe_threshold\n            ):\n                state[\"hashes\"].append(current)\n        return state\n\n    def compute_batches(\n        self, filepath: str, batch_size: int, errors=\"raise\", hash_format=\"base64\"\n    ):\n        \"\"\"Compute hashes for a video in batches.\n\n        Args:\n            filepath: Path to video file\n            batch_size: The batch size to use for returning hashes\n            errors: One of \"raise\", \"ignore\", or \"warn\". Passed\n                to perception.hashers.tools.read_video.\n            hash_format: The format in which to return hashes\n        \"\"\"\n\n        def format_batch(hashes):\n            return [\n                (\n                    self.vector_to_string(vector, hash_format=hash_format)\n                    if hash_format != \"vector\"\n                    else vector\n                )\n                for vector in hashes\n            ]\n\n        state = None\n        for frame, frame_index, frame_timestamp in tools.read_video(\n            filepath=filepath, frames_per_second=self.frames_per_second, errors=errors\n        ):\n            state = self.process_frame(\n                frame=frame,\n                frame_index=frame_index,\n                frame_timestamp=frame_timestamp,\n                state=state,\n            )\n            if state is not None and len(state[\"hashes\"]) > batch_size:\n                yield format_batch(state[\"hashes\"])\n                state[\"hashes\"] = []\n        if state is not None and state[\"hashes\"]:\n            yield format_batch(state[\"hashes\"])\n\n    def hash_from_final_state(self, state):\n        if state is None:\n            return []\n        return state[\"hashes\"]\n"
  },
  {
    "path": "perception/hashers/video/tmk.py",
    "content": "import platform\nimport warnings\n\nimport numpy as np\nimport scipy.special\n\nfrom ..hasher import ImageHasher, VideoHasher\nfrom ..image.phash import PHashF\n\n\nclass TMKL2(VideoHasher):\n    \"\"\"The TMK L2 video hashing algorithm.\"\"\"\n\n    dtype = \"float32\"\n    distance_metric = \"custom\"\n\n    def __init__(\n        self,\n        frame_hasher: ImageHasher | None = None,\n        frames_per_second: int = 15,\n        normalization: str = \"matrix\",\n    ):\n        if platform.machine() == \"arm64\":\n            warnings.warn(\"TMK is not supported on ARM64\")\n\n        T = np.array([2731, 4391, 9767, 14653]).astype(\"float32\")\n        m = 32\n        if frame_hasher is None:\n            frame_hasher = PHashF(hash_size=16, exclude_first_term=True, freq_shift=1)\n        self.frames_per_second = frames_per_second\n        assert frame_hasher.dtype != \"bool\", \"This hasher requires real valued hashes.\"\n\n        # Beta parameter of the modified Bessel function of the first kind\n        self.beta = 32\n\n        # Number of Fourier coefficients per period\n        self.m = m\n\n        # The periods with shape (T, )\n        self.T = T  # (T)\n\n        # The Fourier coefficients with shape (T, m, 1)\n        self.ms = 2 * np.pi * np.arange(0, self.m).astype(\"float32\")  # (m)\n        self.ms_normed = (self.ms[np.newaxis,] / self.T.reshape(-1, 1)).reshape(\n            len(self.T), self.m, 1\n        )  # (T, m, 1)\n\n        # The weights with shape (T, 2m, 1)\n        a = np.array(\n            [\n                (scipy.special.iv(0, self.beta) - np.exp(-self.beta))\n                / (2 * np.sinh(self.beta))\n            ]\n            + [\n                scipy.special.iv(i, self.beta) / np.sinh(self.beta)\n                for i in range(1, self.m)\n            ]\n        )\n        a = a.reshape(1, -1).repeat(repeats=len(self.T), axis=0)  # type: ignore\n        a = np.sqrt(a)\n        self.a = a[..., np.newaxis]\n\n        # The frame-wise hasher\n        self.frame_hasher = frame_hasher\n\n        self.hash_length = self.T.shape[0] * 2 * self.m * self.frame_hasher.hash_length\n\n        self.normalization = normalization\n\n    def process_frame(self, frame, frame_index, frame_timestamp, state=None):\n        if state is None:\n            state = {\"features\": [], \"timestamps\": []}\n        state[\"features\"].append(self.frame_hasher.compute(frame, hash_format=\"vector\"))\n        state[\"timestamps\"].append(frame_timestamp)\n        return state\n\n    def hash_from_final_state(self, state):\n        timestamps = np.array(state[\"timestamps\"])\n        features = np.array(state[\"features\"]).reshape(\n            (\n                1,\n                1,\n                timestamps.shape[0],\n                self.frame_hasher.hash_length,\n            )\n        )\n        x = self.ms_normed * timestamps\n        yw1 = np.sin(x) * self.a\n        yw2 = np.cos(x) * self.a\n        yw = np.concatenate([yw1, yw2], axis=1)[..., np.newaxis]  # (T, 2m, t, 1)\n        y = (yw * features).sum(axis=2)  # (T, 2m, d)\n        return y.flatten()\n\n    def _compute_distance(self, vector1, vector2):\n        shape = (len(self.T), 2 * self.m, self.frame_hasher.hash_length)\n        return 1 - self._score_pair(\n            fv_a=vector1.reshape(shape),\n            fv_b=vector2.reshape(shape),\n            offsets=None,\n            normalization=self.normalization,\n        )\n\n    def _score_pair(self, fv_a, fv_b, offsets=None, normalization=\"matrix\"):\n        eps = 1e-8\n\n        if offsets is None:\n            offsets = np.array([0])\n\n        assert normalization in [\n            \"feat\",\n            \"freq\",\n            \"feat_freq\",\n            \"matrix\",\n        ], \"Invalid normalization\"\n\n        if \"feat\" in normalization:\n            a_xp = np.concatenate([self.a, self.a], axis=1)  # (T, 2m, 1)\n            fv_a_0 = fv_a / a_xp\n            fv_b_0 = fv_b / a_xp\n            norm_a = np.sqrt(np.sum(fv_a_0**2, axis=2, keepdims=True) + eps) + eps\n            norm_b = np.sqrt(np.sum(fv_b_0**2, axis=2, keepdims=True) + eps) + eps\n            fv_a = fv_a / norm_a\n            fv_b = fv_b / norm_b\n\n        if \"freq\" in normalization:\n            norm_a, norm_b = (\n                np.sqrt((fv**2).sum(axis=1, keepdims=True) / self.m + eps) + eps\n                for fv in [fv_a, fv_b]\n            )\n            fv_a = fv_a / norm_a\n            fv_b = fv_b / norm_b\n\n        if normalization == \"matrix\":\n            norm_a, norm_b = (\n                np.sqrt(np.sum(fv**2, axis=(1, 2)) + eps)[..., np.newaxis] + eps\n                for fv in [fv_a, fv_b]\n            )  # (T, 1)\n\n        fv_a_sin, fv_b_sin = (fv[:, : self.m] for fv in [fv_a, fv_b])  # (T, m, d)\n        fv_a_cos, fv_b_cos = (fv[:, self.m :] for fv in [fv_a, fv_b])  # (T, m, d)\n        ms = self.ms.reshape(-1, 1)  # (m, 1)\n        dot_sin_sin, dot_sin_cos, dot_cos_cos, dot_cos_sin = (\n            np.sum(p, axis=2, keepdims=True)\n            for p in [\n                fv_a_sin * fv_b_sin,\n                fv_a_sin * fv_b_cos,\n                fv_a_cos * fv_b_cos,\n                fv_a_cos * fv_b_sin,\n            ]\n        )  # (T, m, 1)\n        delta = (\n            ms.reshape(1, -1, 1) * offsets.reshape(1, -1) / self.T.reshape((-1, 1, 1))\n        )\n        cos_delta = np.cos(delta)  # (T, m, delta)\n        sin_delta = np.sin(delta)  # (T, m, delta)\n        dots = (\n            dot_sin_sin * cos_delta\n            + dot_sin_cos * sin_delta\n            + dot_cos_cos * cos_delta\n            - dot_cos_sin * sin_delta\n        ).sum(axis=1)\n        if normalization == \"matrix\":\n            dots = dots / (norm_a * norm_b)\n        if normalization == \"freq\":\n            dots = dots / self.m  # (T, m, delta)\n        elif normalization in [\"feat\", \"feat_freq\"]:\n            dots = dots / 512\n        return dots.mean(axis=0)\n\n\nclass TMKL1(VideoHasher):\n    \"\"\"The TMK L1 video hashing algorithm.\"\"\"\n\n    def __init__(\n        self,\n        frame_hasher: ImageHasher | None = None,\n        frames_per_second: int = 15,\n        dtype=\"float32\",\n        distance_metric=\"cosine\",\n        norm=2,\n        quality_threshold=None,\n    ):\n        if frame_hasher is None:\n            frame_hasher = PHashF(hash_size=16, exclude_first_term=True, freq_shift=1)\n        self.hash_length = frame_hasher.hash_length\n        self.frames_per_second = frames_per_second\n        assert frame_hasher.dtype != \"bool\", \"This hasher requires real valued hashes.\"\n        self.frame_hasher = frame_hasher\n        self.norm = norm\n        self.dtype = dtype or self.frame_hasher.dtype\n        self.distance_metric = distance_metric or self.frame_hasher.distance_metric\n        self.quality_threshold = quality_threshold\n\n    def process_frame(self, frame, frame_index, frame_timestamp, state=None):\n        if state is None:\n            state = {\"sum\": np.zeros(self.frame_hasher.hash_length), \"frame_count\": 0}\n        if self.quality_threshold is None:\n            hash_vector = self.frame_hasher.compute(frame, hash_format=\"vector\")\n        else:\n            hash_vector, quality = self.frame_hasher.compute_with_quality(\n                frame, hash_format=\"vector\"\n            )\n            if quality < self.quality_threshold:\n                return state\n        assert isinstance(hash_vector, np.ndarray)  # help type checking below\n        if hash_vector is not None:\n            state[\"sum\"] += hash_vector.astype(np.float32)\n            state[\"frame_count\"] += 1\n        return state\n\n    def hash_from_final_state(self, state):\n        if state[\"frame_count\"] == 0:\n            return None\n        average_vector = state[\"sum\"] / state[\"frame_count\"]\n        if self.norm is not None:\n            return (\n                average_vector / np.linalg.norm(average_vector, ord=self.norm)\n            ).astype(self.frame_hasher.dtype)\n        return average_vector.astype(self.frame_hasher.dtype)\n"
  },
  {
    "path": "perception/local_descriptor_deduplication.py",
    "content": "import concurrent.futures\nimport logging\nimport typing\nfrom abc import ABC\nfrom warnings import warn\n\nimport cv2\nimport numpy as np\nimport pandas as pd\nimport tqdm\nimport typing_extensions\n\nimport perception.approximate_deduplication as ad\nimport perception.hashers.tools as pht\n\nLOGGER = logging.getLogger(__name__)\nDEFAULT_MAX_FEATURES = 256\nDEFAULT_OVERLAP = 0.01\nDEFAULT_MATCH_PCT = 0.4\nDEFAULT_INTERSECTION = 0.6\nDEFAULT_INLIERS = 5\nDEFAULT_MAX_SIZE = 256\nDEFAULT_MIN_FEATURES = 10\n\nDEFAULT_THRESHOLD = 100\nDEFAULT_SIFT_THRESHOLD = 100\nDEFAULT_AKAZE_THRESHOLD = 250\n\nDEFAULT_RATIO = 0.5\nDEFAULT_SIFT_RATIO = 0.5\nDEFAULT_AKAZE_RATIO = 0.85\n\n\nclass Descriptors(typing_extensions.TypedDict):\n    keypoints: np.ndarray\n    descriptors: np.ndarray\n    descriptor_count: int\n    dimensions: tuple[int, int]\n    filepath: str\n    hasher: str\n\n\nclass MatchStats(typing_extensions.TypedDict):\n    match: float | None\n    min_kpBM: int | None\n    MAB: str | None\n    intersection: float | None\n    inliers: float | None\n    bounds_intersection: float | None\n    final_matched_a_pts: list[np.ndarray] | None\n    final_matched_b_pts: list[np.ndarray] | None\n\n\nclass LocalHasher(ABC):\n    grayscale = False\n    name: str\n    hasher: typing.Any\n    ratio: float\n    threshold: int\n\n    def __init__(\n        self,\n        max_features: int = DEFAULT_MAX_FEATURES,\n        ratio: float = DEFAULT_SIFT_RATIO,\n        threshold: int = DEFAULT_THRESHOLD,\n        overlap: float = DEFAULT_OVERLAP,\n        validation_match: float = DEFAULT_MATCH_PCT,\n        validation_inliers: int = DEFAULT_INLIERS,\n        validation_intersection: float = DEFAULT_INTERSECTION,\n    ):\n        self.ratio = ratio\n        self.threshold = threshold\n        self.max_features = max_features\n        self.overlap = overlap\n        self.validation_match = validation_match\n        self.validation_inliers = validation_inliers\n        self.validation_intersection = validation_intersection\n\n    def compute(self, image) -> tuple[np.ndarray, np.ndarray]:\n        return self.hasher.detectAndCompute(image, None)\n\n    def validate_match(\n        self,\n        descriptor1: Descriptors,\n        descriptor2: Descriptors,\n        minimum_match: float = DEFAULT_MATCH_PCT,\n        minimum_intersection: float = DEFAULT_INTERSECTION,\n        minimum_inliers: int = DEFAULT_INLIERS,\n    ) -> tuple[bool, MatchStats]:\n        \"\"\"Validate the match between two sets of keypoints and descriptors. The\n        validation algorithm is as follows:\n\n        #. Compute the mutual set of matches between the two sets of descriptors\n           and filter them using Lowe's ratio test.\n        #. If the minimum number of passing matches is less than \"minimum_match\",\n           the match fails. This ensures we don't have trivial matches.\n        #. Compute the intersection area of the matched keypoints versus the\n           raw keypoints. If the area overlap is less than minimum_intersection,\n           the match fails. This ensures we don't match on small subsegments of\n           an image, such as logos.\n        #. Compute a transformation matrix using cv2.findHomography. If we cannot\n           obtain a transformation matrix, the match fails. If the sum total\n           of inliers for the transformation matrix is less than minimum_inliers,\n           the match fails.\n        #. Finally, use the transformation matrix on a set of points representing\n           the bounding box of each image. If less than minimum_intersection of\n           the bounding box fits within the bounds of the transformed version,\n           the match fails. This is a second pass safety check for logos and other\n           subsegments of images.\n\n        Args:\n            kp1: The first set of keypoints\n            des1: The first set of descriptors\n            kp2: The second set of keypoints\n            des2: The second set of descriptors\n            dims1: The dimensions (width, height) for the first image\n            dims2: The dimensions (width, height) for the second image\n            minimum_match: The minimum number of matches passing the ratio test.\n            minimum_intersection: The minimum overlapping area between the keypoints\n                in the filtered set of matches and the original keypoints.\n            minimum_inliers: The minimum number of inliers for the transformation\n                matrix.\n            ratio: The ratio to use for Lowe's ratio test.\n\n        Returns:\n            True if the match passes, False otherwise.\n        \"\"\"\n        swap = descriptor1[\"keypoints\"].shape[0] < descriptor2[\"keypoints\"].shape[0]\n        descriptorA = descriptor2 if swap else descriptor1\n        descriptorB = descriptor1 if swap else descriptor2\n\n        stats: MatchStats = {\n            \"match\": None,\n            \"min_kpBM\": None,\n            \"MAB\": None,\n            \"intersection\": None,\n            \"inliers\": None,\n            \"bounds_intersection\": None,\n            \"final_matched_a_pts\": None,\n            \"final_matched_b_pts\": None,\n        }\n\n        indexA = ad.build_index(descriptorA[\"descriptors\"], approximate=False)\n        indexB = ad.build_index(descriptorB[\"descriptors\"], approximate=False)\n        if (\n            descriptorA[\"descriptors\"] is None\n            or indexA is None\n            or descriptorB[\"descriptors\"] is None\n            or indexB is None\n        ):\n            return False, stats\n\n        distances_A2B, indexes_A2B = indexB.search(\n            descriptorA[\"descriptors\"].astype(\"float32\"), 2\n        )\n        distances_B2A, _ = indexA.search(\n            descriptorB[\"descriptors\"].astype(\"float32\"), 2\n        )\n        good_A2B, good_B2A = map(\n            lambda distances: (distances[:, 0] < distances[:, 1] * self.ratio),\n            [distances_A2B, distances_B2A],\n        )\n        match = min(\n            good_A2B.sum() / good_A2B.shape[0], good_B2A.sum() / good_B2A.shape[0]\n        )\n        stats[\"match\"] = match\n\n        if match < minimum_match:\n            # We didn't get enough good matches.\n            return False, stats\n        kpAM = descriptorA[\"keypoints\"][good_A2B]\n        kpBM = descriptorB[\"keypoints\"][indexes_A2B[good_A2B, 0]]\n\n        # findHomography requires 4 points from each to work.\n        stats[\"min_kpBM\"] = min(len(kpAM), len(kpBM))\n        if len(kpAM) < 4 or len(kpBM) < 4:\n            return False, stats\n\n        intersection = compute_minimum_intersection(\n            kp1=descriptorA[\"keypoints\"],\n            kp2=descriptorB[\"keypoints\"],\n            filter_arr1=good_A2B,\n            filter_arr2=indexes_A2B[good_A2B, 0],\n        )\n        stats[\"intersection\"] = intersection\n        if intersection < minimum_intersection:\n            return False, stats\n\n        MAB, mask = cv2.findHomography(\n            kpAM.reshape(-1, 1, 2),\n            kpBM.reshape(-1, 1, 2),\n            cv2.RANSAC,\n            1.0,\n            maxIters=50_000,\n            confidence=0.9999,\n        )\n        stats[\"MAB\"] = \"good\"\n        if MAB is None:\n            # We didn't get a transformation matrix.\n            stats[\"MAB\"] = \"is-None\"\n            return False, stats\n        stats[\"inliers\"] = mask.sum()\n        if mask.sum() < minimum_inliers:\n            # The transformation matrix didn't include enough inliers.\n            return False, stats\n        # Check how much of each original bounding box fits onto\n        # the other image.\n        try:\n            MBA = np.linalg.inv(MAB)\n        except np.linalg.LinAlgError:\n            # We couldn't compute the matrix inverse.\n            stats[\"MAB\"] = \"inverse-failed\"\n            return False, stats\n        ptsA = np.array([[0, 0], descriptorA[\"dimensions\"]]).astype(\"float32\")\n        ptsB = np.array([[0, 0], descriptorB[\"dimensions\"]]).astype(\"float32\")\n        ptsAt = (\n            cv2.perspectiveTransform(ptsA.reshape((-1, 1, 2)), MAB)\n            .reshape(-1, 2)\n            .clip(0, descriptorB[\"dimensions\"])\n        )\n        ptsBt = (\n            cv2.perspectiveTransform(ptsB.reshape((-1, 1, 2)), MBA)\n            .reshape(-1, 2)\n            .clip(0, descriptorA[\"dimensions\"])\n        )\n        bounds_intersection = min(\n            abs(np.prod(ptsBt[1] - ptsBt[0]) / np.prod(descriptorA[\"dimensions\"])),\n            abs(np.prod(ptsAt[1] - ptsAt[0]) / np.prod(descriptorB[\"dimensions\"])),\n        )\n        stats[\"bounds_intersection\"] = bounds_intersection\n\n        # Apply mask index to kpAM, kpBM for list of matcihing points. mask ==1 for keep\n        matched_a_pts = []\n        matched_b_pts = []\n        for i in range(mask.shape[0]):\n            if mask[i][0] == 1:\n                matched_a_pts.append(kpAM[i])\n                matched_b_pts.append(kpBM[i])\n        # Unswap points before final return.\n        if swap:\n            stats[\"final_matched_a_pts\"] = matched_b_pts\n            stats[\"final_matched_b_pts\"] = matched_a_pts\n        else:\n            stats[\"final_matched_a_pts\"] = matched_a_pts\n            stats[\"final_matched_b_pts\"] = matched_b_pts\n\n        return (bounds_intersection >= minimum_intersection, stats)\n\n\nclass SIFT(LocalHasher):\n    name = \"SIFT\"\n\n    def __init__(\n        self,\n        max_features: int = DEFAULT_MAX_FEATURES,\n        ratio: float = DEFAULT_SIFT_RATIO,\n        threshold: int = DEFAULT_SIFT_THRESHOLD,\n        **kwargs,\n    ):\n        super().__init__(max_features, ratio, threshold, **kwargs)\n        self.hasher = cv2.SIFT_create(nfeatures=self.max_features)  # type: ignore[attr-defined]\n\n\nclass AKAZE(LocalHasher):\n    name = \"AKAZE\"\n\n    def __init__(\n        self,\n        max_features: int = DEFAULT_MAX_FEATURES,\n        ratio: float = DEFAULT_AKAZE_RATIO,\n        threshold: int = DEFAULT_AKAZE_THRESHOLD,\n        **kwargs,\n    ):\n        super().__init__(max_features, ratio, threshold, **kwargs)\n        LOGGER.warning(\"The default AKAZE tuning has issues with some cropped images.\")\n        self.hasher = cv2.AKAZE_create()  # type: ignore[attr-defined]\n\n\ndef load_and_preprocess(filepath, max_size=DEFAULT_MAX_SIZE, grayscale=True):\n    \"\"\"Read, unletterbox, and resize an image.\n\n    Args:\n        filepath: The path to the file\n        max_size: The maximum size for a dimension of the image\n        grayscale: Set to false to get RGB\n    \"\"\"\n    image = pht.read(filepath)\n    if image is None:\n        LOGGER.warning(\"Failed to load image %s\", filepath)\n        return None\n    res = pht.unletterbox(image)\n    if res is None:\n        return None\n    (x1, x2), (y1, y2) = res\n    image = np.ascontiguousarray(image[y1:y2, x1:x2])\n    if grayscale:\n        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)\n\n    max_dimension = max(image.shape[:2])\n    if max_dimension > max_size:\n        scale = max_size / max_dimension\n        image = cv2.resize(\n            image, (int(image.shape[1] * scale), int(image.shape[0] * scale))\n        )\n    return image\n\n\ndef generate_image_descriptors(\n    filepath: str,\n    hasher: LocalHasher | None = None,\n    min_features=DEFAULT_MIN_FEATURES,\n    max_size=DEFAULT_MAX_SIZE,\n) -> Descriptors | None:\n    \"\"\"Generate local descriptors for a file.\n\n    Args:\n        filepath: Path to image file.\n        max_features: The maximum number of features to\n            extract.\n        min_features: The minimum number of features to\n            extract.\n        max_size: The maximum side length for an image.\n\n    Returns:\n        If successful, returns a tuple of keypoints, descriptors,\n        and a (width, height) tuple.\n    \"\"\"\n    if hasher is None:\n        hasher = SIFT(\n            max_features=DEFAULT_MAX_FEATURES,\n        )\n\n    try:\n        image = load_and_preprocess(\n            filepath, max_size=max_size, grayscale=hasher.grayscale\n        )\n        if image is None:\n            return None\n        keypoints, descriptors = hasher.compute(image)\n    except FileNotFoundError:\n        LOGGER.warning(\"Image file %s not found.\", filepath)\n        return None\n    except ValueError as e:\n        LOGGER.error(\"Processing image file %s failed.\", filepath, exc_info=e)\n        return None\n\n    if descriptors is None:\n        return None\n    if descriptors.shape[0] < min_features:\n        return None\n    keypoints = np.array([kp.pt for kp in keypoints], dtype=np.float32)\n\n    return {\n        \"keypoints\": keypoints,\n        \"descriptors\": descriptors,\n        \"descriptor_count\": descriptors.shape[0],\n        \"filepath\": filepath,\n        \"dimensions\": (image.shape[1], image.shape[0]),\n        \"hasher\": hasher.name,\n    }\n\n\ndef build_reference_df(\n    filepaths: typing.Iterable[str],\n    hasher: LocalHasher | None = None,\n    min_features=DEFAULT_MIN_FEATURES,\n    max_size=DEFAULT_MAX_SIZE,\n    show_progress=False,\n) -> pd.DataFrame:\n    \"\"\"Build descriptors for a list of files.\n\n    Args:\n        filepaths: A list of filepaths for which descriptors\n            are desired.\n        hasher: The local descriptor hasher to use to extract\n            features.\n        min_features: The minimum number of features to\n            extract.\n        max_size: The maximum side length for an image.\n\n    Returns:\n        A dataframe, indexed by filepath with columns for descriptors\n        and descriptor counts.\n    \"\"\"\n    LOGGER.debug(\"Generating descriptors\")\n\n    if hasher is None:\n        hasher = SIFT()\n\n    features = []\n    for filepath in tqdm.tqdm(filepaths, disable=not show_progress, desc=\"Filepaths\"):\n        features.append(\n            generate_image_descriptors(\n                filepath,\n                hasher=hasher,\n                min_features=min_features,\n                max_size=max_size,\n            )\n        )\n    LOGGER.debug(\"Finished computing descriptors.\")\n    return pd.DataFrame(\n        {\n            \"descriptors\": [\n                f[\"descriptors\"] if f is not None else None for f in features\n            ],\n            \"keypoints\": [f[\"keypoints\"] if f is not None else None for f in features],\n            \"descriptor_count\": [\n                f[\"descriptor_count\"] if f is not None else None for f in features\n            ],  # type: ignore\n            \"dimensions\": [\n                f[\"dimensions\"] if f is not None else None for f in features\n            ],\n            \"hasher\": hasher.name,\n            \"filepath\": filepaths,\n        }\n    ).set_index(\"filepath\")\n\n\ndef hasher_name(df: pd.DataFrame) -> str:\n    return df.iloc[0].get(\"hasher\", \"SIFT\")\n\n\ndef check_hasher(df1: pd.DataFrame, df2: pd.DataFrame):\n    assert hasher_name(df1) == hasher_name(\n        df2\n    ), \"The hashers must mach for deduplication to work.\"\n\n\ndef compute_pairs(\n    match_df,\n    query_df=None,\n    hasher: LocalHasher | None = None,\n    pct_probe=0.1,\n    use_gpu: bool = True,\n    faiss_cache_path: str | None = None,\n    show_progress: bool = False,\n):\n    \"\"\"Compute pairs of matching images from a reference\n    dataframe.\n    Args:\n        match_df: A dataframe, as computed by build_reference_df, will compute pairs against self,\n            unless query_df is provided.\n        query_df: optional, if provided will be used to query against match_df for matches.\n        threshold: The match threshold between two vectors.\n        minimum_overlap: The minimum overlap between a pair of files.\n        pct_probe: The percentage of the dataset to search for approximate\n            search.\n        faiss_cache_path: If provided load any existing faiss index from this path, and if\n            it does not exist then save the generated faiss index to the path.\n        show_progress: Whether or not to show a progress bar while computing pairs\n    \"\"\"\n    match_df = match_df.dropna(subset=[\"descriptors\"])\n    counts = match_df[\"descriptor_count\"].values.astype(\"uint32\")\n    descriptors = np.vstack(match_df[\"descriptors\"].values)\n\n    if hasher is None:\n        hasher = SIFT()\n\n    if query_df is None:\n        assert (\n            hasher_name(match_df) == hasher.name\n        ), \"The hasher must mach the original hash format.\"\n        y_counts = None\n        y_descriptors = None\n    else:\n        check_hasher(match_df, query_df)\n        query_df = query_df.dropna(subset=[\"descriptors\"])\n        y_counts = query_df[\"descriptor_count\"].values.astype(\"uint32\")\n        y_descriptors = np.vstack(query_df[\"descriptors\"].values).astype(\"float32\")\n    LOGGER.debug(\"Computing euclid pairs aprox\")\n    pairs = ad.compute_euclidean_pairwise_duplicates_approx(\n        X=descriptors.astype(\"float32\"),\n        counts=counts,\n        threshold=hasher.threshold,\n        minimum_overlap=hasher.overlap,\n        pct_probe=pct_probe,\n        Y=y_descriptors,\n        y_counts=y_counts,\n        use_gpu=use_gpu,\n        faiss_cache_path=faiss_cache_path,\n        show_progress=show_progress,\n    )\n\n    if query_df is None:\n        query_df = match_df  # Assign query_df to be able to lookup matches.\n\n    return [(query_df.iloc[p1].name, match_df.iloc[p2].name) for p1, p2 in pairs]\n\n\ndef compute_area(box):\n    \"\"\"Compute the area of a box given a set\n    of points x1, y1, x2, y2.\n\n    Args:\n        box: A list of coordinates.\n    \"\"\"\n    return (box[3] - box[1]) * (box[2] - box[0])\n\n\ndef compute_intersection(kps, filter_arr):\n    \"\"\"Compute the percentage of area covered by\n    a set of filtered keypoints versus raw keypoints.\n\n    Args:\n        kps: A list of points\n        filter_arr: A filter array of same length as kps_raw\n            indicating whether to keep that keypoint.\n    \"\"\"\n    kps_filtered = kps[filter_arr]\n    box_after = np.hstack([kps_filtered.min(axis=0), kps_filtered.max(axis=0)])\n    box_before = np.hstack([kps.min(axis=0), kps.max(axis=0)])\n    area_before = compute_area(box_before)\n    area_after = compute_area(box_after)\n    return area_after / area_before\n\n\ndef compute_minimum_intersection(kp1, kp2, filter_arr1, filter_arr2):\n    \"\"\"Compute the minimum intersection between two pairs\n    of keypoints (filtered and unfiltered).\n\n    Args:\n        kp1: A list of the first set of keypoints\n        kp2: A list of the second set of keypoints\n        filter_arr1: A filter array for the first set of keypoints\n        filter_arr2: A filter array for the second set of keypoints\n    \"\"\"\n    return min(\n        compute_intersection(kp1, filter_arr1), compute_intersection(kp2, filter_arr2)\n    )\n\n\ndef deduplicate_sift_dfs(*args, **kwargs):\n    \"DEPRECATED please use deduplicate_dfs.\"\n    warn(\"deduplicate_sift_dfs is deprecated.\", DeprecationWarning, stacklevel=2)\n    deduplicate_dfs(*args, **kwargs)\n\n\ndef deduplicate_dfs(\n    match_df: pd.DataFrame,\n    query_df: pd.DataFrame | None = None,\n    coarse_pct_probe: float = ad.DEFAULT_PCT_PROBE,\n    max_workers: int | None = None,\n    use_gpu: bool = True,\n    faiss_cache_path: str | None = None,\n    verbose: bool = False,\n    hasher: LocalHasher | None = None,\n    show_progress: bool = False,\n) -> (\n    list[tuple[typing.Any, typing.Any]]\n    | list[tuple[typing.Any, typing.Any, MatchStats]]\n):\n    \"\"\"Deduplicate images within one set of images or between two sets of images:\n    #. Given a dataframe (or two) of descriptors and keypoints for images.\n    #. Perform a coarse, approximate search for images with common features.\n    #. For each candidate pair, validate it pairwise by checking the features\n    and keypoints with the traditional approach using the ratio test. See\n    validate_match for more information.\n    Args:\n        match_df: Dataframe of features to dedup within.\n        query_df: If provided will search for matches between this and match_df, if None will\n            just search match_df against itself.\n        coarse_pct_probe: The minimum fraction of nearest lists to search. If\n            the product of pct_probe and the number of lists is less\n            than 1, one list will be searched.\n        corase_threshold: The threshold for a match as a euclidean distance.\n        minimum_coarse_overlap: The minimum overlap between two files to qualify as a match.\n        minimum_validation_match: The minimum number of matches passing the ratio test.\n        minimum_validation_intersection: The minimum overlapping area between the keypoints\n            in the filtered set of matches and the original keypoints.\n        minimum_validation_inliers: The minimum number of inliers for the transformation\n            matrix.\n        ratio: The ratio to use for Lowe's ratio test.\n        max_workers: The maximum number of threads to use for doing the final validation\n            step.\n        faiss_cache_path: If provided load any existing faiss index from this path, and if\n            it does not exist then save the generated faiss index to the path. Most helpful if\n            doing multiple queries against the same match_df.\n        verbose: return metada with matches such as overlap percent etc.\n        show_progress: Whether or not to show a progress bar while computing duplicate file pairs\n    Returns:\n        A list of pairs of file duplicates.\n        If verbose is true the tuple will be: (match_id1, match_id2, metadata_dict)\n    \"\"\"\n    if hasher is None:\n        hasher = SIFT()\n\n    LOGGER.debug(\"Computing candidate pairs\")\n    candidates = compute_pairs(\n        match_df,\n        query_df,\n        pct_probe=coarse_pct_probe,\n        hasher=hasher,\n        use_gpu=use_gpu,\n        faiss_cache_path=faiss_cache_path,\n        show_progress=show_progress,\n    )\n\n    if query_df is None:\n        query_df = match_df\n\n    assert (\n        match_df.index.is_unique\n    ), \"Index of match_df must be unique, or it will cause wrong matches.\"\n    assert (\n        query_df.index.is_unique\n    ), \"Index of query_df must be unique, or it will cause wrong matches.\"\n\n    LOGGER.debug(\"Validating candidate pairs: %d\", len(candidates))\n    keep: (\n        list[tuple[typing.Any, typing.Any]]\n        | list[tuple[typing.Any, typing.Any, MatchStats]]\n    ) = []  # type: ignore\n    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n        batch_size = 10_000\n        for start in tqdm.tqdm(range(0, len(candidates), batch_size)):\n            futures = {\n                executor.submit(\n                    hasher.validate_match,\n                    descriptor1=query_df.loc[c1].to_dict(),\n                    descriptor2=match_df.loc[c2].to_dict(),\n                    minimum_match=hasher.validation_match,\n                    minimum_inliers=hasher.validation_inliers,\n                    minimum_intersection=hasher.validation_intersection,\n                ): (c1, c2)\n                for c1, c2 in candidates[start : start + batch_size]\n            }\n            for future in concurrent.futures.as_completed(futures):\n                is_match, metadata = future.result()\n                if is_match:\n                    if verbose:\n                        keep.append(\n                            (futures[future][0], futures[future][1], metadata)  # type: ignore\n                        )\n                    else:\n                        keep.append(futures[future])  # type: ignore\n    LOGGER.debug(\"Validating complete, keeping: %d\", len(keep))\n    return keep\n\n\ndef deduplicate(\n    filepaths_or_reference_df: typing.Iterable[str] | pd.DataFrame,\n    query_filepaths_or_df: None | (typing.Iterable[str] | pd.DataFrame) = None,\n    max_features: int = DEFAULT_MAX_FEATURES,\n    min_features: int = DEFAULT_MIN_FEATURES,\n    max_size: int = DEFAULT_MAX_SIZE,\n    hasher: LocalHasher | None = None,\n    show_progress: bool = False,\n    **kwargs,\n) -> (\n    list[tuple[typing.Any, typing.Any]]\n    | list[tuple[typing.Any, typing.Any, MatchStats]]\n):\n    \"\"\"Deduplicate images by doing the following:\n    #. Unletterbox all images and resize to some maximum size, preserving\n       aspect ratio.\n    #. Compute the descriptors and keypoints for all the resulting images.\n    #. See `deduplicate_dfs` for remaining steps.\n    Args:\n        filepaths_or_reference_df: The list of images to deduplicate, or a precomputed\n            descriptor DataFrame.\n        query_filepaths_or_df: If provided will look for matches between these files and\n            the files in the first param.\n        max_features: The maximum number of features to\n            extract.\n        min_features: The minimum number of features to\n            extract.\n        max_size: The maximum side length for an image.\n        show_progress: Whether or not to show a progress bar while building descriptors and\n            computing pairs of file duplicates\n    Returns:\n        A list of pairs of file duplicates.\n        If verbose is true the tuple will be: (match_id1, match_id2, metadata_dict)\n    \"\"\"\n    if hasher is None:\n        hasher = SIFT(max_features=max_features)\n\n    if isinstance(filepaths_or_reference_df, pd.DataFrame):\n        reference_df = filepaths_or_reference_df\n    else:\n        reference_df = build_reference_df(\n            filepaths=filepaths_or_reference_df,\n            hasher=hasher,\n            min_features=min_features,\n            max_size=max_size,\n            show_progress=show_progress,\n        )\n\n    if query_filepaths_or_df is None:\n        query_df = None\n    else:\n        if isinstance(query_filepaths_or_df, pd.DataFrame):\n            query_df = query_filepaths_or_df\n        else:\n            query_df = build_reference_df(\n                filepaths=query_filepaths_or_df,\n                hasher=hasher,\n                min_features=min_features,\n                max_size=max_size,\n                show_progress=show_progress,\n            )\n\n    return deduplicate_dfs(\n        reference_df,\n        query_df=query_df,\n        hasher=hasher,\n        show_progress=show_progress,\n        **kwargs,\n    )\n"
  },
  {
    "path": "perception/py.typed",
    "content": ""
  },
  {
    "path": "perception/testing/__init__.py",
    "content": "import atexit\nimport math\nimport typing\nfrom contextlib import ExitStack\nfrom importlib import resources\n\nimport cv2\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom PIL import Image\n\nfrom .. import hashers, tools\n\nSIZES = {\"float32\": 32, \"uint8\": 8, \"bool\": 1}\n\n\ndef get_low_detail_image():\n    v = np.arange(0, 50, 1)\n    v = np.concatenate([v, v[::-1]])[np.newaxis,]\n    image = np.matmul(v.T, v)\n    image = (image * 255 / image.max()).astype(\"uint8\")\n    image = image[..., np.newaxis].repeat(repeats=3, axis=2)\n    image[:, 50:] = 0\n    image[50:] = 0\n    return image\n\n\nLOW_DETAIL_IMAGE = get_low_detail_image()\n\nfile_manager = ExitStack()\natexit.register(file_manager.close)\n\nDEFAULT_TEST_IMAGES = [\n    str(\n        file_manager.enter_context(\n            resources.as_file(\n                resources.files(\"perception\") / \"testing\" / \"images\" / f\"image{n}.jpg\"\n            )\n        )\n    )\n    for n in range(1, 11)\n]\nDEFAULT_TEST_LOGOS = [\n    str(\n        file_manager.enter_context(\n            resources.as_file(\n                resources.files(\"perception\") / \"testing\" / \"logos\" / \"logoipsum.png\"\n            )\n        )\n    )\n]\nDEFAULT_TEST_VIDEOS = [\n    str(\n        file_manager.enter_context(\n            resources.as_file(\n                resources.files(\"perception\") / \"testing\" / \"videos\" / f\"v{n}.m4v\"\n            )\n        )\n    )\n    for n in range(1, 3)\n] + [\n    str(\n        file_manager.enter_context(\n            resources.as_file(\n                resources.files(\"perception\") / \"testing\" / \"videos\" / \"v2s.mov\"\n            )\n        )\n    )\n]\n\n\n@typing.no_type_check\ndef test_opencv_hasher(hasher: hashers.ImageHasher, image1: str, image2: str):\n    # For OpenCV hashers we make sure the distance we compute\n    # is the same as inside OpenCV\n    f1 = image1\n    f2 = image2\n    opencv_distance = hasher.hasher.compare(\n        hasher.hasher.compute(hashers.tools.read(f1)),\n        hasher.hasher.compute(hashers.tools.read(f2)),\n    )\n    if hasher.distance_metric == \"hamming\":\n        opencv_distance /= hasher.hash_length\n    np.testing.assert_approx_equal(\n        opencv_distance,\n        hasher.compute_distance(hasher.compute(f1), hasher.compute(f2)),\n        significant=4,\n    )\n\n\ndef hash_dicts_to_df(hash_dicts, returns_multiple):\n    assert all(\n        h[\"error\"] is None for h in hash_dicts\n    ), \"An error was found in the hash dictionaries\"\n    if returns_multiple:\n        return pd.DataFrame(\n            {\n                \"filepath\": tools.flatten(\n                    [[h[\"filepath\"]] * len(h[\"hash\"]) for h in hash_dicts]\n                ),\n                \"hash\": tools.flatten([h[\"hash\"] for h in hash_dicts]),\n            }\n        ).assign(error=np.nan)\n    return pd.DataFrame.from_records(hash_dicts).assign(error=np.nan)\n\n\ndef test_hasher_parallelization(hasher, test_filepaths):\n    filepaths_10x = test_filepaths * 10\n    if not hasher.allow_parallel:\n        with pytest.warns(UserWarning, match=\"cannot be used in parallel\"):\n            hashes_parallel_dicts = hasher.compute_parallel(filepaths=filepaths_10x)\n    else:\n        hashes_parallel_dicts = hasher.compute_parallel(filepaths=filepaths_10x)\n    hashes_sequential_dicts = [\n        {\"filepath\": filepath, \"hash\": hasher.compute(filepath), \"error\": None}\n        for filepath in filepaths_10x\n    ]\n    hashes_parallel = hash_dicts_to_df(\n        hashes_parallel_dicts, returns_multiple=hasher.returns_multiple\n    ).sort_values([\"filepath\", \"hash\"])\n    hashes_sequential = hash_dicts_to_df(\n        hashes_sequential_dicts, returns_multiple=hasher.returns_multiple\n    ).sort_values([\"filepath\", \"hash\"])\n    assert (hashes_sequential.hash.values == hashes_parallel.hash.values).all()\n    assert (hashes_sequential.filepath.values == hashes_parallel.filepath.values).all()\n\n\ndef test_video_hasher_integrity(\n    hasher: hashers.VideoHasher, test_videos: list[str] = DEFAULT_TEST_VIDEOS\n):\n    test_hasher_parallelization(hasher, test_videos)\n\n\ndef test_image_hasher_integrity(\n    hasher: hashers.ImageHasher,\n    pil_opencv_threshold: float,\n    transform_threshold: float,\n    test_images: list[str] = DEFAULT_TEST_IMAGES,\n    opencv_hasher: bool = False,\n):\n    \"\"\"Test to ensure a hasher works correctly.\n\n    Args:\n        hasher: The hasher to test.\n        test_images: A list of filepaths to images to use for testing.\n        pil_opencv_threshold: The hash distance permitted for an image\n            when loaded with OpenCV vs. PIL.\n        transform_threshold: The permitted error in isometric transform\n            hashes.\n        opencv_hasher: Whether the hasher is an OpenCV hasher. Used to\n            determine whether to check for consistent distances.\n    \"\"\"\n    assert len(test_images) >= 2, \"You must provide at least two test images.\"\n    image1 = test_images[0]\n    image2 = test_images[1]\n    hash1_1 = str(hasher.compute(image1))  # str() games for mypy, not proud\n    hash1_2 = str(hasher.compute(Image.open(image1)))\n    image_cv = cv2.imread(image1)\n    assert image_cv is not None, f\"Failed to load image: {image1}\"\n    hash1_3 = str(hasher.compute(cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB)))\n\n    hash2_1 = str(hasher.compute(image2))\n\n    # There is a small distance because PIL and OpenCV read\n    # JPEG images a little differently (e.g., libjpeg-turbo vs. libjpeg)\n    assert hasher.compute_distance(hash1_1, hash1_2) < pil_opencv_threshold\n    assert hasher.compute_distance(hash1_1, hash2_1) > pil_opencv_threshold\n    assert hasher.compute_distance(hash1_1, hash1_3) == 0\n\n    # Ensure the conversion to and from vectors works for both base64 and hex.\n    assert hasher.vector_to_string(hasher.string_to_vector(hash2_1)) == hash2_1\n    assert (\n        hasher.vector_to_string(\n            hasher.string_to_vector(\n                str(\n                    hasher.vector_to_string(\n                        hasher.string_to_vector(hash2_1), hash_format=\"hex\"\n                    )\n                ),\n                hash_format=\"hex\",\n            )\n        )\n        == hash2_1\n    )\n\n    # Ensure parallelization works properly.\n    test_hasher_parallelization(hasher=hasher, test_filepaths=test_images)\n\n    # Ensure the isometric hashes computation work properly\n    for image in test_images:\n        transforms = hashers.tools.get_isometric_transforms(image)\n        hashes_exp = {\n            key: str(hasher.compute(value)) for key, value in transforms.items()\n        }\n        hashes_act = hasher.compute_isometric(transforms[\"r0\"])\n        for transform_name in hashes_exp.keys():\n            assert (\n                hasher.compute_distance(\n                    hashes_exp[transform_name], hashes_act[transform_name]\n                )\n                < transform_threshold\n            )\n\n    # Verify that hashes are the correct length.\n    hash_bits = hasher.hash_length * SIZES[hasher.dtype]\n\n    words_base64 = math.ceil(hash_bits / 6)  # Base64 uses 8 bits for every 6 bits\n    words_base64 += (\n        0 if words_base64 % 4 == 0 else 4 - (words_base64 % 4)\n    )  # Base64 always uses multiples of four\n    assert len(hash2_1) == words_base64\n\n    words_hex = 2 * math.ceil(hash_bits / 8)  # Hex uses 16 bits for every 8 bits\n    words_hex += (\n        0 if words_hex % 2 == 0 else 1\n    )  # Two characters for every one character.\n    assert (\n        len(\n            str(\n                hasher.vector_to_string(\n                    hasher.string_to_vector(hash2_1), hash_format=\"hex\"\n                )\n            )\n        )\n        == words_hex\n    )\n\n    # Verify that low quality images yield zero quality\n    image = np.zeros((100, 100, 3)).astype(\"uint8\")  # type: ignore\n    _, quality = hasher.compute_with_quality(image)\n    assert quality == 0\n\n    # Verify that high quality images yield high quality\n    # scores.\n    assert (\n        min(hasher.compute_with_quality(filepath)[1] for filepath in test_images) == 100\n    )\n\n    # Verify that medium quality images yield medium quality\n    _, quality = hasher.compute_with_quality(LOW_DETAIL_IMAGE)\n    assert 0 < quality < 100\n\n    if opencv_hasher:\n        test_opencv_hasher(hasher, image1, image2)\n"
  },
  {
    "path": "perception/testing/images/README.md",
    "content": "# Sample images\nThese images were obtained from Wikimedia Commons.\n\n- [Image 1](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:ADAC-Zentrale,_Munich,_March_2017-05.jpg)\n- [Image 2](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Two-tailed_pasha_(Charaxes_jasius_jasius)_Greece.jpg)\n- [Image 3](https://commons.wikimedia.org/wiki/Main_Page#/media/File:Escolta_presidencial,_Plaza_de_Armas,_Lima,_Per%C3%BA,_2015-07-28,_DD_40.JPG)\n- [Image 4](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Iglesia_de_Ntra._Sra._de_la_Junquera,_Luesma,_Zaragoza,_Espa%C3%B1a,_2017-01-04,_DD_60.jpg)\n- [Image 5](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Bahrain_Fort_March_2015.JPG)\n- [Image 6](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:ET_Gondar_asv2018-02_img18_Fasil_Ghebbi.jpg)\n- [Image 7](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:M%C3%BCnster,_Beresa,_Mercedes-Benz_C-Klasse_Cabrio_--_2018_--_1757.jpg)\n- [Image 8](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Panoramic_sunset_in_Conques_02.jpg)\n- [Image 9](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Catedral_de_San_Basilio,_Mosc%C3%BA,_Rusia,_2016-10-03,_DD_05-06_HDR.jpg)\n- [Image 10](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Tupolev_Tu-160_overflying_Moscow_fix.jpg)"
  },
  {
    "path": "perception/testing/logos/README.md",
    "content": "# Sample Logos\nThese logos were obtained from free sources.\n\n- [LogoIpsum](https://logoipsum.com/)"
  },
  {
    "path": "perception/testing/videos/README.md",
    "content": "Video from https://www.youtube.com/watch?v=84Er4LnWXtI under Creative Commons Attribution License.\n\nNotes\n- v1 is a fairly short, slow moving video\n- v2 is a longer but faster-paced video\n- v2s is the same as v2 but with a snippet removed in the middle (simulates a scene or cut)"
  },
  {
    "path": "perception/tools.py",
    "content": "import base64\nimport json\nimport os\nimport urllib.parse\nimport urllib.request\nimport warnings\n\nimport numpy as np\nfrom scipy import spatial\nfrom tqdm import tqdm\n\nfrom . import hashers as perception_hashers\nfrom .utils import flatten\n\ntry:\n    from . import extensions  # type: ignore\nexcept ImportError:\n    warnings.warn(\n        \"C extensions were not built. Some metrics will be computed more slowly. \"\n        \"Please install from wheels or set up a compiler prior to installation \"\n        \"from source to use extensions.\"\n    )\n    extensions = None\n\n\ndef _multiple_hashes_for_ids(hashes: list[tuple[str, str | np.ndarray]]):\n    \"\"\"Check if a list of (hash_id, hash) tuples has more\n    than one hash for a hash_id.\n\n    Args:\n        hashes: A list of (hash_id, hash) tuples.\n    \"\"\"\n    hash_ids = [hash_id for hash_id, _ in hashes]\n    return len(hash_ids) != len(set(hash_ids))\n\n\ndef deduplicate_hashes(\n    hashes: list[tuple[str, str | np.ndarray]],\n    threshold: float,\n    hash_format: str = \"base64\",\n    hasher: perception_hashers.ImageHasher | None = None,\n    hash_length: int | None = None,\n    hash_dtype: str | None = None,\n    distance_metric: str | None = None,\n    progress: tqdm | None = None,\n) -> list[tuple[str, str]]:\n    \"\"\"Find duplicates using a list of precomputed hashes.\n\n    Args:\n        hashes: A list of (id, hash) tuples\n        threshold: A distance threshold\n        hasher: A hasher to use for computing distances\n        progress: A tqdm object for reporting progress\n\n    Returns:\n        A list of duplicated id pairs. To use, you can just remove the\n        first entry of each pair from your dataset. The pairs are provided\n        in the event that you wish to apply further analysis.\n    \"\"\"\n    assert (\n        hash_length is not None\n        and hash_dtype is not None\n        and distance_metric is not None\n    ) or (hasher is not None), (\n        \"You must provide either `hasher` or all of \"\n        \"`hash_length`, `hash_dtype`, and `distance_metric`.\"\n    )\n    if hasher is not None:\n        assert all(\n            k is None for k in [hash_length, hash_dtype, distance_metric]\n        ), \"If hasher is provided, hash_length, hash_dtype, and distance_metric must all be None.\"\n        hash_length = hasher.hash_length\n        hash_dtype = hasher.dtype\n        distance_metric = hasher.distance_metric\n    assert hash_length is not None\n    assert isinstance(hash_dtype, str)\n    assert isinstance(distance_metric, str)\n    # If there is more than one hash for an id, we want them\n    # to be sequential in case we are able to use the more\n    # efficient distance calculation (compute_euclidean_pairwise_duplicates)\n    # that skips computation of distance between two hashes for the same file.\n    multiple_hashes_per_id = _multiple_hashes_for_ids(hashes)\n    if multiple_hashes_per_id:\n        hashes = sorted(hashes)\n    vectors = np.array(\n        [\n            (\n                perception_hashers.tools.string_to_vector(\n                    hash_string=hash_string_or_vector,\n                    hash_format=hash_format,\n                    hash_length=hash_length,\n                    dtype=hash_dtype,\n                )\n                if isinstance(hash_string_or_vector, str)\n                else hash_string_or_vector\n            )\n            for _, hash_string_or_vector in hashes\n        ]\n    )\n    files = np.array([identifier for identifier, _ in hashes])\n    pairs: list[tuple[str, str]] = []\n    n_hashes = len(vectors)\n    start_idx = 0\n    end_idx = None\n    if distance_metric != \"euclidean\" or \"int\" not in hash_dtype or extensions is None:\n        iterator = range(n_hashes)\n        if progress is not None:\n            iterator = progress(iterator, total=n_hashes, desc=\"Deduplicating.\")  # type: ignore[operator]\n        distances = spatial.distance.pdist(vectors, metric=distance_metric)\n        for hash_index in iterator:\n            if end_idx is not None:\n                start_idx = end_idx\n            end_idx = start_idx + (n_hashes - hash_index - 1)\n            current_distances = distances[start_idx:end_idx]\n            duplicated_files = files[hash_index + 1 :][current_distances < threshold]\n            current_file = files[hash_index]\n            # We have to make sure the two files are not the same file\n            # because it can happen for highly symmetric images when\n            # we are including isometric hashes.\n            pairs.extend(\n                [\n                    (current_file, duplicated_file)\n                    for duplicated_file in duplicated_files\n                    if duplicated_file != current_file\n                ]\n            )\n    else:\n        # We want to count the number of hashes for each unique hash ID. There\n        # may be more than one -- for example in the case of video. We need\n        # this so we can pass it to the compute_euclidean_pairwise_duplicates\n        # function.\n        if multiple_hashes_per_id:\n            counts = np.zeros(shape=len({hash_id for hash_id, _ in hashes})).astype(\n                \"uint32\"\n            )\n            previous_hash_id = None\n            counts_idx = 0\n            files_ = (\n                []  # make type check happy\n            )  # We're going to re-build the IDs with deduplicated files.\n            for hash_id, _ in hashes:\n                if hash_id != previous_hash_id:\n                    files_.append(hash_id)\n                if previous_hash_id is not None and hash_id != previous_hash_id:\n                    counts_idx += 1\n                counts[counts_idx] += 1\n                previous_hash_id = hash_id\n            files = np.array(files_)\n        else:\n            counts = None  # type: ignore\n        pairs = [\n            (files[idx1], files[idx2])\n            for idx1, idx2 in extensions.compute_euclidean_pairwise_duplicates_simple(\n                vectors.astype(\"int32\"), threshold=threshold, counts=counts\n            )\n        ]\n    return list(set(pairs))\n\n\ndef deduplicate(\n    files: list[str],\n    hashers: list[tuple[perception_hashers.ImageHasher, float]],\n    isometric: bool = False,\n    progress: tqdm | None = None,\n) -> list[tuple[str, str]]:\n    \"\"\"Find duplicates in a list of files.\n\n    Args:\n        files: A list of filepaths.\n        hashers: A list of tuples of the form (hasher, threshold)\n        isometric: Whether to compare the rotated versions of the images\n        progress: A tqdm progress indicator\n\n    Returns:\n        A list of duplicated file pairs. To use, you can just remove the\n        first entry of each pair from your dataset. The pairs are provided\n        in the event that you wish to apply further analysis.\n    \"\"\"\n    files_dedup = set(files)\n    if len(files_dedup) != len(files):\n        warnings.warn(\n            message=\"Duplicate file paths were provided. These will be automatically removed.\",\n            category=UserWarning,\n        )\n        files = list(files_dedup)\n    pairs: list[tuple[str, str]] = []\n    for hasher_idx, (hasher, threshold) in enumerate(hashers):\n        hash_dicts = hasher.compute_parallel(\n            filepaths=files,\n            progress=progress,\n            progress_desc=f\"Computing hashes for hash {hasher_idx+1} of {len(hashers)}.\",\n            isometric=isometric,\n        )\n        hash_list = sorted(hash_dicts, key=lambda h: h[\"filepath\"])\n        if isometric:\n            hash_list = flatten(\n                [\n                    list(row[\"hash\"].values())\n                    for row in hash_dicts\n                    if row[\"error\"] is None\n                ]\n            )\n            files_for_hashes = flatten(\n                [[row[\"filepath\"]] * 8 for row in hash_dicts if row[\"error\"] is None]\n            )\n        elif hasher.returns_multiple:\n            hash_list = flatten(\n                [row[\"hash\"] for row in hash_dicts if row[\"error\"] is None]\n            )\n            files_for_hashes = flatten(\n                [[row[\"filepath\"]] * 8 for row in hash_dicts if row[\"error\"] is None]\n            )\n        else:\n            hash_list = [row[\"hash\"] for row in hash_dicts if row[\"error\"] is None]\n            files_for_hashes = [\n                row[\"filepath\"] for row in hash_dicts if row[\"error\"] is None\n            ]\n        pairs.extend(\n            deduplicate_hashes(\n                hashes=list(zip(files_for_hashes, hash_list)),\n                hasher=hasher,\n                threshold=threshold,\n                progress=progress,\n            )\n        )\n    return list(set(pairs))\n\n\nclass SaferMatcher:\n    \"\"\"An object for matching hashes with the known CSAM hashes in the\n    Safer matching service.\n    Please contact `info@getsafer.io <mailto:info@getsafer.io>`_\n    for details on obtaining credentials and information on how match\n    responses are provided.\n\n    Here's a minimalist example:\n\n    .. code-block:: python\n\n        from perception import hashers, tools\n\n        hasher = hashers.PHash(hash_size=16)\n        matches = hashers.tools.SaferMatcher(\n            api_key='YOUR_API_KEY',\n            username='YOUR_USERNAME', # You only need to provide\n            password='YOUR_PASSWORD', # an API key OR username/password.\n            url='MATCHING_SERVICE_URL'\n        )\n\n    For authentication, you must provide the API key OR username and password pair.\n    If neither is provided, the function will attempt to find them as environment\n    variables with names :code:`SAFER_MATCHING_SERVICE_API_KEY`,\n    :code:`SAFER_MATCHING_SERVICE_USERNAME`, and :code:`SAFER_MATCHING_SERVICE_PASSWORD`,\n    respectively. You must also provide the URL endpoint for the matching service,\n    either as a keyword argument or as a :code:`SAFER_MATCHING_SERVICE_URL`\n    environment variable.\n\n    Args:\n        api_key: A base64 encoded set of matching service credentials\n        username: Matching service username\n        password: Matching service password\n        url: Safer matching service URL\n        hasher: A hasher to use for matching\n        hasher_api_id: The hasher ID for finding matches.\n        quality_threshold: The quality threshold filter to use\n    \"\"\"\n\n    def __init__(\n        self,\n        api_key: str | None = None,\n        username: str | None = None,\n        password: str | None = None,\n        url: str | None = None,\n        hasher: perception_hashers.ImageHasher | None = None,\n        hasher_api_id: str | None = None,\n        quality_threshold: int = 90,\n    ):\n        if (\n            username is None\n            and password is None\n            and api_key is None\n            and os.environ.get(\"SAFER_MATCHING_SERVICE_USERNAME\") is not None\n            and os.environ.get(\"SAFER_MATCHING_SERVICE_PASSWORD\") is not None\n        ):\n            username = os.environ[\"SAFER_MATCHING_SERVICE_USERNAME\"]\n            password = os.environ[\"SAFER_MATCHING_SERVICE_PASSWORD\"]\n        if username is not None and password is not None:\n            credentials = f\"{username}:{password}\"\n            api_key = base64.b64encode(credentials.encode(\"ascii\")).decode(\"ascii\")\n        if api_key is None:\n            api_key = os.environ.get(\"SAFER_MATCHING_SERVICE_API_KEY\")\n            if api_key is None:\n                raise ValueError(\n                    \"You must provide one of (1) API key, (2) API key provided as \"\n                    \"`SAFER_MATCHING_SERVICE_API_KEY` env var, (3) username and password or \"\n                    \"(4) username and password as `SAFER_MATCHING_SERVICE_USERNAME` and \"\n                    \"`SAFER_MATCHING_SERVICE_PASSWORD` env vars.\"\n                )\n        if url is None:\n            url = os.environ.get(\"SAFER_MATCHING_SERVICE_URL\")\n            if url is None:\n                raise ValueError(\n                    \"You must provide either the url or the SAFER_MATCHING_SERVICE_URL env var.\"\n                )\n        if urllib.parse.urlparse(url).scheme != \"https\" and not os.environ.get(\n            \"SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP\"\n        ):\n            raise ValueError(\"You must provide an url that begins with `https://`.\")\n        self.api_key = api_key\n        self.url = url\n        if hasher is None:\n            hasher = perception_hashers.PHash(hash_size=16, highfreq_factor=4)\n        if hasher_api_id is None:\n            hasher_api_id = \"phash\"\n        self.hasher = hasher\n        self.hasher_api_id = hasher_api_id\n        self.quality_threshold = quality_threshold\n\n    def match(\n        self,\n        images: list[(str | tuple[perception_hashers.tools.ImageInputType, str])],\n    ) -> dict:\n        \"\"\"Match hashes with the Safer matching service.\n\n        Args:\n            images: A list of image filepaths or (image_like, image_id) tuples.\n\n        Returns:\n            A dictionary of matches. See Safer matching service documentation (\n            contact Thorn for a copy).\n        \"\"\"\n        raw_hashes = [\n            self.hasher.compute_with_quality(\n                image if isinstance(image, str) else image[0]\n            )\n            for image in images\n        ]\n        hashes = [\n            {\n                \"id\": image if isinstance(image, str) else image[1],\n                self.hasher_api_id: hash_string,\n                \"md5\": (\n                    perception_hashers.tools.compute_md5(image)\n                    if isinstance(image, str)\n                    else (\n                        perception_hashers.tools.compute_md5(image[0])\n                        if isinstance(image[0], str)\n                        else None\n                    )\n                ),\n            }\n            for image, (hash_string, quality) in zip(images, raw_hashes)\n            if quality > self.quality_threshold\n        ]\n        for hash_dict in hashes:\n            # We cannot include an md5 key if we don't\n            # have the md5.\n            if hash_dict[\"md5\"] is None:\n                del hash_dict[\"md5\"]\n        if not hashes:\n            warnings.warn(\n                message=\"No images of sufficient quality were found.\",\n                category=UserWarning,\n            )\n            return {}\n        body = {\"hashes\": hashes, \"version\": \"v2\"}\n        headers = {\n            \"Authorization\": f\"Basic {self.api_key}\",\n            \"Content-Type\": \"application/json\",\n        }\n        req = urllib.request.Request(\n            url=self.url,\n            data=str(json.dumps(body)).encode(\"utf-8\"),\n            headers=headers,\n            method=\"POST\",\n        )\n        with urllib.request.urlopen(req) as res:\n            ret = json.loads(res.read().decode(\"utf-8\"))\n        return ret\n"
  },
  {
    "path": "perception/utils.py",
    "content": "def flatten(list_of_lists):\n    return [item for sublist in list_of_lists for item in sublist]\n"
  },
  {
    "path": "poetry.toml",
    "content": "[virtualenvs]\ncreate = true\nin-project = true\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[project]\nname = \"Perception\"\ndynamic = [\"version\"]\ndescription = \"Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use.\"\nauthors = [{ name = \"Thorn\", email = \"info@wearethorn.org\" }]\nlicense = \"Apache-2.0\"\nreadme = \"README.md\"\nrequires-python = \">=3.10,<4.0\"\ndependencies = [\n  \"Cython>=3.0.0,<4.0.0\",\n  \"numpy>=1.26.4,<3.0.0\",\n  \"opencv-contrib-python-headless>=4.10.0,<5.0.0\",\n  \"faiss-cpu>=1.8.0,<2.0.0\",\n  \"networkit>=11.1,<12.0.0; sys_platform != 'darwin'\",\n  \"networkx>=3.0,<4.0; sys_platform == 'darwin'\",\n  \"pandas\",\n  \"Pillow\",\n  \"pywavelets>=1.5.0,<2.0.0\",\n  \"validators>=0.22.0,<1.0.0\",\n  \"rich>=13.7.0,<14.0.0\",\n  \"scipy\",\n  \"tqdm>=4.67.1,<5.0.0\",\n]\n\n\n[project.optional-dependencies]\nbenchmarking = [\n  \"matplotlib\",\n  \"albumentations>=2.0.8,<3.0.0\",\n  \"tabulate\",\n  \"scikit-learn\",\n  \"ffmpeg-python\",\n]\nmatching = [\"aiohttp\", \"python-json-logger\"]\npdq = [\"pdqhash>=0.2.7,<0.3.0\"]\n\n\n[tool.poetry]\nversion = \"0.0.0\"\n\n\n[tool.poetry.group.dev.dependencies]\nblack = \"^26\"\ncoverage = \"*\"\nipython = \"*\"\nmypy = \"*\"\npandas-stubs = \"*\"\npre-commit = \"*\"\npytest = \"*\"\npytest-cov = \"*\"\nruff = \"*\"\ntypes-pillow = \"*\"\ntypes-tqdm = \"*\"\ntwine = \"*\"\nalbumentations = \"^2.0.8\"\n\n\n[tool.poetry.build]\nscript = \"build.py\"\ngenerate-setup-file = true\n\n[tool.mypy]\nexclude = [\"/tests/\"]\ncheck_untyped_defs = true\nignore_missing_imports = true\n\n[tool.poetry-dynamic-versioning]\nenable = true\nvcs = \"git\"\n\n[build-system]\nrequires = [\n  \"poetry-core\",\n  \"poetry-dynamic-versioning\",\n  \"numpy\",\n  \"Cython\",\n  \"setuptools\",\n  \"wheel\",\n]\nbuild-backend = \"poetry_dynamic_versioning.backend\"\n"
  },
  {
    "path": "setup.py",
    "content": "# -*- coding: utf-8 -*-\nfrom setuptools import setup\n\npackages = [\n    \"perception\",\n    \"perception.approximate_deduplication\",\n    \"perception.benchmarking\",\n    \"perception.hashers\",\n    \"perception.hashers.image\",\n    \"perception.hashers.video\",\n    \"perception.testing\",\n]\n\npackage_data = {\"\": [\"*\"], \"perception.testing\": [\"images/*\", \"logos/*\", \"videos/*\"]}\n\nextras_require = {\n    \"benchmarking\": [\n        \"matplotlib\",\n        \"scipy\",\n        \"albumentations\",\n        \"tabulate\",\n        \"scikit-learn\",\n        \"ffmpeg-python\",\n    ],\n    \"experimental\": [\"networkit\", \"faiss-cpu\"],\n    \"matching\": [\"aiohttp\", \"python-json-logger\"],\n}\n\nsetup_kwargs = {\n    \"name\": \"Perception\",\n    \"version\": \"0.0.0\",\n    \"description\": \"Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use.\",\n    \"long_description\": \"# perception ![ci](https://github.com/thorn-oss/perception/workflows/ci/badge.svg)\\n\\n`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. See [the documentation](https://perception.thorn.engineering/en/latest/) for details.\\n\\n## Background\\n\\n`perception` was initially developed at [Thorn](https://www.thorn.org) as part of our work to eliminate child sexual abuse material from the internet. For more information on the issue, check out [our CEO's TED talk](https://www.thorn.org/blog/time-is-now-eliminate-csam/).\\n\\n## Getting Started\\n\\n### Installation\\n\\n`pip install perception`\\n\\n### Hashing\\n\\nHashing with different functions is simple with `perception`.\\n\\n```python\\nfrom perception import hashers\\n\\nfile1, file2 = 'test1.jpg', 'test2.jpg'\\nhasher = hashers.PHash()\\nhash1, hash2 = hasher.compute(file1), hasher.compute(file2)\\ndistance = hasher.compute_distance(hash1, hash2)\\n```\\n\\n### Examples\\n\\nSee below for end-to-end examples for common use cases for perceptual hashes.\\n\\n- [Detecting child sexual abuse material](https://perception.thorn.engineering/en/latest/examples/detecting_csam.html)\\n- [Deduplicating media](https://perception.thorn.engineering/en/latest/examples/deduplication.html)\\n- [Benchmarking perceptual hashes](https://perception.thorn.engineering/en/latest/examples/benchmarking.html)\\n\\n## Supported Hashing Algorithms\\n\\n`perception` currently ships with:\\n\\n- pHash (DCT hash) (`perception.hashers.PHash`)\\n- Facebook's PDQ Hash (`perception.hashers.PDQ`)\\n- dHash (difference hash) (`perception.hashers.DHash`)\\n- aHash (average hash) (`perception.hashers.AverageHash`)\\n- Marr-Hildreth (`perception.hashers.MarrHildreth`)\\n- Color Moment (`perception.hashers.ColorMoment`)\\n- Block Mean (`perception.hashers.BlockMean`)\\n- wHash (wavelet hash) (`perception.hashers.WaveletHash`)\\n\\n## Contributing\\n\\nTo work on the project, start by doing the following.\\n\\n```bash\\n# Install local dependencies for\\n# code completion, etc.\\nmake init\\n\\n- To do a (close to) comprehensive check before committing code, you can use `make precommit`.\\n\\nTo implement new features, please first file an issue proposing your change for discussion.\\n\\nTo report problems, please file an issue with sample code, expected results, actual results, and a complete traceback.\\n\\n## Alternatives\\n\\nThere are other packages worth checking out to see if they meet your needs for perceptual hashing. Here are some\\nexamples.\\n\\n- [dedupe](https://github.com/dedupeio/dedupe)\\n- [imagededup](https://idealo.github.io/imagededup/)\\n- [ImageHash](https://github.com/JohannesBuchner/imagehash)\\n- [PhotoHash](https://github.com/bunchesofdonald/photohash)\\n```\\n\",\n    \"author\": \"Thorn\",\n    \"author_email\": \"info@wearethorn.org\",\n    \"maintainer\": \"None\",\n    \"maintainer_email\": \"None\",\n    \"url\": \"None\",\n    \"packages\": packages,\n    \"package_data\": package_data,\n    \"extras_require\": extras_require,\n    \"python_requires\": \">=3.10,<4.0\",\n}\nfrom build import *\n\nbuild(setup_kwargs)\n\nsetup(**setup_kwargs)\n"
  },
  {
    "path": "tests/test_approximate_deduplication.py",
    "content": "import perception.approximate_deduplication as ad\n\n\ndef get_cluster_members(assignments):\n    clusters: dict[int, list[str]] = {}\n    for assignment in assignments:\n        clusters.setdefault(assignment[\"cluster\"], []).append(assignment[\"id\"])\n    return sorted(sorted(members) for members in clusters.values())\n\n\ndef test_pairs_to_clusters_component_strictness():\n    assignments = ad.pairs_to_clusters(\n        ids=[\"a\", \"b\", \"c\", \"d\"],\n        pairs=[(\"a\", \"b\"), (\"b\", \"c\")],\n        strictness=\"component\",\n    )\n\n    assert get_cluster_members(assignments) == [[\"a\", \"b\", \"c\"], [\"d\"]]\n\n\ndef test_pairs_to_clusters_community_strictness():\n    assignments = ad.pairs_to_clusters(\n        ids=[\"a\", \"b\", \"c\"],\n        pairs=[(\"a\", \"b\"), (\"b\", \"c\")],\n        strictness=\"community\",\n    )\n\n    assert get_cluster_members(assignments) == [[\"a\", \"b\", \"c\"]]\n\n\ndef test_pairs_to_clusters_clique_strictness():\n    assignments = ad.pairs_to_clusters(\n        ids=[\"a\", \"b\", \"c\", \"d\"],\n        pairs=[(\"a\", \"b\"), (\"a\", \"c\"), (\"b\", \"c\"), (\"c\", \"d\")],\n        strictness=\"clique\",\n    )\n\n    assert get_cluster_members(assignments) == [[\"a\", \"b\", \"c\"], [\"d\"]]\n"
  },
  {
    "path": "tests/test_benchmarking.py",
    "content": "import base64\nimport os\nimport shutil\nimport tempfile\n\nimport numpy as np\nimport pytest\nimport albumentations\nfrom scipy import spatial\n\nfrom perception import benchmarking, hashers, testing\nfrom perception.benchmarking import video_transforms\nfrom perception.benchmarking.image import BenchmarkImageDataset\nfrom perception.benchmarking.video import BenchmarkVideoDataset\n\nfiles = testing.DEFAULT_TEST_IMAGES\ndataset = BenchmarkImageDataset.from_tuples([(fn, i % 2) for i, fn in enumerate(files)])\n\n\ndef test_deduplicate():\n    tempdir = tempfile.TemporaryDirectory()\n    new_file = os.path.join(tempdir.name, \"dup_file.jpg\")\n    shutil.copy(files[0], new_file)\n    duplicated_files = files + [new_file]\n    deduplicated, duplicates = BenchmarkImageDataset.from_tuples(\n        [(fn, i % 2) for i, fn in enumerate(duplicated_files)]\n    ).deduplicate(hasher=hashers.AverageHash(), threshold=1e-2)\n    assert len(duplicates) == 1\n    assert len(deduplicated._df) == len(files)\n\n\ndef test_bad_dataset():\n    bad_files = files + [\"tests/images/nonexistent.jpg\"]\n    bad_dataset = BenchmarkImageDataset.from_tuples(\n        [(fn, i % 2) for i, fn in enumerate(bad_files)]\n    )\n    transforms = {\n        \"blur0.05\": albumentations.GaussianBlur(sigma_limit=0.05, p=1),\n        \"noop\": albumentations.Resize(height=256, width=256, p=1),\n    }\n    with pytest.raises(Exception):\n        transformed = bad_dataset.transform(\n            transforms=transforms, storage_dir=\"/tmp/transforms\", errors=\"raise\"\n        )\n    with pytest.warns(UserWarning, match=\"occurred reading\"):\n        transformed = bad_dataset.transform(\n            transforms=transforms, storage_dir=\"/tmp/transforms\", errors=\"warn\"\n        )\n    assert len(transformed._df) == len(files) * 2\n\n\ndef test_benchmark_dataset():\n    assert len(dataset._df) == len(files)\n    assert len(dataset.filter(category=[0])._df) == len(files) / 2\n    with pytest.warns(UserWarning, match=\"Did not find\"):\n        assert len(dataset.filter(category=[3])._df) == 0\n\n    dataset.save(\"/tmp/dataset.zip\")\n    dataset.save(\"/tmp/dataset_folder\")\n    o1 = BenchmarkImageDataset.load(\"/tmp/dataset.zip\")\n    o2 = BenchmarkImageDataset.load(\"/tmp/dataset_folder\")\n    o3 = BenchmarkImageDataset.load(\"/tmp/dataset.zip\")\n\n    for opened in [o1, o2, o3]:\n        assert (\n            opened._df[\"filepath\"].apply(os.path.basename)\n            == dataset._df[\"filepath\"].apply(os.path.basename)\n        ).all()\n\n\ndef test_benchmark_transforms():\n    transformed = dataset.transform(\n        transforms={\n            \"blur0.05\": albumentations.GaussianBlur(sigma_limit=0.05, p=1),\n            \"noop\": albumentations.Resize(height=256, width=256, p=1),\n        },\n        storage_dir=\"/tmp/transforms\",\n    )\n\n    assert len(transformed._df) == len(files) * 2\n\n    hashes = transformed.compute_hashes(hashers={\"pdna\": hashers.PHash()})\n    tr = hashes.compute_threshold_recall().reset_index()\n\n    hashes._metrics = None\n    hashes._df.at[0, \"hash\"] = None\n    with pytest.warns(UserWarning, match=\"invalid / empty hashes\"):\n        hashes.compute_threshold_recall()\n\n    assert (tr[tr[\"transform_name\"] == \"noop\"][\"recall\"] == 100.0).all()\n\n    # This is a charting function but we execute it just to make sure\n    # it runs without error.\n    hashes.show_histograms()\n\n\ndef convert_hash_string_to_vector(hash_string):\n    buff = base64.decodebytes(hash_string.encode(\"utf-8\"))\n    return np.frombuffer(buff, dtype=np.uint8)\n\n\ndef test_video_benchmark_dataset():\n    video_dataset = BenchmarkVideoDataset.from_tuples(\n        files=[\n            (\"perception/testing/videos/v1.m4v\", \"category1\"),\n            (\"perception/testing/videos/v2.m4v\", \"category1\"),\n            (\"perception/testing/videos/v1.m4v\", \"category2\"),\n            (\"perception/testing/videos/v2.m4v\", \"category2\"),\n        ]\n    )\n    transforms = {\n        \"noop\": video_transforms.get_simple_transform(width=128, sar=\"1/1\"),\n        \"gif\": video_transforms.get_simple_transform(codec=\"gif\", output_ext=\".gif\"),\n        \"clip1s\": video_transforms.get_simple_transform(clip_s=(1, None)),\n        \"blackpad\": video_transforms.get_black_frame_padding_transform(duration_s=1),\n        \"slideshow\": video_transforms.get_slideshow_transform(\n            frame_input_rate=1, frame_output_rate=1\n        ),\n    }\n    transformed = video_dataset.transform(\n        storage_dir=tempfile.TemporaryDirectory().name, transforms=transforms\n    )\n    assert len(transformed._df) == len(transforms) * len(video_dataset._df)\n    assert transformed._df[\"filepath\"].isnull().sum() == 0\n\n    # We will compute hashes for each of the transformed\n    # videos and check the results for correctness.\n    phash_framewise_hasher = hashers.FramewiseHasher(\n        frame_hasher=hashers.PHash(), interframe_threshold=-1, frames_per_second=2\n    )\n    hashes = transformed.compute_hashes(\n        hashers={\"phashframewise\": phash_framewise_hasher}\n    )\n\n    guid = hashes._df.guid.iloc[0]\n    df = hashes._df[hashes._df[\"guid\"] == guid]\n    clip1s = df[(df.transform_name == \"clip1s\")]\n    noop = df[(df.transform_name == \"noop\")]\n    blackpad = df[(df.transform_name == \"blackpad\")]\n    slideshow = df[(df.transform_name == \"slideshow\")]\n\n    # We should have dropped two hashes from the beginning\n    # on the clipped video.\n    assert len(clip1s) == len(noop) - 2\n\n    # The first hash from the clipped video should be the\n    # same as the third hash from the original\n    np.testing.assert_allclose(\n        convert_hash_string_to_vector(clip1s.hash.iloc[0]),\n        convert_hash_string_to_vector(noop.hash.iloc[2]),\n        rtol=0.2,\n    )\n\n    # The black padding adds four hashes (two on either side).\n    assert len(blackpad) == len(noop) + 4\n\n    # A black frame should yield all zeros for PHash\n    assert phash_framewise_hasher.string_to_vector(blackpad.iloc[0].hash).sum() == 0\n\n    # The slideshow hashes should be the same as the noop\n    # hashes for every other hash.\n    # Note: this is a weird test structure now because the original test, which was\n    # assert (noop.hash.values[::2] == slideshow.hash.values[::2]).all()\n    # kept failing because of 1 bit difference in 1 hash. This is keeps the same\n    # spirit, but is more complex with a little leniency. We suspect the difference is\n    # due to some versioning. So might be worthwhile to try replacing the test with the\n    # original one occasionally.\n    noop_hash_vectors = [\n        convert_hash_string_to_vector(h) for h in noop.hash.values[::2]\n    ]\n    slideshow_hash_vectors = [\n        convert_hash_string_to_vector(h) for h in slideshow.hash.values[::2]\n    ]\n    total_missed_bits = 0\n    for noop_vector, slideshow_vector in zip(noop_hash_vectors, slideshow_hash_vectors):\n        for n in range(0, len(noop_vector)):\n            if noop_vector[n] != slideshow_vector[n]:\n                total_missed_bits += 1\n    assert total_missed_bits <= 4\n\n    # Every second hash in the slideshow should be the same as the\n    # previous one.\n    for n in range(0, 10, 2):\n        assert slideshow.hash.values[n] == slideshow.hash.values[n + 1]\n\n\ndef test_euclidean_extension():\n\n    # This function plainly inplements the process of computing\n    # the closest positive and negative examples and their indexes.\n    def compute_euclidean_metrics_py(X_noop, X_transformed, mask):\n        distance_matrix = spatial.distance.cdist(\n            XA=X_transformed, XB=X_noop, metric=\"euclidean\"\n        )\n        pos = np.ma.masked_array(distance_matrix, np.logical_not(mask))\n        neg = np.ma.masked_array(distance_matrix, mask)\n        distances = np.concatenate(\n            [neg.min(axis=1).data[np.newaxis], pos.min(axis=1).data[np.newaxis]], axis=0\n        ).T\n        indexes = np.concatenate(\n            [\n                neg.argmin(axis=1)[np.newaxis],\n                pos.argmin(axis=1)[np.newaxis],\n            ]\n        ).T\n        return distances, indexes\n\n    X_noop = np.random.uniform(low=0, high=255, size=(5, 144)).astype(\"int32\")\n    X_trans = np.random.uniform(low=0, high=255, size=(10, 144)).astype(\"int32\")\n    mask = np.array([True, False] * 5 * 5).reshape(10, 5)\n\n    distances, indexes = benchmarking.common.extensions.compute_euclidean_metrics(\n        X_noop, X_trans, mask\n    )\n    distances_py, indexes_py = compute_euclidean_metrics_py(X_noop, X_trans, mask)\n\n    assert (indexes_py == indexes).all()\n    np.testing.assert_allclose(distances, distances_py)\n"
  },
  {
    "path": "tests/test_hashers.py",
    "content": "import os\nimport string\n\nimport pytest\n\nfrom perception import hashers, testing\nfrom perception.hashers.image.pdq import PDQHash\n\nTEST_IMAGES = [os.path.join(\"tests\", \"images\", f\"image{n}.jpg\") for n in range(1, 11)]\n\n\n# The PDQ hash isometric computation is inexact. See\n# https://github.com/faustomorales/pdqhash-python/blob/master/tests/test_compute.py\n# for details.\n@pytest.mark.parametrize(\n    \"hasher_class,pil_opencv_threshold,transform_threshold,opencv_hasher\",\n    [\n        (hashers.AverageHash, 0.1, 0.1, False),\n        (hashers.WaveletHash, 0.1, 0.1, False),\n        (hashers.PHash, 0.1, 0.1, False),\n        (PDQHash, 0.1, 0.15, False),\n        (hashers.DHash, 0.1, 0.1, False),\n        (hashers.MarrHildreth, 0.1, 0.1, True),\n        (hashers.BlockMean, 0.1, 0.1, True),\n        (hashers.ColorMoment, 10, 0.1, True),\n    ],\n)\ndef test_image_hashing_common(\n    hasher_class, pil_opencv_threshold, transform_threshold, opencv_hasher\n):\n    testing.test_image_hasher_integrity(\n        hasher=hasher_class(),\n        pil_opencv_threshold=pil_opencv_threshold,\n        transform_threshold=transform_threshold,\n        opencv_hasher=opencv_hasher,\n    )\n\n\ndef test_video_hashing_common():\n    testing.test_video_hasher_integrity(\n        hasher=hashers.FramewiseHasher(\n            frame_hasher=hashers.PHash(hash_size=16),\n            interframe_threshold=0.1,\n            frames_per_second=1,\n        )\n    )\n\n\ndef test_video_reading():\n    # We should get one red, one green, and one blue frame\n    for frame, _, timestamp in hashers.tools.read_video(\n        filepath=\"perception/testing/videos/rgb.m4v\", frames_per_second=0.5\n    ):\n        assert timestamp in [0.0, 2.0, 4.0]\n        channel = int(timestamp / 2)\n        assert frame[:, :, channel].min() > 220\n        for other in [0, 1, 2]:\n            if other == channel:\n                continue\n            assert frame[:, :, other].max() < 20\n\n\ndef test_common_framerate():\n    assert hashers.tools.get_common_framerates(\n        dict(zip([\"a\", \"b\", \"c\"], [1 / 3, 1 / 2, 1 / 5]))\n    ) == {1.0: (\"a\", \"b\", \"c\")}\n    assert hashers.tools.get_common_framerates(\n        dict(zip([\"a\", \"b\", \"c\"], [1 / 3, 1 / 6, 1 / 9]))\n    ) == {1 / 3: (\"a\", \"b\", \"c\")}\n    assert hashers.tools.get_common_framerates(\n        dict(zip([\"a\", \"b\", \"c\", \"d\", \"e\"], [1 / 3, 1 / 2, 1 / 5, 1 / 7, 1 / 11]))\n    ) == {1.0: (\"a\", \"b\", \"c\", \"d\", \"e\")}\n    assert hashers.tools.get_common_framerates(\n        dict(zip(string.ascii_lowercase[:6], [10, 5, 3, 1 / 3, 1 / 6, 1 / 9]))\n    ) == {3.0: (\"c\", \"d\", \"e\", \"f\"), 10.0: (\"a\", \"b\")}\n    assert hashers.tools.get_common_framerates(dict(zip([\"a\", \"b\"], [100, 1]))) == {\n        100: (\"a\", \"b\")\n    }\n\n\ndef test_synchronized_hashing():\n    video_hashers = {\n        \"phashframewise\": hashers.FramewiseHasher(\n            frame_hasher=hashers.PHash(hash_size=16),\n            frames_per_second=1,\n            interframe_threshold=0.2,\n        ),\n        \"tmkl2\": hashers.TMKL2(frames_per_second=15),\n        \"tmkl1\": hashers.TMKL1(frames_per_second=15),\n    }\n\n    for filepath in [\n        \"perception/testing/videos/v1.m4v\",\n        \"perception/testing/videos/v2.m4v\",\n    ]:\n        # Ensure synchronized hashing\n        hashes1 = {\n            hasher_name: hasher.compute(filepath)\n            for hasher_name, hasher in video_hashers.items()\n        }\n        hashes2 = hashers.tools.compute_synchronized_video_hashes(\n            filepath=filepath, hashers=video_hashers\n        )\n        assert hashes1 == hashes2\n\n\ndef test_hex_b64_conversion():\n    b64_string = \"\"\"\n    CFFRABrAaRKCDQigEBIGwAhNBdIISgVZBxQYAgP4fwYNUR0oBgYCPwwIDSqTAmIH\n    FRQhCiT/IT9DpHIeIx4cA2hQcBTwISovFkspMxz/MzdnljeCOEs4LnBYNHHBMC4x\n    EC8mPxLaLkI/dywmNk1lMXoqJyCLSyg7BxwRSgTmIlI/LwsrP04hTCMtBSxaGAFB\n    \"\"\".replace(\"\\n\", \"\").replace(\" \", \"\").strip()\n    hex_string = \"\"\"\n    085151001ac06912820d08a0101206c0084d05d2084a05590714180203f87f06\n    0d511d280606023f0c080d2a930262071514210a24ff213f43a4721e231e1c03\n    68507014f0212a2f164b29331cff333767963782384b382e70583471c1302e31\n    102f263f12da2e423f772c26364d65317a2a27208b4b283b071c114a04e62252\n    3f2f0b2b3f4e214c232d052c5a180141\n    \"\"\".replace(\"\\n\", \"\").replace(\" \", \"\").strip()\n    assert (\n        hashers.tools.hex_to_b64(hex_string, dtype=\"uint8\", hash_length=144)\n        == b64_string\n    )\n    assert (\n        hashers.tools.b64_to_hex(b64_string, dtype=\"uint8\", hash_length=144)\n        == hex_string\n    )\n"
  },
  {
    "path": "tests/test_local_descriptor_deduplication.py",
    "content": "import os\nimport tempfile\n\nimport albumentations\nimport cv2\nimport pandas as pd\nimport pytest\n\n\nimport perception.benchmarking.image as pb\nimport perception.benchmarking.image_transforms as pbit\nimport perception.approximate_deduplication as ad\nimport perception.local_descriptor_deduplication as ldd\nimport perception.hashers.tools as pht\nimport perception.testing as pt\nfrom perception.approximate_deduplication.debug import vizualize_pair\n\n# Params for object level matching.\nOBJECT_MATCH_PARAMS = {\n    \"strong_match_threshold\": 0.3,  # Ideally something close to 95% precision.\n    \"ratio\": 0.5,\n    \"coarse_pct_probe\": 0.1,\n    \"minimum_coarse_overlap\": 0.001,\n    \"coarse_threshold\": 100.0,\n    \"minimum_validation_match\": 0.04,\n    \"minimum_validation_intersection\": 0.04,\n    \"minimum_validation_inliers\": 6,\n}\n\n\n@pytest.mark.parametrize(\"hasher\", [ldd.SIFT(), ldd.AKAZE()])\ndef test_deduplication(hasher):\n    tdir = tempfile.TemporaryDirectory()\n    watermark = cv2.cvtColor(\n        cv2.imread(pt.DEFAULT_TEST_LOGOS[0], cv2.IMREAD_UNCHANGED), cv2.COLOR_BGRA2RGBA\n    )\n    transformed = pb.BenchmarkImageDataset.from_tuples(\n        files=[(filepath, \"test\") for filepath in pt.DEFAULT_TEST_IMAGES]\n    ).transform(\n        transforms={\n            \"noop\": albumentations.NoOp(p=1),\n            \"pad\": albumentations.CropAndPad(percent=0.1, p=1),\n            \"crop\": albumentations.CropAndPad(percent=-0.1, p=1),\n            \"watermark\": pbit.apply_watermark(watermark, alpha=1, size=0.8),  # type: ignore\n        },\n        storage_dir=tdir.name,\n    )\n    df = transformed._df.set_index(\"filepath\")\n    pairs = ldd.deduplicate(\n        filepaths_or_reference_df=df.index, max_workers=2, hasher=hasher\n    )  #  Test throws errors if unset.\n\n    clustered = (\n        pd.DataFrame(\n            ad.pairs_to_clusters(ids=df.index, pairs=pairs, strictness=\"component\")\n        )\n        .set_index(\"id\")\n        .merge(df, left_index=True, right_index=True)\n        .reset_index()\n    )\n    print(\"test2\")\n    n_clusters = clustered[\"cluster\"].nunique()\n    n_transforms = clustered[\"transform_name\"].nunique()\n    perfect = (\n        clustered.groupby(\"cluster\")\n        .apply(\n            lambda g: g[\"guid\"].nunique() == 1\n            and g[\"transform_name\"].nunique() == n_transforms\n        )\n        .sum()\n    )\n\n    tainted = clustered.groupby(\"cluster\")[\"guid\"].nunique().gt(1).sum()\n    pct_perfect = perfect / n_clusters\n    pct_tainted = tainted / n_clusters\n    assert pct_tainted == 0\n    assert pct_perfect > 0.1\n\n\n@pytest.mark.parametrize(\"hasher\", [ldd.SIFT(), ldd.AKAZE()])\ndef test_deduplication_across_sets(hasher):\n    tdir = tempfile.TemporaryDirectory()\n    watermark = cv2.cvtColor(\n        cv2.imread(pt.DEFAULT_TEST_LOGOS[0], cv2.IMREAD_UNCHANGED), cv2.COLOR_BGRA2RGBA\n    )\n    transformed = pb.BenchmarkImageDataset.from_tuples(\n        files=[(filepath, \"test\") for filepath in pt.DEFAULT_TEST_IMAGES]\n    ).transform(\n        transforms={\n            \"noop\": albumentations.NoOp(p=1),\n            \"pad\": albumentations.CropAndPad(percent=0.1, p=1),\n            \"crop\": albumentations.CropAndPad(percent=0.1, p=1),\n            \"watermark\": pbit.apply_watermark(watermark, alpha=1, size=0.8),  # type: ignore\n        },\n        storage_dir=tdir.name,\n    )\n\n    df = transformed._df.set_index(\"filepath\")\n    query_images = list(df[df.transform_name == \"noop\"].index.values)\n    images_to_match_to = list(df[~(df.transform_name == \"noop\")].index.values)\n\n    pairs = ldd.deduplicate(\n        filepaths_or_reference_df=images_to_match_to,\n        query_filepaths_or_df=query_images,\n        max_workers=2,\n        hasher=hasher,\n    )  #  Test throws errors if unset.\n\n    assert len(pairs) >= 20, \"Wrong # of pairs.\"\n    only_one_noop = [p for p in pairs if ((\"noop\" in p[0]) != (\"noop\" in p[1]))]\n    assert len(only_one_noop) == len(\n        pairs\n    ), \"All pairs must be between a noop and non-noop file\"\n\n\n@pytest.mark.parametrize(\"hasher\", [ldd.SIFT(), ldd.AKAZE()])\ndef test_validation_for_overlapping_case(hasher):\n    tdir = tempfile.TemporaryDirectory()\n    # Each image will have the center of the other\n    # pasted in the top left corner.\n    image1 = pht.read(pt.DEFAULT_TEST_IMAGES[0])\n    image2 = pht.read(pt.DEFAULT_TEST_IMAGES[1])\n    image1[:100, :100] = image2[100:200, 100:200]\n    image2[:100, :100] = image1[100:200, 100:200]\n    fp1 = os.path.join(tdir.name, \"test1.jpg\")\n    fp2 = os.path.join(tdir.name, \"test2.jpg\")\n    cv2.imwrite(fp1, image1[..., ::-1])\n    cv2.imwrite(fp2, image2[..., ::-1])\n    descriptor1 = ldd.generate_image_descriptors(fp1, hasher)\n    descriptor2 = ldd.generate_image_descriptors(fp2, hasher)\n    assert descriptor1 is not None\n    assert descriptor2 is not None\n\n    # These images should not match.\n    assert not hasher.validate_match(descriptor1=descriptor1, descriptor2=descriptor2)[\n        0\n    ]\n\n\n@pytest.mark.parametrize(\"hasher\", [ldd.SIFT(), ldd.AKAZE()])\ndef test_handling_bad_file_case(caplog, hasher):\n    tdir = tempfile.TemporaryDirectory()\n    missing_file = os.path.join(tdir.name, \"missing-file\")\n    bad_file_handle = tempfile.NamedTemporaryFile()\n    bad_file = bad_file_handle.name\n    transformed = pb.BenchmarkImageDataset.from_tuples(\n        files=[(filepath, \"test\") for filepath in pt.DEFAULT_TEST_IMAGES]\n    ).transform(\n        transforms={\n            \"noop\": lambda image: image,\n        },\n        storage_dir=tdir.name,\n    )\n    df = transformed._df.set_index(\"filepath\")\n    df.loc[missing_file] = df.iloc[0]\n    df.loc[bad_file] = df.iloc[0]\n    pairs = ldd.deduplicate(filepaths_or_reference_df=df.index, hasher=hasher)\n    clustered = (\n        pd.DataFrame(\n            ad.pairs_to_clusters(ids=df.index, pairs=pairs, strictness=\"component\")\n        )\n        .set_index(\"id\")\n        .merge(df, left_index=True, right_index=True)\n        .reset_index()\n    )\n\n    assert bad_file not in clustered.index\n    assert missing_file not in clustered.index\n\n    bad_file_error = next(\n        record for record in caplog.records if bad_file in record.message\n    )\n    assert bad_file_error\n    assert bad_file_error.levelname == \"ERROR\"\n\n    missing_file_warning = next(\n        record for record in caplog.records if missing_file in record.message\n    )\n    assert missing_file_warning\n    assert missing_file_warning.levelname == \"WARNING\"\n\n\ndef test_handling_hasher_mismatch():\n    tdir = tempfile.TemporaryDirectory()\n    transformed = pb.BenchmarkImageDataset.from_tuples(\n        files=[(filepath, \"test\") for filepath in pt.DEFAULT_TEST_IMAGES]\n    ).transform(\n        transforms={\n            \"noop\": lambda image: image,\n        },\n        storage_dir=tdir.name,\n    )\n    df = transformed._df.set_index(\"filepath\")\n    reference_df = ldd.build_reference_df(filepaths=df.index, hasher=ldd.SIFT())\n    query_df = ldd.build_reference_df(filepaths=df.index, hasher=ldd.AKAZE())\n    with pytest.raises(AssertionError):\n        ldd.deduplicate(reference_df, query_df)\n\n\ndef test_viz_pair():\n    object_sift = ldd.SIFT(\n        max_features=256,\n        ratio=OBJECT_MATCH_PARAMS[\"ratio\"],\n        threshold=OBJECT_MATCH_PARAMS[\"coarse_threshold\"],\n        overlap=OBJECT_MATCH_PARAMS[\"minimum_coarse_overlap\"],\n        validation_match=OBJECT_MATCH_PARAMS[\"minimum_validation_match\"],\n        validation_inliers=OBJECT_MATCH_PARAMS[\"minimum_validation_inliers\"],\n        validation_intersection=OBJECT_MATCH_PARAMS[\"minimum_validation_intersection\"],\n    )\n    filepaths = [\n        \"tests/images/chair.png\",\n        \"tests/images/chair3.png\",\n        \"tests/images/chair-square.png\",\n        \"tests/images/chair-tall.png\",\n    ]\n    reference_df = ldd.build_reference_df(\n        filepaths=filepaths,\n        hasher=object_sift,\n        min_features=10,\n        max_size=1000,\n        show_progress=False,\n    )\n    pairs = ldd.deduplicate(\n        filepaths_or_reference_df=reference_df,\n        hasher=object_sift,\n        max_size=1000,\n        min_features=10,\n        verbose=True,\n    )\n    row = pairs[0]\n    viz_img = vizualize_pair(\n        reference_df.loc[row[0]],\n        reference_df.loc[row[1]],\n        0.5,\n        match_metadata=row[2],\n        sanitized=False,\n    )\n    viz_img = cv2.cvtColor(viz_img, cv2.COLOR_RGB2BGR)\n    cv2.imwrite(\"tests/images/debug-image.png\", viz_img)\n\n\ndef test_viz_pair_symmetry():\n    # This test catches a regression where if the smaller image was the query one LDD would swap\n    # points during distance calculation, but not unswap points before returning them.\n    object_sift = ldd.SIFT(\n        max_features=256,\n        ratio=OBJECT_MATCH_PARAMS[\"ratio\"],\n        threshold=OBJECT_MATCH_PARAMS[\"coarse_threshold\"],\n        overlap=OBJECT_MATCH_PARAMS[\"minimum_coarse_overlap\"],\n        validation_match=OBJECT_MATCH_PARAMS[\"minimum_validation_match\"],\n        validation_inliers=OBJECT_MATCH_PARAMS[\"minimum_validation_inliers\"],\n        validation_intersection=OBJECT_MATCH_PARAMS[\"minimum_validation_intersection\"],\n    )\n    filepaths = [\n        \"tests/images/chair.png\",\n        \"tests/images/chair3.png\",\n    ]\n    reference_df = ldd.build_reference_df(\n        filepaths=filepaths,\n        hasher=object_sift,\n        min_features=10,\n        max_size=1000,\n        show_progress=False,\n    )\n    pairs = ldd.deduplicate(\n        filepaths_or_reference_df=filepaths[:1],\n        query_filepaths_or_df=filepaths[1:],\n        hasher=object_sift,\n        max_size=1000,\n        min_features=10,\n        verbose=True,\n    )\n    row = pairs[0]\n    viz_img = vizualize_pair(\n        reference_df.loc[row[0]],\n        reference_df.loc[row[1]],\n        0.5,\n        match_metadata=row[2],\n        sanitized=False,\n    )\n    viz_img = cv2.cvtColor(viz_img, cv2.COLOR_RGB2BGR)\n    cv2.imwrite(\"tests/images/debug-image-symmetry-1.png\", viz_img)\n\n    # Swap order of ref and query files.\n    pairs = ldd.deduplicate(\n        filepaths_or_reference_df=filepaths[1:],\n        query_filepaths_or_df=filepaths[:1],\n        hasher=object_sift,\n        max_size=1000,\n        min_features=10,\n        verbose=True,\n    )\n    row = pairs[0]\n    viz_img = vizualize_pair(\n        reference_df.loc[row[0]],\n        reference_df.loc[row[1]],\n        0.5,\n        match_metadata=row[2],\n        sanitized=False,\n    )\n    viz_img = cv2.cvtColor(viz_img, cv2.COLOR_RGB2BGR)\n    cv2.imwrite(\"tests/images/debug-image-symmetry-2.png\", viz_img)\n"
  },
  {
    "path": "tests/test_tmk.py",
    "content": "import gzip\nimport json\nfrom pathlib import Path\nfrom typing import cast\nimport platform\n\nimport numpy as np\nimport pytest\n\nfrom perception.hashers.video import tmk\n\nTEST_FILES = Path(\"perception\") / \"testing\" / \"videos\"\n\n\ndef test_tmk_parity():\n    if platform.machine() == \"arm64\":\n        pytest.xfail(\"TMK is not supported on ARM64\")\n\n    hasher = tmk.TMKL2()\n    with gzip.open(TEST_FILES / \"expected_tmk.json.gz\", \"rt\", encoding=\"utf8\") as f:\n        expected_output = json.load(f)\n    expected_output = {k: np.array(v) for k, v in expected_output.items()}\n\n    output = []\n\n    for filepath in [\n        \"perception/testing/videos/v1.m4v\",\n        \"perception/testing/videos/v2.m4v\",\n    ]:\n        hash_value: np.ndarray = cast(\n            np.ndarray, hasher.compute(filepath=filepath, hash_format=\"vector\")\n        )\n        output.append(hash_value.reshape((4, 64, -1)))\n\n    # Verify the hashes are the same\n    for o, t in zip(output, expected_output[\"hashes\"]):\n        np.testing.assert_allclose(o.reshape(*t.shape), t)\n\n    # Verify the pair-wise scores are the same\n    offsets = np.arange(-5, 5)\n    for normalization in [\"feat\", \"feat_freq\", \"matrix\"]:\n        score = hasher._score_pair(\n            output[0], output[1], offsets=offsets, normalization=normalization\n        )\n        np.testing.assert_allclose(score, expected_output[normalization])\n"
  },
  {
    "path": "tests/test_tools.py",
    "content": "import os\nimport shutil\nimport tempfile\nimport io\n\nimport numpy as np\nimport pytest\n\nfrom perception import hashers, testing, tools\n\n\ndef test_deduplicate():\n    directory = tempfile.TemporaryDirectory()\n    original = testing.DEFAULT_TEST_IMAGES[0]\n    duplicate = os.path.join(directory.name, \"image1.jpg\")\n    shutil.copy(original, duplicate)\n    pairs = tools.deduplicate(\n        files=[\n            testing.DEFAULT_TEST_IMAGES[0],\n            testing.DEFAULT_TEST_IMAGES[1],\n            duplicate,\n        ],\n        hashers=[(hashers.PHash(hash_size=16), 0.25)],\n    )\n    assert len(pairs) == 1\n    file1, file2 = pairs[0]\n    assert ((file1 == duplicate) and (file2 == original)) or (\n        (file1 == original) and (file2 == duplicate)\n    )\n\n\ndef test_deduplicate_u8():\n    # This test verifies that extensions.compute_euclidean_pairwise_duplicates\n    # works properly.\n    directory = tempfile.TemporaryDirectory()\n    original = testing.DEFAULT_TEST_IMAGES[0]\n    duplicate = os.path.join(directory.name, \"image1.jpg\")\n    shutil.copy(original, duplicate)\n    pairs = tools.deduplicate(\n        files=[\n            testing.DEFAULT_TEST_IMAGES[0],\n            testing.DEFAULT_TEST_IMAGES[1],\n            duplicate,\n        ],\n        hashers=[(hashers.PHashU8(hash_size=16), 10)],\n    )\n    assert len(pairs) == 1\n    file1, file2 = pairs[0]\n    assert ((file1 == duplicate) and (file2 == original)) or (\n        (file1 == original) and (file2 == duplicate)\n    )\n\n\ndef test_deduplicate_hashes_multiple():\n    # This test verifies that deduplicate_hashes functions properly\n    # when there is more than one hash for a file.\n    directory = tempfile.TemporaryDirectory()\n    original = testing.DEFAULT_TEST_IMAGES[0]\n    duplicate = os.path.join(directory.name, \"image1.jpg\")\n    hasher = hashers.PHashU8(hash_size=16)\n    shutil.copy(original, duplicate)\n    hashes = [\n        (0, hasher.compute(original)),\n        (1, hasher.compute(duplicate)),\n        (1, hasher.compute(duplicate)),\n        (1, hasher.compute(duplicate)),\n        (2, hasher.compute(testing.DEFAULT_TEST_IMAGES[1])),\n    ]\n    pairs = tools.deduplicate_hashes(\n        hashes=hashes,  # type: ignore[arg-type]\n        threshold=10,\n        hash_format=\"base64\",\n        hash_length=hasher.hash_length,\n        distance_metric=\"euclidean\",\n        hash_dtype=\"uint8\",\n    )\n    assert len(pairs) == 1\n    file1, file2 = pairs[0]\n    assert ((file1 == 0) and (file2 == 1)) or ((file1 == 1) and (file2 == 0))\n\n\ndef test_compute_euclidean_pairwise_duplicates():\n    # The purpose of this test is to verify that the handling of\n    # deduplication with files that have multiple hashes works\n    # properly. This is particularly important for video where\n    # we are likely to have many hashes.\n    X = np.array(\n        [\n            # File 1\n            [0, 0, 0],\n            [1, 1, 1],\n            [2, 2, 2],\n            # File 2\n            [1, 1, 1],\n            [2, 2, 2],\n            [3, 3, 3],\n            # File 3\n            [3, 3, 3],\n            [4, 4, 4],\n            # File 4\n            [5, 5, 5],\n            [6, 6, 6],\n        ]\n    )\n\n    # Use grouped files.\n    counts = np.array([3, 3, 2, 2])\n    expected = np.array(\n        [[2 / 3, 2 / 3], [0, 0], [0, 0], [1 / 3, 1 / 2], [0, 0], [0, 0]]\n    )\n    actual = tools.extensions.compute_euclidean_pairwise_duplicates(\n        X=X.astype(\"int32\"),\n        threshold=1,\n        counts=counts.astype(\"uint32\"),\n        compute_overlap=True,\n    )\n    assert (expected == actual).all()\n\n    # Use without computing overlap.\n    expected = np.array([[2, 2], [0, 0], [0, 0], [1, 1], [0, 0], [0, 0]])\n    actual = tools.extensions.compute_euclidean_pairwise_duplicates(\n        X=X.astype(\"int32\"),\n        threshold=1,\n        counts=counts.astype(\"uint32\"),\n        compute_overlap=False,\n    )\n    assert (expected == actual).all()\n\n    # Use ungrouped files.\n    X = np.array(\n        [\n            # File 1\n            [0, 0, 0],\n            [1, 1, 1],\n            [2, 2, 2],\n            [1, 1, 1],\n        ]\n    )\n    expected = np.array([[0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [0, 0]])\n    actual = tools.extensions.compute_euclidean_pairwise_duplicates(\n        X=X.astype(\"int32\"), threshold=1, compute_overlap=True\n    )\n    assert (expected == actual).all()\n\n\ndef test_api_is_over_https():\n    matcher_https = tools.SaferMatcher(api_key=\"foo\", url=\"https://www.example.com/\")\n    assert matcher_https\n\n    if \"SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP\" in os.environ:\n        del os.environ[\"SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP\"]\n    with pytest.raises(ValueError):\n        tools.SaferMatcher(api_key=\"foo\", url=\"http://www.example.com/\")\n\n    os.environ[\"SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP\"] = \"1\"\n    matcher_http_with_escape_hatch = tools.SaferMatcher(\n        api_key=\"foo\", url=\"http://www.example.com/\"\n    )\n    assert matcher_http_with_escape_hatch\n\n\ndef test_unletterbox():\n    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])\n    padded = np.zeros((image.shape[0] + 100, image.shape[1] + 50, 3), dtype=\"uint8\")\n    padded[50 : 50 + image.shape[0], 25 : 25 + image.shape[1]] = image\n    result = hashers.tools.unletterbox(padded)\n    assert result is not None\n    (x1, x2), (y1, y2) = result\n    assert y1 == 50\n    assert y2 == 50 + image.shape[0]\n    assert x1 == 25\n    assert x2 == 25 + image.shape[1]\n\n\ndef test_unletterbox_crop():\n    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])\n    padded = np.zeros((image.shape[0] + 100, image.shape[1] + 50, 3), dtype=\"uint8\")\n    padded[50 : 50 + image.shape[0], 25 : 25 + image.shape[1]] = image\n    cropped_image = hashers.tools.unletterbox_crop(padded)\n    assert cropped_image is not None\n    assert image.shape[0] == cropped_image.shape[0]\n    assert image.shape[1] == cropped_image.shape[1]\n\n\ndef test_unletterbox_crop_meaningful_pixels():\n    \"\"\"Test the value of .5  min_fraction_meaningful_pixels in unletterbox().\"\"\"\n    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])\n    h, w, _ = image.shape\n\n    # make tall skinny images with lots of padding around the content\n    # so its below min_fraction_meaningful_pixels threshold\n    padding_size = int(5 * h)\n\n    padded = np.r_[\n        np.zeros((padding_size, w, 3)), image, np.zeros((padding_size, w, 3))\n    ]\n    assert None is hashers.tools.unletterbox_crop(\n        padded, min_fraction_meaningful_pixels=0.5\n    )\n\n\ndef test_unletterbox_color():\n    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])\n    padded = np.zeros((image.shape[0] + 100, image.shape[1] + 50, 3), dtype=\"uint8\")\n    padded[:, :] = (200, 0, 200)\n    padded[50 : 50 + image.shape[0], 25 : 25 + image.shape[1]] = image\n    # Should not unletterbox since not black.\n    results = hashers.tools.unletterbox(padded, only_remove_black=True)\n    assert results is not None\n    (x1, x2), (y1, y2) = results\n    assert y1 == 0\n    assert y2 == padded.shape[0]\n    assert x1 == 0\n    assert x2 == padded.shape[1]\n\n    # Should  unletterbox color:\n    results = hashers.tools.unletterbox(padded, only_remove_black=False)\n    assert results is not None\n    (x1, x2), (y1, y2) = results\n    assert y1 == 50\n    assert y2 == 50 + image.shape[0]\n    assert x1 == 25\n    assert x2 == 25 + image.shape[1]\n\n\ndef test_unletterbox_aspect_ratio():\n    \"\"\"Test the value of .1 in unletterbox().\"\"\"\n    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])\n    h, w, z = image.shape\n\n    # make tall skinny images with non-trivial content just below and\n    # above 10% threshold\n    base = int(4.5 * h)  # 2 * base + h = 100%\n    h_fail, h_pass = base + 10, base - 10\n\n    padded = np.r_[np.zeros((h_fail, w, 3)), image, np.zeros((h_fail, w, 3))]\n    assert None is hashers.tools.unletterbox(padded)\n\n    padded = np.r_[np.zeros((h_pass, w, 3)), image, np.zeros((h_pass, w, 3))]\n\n    results = hashers.tools.unletterbox(padded)\n    assert results is not None\n    (x1, x2), (y1, y2) = results\n\n    assert y1 == h_pass\n    assert y2 == h_pass + image.shape[0]\n    assert x1 == 0\n    assert x2 == image.shape[1]\n\n\ndef test_unletterbox_noblackbars():\n    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])\n\n    results = hashers.tools.unletterbox(image)\n    assert results is not None\n    (x1, x2), (y1, y2) = results\n    assert x1 == 0\n    assert y1 == 0\n    assert x2 == image.shape[1]\n    assert y2 == image.shape[0]\n\n\ndef test_ffmpeg_video():\n    \"\"\"Check that the FFMPEG video parsing code provides substantially similar\n    results to the OpenCV approach (which also uses FFMPEG under the hood but\n    also has different frame selection logic).\"\"\"\n    frames_per_second = 2.3\n    for filepath in testing.DEFAULT_TEST_VIDEOS:\n        filename = os.path.basename(filepath)\n        for (frame1, index1, timestamp1), (frame2, index2, timestamp2) in zip(\n            hashers.tools.read_video_to_generator_ffmpeg(\n                filepath, frames_per_second=frames_per_second\n            ),\n            hashers.tools.read_video_to_generator(\n                filepath, frames_per_second=frames_per_second\n            ),\n        ):\n            diff = np.abs(frame1.astype(\"int32\") - frame2.astype(\"int32\")).flatten()\n            assert index1 == index2, f\"Index mismatch for {filename}\"\n            np.testing.assert_allclose(\n                timestamp1, timestamp2\n            ), f\"Timestamp mismatch for {filename}\"\n            assert np.percentile(diff, 75) < 25, f\"Frame mismatch for {filename}\"\n\n\ndef test_videos_with_extra_channels():\n    frames_per_second = 1\n    test_videos = [\n        \"perception/testing/videos/extra_channel_attached_pic.mp4\",\n        \"perception/testing/videos/extra_channel_attached_pic_audio.mp4\",\n    ]\n    expected_frames = 10\n    for filepath in test_videos:\n        filename = os.path.basename(filepath)\n        frame_count = 0\n        for frame1, index1, timestamp1 in hashers.tools.read_video_to_generator_ffmpeg(\n            filepath, frames_per_second=frames_per_second\n        ):\n            frame_count += 1\n        assert frame_count == expected_frames, f\"Frame count mismatch for {filename}\"\n\n\ndef test_image_input_types():\n    image_expected = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])\n\n    with open(testing.DEFAULT_TEST_IMAGES[0], \"rb\") as f:\n        image_data = f.read()\n\n    image_bytes_io = hashers.tools.read(io.BytesIO(image_data))\n    assert (image_expected == image_bytes_io).all()\n\n    with tempfile.SpooledTemporaryFile() as f:\n        f.write(image_data)\n        f.seek(0)\n        image_tempfile = hashers.tools.read(f)\n\n    assert (image_expected == image_tempfile).all()\n"
  }
]