Repository: thorn-oss/perception
Branch: main
Commit: b17dcb841435
Files: 75
Total size: 366.2 KB

Directory structure:
gitextract_4qwyzu2o/

├── .dockerignore
├── .git-blame-ignore-revs
├── .gitattributes
├── .github/
│   ├── dependabot.yaml
│   └── workflows/
│       ├── ci.yaml
│       ├── gh-pages.yaml
│       └── release.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── build.py
├── docs/
│   ├── api/
│   │   ├── benchmarking.rst
│   │   ├── hashers.rst
│   │   ├── index.rst
│   │   └── tools.rst
│   ├── conf.py
│   ├── examples/
│   │   ├── benchmarking.rst
│   │   ├── deduplication.rst
│   │   ├── detecting_csam.rst
│   │   └── index.rst
│   ├── index.rst
│   └── requirements.txt
├── perception/
│   ├── __init__.py
│   ├── approximate_deduplication/
│   │   ├── __init__.py
│   │   ├── _graph_backend.py
│   │   ├── debug.py
│   │   ├── index.py
│   │   └── serve.py
│   ├── benchmarking/
│   │   ├── __init__.py
│   │   ├── common.py
│   │   ├── extensions.pyx
│   │   ├── image.py
│   │   ├── image_transforms.py
│   │   ├── video.py
│   │   └── video_transforms.py
│   ├── extensions.pyx
│   ├── hashers/
│   │   ├── __init__.py
│   │   ├── hasher.py
│   │   ├── image/
│   │   │   ├── __init__.py
│   │   │   ├── average.py
│   │   │   ├── dhash.py
│   │   │   ├── opencv.py
│   │   │   ├── pdq.py
│   │   │   ├── phash.py
│   │   │   └── wavelet.py
│   │   ├── tools.py
│   │   └── video/
│   │       ├── __init__.py
│   │       ├── framewise.py
│   │       └── tmk.py
│   ├── local_descriptor_deduplication.py
│   ├── py.typed
│   ├── testing/
│   │   ├── __init__.py
│   │   ├── images/
│   │   │   └── README.md
│   │   ├── logos/
│   │   │   └── README.md
│   │   └── videos/
│   │       ├── README.md
│   │       ├── rgb.m4v
│   │       ├── v1.m4v
│   │       └── v2.m4v
│   ├── tools.py
│   └── utils.py
├── poetry.toml
├── pyproject.toml
├── setup.py
└── tests/
    ├── test_approximate_deduplication.py
    ├── test_benchmarking.py
    ├── test_hashers.py
    ├── test_local_descriptor_deduplication.py
    ├── test_tmk.py
    └── test_tools.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
notebooks
.venv/


================================================
FILE: .git-blame-ignore-revs
================================================
# Format with black
6c03f96a9335e548685ece233474125fe453c262

================================================
FILE: .gitattributes
================================================
perception/_version.py export-subst


================================================
FILE: .github/dependabot.yaml
================================================
version: 2
updates:
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      # Check for updates to GitHub Actions every week.
      interval: "weekly"


================================================
FILE: .github/workflows/ci.yaml
================================================
name: ci
on:
  push:
    branches:
      - "**"
    tags-ignore:
      - v*
jobs:
  test:
    strategy:
      matrix:
        python-version: ["3.10", "3.11", "3.12", "3.13"]
        os: ["ubuntu-latest", "windows-latest", "macos-latest"]
    runs-on: ${{ matrix.os }}
    steps:
      - name: checkout
        uses: actions/checkout@v6
      - name: Setup Poetry
        uses: abatilo/actions-poetry@v4
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python-version }}
          cache: poetry
          cache-dependency-path: poetry.lock
      - name: Setup FFMPEG
        uses: FedericoCarboni/setup-ffmpeg@v3
        if: ${{ ! startsWith(matrix.os, 'macos') }}
      - name: Setup Dependencies with Homebrew
        if: startsWith(matrix.os, 'macos')
        run: |
          brew install llvm ffmpeg
          echo "CC=$(brew --prefix)/opt/llvm/bin/clang" >> $GITHUB_ENV
          echo "CXX=$(brew --prefix)/opt/llvm/bin/clang++" >> $GITHUB_ENV
      - name: Setup Project
        run: make init-project
      - name: Normalize OpenCV package
        run: |
          poetry run python -m pip uninstall -y opencv-python-headless
          poetry run python -m pip install --no-deps --force-reinstall opencv-contrib-python-headless
      - name: Run precommit
        run: make precommit


================================================
FILE: .github/workflows/gh-pages.yaml
================================================
name: Deploy Sphinx documentation to Pages

on:
  push:
    branches:
      - dunnack/sphinx-to-github-pages
      - main
    paths:
      - .github/workflows/gh-pages.yaml
      - docs/**

jobs:
  pages:
    runs-on: ubuntu-latest
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    permissions:
      contents: read
      pages: write
      id-token: write
    steps:
      - uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - id: deployment
        uses: sphinx-notes/pages@v3
        with:
          checkout: false
          documentation_path: docs
          requirements_path: docs/requirements.txt


================================================
FILE: .github/workflows/release.yaml
================================================
name: release
on:
  release:
    types: [published]
  workflow_dispatch:

jobs:
  build-wheels:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        python-version: ["3.10", "3.11", "3.12", "3.13"]
        os: ["ubuntu-latest", "windows-latest", "macos-latest"]
    name: Build for ${{ matrix.os }} on Python ${{ matrix.python-version }}
    steps:
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python-version }}
      - name: Setup Poetry
        uses: abatilo/actions-poetry@v4
      - name: Setup FFMPEG
        uses: FedericoCarboni/setup-ffmpeg@v3
        if: ${{ ! startsWith(matrix.os, 'macos') }}
      - name: Setup Dependencies with Homebrew
        if: startsWith(matrix.os, 'macos')
        run: |
          brew install llvm ffmpeg
          echo "CC=$(brew --prefix)/opt/llvm/bin/clang" >> $GITHUB_ENV
          echo "CXX=$(brew --prefix)/opt/llvm/bin/clang++" >> $GITHUB_ENV
      - uses: actions/checkout@v6
        with:
          # Full clone for version calculation
          fetch-depth: 0
          fetch-tags: true
          ref: ${{ github.event_name == 'release' && format('refs/tags/{0}', github.event.release.tag_name) || github.ref }}
      - name: Build Project
        run: make build-wheel
      - uses: actions/upload-artifact@v7
        with:
          name: package-wheels-${{ matrix.os }}-${{ matrix.python-version }}
          path: dist/*

  build-sdist:
    runs-on: ubuntu-latest
    name: Build sdist
    steps:
      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.13"
      - name: Setup Poetry
        uses: abatilo/actions-poetry@v4
      - uses: actions/checkout@v6
        with:
          # Full clone for version calculation
          fetch-depth: 0
          fetch-tags: true
          ref: ${{ github.event_name == 'release' && format('refs/tags/{0}', github.event.release.tag_name) || github.ref }}
      - name: Build Project
        run: make build-sdist
      - uses: actions/upload-artifact@v7
        with:
          name: package-sdist
          path: dist/*

  publish:
    needs: [build-wheels, build-sdist]
    runs-on: ubuntu-latest
    if: ${{ github.repository_owner == 'thorn-oss' && github.event_name == 'release' }}
    steps:
      - uses: actions/checkout@v6
        with:
          # Full clone for version calculation
          fetch-depth: 0
          fetch-tags: true
          ref: refs/tags/${{ github.event.release.tag_name }}
      - uses: actions/setup-python@v6
        with:
          python-version: "3.13"
      - name: Setup Poetry
        uses: abatilo/actions-poetry@v4
      - name: Setup Dynamic Versioning
        run: poetry self add "poetry-dynamic-versioning[plugin]"
      - name: Download wheels
        uses: actions/download-artifact@v8
        with:
          path: dist
          pattern: package-*
          merge-multiple: true
      - name: Load PyPI Token
        uses: 1password/load-secrets-action@v4
        with:
          # Export loaded secrets as environment variables
          export-env: true
        env:
          OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.DATA_SCIENCE_OP_SERVICE_ACCOUNT_TOKEN }}
          POETRY_PYPI_TOKEN_PYPI: op://data-science-oss/perception-pypi-api-key/secret/value
      - name: Verify artifacts
        run: |
          mapfile -t artifacts < <(find dist -type f \( -name "*.whl" -o -name "*.tar.gz" \))
          if [ ${#artifacts[@]} -eq 0 ]; then
            echo "No artifacts found in dist"
            exit 1
          fi
          printf '%s\n' "${artifacts[@]}"
          if printf '%s\n' "${artifacts[@]}" | grep -E -- '-0\.0\.0([.-]|$)'; then
            echo "Refusing to publish placeholder version 0.0.0 artifacts"
            exit 1
          fi
      - name: Publish package
        run: poetry publish -n


================================================
FILE: .gitignore
================================================
# MacOS stuff
.DS_Store

# Python artifacts
*.egg-info

# Cache
.mypy_cache
.pytest_cache
__pycache__
.ipynb_checkpoints
dist

# Any temporary images or CSV files
notebooks

# Local environment
.venv
.python-version

# Coverage file
.coverage

# Versioneer artifacts
/versioneer.pyc

# Build artifacts
/build

# Docs build artifacts
/docs/_build

# Remove .vscode folder
.vscode

# Extension artifacts
*.c
*.cpp
*.so
debug-image*


================================================
FILE: .pre-commit-config.yaml
================================================
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.5.0
    hooks:
      - id: trailing-whitespace
      - id: end-of-file-fixer
      - id: check-yaml
      - id: check-added-large-files
  - repo: https://github.com/psf/black
    rev: 26.3.1
    hooks:
      - id: black
        language_version: python3
  - repo: https://github.com/astral-sh/ruff-pre-commit
    # Ruff version.
    rev: v0.11.13
    hooks:
      # Run the linter.
      - id: ruff
        args: [ --fix ]
  - repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.8.0
    hooks:
      - id: mypy


================================================
FILE: .readthedocs.yaml
================================================
version: 2

# Build documentation in the docs/ directory with Sphinx
sphinx:
  configuration: docs/conf.py

formats: all

# Installs the package and the docs requirements.
python:
   version: 3.9
   install:
      - requirements: docs/requirements.txt
      - method: pip
        path: .
   system_packages: true


================================================
FILE: CHANGELOG.md
================================================
# Changelog
All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.4.0] - 2020-10-17
This release switches from using false positive rates in benchmarking to reporting precision, which is more intuitive.

### Breaking changes
All references to fpr_threshold now refer to precision_threshold.

### Bug fixes
The PDQHash hasher now correctly returns the hash vector instead of the (vector, quality) tuple.

## [0.3.0] - 2020-04-27
This release adds significantly more support for video.

### Breaking changes
- Previously, `read_video` returned `(frame, index, timestamp)` tuples where `index` reflected the index of the yielded frame (i.e., it always increased by exactly 1). It now reflects the index of the frame in the original video. This means that, if the requested framerate is higher than the encoded video framerate, this index may repeat the same value, indicating that we have repeated the same frame.

### Enhancements
- We now include a `SimpleSceneDetection` hasher that can wrap other video hashers using scene detection.
- `compute_metrics` is much faster now for integer-valued hashes that use a euclidean distance metric.
- We now include an unsigned 8-bit integer version of `PHash`, called `PHashU8`. This provides a useful framewise hasher for averaging across frames (e.g., using TMK) while being more compact than `PHashF`.
- We include more thorough support for benchmarking video hashes.

### Bug fixes
- When using `hasher.vector_to_string` with hashers that return multiple hashes, the `hash_format` argument was not respected.
- The `compute_threshold_recall` and `show_histograms` functions did not work properly when `grouping=[]`.

## [0.2.0] - 2019-12-20
This release adds more support for hashing videos (including TMK L2 and TMK L2). As part of that, it also includes a re-factor to separate `benchmarking.BenchmarkDataset` and `benchmarking.BenchmarkTransforms` into image and video variants.

## [0.1.0] - 2019-11-04
Initial release

================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to make participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.

## Our Standards

Examples of behavior that contributes to creating a positive environment
include:

* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members

Examples of unacceptable behavior by participants include:

* The use of sexualized language or imagery and unwelcome sexual attention or
  advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
  address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Our Responsibilities

Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.

Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.

## Scope

This Code of Conduct applies within all project spaces, and it also applies when
an individual is representing the project or its community in public spaces.
Examples of representing a project or community include using an official
project e-mail address, posting via an official social media account, or acting
as an appointed representative at an online or offline event. Representation of
a project may be further defined and clarified by project maintainers.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at conduct@thorn.org. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.

Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        https://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   Copyright 2019 Thorn

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       https://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

================================================
FILE: MANIFEST.in
================================================
include perception/testing/images/*
include perception/testing/videos/*
include perception/testing/logos/*
include perception/**/*.pyx
include perception/*.pyx
include perception/py.typed
exclude tests/*


================================================
FILE: Makefile
================================================
TEST_SCOPE?=tests/

.PHONY: build build-wheel build-sdist verify-version init-project init test lint_check type_check format format_check precommit

init-project:
	poetry install --all-extras

init: init-project
	poetry run pre-commit install

test:
	poetry run pytest $(TEST_SCOPE)

lint_check:
	poetry run ruff check perception tests

type_check:
	poetry run mypy perception

format:
	poetry run black .

format_check:
	poetry run black --check . || (echo '\nUnexpected format.' && exit 1)

precommit:
	poetry check
	make lint_check
	make type_check
	make format_check
	make test

verify-version:
	@echo "Poetry: $$(poetry --version)"
	@echo "Poetry plugins:"
	poetry self show plugins
	@echo "Git describe: $$(git describe --tags --always)"
	@poetry self show plugins | grep -q "poetry-dynamic-versioning"

build-wheel:
	poetry run pip -q install repairwheel
	poetry self add "poetry-dynamic-versioning[plugin]"
	$(MAKE) verify-version
	poetry build --format="wheel" --output="dist-tmp"
	poetry run repairwheel -o dist dist-tmp/*.whl
	@find dist -name "*.whl" -type f | sed -n "s/\(.*\)\.linux.*\.whl$$/& \1.whl/p" | xargs -r -n 2 mv # Fix wheel name
	@rm -rf dist-tmp

build-sdist:
	poetry self add "poetry-dynamic-versioning[plugin]"
	$(MAKE) verify-version
	poetry build --format="sdist" --output="dist"

build: build-wheel build-sdist


================================================
FILE: README.md
================================================
# perception ![ci](https://github.com/thorn-oss/perception/workflows/ci/badge.svg)

`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. See [the documentation](https://perception.thorn.engineering/en/latest/) for details.

## Background

`perception` was initially developed at [Thorn](https://www.thorn.org) as part of our work to eliminate child sexual abuse material from the internet. For more information on the issue, check out [our CEO's TED talk](https://www.thorn.org/blog/time-is-now-eliminate-csam/).

## Getting Started

### Installation

`pip install perception`

### Hashing

Hashing with different functions is simple with `perception`.

```python
from perception import hashers

file1, file2 = 'test1.jpg', 'test2.jpg'
hasher = hashers.PHash()
hash1, hash2 = hasher.compute(file1), hasher.compute(file2)
distance = hasher.compute_distance(hash1, hash2)
```

### Examples

See below for end-to-end examples for common use cases for perceptual hashes.

- [Detecting child sexual abuse material](https://perception.thorn.engineering/en/latest/examples/detecting_csam.html)
- [Deduplicating media](https://perception.thorn.engineering/en/latest/examples/deduplication.html)
- [Benchmarking perceptual hashes](https://perception.thorn.engineering/en/latest/examples/benchmarking.html)

## Supported Hashing Algorithms

`perception` currently ships with:

- pHash (DCT hash) (`perception.hashers.PHash`)
- Facebook's PDQ Hash (`perception.hashers.PDQ`)
- dHash (difference hash) (`perception.hashers.DHash`)
- aHash (average hash) (`perception.hashers.AverageHash`)
- Marr-Hildreth (`perception.hashers.MarrHildreth`)
- Color Moment (`perception.hashers.ColorMoment`)
- Block Mean (`perception.hashers.BlockMean`)
- wHash (wavelet hash) (`perception.hashers.WaveletHash`)

## Contributing

To work on the project, start by doing the following.

```bash
# Install local dependencies for
# code completion, etc.
make init

- To do a (close to) comprehensive check before committing code, you can use `make precommit`.

To implement new features, please first file an issue proposing your change for discussion.

To report problems, please file an issue with sample code, expected results, actual results, and a complete traceback.

## Alternatives

There are other packages worth checking out to see if they meet your needs for perceptual hashing. Here are some
examples.

- [dedupe](https://github.com/dedupeio/dedupe)
- [imagededup](https://idealo.github.io/imagededup/)
- [ImageHash](https://github.com/JohannesBuchner/imagehash)
- [PhotoHash](https://github.com/bunchesofdonald/photohash)
```


================================================
FILE: build.py
================================================
from Cython.Build import cythonize
import numpy as np

compiler_directives = {"language_level": 3, "embedsignature": True}


def build(setup_kwargs):
    setup_kwargs.update(
        {
            "ext_modules": cythonize(
                "perception/**/extensions.pyx", compiler_directives=compiler_directives
            ),
            "include_dirs": [np.get_include()],
        }
    )


================================================
FILE: docs/api/benchmarking.rst
================================================
Benchmarking
************

.. autoclass:: perception.benchmarking.BenchmarkImageDataset
        :members:
        :inherited-members:

.. autoclass:: perception.benchmarking.BenchmarkImageTransforms
        :members:
        :inherited-members:

.. autoclass:: perception.benchmarking.BenchmarkVideoDataset
        :members:
        :inherited-members:

.. autoclass:: perception.benchmarking.BenchmarkVideoTransforms
        :members:
        :inherited-members:

.. autoclass:: perception.benchmarking.BenchmarkHashes
        :members:
        :inherited-members:

Video Transforms
================

Transforming videos can be more complex, so we provide the following
tools for transforming videos.

.. automodule:: perception.benchmarking.video_transforms
        :members: get_simple_transform, get_black_frame_padding_transform, get_slideshow_transform


================================================
FILE: docs/api/hashers.rst
================================================
Hashers
*******

All hashers from the :code:`Hasher` class.

.. autoclass:: perception.hashers.hasher.Hasher
        :members:

Images
~~~~~~

All image hashers inherit from the :code:`ImageHasher` class. 

.. autoclass:: perception.hashers.hasher.ImageHasher
        :members:

The following image hash functions are included in the package.

.. automodule:: perception.hashers.image
        :members:
        :imported-members:


Videos
~~~~~~

All video hashers inherit from the :code:`VideoHasher` class. 

.. autoclass:: perception.hashers.hasher.VideoHasher
        :members:

The following video hash functions are included in the package.

.. automodule:: perception.hashers.video
        :members:
        :imported-members:

Tools
~~~~~

These utility functions are only used by the hashers but are documented
here for completeness.

.. automodule:: perception.hashers.tools
    :members:

================================================
FILE: docs/api/index.rst
================================================
API
***

.. toctree::
   :maxdepth: 2
   :caption: Contents:

   hashers
   benchmarking
   tools


================================================
FILE: docs/api/tools.rst
================================================

Tools
*****


.. automodule:: perception.tools
    :members:

================================================
FILE: docs/conf.py
================================================
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#

# -- Project information -----------------------------------------------------
project = "perception"
copyright = "2019, thorn"
author = "thorn"

# The short X.Y version
version = ""
# The full version, including alpha/beta/rc tags
release = ""

# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    "sphinx.ext.autodoc",
    "sphinx.ext.imgmath",
    "sphinx.ext.napoleon",
    "sphinx_autodoc_typehints",
    "m2r",
]

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = ".rst"

# The master toctree document.
master_doc = "index"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = None

html_theme = "sphinx_rtd_theme"

html_theme_options = {"navigation_depth": 4, "collapse_navigation": False}


================================================
FILE: docs/examples/benchmarking.rst
================================================
Benchmarking
************

This package provides a fair amount of infrastructure for benchmarking different hashers to evaluate their performance.

Image Hashing
=============

The below example does the following:

- Download a benchmarking dataset (we provide a dataset with images that have compatible licensing for this example)
- Load the dataset. If you are using your own datasets, you may wish to call `deduplicate` on it to ensure no duplicates are included.
- Transform the dataset to generate synthetic images.
- Define a new custom hasher that we want to evaluate.
  It's not very good -- but demonstrates how you can evaluate your own custom hash functions.
- Compute all the hashes.
- Report metrics for each image category / hasher / transformation combination.

.. code-block:: python

    import os
    import glob
    import zipfile
    import urllib.request

    import cv2
    import albumentations
    import tabulate # Optional: Only used for generating tables for the Sphinx documentation
    import numpy as np

    from perception import benchmarking, hashers
    from perception.hashers.image.pdq import PDQHash

    urllib.request.urlretrieve(
        "https://thorn-perception.s3.amazonaws.com/thorn-perceptual-benchmark-v0.zip",
        "thorn-perceptual-benchmark-v0.zip"
    )

    with zipfile.ZipFile('thorn-perceptual-benchmark-v0.zip') as f:
        f.extractall('.')

    # Load the dataset
    dataset = benchmarking.BenchmarkImageDataset.from_tuples(files=[
        (filepath, filepath.split(os.path.sep)[-2]) for filepath in glob.glob(
            os.path.join('thorn-perceptual-benchmark-v0', '**', '*.jpg')
        )
    ])

    # Define the transforms we want to use for
    # evaluation hash quality.
    def watermark(image):
        fontScale = 5
        thickness = 5
        text = "TEXT"
        fontFace = cv2.FONT_HERSHEY_SIMPLEX
        targetWidth = 0.2*image.shape[1]
        (textWidth, textHeight), _ = cv2.getTextSize(
            text="TEST",
            fontFace=fontFace,
            fontScale=fontScale,
            thickness=thickness
        )
        fontScaleCorr = targetWidth / textWidth
        textHeight *= fontScaleCorr
        textWidth *= fontScaleCorr
        fontScale *= fontScaleCorr

        org = ( textHeight, image.shape[0] - textHeight )
        org = tuple(map(int, org))
        color = (0, 0, 0, 200)
        placeholder = cv2.putText(
            img=np.zeros(image.shape[:2] + (4, ), dtype='uint8'),
            text="TEST",
            org=org,
            color=color,
            fontFace=fontFace,
            fontScale=fontScale,
            thickness=thickness
        ).astype('float32')
        augmented = (
            (image.astype('float32')[..., :3]*(255 - placeholder[..., 3:]) + placeholder[..., :3]*placeholder[..., 3:])
        ) / 255
        return augmented.astype('uint8')

    def vignette(image):
        height, width = image.shape[:2]
        a = cv2.getGaussianKernel(height, height/2)
        b = cv2.getGaussianKernel(width, width/2)
        c = (b.T*a)[..., np.newaxis]
        d = c/c.max()
        e = image*d
        return e.astype('uint8')

    transforms={
        'watermark': watermark,
        'blur2': albumentations.GaussianBlur(sigma_limit=2.0, p=1),
        'vignette': vignette,
        'gamma2': albumentations.RandomGamma(gamma_limit=2, p=1),
        'jpeg95': albumentations.ImageCompression(quality=95, p=1),
        'pad0.2': albumentations.CropAndPad(percent=(0.2, 2), p=1),
        'crop0.05': albumentations.CropAndPad(percent=-0.05, p=1),
        'noise0.2': albumentations.GaussNoise(noise_scale_factor=0.2, p=1),
        'rotate4': albumentations.Affine(rotate=4, p=1),
        'noop': albumentations.NoOp(p=1),
    }

    # Compute the transformed versions of the images.
    # This takes a while but you can reload the
    # generated dataset without recomputing it (see next line).
    transformed = dataset.transform(
        transforms=transforms,
        storage_dir='transformed',
        errors="raise"
    )
    # We don't actually have to do this, but it shows
    # how to reload the transformed dataset later.
    transformed = benchmarking.BenchmarkImageTransforms.load(
        path_to_zip_or_directory='transformed', verify_md5=False
    )

    # Create a new hash that we want to evaluate.
    # perception will handle most of the plumbing but
    # we do have to specify a few things.
    class ShrinkHash(hashers.ImageHasher):
        """This is a simple hash to demonstrate how you
        can create your own hasher and compare it to others.
        It just shrinks images to 8x8 pixels and then flattens
        the result.
        """

        # We have to let perception know
        # the shape and type of our hash.
        hash_length = 64
        dtype = 'uint8'

        # We need to specify how distance is
        # computed between hashes.
        distance_metric = 'euclidean'

        def _compute(self, image):
            gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
            resized = cv2.resize(gray, dsize=(8, 8))
            return resized.flatten()

    hashers_dict = {
        'ahash': hashers.AverageHash(hash_size=16),
        'dhash': hashers.DHash(hash_size=16),
        'pdq': PDQHash(),
        'phash': hashers.PHash(hash_size=16),
        'marrhildreth': hashers.MarrHildreth(),
        'wavelet': hashers.WaveletHash(hash_size=16),
        'blockmean': hashers.BlockMean(),
        'shrinkhash': ShrinkHash()
    }

    # Compute the hashes
    hashes = transformed.compute_hashes(hashers=hashers_dict)

    # Get performance metrics (i.e., recall) for each hash function based on
    # a minimum precision threshold. Here we use 99.99%.
    precision_threshold = 99.99

    # The metrics are just pandas dataframes. We use tabulate here to obtain the tables
    # formatted for the documentation.
    metrics = hashes.compute_threshold_recall(precision_threshold=precision_threshold).reset_index()
    print(tabulate.tabulate(metrics, showindex=False, headers=metrics.columns, tablefmt='rst'))

    metrics_by_transform = hashes.compute_threshold_recall(grouping=['transform_name'], precision_threshold=precision_threshold).reset_index()
    print(tabulate.tabulate(metrics_by_transform, showindex=False, headers=metrics_by_transform.columns, tablefmt='rst'))

    metrics_simple = hashes.compute_threshold_recall(grouping=[], precision_threshold=precision_threshold).reset_index()
    print(tabulate.tabulate(metrics_simple, showindex=False, headers=metrics_simple.columns, tablefmt='rst'))


===========  ================  =============  ============  ========  ===========  =============
category     transform_name    hasher_name       threshold    recall    precision    n_exemplars
===========  ================  =============  ============  ========  ===========  =============
paintings    blur2             ahash            0.0078125     51.724          100           2204
paintings    blur2             blockmean        0.0123967     85.753          100           2204
paintings    blur2             dhash            0.105469     100              100           2204
paintings    blur2             marrhildreth     0.0989583    100              100           2204
paintings    blur2             pdq              0.117188     100              100           2204
paintings    blur2             phash            0.0390625    100              100           2204
paintings    blur2             shrinkhash      60.8112        43.33           100           2204
paintings    blur2             wavelet          0.0117188     66.379          100           2204
paintings    crop0.05          ahash            0.00390625     0.045          100           2204
paintings    crop0.05          blockmean        0.0123967      0.227          100           2204
paintings    crop0.05          dhash            0.210938       7.577          100           2204
paintings    crop0.05          marrhildreth     0.213542       3.584          100           2204
paintings    crop0.05          pdq              0.257812       8.439          100           2204
paintings    crop0.05          phash            0.226562       6.76           100           2204
paintings    crop0.05          shrinkhash      95.0053         2.269          100           2204
paintings    crop0.05          wavelet          0.0078125      0              nan           2204
paintings    gamma2            ahash            0.00390625     0.998          100           2204
paintings    gamma2            blockmean        0.0072314      1.724          100           2204
paintings    gamma2            dhash            0.167969      98.639          100           2204
paintings    gamma2            marrhildreth     0.159722      99.41           100           2204
paintings    gamma2            pdq              0.164062     100              100           2204
paintings    gamma2            phash            0.164062     100              100           2204
paintings    gamma2            shrinkhash      46.5296         0              nan           2204
paintings    gamma2            wavelet          0.0117188     18.512          100           2204
paintings    jpeg95            ahash            0.00390625     4.22           100           2204
paintings    jpeg95            blockmean        0.0134298     28.811          100           2204
paintings    jpeg95            dhash            0.191406      94.782          100           2204
paintings    jpeg95            marrhildreth     0.168403      82.985          100           2204
paintings    jpeg95            pdq              0.257812     100              100           2204
paintings    jpeg95            phash            0.234375     100              100           2204
paintings    jpeg95            shrinkhash      66.053         55.172          100           2204
paintings    jpeg95            wavelet          0              0              nan           2204
paintings    noise0.2          ahash            0.00390625     2.677          100           2204
paintings    noise0.2          blockmean        0.00826446     6.987          100           2204
paintings    noise0.2          dhash            0.25          93.648          100           2204
paintings    noise0.2          marrhildreth     0.170139      73.911          100           2204
paintings    noise0.2          pdq              0.257812      99.229          100           2204
paintings    noise0.2          phash            0.257812     100              100           2204
paintings    noise0.2          shrinkhash     169.387          3.312          100           2204
paintings    noise0.2          wavelet          0.0078125      1.407          100           2204
paintings    noop              ahash            0            100              100           2204
paintings    noop              blockmean        0            100              100           2204
paintings    noop              dhash            0            100              100           2204
paintings    noop              marrhildreth     0            100              100           2204
paintings    noop              pdq              0            100              100           2204
paintings    noop              phash            0            100              100           2204
paintings    noop              shrinkhash       0            100              100           2204
paintings    noop              wavelet          0            100              100           2204
paintings    pad0.2            ahash            0.0703125      0              nan           2204
paintings    pad0.2            blockmean        0.0795455      0              nan           2204
paintings    pad0.2            dhash            0.210938       1.089          100           2204
paintings    pad0.2            marrhildreth     0.177083       0              nan           2204
paintings    pad0.2            pdq              0.289062       1.86           100           2204
paintings    pad0.2            phash            0.273438       2.541          100           2204
paintings    pad0.2            shrinkhash     146.325          0.181          100           2204
paintings    pad0.2            wavelet          0.109375       0              nan           2204
paintings    resize0.5         ahash            0.0078125     76.089          100           2204
paintings    resize0.5         blockmean        0.0144628     98.185          100           2204
paintings    resize0.5         dhash            0.0976562    100              100           2204
paintings    resize0.5         marrhildreth     0.154514      99.819          100           2204
paintings    resize0.5         pdq              0.1875       100              100           2204
paintings    resize0.5         phash            0.09375      100              100           2204
paintings    resize0.5         shrinkhash      56.9034        76.27           100           2204
paintings    resize0.5         wavelet          0.0117188     84.71           100           2204
paintings    rotate4           ahash            0.0390625      2.949          100           2204
paintings    rotate4           blockmean        0.0382231      2.949          100           2204
paintings    rotate4           dhash            0.207031      36.298          100           2204
paintings    rotate4           marrhildreth     0.227431      61.978          100           2204
paintings    rotate4           pdq              0.273438      56.08           100           2204
paintings    rotate4           phash            0.257812      61.615          100           2204
paintings    rotate4           shrinkhash      69.1737         2.813          100           2204
paintings    rotate4           wavelet          0.03125        0.136          100           2204
paintings    vignette          ahash            0.0429688      6.171          100           2204
paintings    vignette          blockmean        0.0475207      8.122          100           2204
paintings    vignette          dhash            0.121094      32.305          100           2204
paintings    vignette          marrhildreth     0.177083      77.904          100           2204
paintings    vignette          pdq              0.132812     100              100           2204
paintings    vignette          phash            0.132812     100              100           2204
paintings    vignette          shrinkhash     102.186          3.267          100           2204
paintings    vignette          wavelet          0.046875       3.085          100           2204
paintings    watermark         ahash            0.00390625    20.054          100           2204
paintings    watermark         blockmean        0.0123967     45.145          100           2204
paintings    watermark         dhash            0.0585938    100              100           2204
paintings    watermark         marrhildreth     0.0625       100              100           2204
paintings    watermark         pdq              0.273438      98.866          100           2204
paintings    watermark         phash            0.28125       99.456          100           2204
paintings    watermark         shrinkhash     104.398         75.998          100           2204
paintings    watermark         wavelet          0.0117188     51.27           100           2204
photographs  blur2             ahash            0.015625      76.727          100           1650
photographs  blur2             blockmean        0.0330579     98              100           1650
photographs  blur2             dhash            0.0859375     98.97           100           1650
photographs  blur2             marrhildreth     0.107639      97.576          100           1650
photographs  blur2             pdq              0.304688     100              100           1650
photographs  blur2             phash            0.179688     100              100           1650
photographs  blur2             shrinkhash     117.627         44              100           1650
photographs  blur2             wavelet          0.0195312     79.879          100           1650
photographs  crop0.05          ahash            0.0078125      0.182          100           1650
photographs  crop0.05          blockmean        0.0258264      0.788          100           1650
photographs  crop0.05          dhash            0.0976562      1.091          100           1650
photographs  crop0.05          marrhildreth     0.173611       3.152          100           1650
photographs  crop0.05          pdq              0.304688      30.606          100           1650
photographs  crop0.05          phash            0.320312      63.697          100           1650
photographs  crop0.05          shrinkhash     125.94           1.152          100           1650
photographs  crop0.05          wavelet          0.015625       0.182          100           1650
photographs  gamma2            ahash            0.015625       8.182          100           1650
photographs  gamma2            blockmean        0.0268595     17.212          100           1650
photographs  gamma2            dhash            0.101562      90.303          100           1650
photographs  gamma2            marrhildreth     0.105903      90.909          100           1650
photographs  gamma2            pdq              0.210938     100              100           1650
photographs  gamma2            phash            0.234375     100              100           1650
photographs  gamma2            shrinkhash     119.683          0.545          100           1650
photographs  gamma2            wavelet          0.0195312     18.424          100           1650
photographs  jpeg95            ahash            0.0117188     29.879          100           1650
photographs  jpeg95            blockmean        0.0278926     76.788          100           1650
photographs  jpeg95            dhash            0.121094      84.182          100           1650
photographs  jpeg95            marrhildreth     0.104167      69.576          100           1650
photographs  jpeg95            pdq              0.296875      99.879          100           1650
photographs  jpeg95            phash            0.28125       99.879          100           1650
photographs  jpeg95            shrinkhash     131.031         89.212          100           1650
photographs  jpeg95            wavelet          0.0195312     40.242          100           1650
photographs  noise0.2          ahash            0.015625      27.636          100           1650
photographs  noise0.2          blockmean        0.036157      75.091          100           1650
photographs  noise0.2          dhash            0.121094      54.121          100           1650
photographs  noise0.2          marrhildreth     0.0989583     46.364          100           1650
photographs  noise0.2          pdq              0.296875      99.697          100           1650
photographs  noise0.2          phash            0.304688      99.818          100           1650
photographs  noise0.2          shrinkhash     210.661         57.576          100           1650
photographs  noise0.2          wavelet          0.0234375     27.03           100           1650
photographs  noop              ahash            0            100              100           1650
photographs  noop              blockmean        0            100              100           1650
photographs  noop              dhash            0            100              100           1650
photographs  noop              marrhildreth     0            100              100           1650
photographs  noop              pdq              0            100              100           1650
photographs  noop              phash            0            100              100           1650
photographs  noop              shrinkhash       0            100              100           1650
photographs  noop              wavelet          0            100              100           1650
photographs  pad0.2            ahash            0.0429688      0.061          100           1650
photographs  pad0.2            blockmean        0.0320248      0              nan           1650
photographs  pad0.2            dhash            0.105469       0.545          100           1650
photographs  pad0.2            marrhildreth     0.177083       0.121          100           1650
photographs  pad0.2            pdq              0.28125        1.455          100           1650
photographs  pad0.2            phash            0.289062       3.515          100           1650
photographs  pad0.2            shrinkhash     114.721          0.061          100           1650
photographs  pad0.2            wavelet          0.0820312      0              nan           1650
photographs  resize0.5         ahash            0.015625      87.697          100           1650
photographs  resize0.5         blockmean        0.0330579     99.152          100           1650
photographs  resize0.5         dhash            0.0898438     98.485          100           1650
photographs  resize0.5         marrhildreth     0.111111      95.394          100           1650
photographs  resize0.5         pdq              0.328125      99.818          100           1650
photographs  resize0.5         phash            0.234375     100              100           1650
photographs  resize0.5         shrinkhash     132.117         80.242          100           1650
photographs  resize0.5         wavelet          0.0195312     88.97           100           1650
photographs  rotate4           ahash            0.0273438      1.818          100           1650
photographs  rotate4           blockmean        0.0371901      3.879          100           1650
photographs  rotate4           dhash            0.09375        2.97           100           1650
photographs  rotate4           marrhildreth     0.149306       4.606          100           1650
photographs  rotate4           pdq              0.304688      73.394          100           1650
photographs  rotate4           phash            0.3125        89.818          100           1650
photographs  rotate4           shrinkhash     130.211          4.424          100           1650
photographs  rotate4           wavelet          0.0078125      0.061          100           1650
photographs  vignette          ahash            0.0273438      8.242          100           1650
photographs  vignette          blockmean        0.0320248     10              100           1650
photographs  vignette          dhash            0.0703125     22              100           1650
photographs  vignette          marrhildreth     0.0954861     38.727          100           1650
photographs  vignette          pdq              0.117188     100              100           1650
photographs  vignette          phash            0.125        100              100           1650
photographs  vignette          shrinkhash     138.989         11.939          100           1650
photographs  vignette          wavelet          0.0195312      4.242          100           1650
photographs  watermark         ahash            0.015625      42.667          100           1650
photographs  watermark         blockmean        0.0247934     60.788          100           1650
photographs  watermark         dhash            0.078125     100              100           1650
photographs  watermark         marrhildreth     0.112847      98.727          100           1650
photographs  watermark         pdq              0.3125        99.818          100           1650
photographs  watermark         phash            0.3125        99.758          100           1650
photographs  watermark         shrinkhash     142.046         79.576          100           1650
photographs  watermark         wavelet          0.0195312     53.455          100           1650
===========  ================  =============  ============  ========  ===========  =============

================  =============  ============  ========  ===========  =============
transform_name    hasher_name       threshold    recall    precision    n_exemplars
================  =============  ============  ========  ===========  =============
blur2             ahash            0.0078125     49.014          100           3854
blur2             blockmean        0.0123967     80.773          100           3854
blur2             dhash            0.0859375     99.196          100           3854
blur2             marrhildreth     0.107639      98.962          100           3854
blur2             pdq              0.234375      99.948          100           3854
blur2             phash            0.179688     100              100           3854
blur2             shrinkhash      60.8112        28.412          100           3854
blur2             wavelet          0.0117188     62.247          100           3854
crop0.05          ahash            0.00390625     0.052          100           3854
crop0.05          blockmean        0.0123967      0.208          100           3854
crop0.05          dhash            0.0976562      0.493          100           3854
crop0.05          marrhildreth     0.173611       1.635          100           3854
crop0.05          pdq              0.257812       9.03           100           3854
crop0.05          phash            0.226562       7.058          100           3854
crop0.05          shrinkhash      95.0053         1.427          100           3854
crop0.05          wavelet          0.0078125      0              nan           3854
gamma2            ahash            0.00390625     0.934          100           3854
gamma2            blockmean        0.0072314      1.713          100           3854
gamma2            dhash            0.101562      90.036          100           3854
gamma2            marrhildreth     0.105903      94.24           100           3854
gamma2            pdq              0.210938     100              100           3854
gamma2            phash            0.234375     100              100           3854
gamma2            shrinkhash     108.457          0.156          100           3854
gamma2            wavelet          0.0117188     14.997          100           3854
jpeg95            ahash            0.00390625     5.319          100           3854
jpeg95            blockmean        0.0134298     32.045          100           3854
jpeg95            dhash            0.121094      74.079          100           3854
jpeg95            marrhildreth     0.104167      59.263          100           3854
jpeg95            pdq              0.257812      99.896          100           3854
jpeg95            phash            0.234375      99.896          100           3854
jpeg95            shrinkhash      66.053         40.296          100           3854
jpeg95            wavelet          0.00390625     3.71           100           3854
noise0.2          ahash            0.00390625     2.984          100           3854
noise0.2          blockmean        0.00826446     8.563          100           3854
noise0.2          dhash            0.121094      40.088          100           3854
noise0.2          marrhildreth     0.0989583     33.083          100           3854
noise0.2          pdq              0.257812      99.222          100           3854
noise0.2          phash            0.273438      99.896          100           3854
noise0.2          shrinkhash     169.387          4.385          100           3854
noise0.2          wavelet          0.0078125      1.894          100           3854
noop              ahash            0            100              100           3854
noop              blockmean        0            100              100           3854
noop              dhash            0            100              100           3854
noop              marrhildreth     0            100              100           3854
noop              pdq              0            100              100           3854
noop              phash            0            100              100           3854
noop              shrinkhash       0            100              100           3854
noop              wavelet          0            100              100           3854
pad0.2            ahash            0.0429688      0.026          100           3854
pad0.2            blockmean        0.0320248      0              nan           3854
pad0.2            dhash            0.105469       0.234          100           3854
pad0.2            marrhildreth     0.177083       0.052          100           3854
pad0.2            pdq              0.28125        1.349          100           3854
pad0.2            phash            0.273438       2.387          100           3854
pad0.2            shrinkhash     114.721          0.052          100           3854
pad0.2            wavelet          0.0820312      0              nan           3854
resize0.5         ahash            0.0078125     70.784          100           3854
resize0.5         blockmean        0.0144628     95.226          100           3854
resize0.5         dhash            0.0898438     99.299          100           3854
resize0.5         marrhildreth     0.112847      97.846          100           3854
resize0.5         pdq              0.265625      99.844          100           3854
resize0.5         phash            0.234375     100              100           3854
resize0.5         shrinkhash      56.9034        51.453          100           3854
resize0.5         wavelet          0.0117188     80.747          100           3854
rotate4           ahash            0.0273438      1.297          100           3854
rotate4           blockmean        0.0371901      3.036          100           3854
rotate4           dhash            0.09375        1.401          100           3854
rotate4           marrhildreth     0.149306       3.762          100           3854
rotate4           pdq              0.273438      54.489          100           3854
rotate4           phash            0.257812      59.626          100           3854
rotate4           shrinkhash      69.1737         1.894          100           3854
rotate4           wavelet          0.0078125      0.026          100           3854
vignette          ahash            0.0273438      4.67           100           3854
vignette          blockmean        0.0320248      6.098          100           3854
vignette          dhash            0.0703125     12.195          100           3854
vignette          marrhildreth     0.0954861     30.54           100           3854
vignette          pdq              0.132812     100              100           3854
vignette          phash            0.132812     100              100           3854
vignette          shrinkhash     103.005          4.541          100           3854
vignette          wavelet          0.0195312      1.946          100           3854
watermark         ahash            0.00390625    18.5            100           3854
watermark         blockmean        0.0123967     41.593          100           3854
watermark         dhash            0.078125     100              100           3854
watermark         marrhildreth     0.112847      99.455          100           3854
watermark         pdq              0.273438      99.014          100           3854
watermark         phash            0.28125       99.377          100           3854
watermark         shrinkhash     104.398         71.199          100           3854
watermark         wavelet          0.0117188     46.912          100           3854
================  =============  ============  ========  ===========  =============

=============  ===========  ========  ===========  =============
hasher_name      threshold    recall    precision    n_exemplars
=============  ===========  ========  ===========  =============
ahash           0.00390625    17.578     100               42394
blockmean       0.00826446    27.714     100               42394
dhash           0.0859375     51.981      99.9952          42394
marrhildreth    0.100694      55.942      99.9957          42394
pdq             0.257812      77.181      99.9969          42394
phash           0.273438      81.967      99.9942          42394
shrinkhash     56.9034        22.378     100               42394
wavelet         0.00390625    18.467     100               42394
=============  ===========  ========  ===========  =============

Video Hashing
=============

The below example does the following:

- Download a benchmarking dataset. Here we use the `Charades <https://prior.allenai.org/projects/charades>`_ dataset which contain over 9,000 videos.
- Load the dataset.
- Transform the dataset to generate synthetically altered videos. Our hashers are responsible for
  matching the altered videos with the originals.
- Define some hashers we want to evaluate.
- Compute all the hashes.
- Report metrics for each video category / hasher / transformation combination to see how well our hashers
  can match the altered videos to the original ("no-op" videos).

.. code-block:: python

    import os
    import zipfile
    import urllib.request


    import pandas as pd

    import perception.benchmarking
    import perception.hashers

    if not os.path.isdir('Charades_v1_480'):
        # Download the dataset since it appears we do not have it. Note that
        # these are large files (> 13GB).
        urllib.request.urlretrieve(
            url='http://ai2-website.s3.amazonaws.com/data/Charades_v1_480.zip',
            filename='Charades_v1_480.zip'
        )
        with zipfile.ZipFile('Charades_v1_480.zip') as zfile:
            zfile.extractall('.')
        urllib.request.urlretrieve(
            url='http://ai2-website.s3.amazonaws.com/data/Charades.zip',
            filename='Charades.zip'
        )
        with zipfile.ZipFile('Charades.zip') as zfile:
            zfile.extractall('.')


    # These are files that we've identified as having identical subsequences, typically
    # when a person is out of frame and the backgrounds are the same.
    duplicates = [
        ('0HVVN.mp4', 'UZRQD.mp4'), ('ZIOET.mp4', 'YGXX6.mp4'), ('82XPD.mp4', 'E7QDZ.mp4'),
        ('FQDS1.mp4', 'AIOTI.mp4'), ('PBV4T.mp4', 'XXYWL.mp4'), ('M0P0H.mp4', 'STY6W.mp4'),
        ('3Q92U.mp4', 'GHPO3.mp4'), ('NFIQM.mp4', 'I2DHG.mp4'), ('PIRMO.mp4', '0GFE8.mp4'),
        ('LRPBA.mp4', '9VK0J.mp4'), ('UI0QG.mp4', 'FHXKQ.mp4'), ('Y05U8.mp4', '4RVZB.mp4'),
        ('J6TVB.mp4', '2ZBL5.mp4'), ('A8T8V.mp4', 'IGOQK.mp4'), ('H8QM1.mp4', 'QYMWC.mp4'),
        ('O45BC.mp4', 'ZS7X6.mp4'), ('NOP6W.mp4', 'F7KFE.mp4'), ('4MPPQ.mp4', 'A3M94.mp4'),
        ('L8FFR.mp4', 'M8MP0.mp4'), ('EHYXP.mp4', 'O8PO3.mp4'), ('MGBLJ.mp4', 'RIEG6.mp4'),
        ('53FPM.mp4', 'BLFEV.mp4'), ('UIIF3.mp4', 'TKEKQ.mp4'), ('GVX7E.mp4', '7GPSY.mp4'),
        ('T7HZB.mp4', '6KGZA.mp4'), ('65M4K.mp4', 'UDGP2.mp4'), ('6SS4H.mp4', 'CK6OL.mp4'),
        ('OVHFT.mp4', 'GG1X2.mp4'), ('VEHER.mp4', 'XBPEJ.mp4'), ('WN38A.mp4', '2QI8F.mp4'),
        ('UMXKN.mp4', 'EOKJ0.mp4'), ('OSIKP.mp4', 'WT2C0.mp4'), ('H5V2Y.mp4', 'ZXN6A.mp4'),
        ('XS6PF.mp4', '1WJ6O.mp4'), ('S2XJW.mp4', 'YH0BX.mp4'), ('UO607.mp4', 'Z5JZD.mp4'),
        ('XN64E.mp4', 'CSRZM.mp4'), ('YXI7M.mp4', 'IKQLJ.mp4'), ('1B9C8.mp4', '004QE.mp4'),
        ('V1SQH.mp4', '48WOM.mp4'), ('107YZ.mp4', 'I049A.mp4'), ('3S6WL.mp4', 'SC5YW.mp4'),
        ('OY50Q.mp4', '5T607.mp4'), ('XKH7W.mp4', '028CE.mp4'), ('X8XQE.mp4', 'J0VXY.mp4'),
        ('STB0G.mp4', 'J0VXY.mp4'), ('UNXLF.mp4', 'J0VXY.mp4'), ('56PK0.mp4', 'M1TZR.mp4'),
        ('FVITB.mp4', 'R0M34.mp4'), ('BPZE3.mp4', 'R0M34.mp4'), ('VS7DA.mp4', '1X0M3.mp4'),
        ('I7MEA.mp4', 'YMM1Z.mp4'), ('9N76L.mp4', '0LDP7.mp4'), ('AXS82.mp4', 'W8WRK.mp4'),
        ('8TSU4.mp4', 'MXATD.mp4'), ('80FWF.mp4', '18HFG.mp4'), ('RO3A2.mp4', 'V4HY4.mp4'),
        ('HU409.mp4', 'BDWIX.mp4'), ('3YY88.mp4', 'EHHRS.mp4'), ('65RS3.mp4', 'SLIH4.mp4'),
        ('LR0L8.mp4', 'Y665P.mp4'), ('DVPL2.mp4', 'EI5M3.mp4'), ('0EGNU.mp4', 'CU3JE.mp4'),
        ('94KP4.mp4', '94KP4.mp4'), ('79QDP.mp4', '79QDP.mp4'), ('GKBX9.mp4', 'GKBX9.mp4'),
        ('RX6R8.mp4', 'RX6R8.mp4'), ('PMVT7.mp4', 'PMVT7.mp4'), ('XNXW6.mp4', 'XNXW6.mp4'),
        ('I005F.mp4', 'I005F.mp4'), ('TF95Y.mp4', 'TF95Y.mp4'), ('79QDP.mp4', '79QDP.mp4'),
        ('LQGMM.mp4', 'LQGMM.mp4'), ('QCAUL.mp4', 'QCAUL.mp4'), ('GFVSV.mp4', 'GFVSV.mp4'),
        ('4UYGY.mp4', '4UYGY.mp4'), ('BYDSE.mp4', 'BYDSE.mp4'), ('PV3KQ.mp4', 'PV3KQ.mp4'),
        ('1X0M3.mp4', '1X0M3.mp4'), ('T5FHD.mp4', 'T5FHD.mp4'), ('QRHJJ.mp4', 'QRHJJ.mp4'),
        ('JYBGS.mp4', 'JYBGS.mp4'), ('N2XCF.mp4', 'N2XCF.mp4'), ('OZPA9.mp4', 'OZPA9.mp4'),
        ('297S4.mp4', '297S4.mp4'), ('LHU7D.mp4', 'LHU7D.mp4'), ('TSKZL.mp4', 'TSKZL.mp4'),
        ('BCONW.mp4', 'BCONW.mp4'), ('KBPDM.mp4', 'KBPDM.mp4'), ('7FTBS.mp4', '7FTBS.mp4'),
        ('099Y1.mp4', '099Y1.mp4'), ('S2RIQ.mp4', 'S2RIQ.mp4'), ('22FJU.mp4', '22FJU.mp4'),
        ('99UA6.mp4', '99UA6.mp4'), ('WJ13E.mp4', 'WJ13E.mp4'), ('5OLVC.mp4', '5OLVC.mp4'),
        ('YQ6Z6.mp4', 'YQ6Z6.mp4'), ('T5MLJ.mp4', 'T5MLJ.mp4'), ('0VOQC.mp4', '0VOQC.mp4'),
        ('S2RIQ.mp4', 'S2RIQ.mp4'), ('2VNXF.mp4', '2VNXF.mp4'), ('G87XG.mp4', 'G87XG.mp4'),
        ('RRS54.mp4', 'RRS54.mp4'), ('TXJK7.mp4', 'TXJK7.mp4'), ('G4KE3.mp4', 'G4KE3.mp4'),
        ('3SNSC.mp4', '3SNSC.mp4'), ('U2FA5.mp4', 'U2FA5.mp4'), ('9AFQ7.mp4', '9AFQ7.mp4')
    ]

    blacklist = [fp1 for fp1, fp2 in duplicates]
    df = pd.concat([pd.read_csv('Charades/Charades_v1_test.csv'), pd.read_csv('Charades/Charades_v1_train.csv')])
    df = df[~(df['id'] + '.mp4').isin(blacklist)]
    df['filepath'] = df['id'].apply(lambda video_id: os.path.join('Charades_v1_480', video_id + '.mp4'))
    assert df['filepath'].apply(os.path.isfile).all(), 'Some video files are missing.'
    dataset = perception.benchmarking.BenchmarkVideoDataset.from_tuples(
        files=df[['filepath', 'scene']].itertuples(index=False)
    )

    if not os.path.isdir('benchmarking_videos'):
        # We haven't computed the transforms yet, so we do that
        # now. Below, we create the following files for each of
        # the videos in our dataset. Note that the only required
        # transform is `noop` (see documentation for
        # perception.bencharmking.BenchmarkVideoDataset.transform).
        #
        # noop: This is the base video we'll actually use in benchmarking, rather
        #       than using the raw video. It is the same as the raw video but downsampled
        #       to a size that is reasonable for hashing (240p). This is because all
        #       of our hashers downsample to a size smaller than this anyway, so there
        #       is no benefit to a higher resolution. Also, we limit the length to the
        #       first five minutes of the video, which speeds everything up significantly.
        # shrink: Shrink the noop video down to 70% of its original size.
        # clip0.2: Clip the first 20% and last 20% of the noop video off.
        # slideshow: Create a slideshow version of the video that grabs frames periodically
        #            from the original.
        # black_frames: Add black frames before and after the start of the video.
        # gif: Create a GIF from the video (similar to slideshow but with re-encoding)
        # black_padding: Add black bars to the top and bottom of the video.
        pad_width = 240
        pad_height = 320
        transforms = {
            'noop': perception.benchmarking.video_transforms.get_simple_transform(
                width='ceil(min(240/max(iw, ih), 1)*iw/2)*2',
                height='ceil(min(240/max(iw, ih), 1)*ih/2)*2',
                codec='h264',
                output_ext='.m4v',
                sar='1/1',
                clip_s=(None, 60*5)
            ),
            'shrink': perception.benchmarking.video_transforms.get_simple_transform(
                width='ceil(0.7*iw/2)*2',
                height='ceil(0.7*ih/2)*2'
            ),
            'clip0.2': perception.benchmarking.video_transforms.get_simple_transform(clip_pct=(0.2, 0.8)),
            'slideshow': perception.benchmarking.video_transforms.get_slideshow_transform(
                frame_input_rate=1/2.5, frame_output_rate=0.5, max_frames=10, offset=1.3),
            'black_frames': perception.benchmarking.video_transforms.get_black_frame_padding_transform(0.5, 0.05),
            'gif': perception.benchmarking.video_transforms.get_simple_transform(
                output_ext='.gif', codec='gif', clip_s=(1.2, 10.2), fps=1/2.5
            ),
            'black_padding': perception.benchmarking.video_transforms.get_simple_transform(
                width=f'(iw*sar)*min({pad_width}/(iw*sar),{pad_height}/ih)', height=f'ih*min({pad_width}/(iw*sar),{pad_height}/ih)',
                pad=f'{pad_width}:{pad_height}:({pad_width}-iw*min({pad_width}/iw,{pad_height}/ih))/2:({pad_height}-ih*min({pad_width}/iw,{pad_height}/ih))/2'
            )
        }

        # Save the transforms for later.
        transformed = dataset.transform(transforms=transforms, storage_dir='benchmarking_videos')

    transformed = perception.benchmarking.BenchmarkVideoTransforms.load('benchmarking_videos', verify_md5=False)

    phashu8 = perception.hashers.PHashU8(exclude_first_term=False, freq_shift=1, hash_size=12)
    hashers = {
        'phashu8_framewise': perception.hashers.FramewiseHasher(
            frames_per_second=1, frame_hasher=phashu8, interframe_threshold=50, quality_threshold=90),
        'phashu8_tmkl1': perception.hashers.FramewiseHasher(
            base_hasher=perception.hashers.TMKL1(
                frames_per_second=5, frame_hasher=phashu8,
                distance_metric='euclidean', dtype='uint8',
                norm=None, quality_threshold=90)
        )
    }
    if not os.path.isfile('hashes.csv'):
        # We haven't computed the hashes, so we do that now.
        hashes = transformed.compute_hashes(hashers=hashers, max_workers=5)
        # Save the hashes for later. It took a long time after all!
        hashes.save('hashes.csv')

    hashes = perception.benchmarking.BenchmarkHashes.load('hashes.csv')

    hashes.compute_threshold_recall(precision_threshold=99.9, grouping=['transform_name'])


================  =================  ===========  ========  ===========  =============
transform_name    hasher_name          threshold    recall    precision    n_exemplars
================  =================  ===========  ========  ===========  =============
black_frames      phashu8_framewise      51.0979    88.12       99.9069         278644
black_frames      phashu8_tmkl1          55.7584    99.918      99.9079         403768
black_padding     phashu8_framewise      74.6391     7.662     100              277399
black_padding     phashu8_tmkl1          53.8702    99.898      99.9079         406899
clip0.2           phashu8_framewise      54.8635    90.741      99.9098         224264
clip0.2           phashu8_tmkl1          59.0424    99.724      99.9077         324251
gif               phashu8_framewise      55.4437    68.21       99.9088          82232
gif               phashu8_tmkl1          55.4887    81.029      99.9103          39757
noop              phashu8_framewise       0        100         100              282658
noop              phashu8_tmkl1           0        100         100              408871
shrink            phashu8_framewise      24.7184   100         100              281731
shrink            phashu8_tmkl1          49.8999    99.836      99.9078         400650
slideshow         phashu8_framewise      56.9825    99.713      99.9076         172829
slideshow         phashu8_tmkl1          56.8683    95.934      99.9035          90684
================  =================  ===========  ========  ===========  =============


================================================
FILE: docs/examples/deduplication.rst
================================================
Media Deduplication
*******************

Perceptual hashes can be used to deduplicate sets of images. Below we provide two examples (one simple, one larger scale).

**For most use cases, we recommend using PHash with** :code:`hash_size=16` **and
with 0.2 as the distance threshold as in the example below.** You may wish to adjust
this threshold up or down based on your tolerance for false negatives / positives.

In practice, deduplicating in memory on your machine by the methods below may be impractical.
For larger-scale applications, you may wish to use tools like
`FAISS <https://github.com/facebookresearch/faiss>`_,
`Annoy <https://github.com/spotify/annoy>`_, or databases with
functionality for querying based on distance such as
`MemSQL <https://docs.memsql.com/sql-reference/v6.8/euclidean_distance/>`_.

For the supported hashers, below are our recommended thresholds with expected false positive rates of <1%.

======================  ===========
hasher                  threshold
======================  ===========
ahash (hash_size=16)    0.008
blockmean               0.008
dhash (hash_size=16)    0.07
marrhildreth            0.1
pdq                     0.2
phash (hash_size=16)    0.2
wavelet (hash_size=16)  0.02
======================  ===========

Simple example
==============

In this example, we download a ZIP file containing 18 images. One of the images is duplicated
twice and another image is duplicated once.

.. code-block:: python

    import os
    import glob
    import zipfile
    import urllib.request

    import tabulate
    import pandas as pd

    from perception import tools, hashers

    urllib.request.urlretrieve(
        "https://thorn-perception.s3.amazonaws.com/thorn-perceptual-deduplication-example.zip",
        "thorn-perceptual-deduplication-example.zip"
    )

    with zipfile.ZipFile('thorn-perceptual-deduplication-example.zip') as f:
        f.extractall('.')
        
    filepaths = glob.glob('thorn-perceptual-deduplication-example/*.jpg')
    duplicate_pairs = tools.deduplicate(files=filepaths, hashers=[(hashers.PHash(hash_size=16), 0.2)])
    print(tabulate.tabulate(pd.DataFrame(duplicate_pairs), showindex=False, headers=['file1', 'file2'], tablefmt='rst'))
    
    # Now we can do whatever we want with the duplicates. We could just delete
    # the first entry in each pair or manually verify the pairs to ensure they
    # are, in fact duplicates.


===============================================  ===============================================
file1                                            file2
===============================================  ===============================================
thorn-perceptual-deduplication-example/309b.jpg  thorn-perceptual-deduplication-example/309.jpg
thorn-perceptual-deduplication-example/309b.jpg  thorn-perceptual-deduplication-example/309a.jpg
thorn-perceptual-deduplication-example/309a.jpg  thorn-perceptual-deduplication-example/309.jpg
thorn-perceptual-deduplication-example/315a.jpg  thorn-perceptual-deduplication-example/315.jpg
===============================================  ===============================================

Real-world example
==================

In the example below, we use the 
`Caltech 256 Categories <http://www.vision.caltech.edu/Image_Datasets/Caltech256>`_ dataset. Like
most other public image datasets, it contains a handful of duplicates in some categories.

The code below will:

1. Download the dataset
2. Group all the filepaths by category (the dataset is provided in folders)
3. Within each group, find duplicates using PHash. We will compare not just the
   original images, but also the 8 isometric transformations for each image.

.. code-block:: python

    import os
    import tarfile
    from glob import glob
    import urllib.request

    import tqdm

    from perception import hashers, tools

    urllib.request.urlretrieve(
        "http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar",
        "256_ObjectCategories.tar"
    )
    with tarfile.open('256_ObjectCategories.tar') as tfile:
        tfile.extractall()

    files = glob('256_ObjectCategories/**/*.jpg')

    # To reduce the number of pairwise comparisons,
    # we can deduplicate within each image category
    # (i.e., we don't need to compare images of 
    # butterflies with images of chess boards).
    filepath_group = [
        (
            filepath,
            os.path.normpath(filepath).split(os.sep)[-2]
        ) for filepath in files
    ]
    groups = list(set([group for _, group in filepath_group]))
    
    # We consider any pair of images with a PHash distance of < 0.2 as
    # as a duplicate.
    comparison_hashers = [(hashers.PHash(hash_size=16), 0.2)]

    duplicate_pairs = []

    for current_group in groups:
        current_filepaths = [
            filepath for filepath, group in filepath_group if group == current_group
        ]
        current_duplicate_pairs = tools.deduplicate(
            files=current_filepaths,
            hashers=comparison_hashers,
            isometric=True,
            progress=tqdm.tqdm
        )
        duplicate_pairs.extend(current_duplicate_pairs)

    # Now we can do whatever we want with the duplicates. We could just delete
    # the first entry in each pair or manually verify the pairs to ensure they
    # are, in fact duplicates.

Video deduplication
===================

Video deduplication requires more thought depending on your tolerance for false positives and
how important temporal relationships are. Below is one example approach for deduplicating a
group of videos by taking frames from each video that are sufficiently different from each other
(to avoid keeping too many) and then using them all to find
pairs of videos that have matching frames.

.. code-block:: python

    import urllib.request
    import zipfile

    import glob
    import tqdm

    import perception.hashers

    # Download some example videos.
    urllib.request.urlretrieve(
        "https://thorn-perception.s3.amazonaws.com/thorn-perceptual-video-deduplication-example.zip",
        "thorn-perceptual-video-deduplication-example.zip"
    )

    with zipfile.ZipFile('thorn-perceptual-video-deduplication-example.zip') as f:
        f.extractall('.')

    frame_hasher = hashers.PHash(hash_size=16)
 
    hasher = perception.hashers.FramewiseHasher(frames_per_second=1, 
        frame_hasher=frame_hasher, 
        interframe_threshold=50, 
        quality_threshold=90)

    # Set a threshold for matching frames within videos and across videos.
    filepaths = glob.glob('thorn-perceptual-video-deduplication-example/*.m4v') + \
                glob.glob('thorn-perceptual-video-deduplication-example/*.gif')

    # Returns a list of dicts with a "filepath" and "hash" key. "hash" contains a
    # list of hashes.
    hashes = hasher.compute_parallel(filepaths=filepaths, progress=tqdm.tqdm)


    # Flatten the hashes into a list of (filepath, hash) tuples.
    hashes_flattened = perception.tools.flatten([
        [(hash_group['filepath'], hash_string) for hash_string in hash_group['hash']]
        for hash_group in hashes
    ])

    duplicates = perception.tools.deduplicate_hashes(
        hashes=hashes_flattened,
        threshold=50,
        hasher=hasher
    )

================================================
FILE: docs/examples/detecting_csam.rst
================================================
Detecting Child Sexual Abuse Material
*************************************

Using `perception` and a subscription to Thorn's Safer service,
you can easily check for child sexual abuse material against a database of known bad content
**without** having to send any images to a third party. You do this by sending compact, irreversible
image hashes to get matches with a high degree of precision. We support matching using
16x16 PHash hashes and md5 hashes.

See usage example below. Please contact info@getsafer.io to discuss Thorn's Safer service
and subscription options and visit `getsafer.io <https://getsafer.io/>`_ to learn more.

.. code-block:: python

    from perception import tools
    matcher = tools.SaferMatcher(
        api_key='YOUR_API_KEY',
        url='MATCHING_SERVICE_URL'
    )
    matches = matcher.match(['myfile.jpg'])

In some cases, you may have a username/password instead of an API key, in which case
you can pass those instead (see API documentation for details).

================================================
FILE: docs/examples/index.rst
================================================
Examples
********

.. toctree::
   :maxdepth: 2
   :caption: Contents:

   deduplication
   detecting_csam
   benchmarking

================================================
FILE: docs/index.rst
================================================
perception
==========

:code:`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing
research, development, and production use. It provides a common wrapper around existing, popular perceptual hashes
(such as those implemented by `ImageHash <https://pypi.org/project/ImageHash/>`_)
along with tools to compare their performance and use them for common tasks.

Perceptual hashes are used to create compact image "fingerprints" which are invariant to small alterations to
the original image. Typically, the representations are compact enough that they are irreversible, which makes
them useful for deduplication and detecting abusive content while preserving the privacy of content owners.

Installation
************

You can install :code:`perception` using pip. You must install OpenCV separately (e.g., with :code:`pip install opencv-python`).

.. code-block:: bash
    
    # Install from PyPi
    pip install perception

    # Install from GitHub
    pip install git+https://github.com/thorn-oss/perception.git#egg=perception

To install with the necessary dependencies for benchmarking, use:

.. code-block:: bash

    # Install from PyPi
    pip install perception[benchmarking]

    # Install from GitHub
    pip install opencv-python git+https://github.com/thorn-oss/perception.git#egg=perception[benchmarking]

Getting Started
***************

Please see the examples for code snippets for common use cases.


.. toctree::
   :maxdepth: 2
   :caption: Contents:

   examples/index
   api/index
    

================================================
FILE: docs/requirements.txt
================================================
sphinx-autodoc-typehints==3.2.0
# sphinx-autobuild==3.0.2
# sphinx==1.8.3
sphinx_rtd_theme==3.0.2
m2r==0.3.1
opencv-contrib-python-headless
tqdm
albumentations
ffmpeg-python
typing-extensions
faiss-cpu
aiohttp
python-json-logger
networkit


================================================
FILE: perception/__init__.py
================================================
from importlib import metadata

__version__ = metadata.version("perception")


================================================
FILE: perception/approximate_deduplication/__init__.py
================================================
import logging
import math
import os.path as op
import typing

import faiss
import numpy as np
import tqdm
import typing_extensions

from ._graph_backend import get_graph_backend

LOGGER = logging.getLogger(__name__)
DEFAULT_PCT_PROBE = 0


# For faiss training on datasets larger than 50,000 vectors, we take a random sub-sample.
TRAIN_LARGE_SIZE: int = 50_000


class ClusterAssignment(typing_extensions.TypedDict):
    cluster: int
    id: typing.Any


def build_index(
    X: np.ndarray,
    pct_probe: float = DEFAULT_PCT_PROBE,
    approximate: bool = True,
    use_gpu: bool = True,
):
    """Buid a FAISS index from a reference dataframe.

    Args:
        X: The vectors to add to the index.
        pct_probe: The minimum fraction of nearest lists to search. If
            the product of pct_probe and the number of lists is less
            than 1, one list will be searched.
        approximate: Whether to build an approximate or exact index.

    Returns:
        An (index, lookup) tuple where the lookup returns the filepath
        for a given entry in the index.
    """
    if X is None:
        return None
    X = X.astype("float32")
    d = X.shape[1]
    if approximate:
        ntotal = X.shape[0]
        nlist = int(max(min(4 * np.sqrt(ntotal), ntotal / 39), 1))
        quantizer = faiss.IndexFlatL2(d)
        index = faiss.IndexIVFFlat(quantizer, d, nlist)
        gpu = False
        if use_gpu:
            try:
                res = faiss.StandardGpuResources()
                index = faiss.index_cpu_to_gpu(res, 0, index)
                gpu = True
            except AttributeError:
                LOGGER.info("Building approximate FAISS index on CPU.")

        if X.shape[0] > TRAIN_LARGE_SIZE:
            # Take random sample of 50,000 or 39 points per centroid.
            # 39 points per centroid is the min for for not getting warnings.
            # https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids
            sample_size = max(39 * nlist, TRAIN_LARGE_SIZE)
            index.train(X[np.random.choice(X.shape[0], sample_size, replace=False)])
        else:
            index.train(X)

        batch_size = 10_000
        for i in range(0, X.shape[0], batch_size):
            index.add(X[i : i + batch_size])
        if gpu:
            index = faiss.index_gpu_to_cpu(index)
        nprobe = max(math.ceil(pct_probe * nlist), 1)
        faiss.ParameterSpace().set_index_parameter(index, "nprobe", nprobe)
    else:
        index = faiss.IndexFlat(d)
        index.add(X)
    return index


def compute_euclidean_pairwise_duplicates_approx(
    X,
    counts,
    threshold,
    minimum_overlap,
    Y=None,
    y_counts=None,
    pct_probe=0.1,
    use_gpu: bool = True,
    faiss_cache_path: str | None = None,
    show_progress: bool = False,
):
    """Provides the same result as perception.extensions.compute_pairwise_duplicates_simple
    but uses an approximate search instead of an exhaustive search, which can dramatically reduce
    processing time.

    Args:
        X: An array of vectors to compute pairs for.
        Y: if provided we search in X for Y vectors.
        counts: A list of counts of vectors for separate files in the
            in the vectors (should add up to the length of X)
        threshold: The threshold for a match as a euclidean distance.
        minimum_overlap: The minimum overlap between two files to qualify as a match.
        pct_probe: The minimum percentage of sublists to search for matches. The larger the
            value, the more exhaustive the search.
        faiss_cache_path: If provided load any existing faiss index from this path, and if
            it does not exist then save the generated faiss index to the path.
        show_progress: Whether or not to show a progress bar while computing pairs
    Returns:
        A list of pairs of matching file indexes.
    """
    assert (
        counts.sum() == X.shape[0]
    ), "Length of counts incompatible with vectors shape."
    assert (Y is None) == (
        y_counts is None
    ), "Must provide both or neither for y, y_counts."
    if X.dtype != "float32":
        # Only make the copy if we have to.
        X = X.astype("float32")

    if Y is not None and Y.dtype != "float32":
        # Only make the copy if we have to.
        Y = Y.astype("float32")

    lookup_ = []
    for idx, count in enumerate(counts):
        lookup_.extend([idx] * count)
    lookup = np.array(lookup_)

    if faiss_cache_path is not None and op.exists(faiss_cache_path):
        LOGGER.debug("Loading cached FAISS index from %s", faiss_cache_path)
        index = faiss.read_index(faiss_cache_path)
        assert (
            X.shape[0] == index.ntotal
        ), "Cached FAISS index does not match provided X."
    else:
        LOGGER.debug("Building FAISS index.")
        index = build_index(X=X, pct_probe=pct_probe, approximate=True, use_gpu=use_gpu)
        if faiss_cache_path is not None:
            faiss.write_index(index, faiss_cache_path)

    LOGGER.debug("FAISS index ready, start aprox search")
    pairs = []

    # Only use y_counts if present.
    if y_counts is None:
        iterator_counts = counts
        M = X
    else:
        iterator_counts = y_counts
        M = Y

    for end, length, query in tqdm.tqdm(
        zip(iterator_counts.cumsum(), iterator_counts, range(len(iterator_counts))),
        total=len(iterator_counts),
        disable=not show_progress,
        desc="Vectors",
    ):
        if length == 0:
            continue
        Xq = M[end - length : end]
        lims, _, idxs = index.range_search(Xq, threshold**2)
        lims = lims.astype("int32")
        matched = [
            match
            for match in np.unique(lookup[list(set(idxs))])  # type: ignore
            if match != query
            or Y is not None  # Protect self matches if Y is not present.
        ]
        query_in_match: typing.Mapping[int, set] = {m: set() for m in matched}
        match_in_query: typing.Mapping[int, set] = {m: set() for m in matched}
        for query_idx in range(length):
            for match_idx in idxs[lims[query_idx] : lims[query_idx + 1]]:
                match = lookup[match_idx]
                if (
                    match == query and Y is None
                ):  # Protect self matches if Y is not present.
                    continue
                match_in_query[match].add(match_idx)
                query_in_match[match].add(query_idx)
        for match in matched:
            overlap = min(
                [
                    len(query_in_match[match]) / length,
                    len(match_in_query[match]) / counts[match],
                ]
            )
            if overlap >= minimum_overlap and overlap > 0:
                if Y is None:
                    pairs.append(tuple(sorted([query, match])))
                else:
                    pairs.append(tuple([query, match]))
    return list(set(pairs))


def pairs_to_clusters(
    ids: typing.Iterable[str],
    pairs: typing.Iterable[tuple[str, str]],
    strictness: typing_extensions.Literal[
        "clique", "community", "component"
    ] = "clique",
    max_clique_batch_size: int = 1000,
) -> list[ClusterAssignment]:
    """Given a list of pairs of matching files, compute sets
    of cliques where all files in a clique are connected.
    Args:
        ids: A list of node ids (e.g., filepaths).
        pairs: A list of pairs of node ids, each pair is assumed to have an edge
        strictness: The level at which groups will be clustered. "component"
            means that all clusters will be connected components. "community"
            will select clusters of files within components that are clustered
            together. "clique" will result in clusters where every file is
            connected to every other file.
        max_clique_batch_size: The maximum batch size for identifying
            cliques.
    Returns:
        A list of cluster assignments (dicts with id and cluster
        entries).
    """
    assert strictness in ["component", "community", "clique"], "Invalid strictness."
    list_ids = list(ids)
    id_to_node_map = {v: i for i, v in enumerate(list_ids)}
    node_to_id_map = {v: k for k, v in id_to_node_map.items()}

    LOGGER.debug("Building graph.")
    node_pairs = {(id_to_node_map[pair[0]], id_to_node_map[pair[1]]) for pair in pairs}
    backend = get_graph_backend()
    graph = backend.build_graph(len(list_ids), node_pairs)

    assignments: list[ClusterAssignment] = []
    cluster_index = 0
    components = backend.connected_components(graph)

    for component in components:
        LOGGER.debug("Got component with size: %s", len(component))
        if strictness == "component":
            assignments.extend(
                [{"id": node_to_id_map[n], "cluster": cluster_index} for n in component]
            )
            cluster_index += 1
            continue
        communities = backend.communities(graph, component)
        for community_members in communities:
            LOGGER.debug("Got community with size: %s", len(community_members))
            if strictness == "community":
                assignments.extend(
                    [
                        {"id": node_to_id_map[n], "cluster": cluster_index}
                        for n in community_members
                    ]
                )
                cluster_index += 1
                continue

            for clique_members in backend.maximal_cliques(
                graph,
                community_members,
                max_clique_batch_size=max_clique_batch_size,
            ):
                assignments.extend(
                    [
                        {
                            "id": node_to_id_map[n],
                            "cluster": cluster_index,
                        }
                        for n in clique_members
                    ]
                )
                cluster_index += 1

    return assignments


================================================
FILE: perception/approximate_deduplication/_graph_backend.py
================================================
import sys
import typing
from abc import ABC, abstractmethod


class GraphBackend(ABC):
    @abstractmethod
    def build_graph(
        self, node_count: int, edges: typing.Iterable[tuple[int, int]]
    ) -> typing.Any: ...

    @abstractmethod
    def connected_components(self, graph: typing.Any) -> list[list[int]]: ...

    @abstractmethod
    def communities(
        self, graph: typing.Any, component: list[int]
    ) -> list[list[int]]: ...

    @abstractmethod
    def maximal_cliques(
        self,
        graph: typing.Any,
        community_nodes: list[int],
        max_clique_batch_size: int,
    ) -> list[list[int]]: ...


class NetworkitGraphBackend(GraphBackend):
    def __init__(self):
        import networkit as nk

        self.nk = nk

    def build_graph(
        self, node_count: int, edges: typing.Iterable[tuple[int, int]]
    ) -> typing.Any:
        graph = self.nk.Graph(node_count)
        for start, end in edges:
            graph.addEdge(start, end)
        return graph

    def connected_components(self, graph: typing.Any) -> list[list[int]]:
        cc_query = self.nk.components.ConnectedComponents(graph)
        cc_query.run()
        return cc_query.getComponents()

    def communities(self, graph: typing.Any, component: list[int]) -> list[list[int]]:
        component_node_map = dict(enumerate(component))
        subgraph = self.nk.graphtools.subgraphFromNodes(graph, component, compact=True)
        algo = self.nk.community.PLP(subgraph, maxIterations=32)
        algo.run()
        communities = algo.getPartition()
        return [
            [component_node_map[node] for node in communities.getMembers(community)]
            for community in communities.subsetSizeMap().keys()
        ]

    def maximal_cliques(
        self,
        graph: typing.Any,
        community_nodes: list[int],
        max_clique_batch_size: int,
    ) -> list[list[int]]:
        cliques: list[list[int]] = []
        for start in range(0, len(community_nodes), max_clique_batch_size):
            batch_nodes = community_nodes[start : start + max_clique_batch_size]
            community_node_map = dict(enumerate(batch_nodes))
            subgraph = self.nk.graphtools.subgraphFromNodes(
                graph, batch_nodes, compact=True
            )

            while subgraph.numberOfNodes() > 0:
                clique = self.nk.clique.MaximalCliques(subgraph, maximumOnly=True)
                clique.run()
                clique_members = clique.getCliques()[0]
                cliques.append([community_node_map[node] for node in clique_members])
                for node in clique_members:
                    subgraph.removeNode(node)

        return cliques


class NetworkxGraphBackend(GraphBackend):
    def __init__(self):
        import networkx as nx

        self.nx = nx

    def build_graph(
        self, node_count: int, edges: typing.Iterable[tuple[int, int]]
    ) -> typing.Any:
        graph = self.nx.Graph()
        graph.add_nodes_from(range(node_count))
        graph.add_edges_from(edges)
        return graph

    def connected_components(self, graph: typing.Any) -> list[list[int]]:
        return [list(component) for component in self.nx.connected_components(graph)]

    def communities(self, graph: typing.Any, component: list[int]) -> list[list[int]]:
        subgraph = graph.subgraph(component)
        return [
            list(community)
            for community in self.nx.algorithms.community.asyn_lpa_communities(
                subgraph, seed=0
            )
        ]

    def maximal_cliques(
        self,
        graph: typing.Any,
        community_nodes: list[int],
        max_clique_batch_size: int,
    ) -> list[list[int]]:
        cliques: list[list[int]] = []
        for start in range(0, len(community_nodes), max_clique_batch_size):
            batch_nodes = community_nodes[start : start + max_clique_batch_size]
            subgraph = graph.subgraph(batch_nodes).copy()

            while subgraph.number_of_nodes() > 0:
                clique_members = max(
                    self.nx.find_cliques(subgraph),
                    key=lambda clique: (
                        len(clique),
                        tuple(sorted(clique)),
                    ),
                )
                cliques.append(list(clique_members))
                subgraph.remove_nodes_from(clique_members)

        return cliques


def get_graph_backend() -> GraphBackend:
    if sys.platform == "darwin":
        return NetworkxGraphBackend()
    return NetworkitGraphBackend()


================================================
FILE: perception/approximate_deduplication/debug.py
================================================
import logging
import random

import cv2
import numpy as np

import perception.local_descriptor_deduplication as ldd

LOGGER = logging.getLogger(__name__)

# Set a fixed size for drawing, we don't have the real descriptor size.
KEYPOINT_SIZE: int = 8


def vizualize_pair(
    features_1,
    features_2,
    ratio: float,
    match_metadata=None,
    local_path_col: str | None = None,
    sanitized: bool = False,
    include_all_points=False,
    circle_size=KEYPOINT_SIZE,
):
    """Given two rows from a reference df vizualize their overlap.

    Currently recalcs overlap using cv2 default logic.

    Args:
        features_1: The row from a reference df for one image.
        features_2: The row from a reference df for the other image.
        ratio: Value for ratio test, suggest re-using value from matching.
        match_metadata: metadata returned from matching, if None will redo brute force matching.
        local_path_col: column in df with path to the image. If None will
            use the index: features_1.name and features_2.name
        sanitized: if True images themselves will not be rendered, only the points.
        include_all_points: if True will draw all points, not just matched points.
        circle_size: size of the circle to draw around keypoints.
    Returns:
        An image of the two images concatted together and matching keypoints drawn.
    """
    # Set a fixed size for drawing, we don't have the real descriptor size.
    if local_path_col is not None:
        features_1_path = features_1[local_path_col]
        features_2_path = features_2[local_path_col]
    else:
        features_1_path = features_1.name
        features_2_path = features_2.name

    img1 = np.zeros(
        (features_1.dimensions[1], features_1.dimensions[0], 1), dtype="uint8"
    )
    img2 = np.zeros(
        (features_2.dimensions[1], features_2.dimensions[0], 1), dtype="uint8"
    )

    if not sanitized:
        try:
            img1 = ldd.load_and_preprocess(
                features_1_path, max_size=max(features_1.dimensions), grayscale=False
            )
        except Exception:
            LOGGER.warning("Failed to load image %s", features_1_path)
        try:
            img2 = ldd.load_and_preprocess(
                features_2_path, max_size=max(features_2.dimensions), grayscale=False
            )
        except Exception:
            LOGGER.warning("Failed to load image %s", features_2_path)

    if match_metadata is not None:
        img_matched = viz_match_data(
            features_1,
            features_2,
            img1,
            img2,
            match_metadata,
            include_all_points=include_all_points,
            circle_size=circle_size,
        )
    else:
        LOGGER.warning("""No match_metadata provided, recalculating match points,
            won't match perception match points.""")
        img_matched = viz_brute_force(features_1, features_2, img1, img2, ratio=ratio)

    return img_matched


def viz_match_data(
    features_1,
    features_2,
    img1,
    img2,
    match_metadata,
    include_all_points=False,
    circle_size=KEYPOINT_SIZE,
):
    """Given match data viz matching points.

    Args:
        features_1: The row from a reference df for one image.
        features_2: The row from a reference df for the other image.
        img1: cv2 of first image
        img2: cv2 of second image
        match_metadata: metadata returned from matching, if None will redo
            brute force matching.
        include_all_points: if True will draw all points, not just matched points.
        circle_size: size of the circle to draw around keypoints.
    Returns:
        cv2 img with matching keypoints drawn.
    """
    # NOTE: could refactor to put matches in to correct format and use: cv2.drawMatchesKnn,
    #  but python docs on necessary class not clear.

    # Pad img1 or img2 vertically with black pixels to match the height of the other image
    if img1.shape[0] > img2.shape[0]:
        img2 = np.pad(
            img2,
            ((0, img1.shape[0] - img2.shape[0]), (0, 0), (0, 0)),
            mode="constant",
            constant_values=0,
        )
    elif img1.shape[0] < img2.shape[0]:
        img1 = np.pad(
            img1,
            ((0, img2.shape[0] - img1.shape[0]), (0, 0), (0, 0)),
            mode="constant",
            constant_values=0,
        )
    # draw two images h concat:
    img_matched = np.concatenate((img1, img2), axis=1)

    overlay = img_matched.copy()

    if include_all_points:
        # draw all points in kp_1
        for k in features_1["keypoints"]:
            new_color = (
                random.randint(0, 255),
                random.randint(0, 255),
                random.randint(0, 255),
            )
            # Draw semi transparent circle
            cv2.circle(img_matched, (int(k[0]), int(k[1])), circle_size, new_color, 1)

        # draw all points in kp_2
        for k in features_2["keypoints"]:
            new_color = (
                random.randint(0, 255),
                random.randint(0, 255),
                random.randint(0, 255),
            )
            cv2.circle(
                img_matched,
                (int(k[0] + features_1.dimensions[0]), int(k[1])),
                circle_size,
                new_color,
                1,
            )

    # draw lines between matching points
    for i in range(len(match_metadata["final_matched_b_pts"])):
        new_color = (
            random.randint(0, 255),
            random.randint(0, 255),
            random.randint(0, 255),
        )
        a_pt = (
            int(match_metadata["final_matched_a_pts"][i][0]),
            int(match_metadata["final_matched_a_pts"][i][1]),
        )
        b_pt = (
            int(match_metadata["final_matched_b_pts"][i][0] + features_1.dimensions[0]),
            int(match_metadata["final_matched_b_pts"][i][1]),
        )
        cv2.circle(img_matched, a_pt, circle_size, new_color, 1)
        cv2.circle(img_matched, b_pt, circle_size, new_color, 1)
        cv2.line(
            img_matched,
            a_pt,
            b_pt,
            new_color,
            1,
        )

    # Re-overlay original image to add some transparency effect to lines and circles.
    alpha = 0.4  # Transparency factor.
    # Following line overlays transparent rectangle over the image
    img_matched = cv2.addWeighted(overlay, alpha, img_matched, 1 - alpha, 0)

    return img_matched


def viz_brute_force(features_1, features_2, img1, img2, ratio: float):
    """
    Given two rows from a reference df vizualize their overlap.

    NOTE: It redoes matching using cv2 bruteforce, so will not match the same
        as the perception matching code.

    Args:
        features_1: The row from a reference df for one image.
        features_2: The row from a reference df for the other image.
        img1: cv2 of first image
        img2: cv2 of second image
        ratio: Value for ratio test, suggest re-using value from matching.

    Returns:
        An image of the two images concatted together and matching keypoints drawn.
    """
    # Convert numpy keypoints to cv2.KeyPoints
    kp1_fixed = []
    for k in features_1["keypoints"]:
        kp1_fixed.append(cv2.KeyPoint(k[0], k[1], KEYPOINT_SIZE))

    kp2_fixed = []
    for k in features_2["keypoints"]:
        kp2_fixed.append(cv2.KeyPoint(k[0], k[1], KEYPOINT_SIZE))
    brute_force_matcher = cv2.BFMatcher()
    kn_matches = brute_force_matcher.knnMatch(
        features_1["descriptors"], features_2["descriptors"], k=2
    )
    # Apply ratio test
    good = []
    for nearest_match, next_nearest_match in kn_matches:
        if nearest_match.distance < ratio * next_nearest_match.distance:
            good.append([nearest_match])
    img_matched = cv2.drawMatchesKnn(  # type: ignore[call-overload]
        img1,
        kp1_fixed,
        img2,
        kp2_fixed,
        good,
        None,
        flags=cv2.DrawMatchesFlags_DRAW_RICH_KEYPOINTS,
    )
    return img_matched


================================================
FILE: perception/approximate_deduplication/index.py
================================================
import time
import typing
import warnings

import faiss
import numpy as np
import pandas as pd
import typing_extensions

import perception.hashers.tools as pht


class QueryInput(typing_extensions.TypedDict):
    id: str
    hash: str


class QueryMatch(typing_extensions.TypedDict):
    id: typing.Any
    matches: list[dict]


class TuningFailure(Exception):
    pass


class QueryDecodingFailure(Exception):
    pass


def build_query(table, ids, paramstyle, columns):
    query = "SELECT {} FROM {} WHERE id in {}"
    if paramstyle == "pyformat":
        sql = query.format(",".join(columns), table, "%(ids)s")
        params = {"ids": tuple(ids)}
    elif paramstyle == "qmark":
        params = ids
        sql = query.format(",".join(columns), table, f"({','.join('?' * len(ids))})")
    else:
        raise NotImplementedError("Unsupported paramstyle.")
    return sql, params


def query_by_id(con, table, ids, paramstyle, extra_columns=None) -> pd.DataFrame:
    """Get data from the database.

    Args:
        con: A connection to the database
        table: The table in which to look up hashes
        ids: The list of IDs to pull
        paramstyle: The paramstyle for the database
        extra_columns: A list of additional (non-ID) columns to pull.
    """
    columns = ["id"]
    if extra_columns is not None:
        columns += extra_columns
    if isinstance(ids, np.ndarray):
        # If it's a numpy array, coerce to a list.
        ids = ids.tolist()
    dfs = []
    batch_size = 1000
    for start in range(0, len(ids), batch_size):
        sql, params = build_query(
            table=table,
            ids=ids[start : start + batch_size],
            paramstyle=paramstyle,
            columns=columns,
        )
        dfs.append(pd.read_sql(con=con, sql=sql, params=params))
    return pd.concat(dfs, ignore_index=True).set_index("id")


class ApproximateNearestNeighbors:
    """A wrapper for a FAISS index.

    Args:
        con: A database connection from which to obtain metadata for
            matched hashes.
        table: The table in the database that we should query for metadata.
        paramstyle: The parameter style for the given database
        index: A FAISS index (or filepath to a FAISS index)
        hash_length: The length of the hash that is being matched against.
        metadata_columns: The metadata that should be returned for queries.
        dtype: The data type for the vectors
        distance_metric: The distance metric for the vectors
    """

    def __init__(
        self,
        con,
        table,
        paramstyle,
        index,
        hash_length,
        metadata_columns=None,
        dtype="uint8",
        distance_metric="euclidean",
    ):
        assert (
            dtype == "uint8"
        ), "Only unsigned 8-bit integer hashes are supported at this time."
        assert (
            distance_metric == "euclidean"
        ), "Only euclidean distance is supported at this time."
        if isinstance(index, str):
            index = faiss.read_index(index)
        self.con = con
        self.index = index
        self.distance_metric = distance_metric
        self.hash_length = hash_length
        self.dtype = dtype
        self.table = table
        self.metadata_columns = metadata_columns
        self.paramstyle = paramstyle
        assert (
            self.index.d == self.hash_length
        ), "Index is incompatible with hash length."

    @classmethod
    def from_database(
        cls,
        con,
        table,
        paramstyle,
        hash_length,
        ids_train=None,
        train_size=None,
        chunksize=100000,
        metadata_columns=None,
        index=None,
        gpu=False,
        dtype="uint8",
        distance_metric="euclidean",
    ):
        """Train and build a FAISS index from a database connection.

        Args:
            con: A database connection from which to obtain metadata for
                matched hashes.
            table: The table in the database that we should query for metadata.
            paramstyle: The parameter style for the given database
            hash_length: The length of the hash that is being matched against.
            ids_train: The IDs for the vectors to train on.
            train_size: The number of vectors to use for training. Will be
                randomly selected from 1 to the number of vectors in the database.
                Ignored if ids_train is not None.
            chunksize: The chunks of data to draw from the database at a time
                when adding vectors to the index.
            metadata_columns: The metadata that should be returned for queries.
            index: If a pretrained index is provided, training will be skipped,
                any existing vectors will be discarded, and the index will be
                repopulated with the current contents of the database.
            gpu: If true, will attempt to carry out training on a GPU.
            dtype: The data type for the vectors
            distance_metric: The distance metric for the vectors
        """
        assert (
            dtype == "uint8"
        ), "Only unsigned 8-bit integer hashes are supported at this time."
        assert (
            distance_metric == "euclidean"
        ), "Only euclidean distance is supported at this time."
        if index is None:
            # Train the index using the practices from
            # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#if-below-1m-vectors-ivfx
            ntotal = pd.read_sql(
                sql="select count(*) as count from hashes", con=con
            ).iloc[0]["count"]
            assert (
                train_size <= ntotal
            ), "Cannot train on more hashes than are available."
            nlist = int(min(4 * np.sqrt(ntotal), ntotal / 39))
            min_train_size = 39 * nlist
            if ids_train is not None:
                train_size = len(ids_train)
            if train_size is None:
                train_size = min_train_size
            assert (
                train_size >= min_train_size
            ), f"Training an index used for {ntotal} hashes requires at least {min_train_size} training hashes."
            if ids_train is None:
                ids_train = np.random.choice(
                    np.arange(ntotal), size=train_size, replace=False
                )
            df_train = query_by_id(
                con=con,
                table=table,
                ids=ids_train,
                paramstyle=paramstyle,
                extra_columns=["hash"],
            )
            x_train = np.array(
                [np.frombuffer(h, dtype=dtype) for h in df_train["hash"]]
            ).astype("float32")
            assert x_train.shape[1] == hash_length, "Hashes are of incorrect length."

            index = faiss.IndexIVFFlat(
                faiss.IndexFlatL2(hash_length), hash_length, nlist
            )
            if gpu:
                res = faiss.StandardGpuResources()
                gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
                gpu_index.train(x_train)
                index = faiss.index_gpu_to_cpu(gpu_index)
            else:
                index.train(x_train)
        else:
            index.reset()

        # Add hashes to the index in chunks.
        for df_add in pd.read_sql(
            sql=f"SELECT id, hash FROM {table}", con=con, chunksize=chunksize
        ):
            x_add = np.array(
                [np.frombuffer(h, dtype=dtype) for h in df_add["hash"]]
            ).astype("float32")
            index.add_with_ids(x_add, df_add["id"].values)
        return cls(
            con=con,
            index=index,
            hash_length=hash_length,
            distance_metric=distance_metric,
            dtype=dtype,
            table=table,
            paramstyle=paramstyle,
            metadata_columns=metadata_columns,
        )

    def query_by_id(
        self, ids, include_metadata=True, include_hashes=False
    ) -> pd.DataFrame:
        """Get data from the database.

        Args:
            ids: The hash IDs to get from the database.
            include_metadata: Whether to include metadata columns.
            include_hashes: Whether to include the hashes
        """
        if not self.metadata_columns and include_metadata and not include_hashes:
            # There won't be anything to  return.
            return pd.DataFrame()
        extra_columns = []
        if self.metadata_columns and include_metadata:
            extra_columns += self.metadata_columns
        if include_hashes:
            extra_columns += ["hash"]
        return query_by_id(
            con=self.con,
            table=self.table,
            ids=ids,
            paramstyle=self.paramstyle,
            extra_columns=extra_columns,
        )

    def string_to_vector(self, s: str, hash_format="base64") -> np.ndarray:
        """Convert a string to vector form.

        Args:
            s: The hash string
            hash_format: The format for the hash string
        """
        return pht.string_to_vector(
            s, hash_format=hash_format, dtype=self.dtype, hash_length=self.hash_length
        )

    def vector_to_string(self, vector, hash_format="base64") -> str | None:
        """Convert a vector back to string

        Args:
            vector: The hash vector
            hash_format: The format for the hash
        """

        return pht.vector_to_string(vector, dtype=self.dtype, hash_format=hash_format)

    def search(
        self,
        queries: list[QueryInput],
        threshold: int | None = None,
        threshold_func: typing.Callable[[np.ndarray], np.ndarray] | None = None,
        hash_format="base64",
        k=1,
    ):
        """Search the index and return matches.

        Args:
            queries: A list of queries in the form of {"id": <id>, "hash": "<hash_string>"}
            threshold: The threshold to use for matching. Takes precedence over threshold_func.
            threshold_func: A function that, given a query vector, returns the desired match threshold for that query.
            hash_format: The hash format used for the strings in the query.
            k: The number of nearest neighbors to return.

        Returns:
            Matches in the form of a list of dicts of the form:
            { "id": <query ID>, "matches": [{"distance": <distance>, "id": <match ID>, "metadata": {}}]}

            The metadata consists of the contents of the metadata columns specified for this matching
            instance.
        """
        try:
            xq = np.array(
                [
                    self.string_to_vector(h["hash"], hash_format=hash_format)
                    for h in queries
                ]
            ).astype("float32")
        except Exception as exc:
            raise QueryDecodingFailure("Failed to parse hash query.") from exc

        thresholds: np.ndarray = np.ones((len(xq), 1)) * np.inf
        if threshold:
            thresholds = np.ones((len(xq), 1)) * threshold
        if not threshold and threshold_func:
            thresholds = threshold_func(xq)
        else:
            thresholds = np.ones((len(xq), 1)) * np.inf
        distances, indices = self.index.search(xq, k=k)
        distances = np.sqrt(distances)
        metadata = (
            None
            if not self.metadata_columns
            else self.query_by_id(ids=np.unique(indices[distances < thresholds]))
        )
        matches: list[QueryMatch] = []
        for match_distances, match_ids, q, q_threshold in zip(
            distances, indices, queries, thresholds
        ):
            match_filter = match_distances < q_threshold
            match_ids = match_ids[match_filter]
            match_distances = match_distances[match_filter]
            match: QueryMatch = {"id": q["id"], "matches": []}
            for match_id, distance in zip(match_ids, match_distances):
                entry = {"distance": float(distance), "id": match_id}
                if metadata is not None:
                    entry["metadata"] = metadata.loc[match_id].to_dict()
                match["matches"].append(entry)
            matches.append(match)
        return matches

    def tune(self, n_query=100, min_recall=99, max_noise=3):
        """Obtain minimum value for nprobe that achieves a target level of recall.
        Args:
            n_query: The number of hashes to use as test hashes.
            min_recall: The minimum desired recall for the index.
            max_noise: The maximum amount of noise to add to each test hash

        Returns:
            A tuple of recall, latency (in ms), and nprobe where the nprobe
            value is the one that achieved the resulting recall.

        Raises:
            TuningFailure if no suitable nprobe value is found.
        """
        assert (
            n_query <= self.ntotal
        ), "Cannot use a test larger than ntotal (total number of hashes)."

        # Pick a random set of query hashes
        ids = np.random.choice(
            np.arange(1, self.ntotal + 1), size=n_query, replace=False
        )
        df = self.query_by_id(ids, include_metadata=False, include_hashes=True)
        xq = np.array(
            [np.frombuffer(v, dtype=self.dtype) for v in df["hash"]], dtype=np.uint8
        )

        noise = np.random.randint(
            low=(-xq.astype("int32")).clip(-max_noise, max_noise),
            high=(255 - xq.astype("float32")).clip(-max_noise, max_noise),
        )
        xq = (xq.astype("int32") + noise).astype("uint8").astype("float32")

        if min_recall == 100:
            warnings.warn(
                "100% recall can only be ensured with exhaustive search.", UserWarning
            )
            self.set_nprobe(self.nlist)
            start = time.time()
            self.index.search(xq, k=1)
            latency = time.time() - start
            return (100, 1000 * latency, self.nlist)

        # Make the search exhaustive so we get ground truth.
        self.set_nprobe(self.nlist)
        _, expected = self.index.search(xq, k=1)

        for nprobe in range(1, self.nlist):
            self.set_nprobe(nprobe)
            start = time.time()
            _, actual = self.index.search(xq, k=1)
            latency = time.time() - start
            recall = 100 * (actual[:, 0] == expected).sum() / xq.shape[0]
            if recall >= min_recall:
                break
        else:
            # If we never break, it means we never reached the target recall
            # for this query.
            raise TuningFailure(
                "Failed to find suitable parameters for selected recall."
            )
        return recall, 1000 * latency, nprobe

    def save(self, filepath):
        """Save an index to disk.

        Args:
            filepath: Where to save the index.
        """
        faiss.write_index(self.index, filepath)

    def set_nprobe(self, nprobe) -> int:
        """Set the value of nprobe.

        Args:
            nprobe: The new value for nprobe
        """
        faiss.ParameterSpace().set_index_parameter(self.index, "nprobe", nprobe)
        return faiss.downcast_index(self.index).nprobe

    @property
    def nlist(self):
        """The number of lists in the index."""
        return faiss.downcast_index(self.index).nlist

    @property
    def nprobe(self):
        """The current value of nprobe."""
        return faiss.downcast_index(self.index).nprobe

    @property
    def ntotal(self):
        """The number of vectors in the index."""
        return self.index.ntotal


================================================
FILE: perception/approximate_deduplication/serve.py
================================================
import asyncio
import functools
import json
import logging
import typing

import aiohttp.web
import numpy as np
from pythonjsonlogger import jsonlogger

import perception.hashers.tools as pht

from .index import ApproximateNearestNeighbors


def is_similarity_valid(data, index: ApproximateNearestNeighbors):
    """Validates input to the similarity endpoint."""
    hash_format = data.get("hash_format", "base64")
    expected_string_length = pht.get_string_length(
        hash_length=index.hash_length, dtype=index.dtype, hash_format=hash_format
    )
    return (
        isinstance(data, dict)
        and "queries" in data
        and isinstance(data["queries"], list)
        and all(isinstance(x.get("hash", None), str) for x in data["queries"])
        and hash_format in ["hex", "base64"]
        and all(
            len(x.get("hash", None)) == expected_string_length for x in data["queries"]
        )
    )


async def similarity(request):
    """Responds to a vector similarity query of the form:

    ```
    {
        "queries": [{"id": str, "hash": "base64_encoded_hash1"}, ...],
        "k": int,
        "threshold": float,
        "hash_format": "base64"
    }
    ```

    with information about similar vectors in the index in the form:

    ```
    {
      "queries": [{"id": str, "matches": [{"metadata": {json metadata}, "distance": float},...],...]
    }
    ```
    """
    try:
        request_data = await request.json()
    except json.JSONDecodeError:
        return aiohttp.web.json_response({"reason": "Malformed JSON"}, status=400)

    index = request.app["index"]
    try:
        assert is_similarity_valid(request_data, index)
    except Exception:
        return aiohttp.web.json_response({"reason": "Invalid JSON request"}, status=400)

    async with request.app["query_semaphore"]:
        matches = await asyncio.get_event_loop().run_in_executor(
            None,
            functools.partial(
                index.search,
                queries=request_data["queries"],
                threshold=request_data.get(
                    "threshold", request.app["default_threshold"]
                ),
                threshold_func=request.app["default_threshold_func"],
                k=request_data.get("k", request.app["default_k"]),
                hash_format=request_data.get("hash_format", "base64"),
            ),
        )
        matches = json.loads(json.dumps({"queries": matches}))

    return aiohttp.web.json_response(matches)


def get_logger(name, log_level):
    logger = logging.Logger(name=name, level=log_level)
    handler = logging.StreamHandler()
    handler.setFormatter(
        jsonlogger.JsonFormatter(
            "%(asctime)s:%(levelname)s:%(name)s:%(message)s%(exc_info)"
        )
    )
    logger.addHandler(handler)
    return logger


async def serve(
    index: ApproximateNearestNeighbors,
    default_threshold: int | None = None,
    default_threshold_func: typing.Callable[[np.ndarray], np.ndarray] | None = None,
    default_k: int = 1,
    concurrency: int = 2,
    log_level=logging.INFO,
    host="localhost",
    port=8080,
):
    """Serve an index as a web API. This function does not block.
    If you wish to use the function in a blocking manner, you can
    do something like

    .. code-block:: python

        loop = asyncio.get_event_loop()
        loop.run_until_complete(serve(...))
        loop.run_forever()

    You can query the API with something like:

    .. code-block:: bash

        curl --header "Content-Type: application/json" \\
             --request POST \\
             --data '{"queries": [{"hash": "<hash string>", "id": "bar"}], "threshold": 1200}' \\
             http://localhost:8080/v1/similarity

    Args:
        index: The underlying index
        default_threshold: The default threshold for matches
        default_k: The default number of nearest neighbors to look for
        concurrency: The number of concurrent requests served
        log_level: The log level to use for the logger
        host: The host for the servoce
        port: The port for the service
    """
    logger = get_logger(name="serve", log_level=log_level)
    logger.info("Initializing web service")
    app = aiohttp.web.Application()
    app.router.add_post("/v1/similarity", similarity, name="similarity")

    # Store globals in the application object
    app["default_threshold"] = default_threshold
    app["logger"] = logger
    app["default_k"] = default_k
    app["default_threshold_func"] = default_threshold_func
    app["index"] = index
    app["query_semaphore"] = asyncio.Semaphore(concurrency)
    logger.info("Entering web service listener loop.")
    runner = aiohttp.web.AppRunner(app, logger=logger)
    await runner.setup()
    site = aiohttp.web.TCPSite(runner, host, port)
    await site.start()
    return site


================================================
FILE: perception/benchmarking/__init__.py
================================================
from perception.benchmarking import video_transforms
from perception.benchmarking import video
from perception.benchmarking import image
from perception.benchmarking.image import (
    BenchmarkImageDataset,
    BenchmarkImageTransforms,
)
from perception.benchmarking.video import (
    BenchmarkVideoDataset,
    BenchmarkVideoTransforms,
)
from perception.benchmarking.common import BenchmarkHashes

__all__ = [
    "BenchmarkImageDataset",
    "BenchmarkImageTransforms",
    "BenchmarkVideoDataset",
    "BenchmarkVideoTransforms",
    "BenchmarkHashes",
    "video_transforms",
    "video",
    "image",
]


================================================
FILE: perception/benchmarking/common.py
================================================
import itertools
import logging
import os
import shutil
import tempfile
import uuid
import warnings
import zipfile
from abc import ABC

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from scipy import spatial, stats

from ..hashers.tools import compute_md5, string_to_vector

try:
    from . import extensions  # type: ignore
except ImportError:
    warnings.warn(
        "C extensions were not built. Some metrics will be computed more slowly. "
        "Please install from wheels or set up a compiler prior to installation "
        "from source to use extensions."
    )
    extensions = None

log = logging.getLogger(__name__)


def create_mask(transformed_guids, noop_guids):
    """Given a list of transformed guids and noop guids,
    computes an MxN array indicating whether noop n has the same guid
    as transform m. Used for applying a mask to a distance matrix
    for efficient computation of recall at different thresholds.

    Args:
        transformed_guids: An iterable of transformed guids
        noop: An iterable of noop guids

    Returns:
        An boolean array of shape
        `(len(transformed_guids), len(transformed_noops))`
    """
    n_noops = len(noop_guids)
    previous_guid = None
    start = None
    end = 0
    mask = np.zeros((len(transformed_guids), len(noop_guids)), dtype="bool")
    for current_guid, row in zip(transformed_guids, mask):
        if previous_guid is None or current_guid != previous_guid:
            start = end
            end = start + next(
                (
                    other_index
                    for other_index, guid in enumerate(noop_guids[start:])
                    if guid != current_guid
                ),
                n_noops,
            )
            previous_guid = current_guid
        row[start:end] = True
    return mask


def compute_threshold_precision_recall(pos, neg, precision_threshold=99.9):
    # Sort both arrays according to the positive distance
    neg = neg[pos.argsort()]
    pos = pos[pos.argsort()]

    # Compute false-positive rate for every value in pos
    tp = np.arange(1, len(pos) + 1)
    fp = np.array([(neg <= t).sum() for t in pos])
    precision = 100 * tp / (tp + fp)

    # Choose the optimal threshold
    bad_threshold_idxs = np.where(precision < precision_threshold)[0]

    if len(bad_threshold_idxs) > 0 and bad_threshold_idxs[0] > 0:
        optimal_threshold = pos[bad_threshold_idxs[0] - 1]
        recovered = (pos <= optimal_threshold).sum()
        if recovered == 0:
            optimal_precision = np.nan
        else:
            optimal_precision = precision[pos <= optimal_threshold].min()
        optimal_recall = round(100 * recovered / len(pos), 3)
    elif len(bad_threshold_idxs) > 0:
        # The closest hash was a false positive.
        optimal_threshold = pos[0]
        optimal_recall = 0
        optimal_precision = np.nan
    else:
        optimal_precision = 100
        optimal_threshold = pos.max()
        optimal_recall = 100
    return optimal_threshold, optimal_precision, optimal_recall


class Filterable(ABC):
    _df: pd.DataFrame
    expected_columns: list

    def __init__(self, df):
        assert sorted(df.columns) == sorted(
            self.expected_columns
        ), f"Column mismatch: Expected {sorted(self.expected_columns)}, found {sorted(df.columns)}."
        self._df = df

    @property
    def categories(self):
        """The categories included in the dataset"""
        return self._df["category"].unique()

    def filter(self, **kwargs):
        """Obtain a new dataset filtered with the given
        keyword arguments."""
        df = self._df.copy()
        for field, included in kwargs.items():
            existing = self._df[field].unique()
            if not all(inc in existing for inc in included):
                missing = ", ".join(
                    [str(inc) for inc in included if inc not in existing]
                )
                message = f"Did not find {missing} in column {field} dataset."
                warnings.warn(message, UserWarning)
            df = df[df[field].isin(included)]
        return self.__class__(df.copy())


class Saveable(Filterable):
    @classmethod
    def load(
        cls,
        path_to_zip_or_directory: str,
        storage_dir: str | None = None,
        verify_md5=True,
    ):
        """Load a dataset from a ZIP file or directory.

        Args:
            path_to_zip_or_directory: Pretty self-explanatory
            storage_dir: If providing a ZIP file, where to extract
                the contents. If None, contents will be extracted to
                a folder with the same name as the ZIP file in the
                same directory as the ZIP file.
            verify_md5: Verify md5s when loading
        """

        # Load index whether from inside ZIP file or from directory.
        if os.path.splitext(path_to_zip_or_directory)[1] == ".zip":
            if storage_dir is None:
                storage_dir = os.path.join(
                    os.path.dirname(os.path.abspath(path_to_zip_or_directory)),
                    os.path.splitext(os.path.basename(path_to_zip_or_directory))[0],
                )
                os.makedirs(storage_dir, exist_ok=True)
            with zipfile.ZipFile(path_to_zip_or_directory, "r") as z:
                # Try extracting only the index at first so we can
                # compare md5.
                z.extract("index.csv", os.path.join(storage_dir))
                index: pd.DataFrame = pd.read_csv(
                    os.path.join(storage_dir, "index.csv")
                )
                index["filepath"] = index["filename"].apply(
                    lambda fn: (
                        os.path.join(storage_dir, fn) if not pd.isnull(fn) else None
                    )
                )
                do_zip_extraction = True
                if index["filepath"].apply(os.path.isfile).all():
                    if verify_md5:
                        do_zip_extraction = not all(
                            row["md5"] == compute_md5(row["filepath"])
                            for _, row in tqdm.tqdm(
                                index.iterrows(), desc="Checking cache"
                            )
                        )
                    else:
                        do_zip_extraction = False
                if do_zip_extraction:
                    z.extractall(storage_dir)
                else:
                    log.info("Found all files already extracted. Skipping extraction.")
                    verify_md5 = False
        else:
            assert (
                storage_dir is None
            ), "Storage directory only valid if path is to ZIP file."
            index = pd.read_csv(os.path.join(path_to_zip_or_directory, "index.csv"))
            index["filepath"] = index["filename"].apply(
                lambda fn: (
                    os.path.join(path_to_zip_or_directory, fn)
                    if not pd.isnull(fn)
                    else None
                )
            )

        if verify_md5:
            assert all(
                row["md5"] == compute_md5(row["filepath"])
                for _, row in tqdm.tqdm(
                    index.iterrows(),
                    desc="Performing final md5 integrity check.",
                    total=len(index.index),
                )
            ), "An md5 mismatch has occurred."
        return cls(index.drop(["filename", "md5"], axis=1))

    def save(self, path_to_zip_or_directory):
        """Save a dataset to a directory or ZIP file.

        Args:
            path_to_zip_or_directory: Pretty self-explanatory
        """
        df = self._df
        assert "filepath" in df.columns, "Index dataframe must contain filepath."

        # Build index using filename instead of filepath.
        index = df.copy()
        index["filename"] = df["filepath"].apply(
            lambda filepath: (
                os.path.basename(filepath) if not pd.isnull(filepath) else None
            )
        )
        if index["filename"].dropna().duplicated().sum() > 0:
            warnings.warn("Changing filenames to UUID due to duplicates.", UserWarning)

            index["filename"] = [
                (
                    str(uuid.uuid4()) + os.path.splitext(row["filename"])[1]
                    if not pd.isnull(row["filename"])
                    else None
                )
                for _, row in index.iterrows()
            ]
        index["md5"] = [
            compute_md5(filepath) if not pd.isnull(filepath) else None
            for filepath in tqdm.tqdm(index["filepath"], desc="Computing md5s.")
        ]

        # Add all files as well as the dataframe index to
        # a ZIP file if path is to ZIP file or to the directory if it is
        # not a ZIP file.
        if os.path.splitext(path_to_zip_or_directory)[1] == ".zip":
            with zipfile.ZipFile(path_to_zip_or_directory, "w") as f:
                with tempfile.TemporaryFile(mode="w+") as index_file:
                    index.drop("filepath", axis=1).to_csv(index_file, index=False)
                    index_file.seek(0)
                    f.writestr("index.csv", index_file.read())
                for _, row in tqdm.tqdm(
                    index.iterrows(), desc="Saving files", total=len(df)
                ):
                    if pd.isnull(row["filepath"]):
                        #  There was an error associated with this file.
                        continue
                    f.write(row["filepath"], row["filename"])
        else:
            os.makedirs(path_to_zip_or_directory, exist_ok=True)
            index.drop("filepath", axis=1).to_csv(
                os.path.join(path_to_zip_or_directory, "index.csv"), index=False
            )
            for _, row in tqdm.tqdm(
                index.iterrows(), desc="Saving files", total=len(df)
            ):
                if pd.isnull(row["filepath"]):
                    # There was an error associated with this file.
                    continue
                if row["filepath"] == os.path.join(
                    path_to_zip_or_directory, row["filename"]
                ):
                    # The source file is the same as the target file.
                    continue
                shutil.copy(
                    row["filepath"],
                    os.path.join(path_to_zip_or_directory, row["filename"]),
                )


class BenchmarkHashes(Filterable):
    """A dataset of hashes for transformed images. It is essentially
    a wrapper around a `pandas.DataFrame` with the following columns:

    - guid
    - error
    - filepath
    - category
    - transform_name
    - hasher_name
    - hasher_dtype
    - hasher_distance_metric
    - hasher_hash_length
    - hash
    """

    expected_columns = [
        "error",
        "filepath",
        "hash",
        "hasher_name",
        "hasher_dtype",
        "hasher_distance_metric",
        "category",
        "guid",
        "input_filepath",
        "transform_name",
        "hasher_hash_length",
    ]

    def __init__(self, df: pd.DataFrame):
        super().__init__(df)
        self._metrics: pd.DataFrame | None = None

    def __add__(self, other):
        return BenchmarkHashes(df=pd.concat([self._df, other._df]).drop_duplicates())

    def __radd__(self, other):
        return self.__add__(other)

    @classmethod
    def load(cls, filepath: str):
        return cls(pd.read_csv(filepath))

    def save(self, filepath):
        self._df.to_csv(filepath, index=False)

    def compute_metrics(
        self, custom_distance_metrics: dict | None = None
    ) -> pd.DataFrame:
        if self._metrics is not None:
            return self._metrics
        metrics = []
        hashsets = self._df.sort_values("guid")
        n_dropped = hashsets["hash"].isnull().sum()
        if n_dropped > 0:
            hashsets = hashsets.dropna(subset=["hash"])
            warnings.warn(f"Dropping {n_dropped} invalid / empty hashes.", UserWarning)
        for (hasher_name, transform_name, category), hashset in tqdm.tqdm(
            hashsets.groupby(["hasher_name", "transform_name", "category"]),
            desc="Computing metrics.",
        ):
            # Note the guid filtering below. We need to include only guids
            # for which we have the transform *and* the guid. One of them
            # may have been dropped due to being invalid.
            noops = hashsets[
                (hashsets["transform_name"] == "noop")
                & (hashsets["hasher_name"] == hasher_name)
                & (hashsets["guid"].isin(hashset["guid"]))
            ]
            valid_hashset = hashset[hashset["guid"].isin(noops["guid"])]
            dtype, distance_metric, hash_length = valid_hashset.iloc[0][
                ["hasher_dtype", "hasher_distance_metric", "hasher_hash_length"]
            ]
            n_noops = len(noops.guid)
            n_hashset = len(valid_hashset.guid)
            noop_guids = noops.guid.values
            mask = create_mask(valid_hashset.guid.values, noops.guid.values)
            if distance_metric != "custom":
                X_trans = np.array(
                    valid_hashset.hash.apply(
                        string_to_vector,  # type: ignore[arg-type]
                        hash_length=int(hash_length),
                        dtype=dtype,
                        hash_format="base64",
                    ).tolist()
                )
                X_noop = np.array(
                    noops.hash.apply(
                        string_to_vector,  # type: ignore[arg-type]
                        dtype=dtype,
                        hash_format="base64",
                        hash_length=int(hash_length),
                    ).tolist()
                )
                if (
                    distance_metric != "euclidean"
                    or "int" not in dtype
                    or extensions is None
                ):
                    distance_matrix = spatial.distance.cdist(
                        XA=X_trans, XB=X_noop, metric=distance_metric
                    )
                    distance_to_closest_image = distance_matrix.min(axis=1)
                    distance_to_correct_image = np.ma.masked_array(
                        distance_matrix, np.logical_not(mask)
                    ).min(axis=1)
                    distance_matrix_incorrect_image: np.ndarray = np.ma.masked_array(
                        distance_matrix, mask
                    )
                    distance_to_incorrect_image = distance_matrix_incorrect_image.min(
                        axis=1
                    )
                    closest_incorrect_guid = noop_guids[
                        distance_matrix_incorrect_image.argmin(axis=1)
                    ]
                else:
                    distances, indexes = extensions.compute_euclidean_metrics(
                        X_noop.astype("int32"), X_trans.astype("int32"), mask
                    )
                    distance_to_correct_image = distances[:, 1]
                    distance_to_incorrect_image = distances[:, 0]
                    distance_to_closest_image = distances.min(axis=1)
                    closest_incorrect_guid = [noop_guids[idx] for idx in indexes[:, 0]]
            else:
                assert (
                    custom_distance_metrics is not None
                    and hasher_name in custom_distance_metrics
                ), f"You must provide a custom distance metric for {hasher_name}."
                noops_hash_values = noops.hash.values
                hashset_hash_values = valid_hashset.hash.values
                distance_matrix = np.zeros((n_hashset, n_noops))
                distance_function = custom_distance_metrics[hasher_name]
                for i1, i2 in itertools.product(range(n_hashset), range(n_noops)):
                    distance_matrix[i1, i2] = distance_function(
                        hashset_hash_values[i1], noops_hash_values[i2]
                    )
                distance_to_closest_image = distance_matrix.min(axis=1)
                distance_to_correct_image = np.ma.masked_array(
                    distance_matrix, np.logical_not(mask)
                ).min(axis=1)
                distance_matrix_incorrect_image = np.ma.masked_array(
                    distance_matrix, mask
                )
                distance_to_incorrect_image = distance_matrix_incorrect_image.min(
                    axis=1
                )
                closest_incorrect_guid = noop_guids[
                    distance_matrix_incorrect_image.argmin(axis=1)
                ]

            metrics.append(
                pd.DataFrame(
                    {
                        "guid": valid_hashset["guid"].values,
                        "transform_name": transform_name,
                        "hasher_name": hasher_name,
                        "category": category,
                        "distance_to_closest_correct_image": distance_to_correct_image,
                        "distance_to_closest_incorrect_image": distance_to_incorrect_image,
                        "distance_to_closest_image": distance_to_closest_image,
                        "closest_incorrect_guid": closest_incorrect_guid,
                    }
                )
            )
        metrics_df = pd.concat(metrics)
        self._metrics = metrics_df
        return metrics_df

    def show_histograms(self, grouping=None, precision_threshold=99.9, **kwargs):
        """Plot histograms for true and false positives, similar
        to https://tech.okcupid.com/evaluating-perceptual-image-hashes-okcupid/
        Additional arguments passed to compute_metrics.

        Args:
            grouping: List of fields to group by. By default, all fields are used
                (category, and transform_name).
        """
        if grouping is None:
            grouping = ["category", "transform_name"]

        metrics = self.compute_metrics(**kwargs)

        hasher_names = metrics["hasher_name"].unique().tolist()
        bounds = (
            metrics.groupby("hasher_name")[
                ["distance_to_closest_image", "distance_to_closest_incorrect_image"]
            ]
            .max()
            .max(axis=1)
        )
        if grouping:
            group_names = [
                ":".join(map(str, row.values))
                for idx, row in metrics[grouping].drop_duplicates().iterrows()
            ]
        else:
            group_names = [""]
        ncols = len(hasher_names)
        nrows = len(group_names)

        fig, axs = plt.subplots(
            ncols=ncols, nrows=nrows, figsize=(ncols * 4, nrows * 3), sharey=True
        )

        for group_name, subset in metrics.groupby(["hasher_name"] + grouping):
            # Get names of group and hasher
            if grouping:
                hasher_name = group_name[0]
                group_name = ":".join(map(str, group_name[1:]))
            else:
                hasher_name = group_name
                group_name = ""

            # Get the correct axis.
            colIdx = hasher_names.index(hasher_name)
            rowIdx = group_names.index(group_name)
            if ncols > 1 and nrows > 1:
                ax = axs[rowIdx, colIdx]
            elif ncols == 1 and nrows == 1:
                ax = axs
            else:
                ax = axs[rowIdx if nrows > 1 else colIdx]

            # Plot the charts
            inner_keys = ["guid"] + (
                ["transform_name"] if "transform_name" in subset.columns else []
            )
            pos, neg = (
                subset.groupby(inner_keys)[
                    [
                        "distance_to_closest_correct_image",
                        "distance_to_closest_incorrect_image",
                    ]
                ]
                .min()
                .values.T
            )
            optimal_threshold, _, optimal_recall = compute_threshold_precision_recall(
                pos=pos, neg=neg, precision_threshold=precision_threshold
            )
            optimal_threshold = optimal_threshold.round(3)
            emd = stats.wasserstein_distance(pos, neg).round(2)
            ax.hist(neg, label="neg", bins=10)
            ax.hist(pos, label="pos", bins=10)
            ax.text(
                0.5,
                0.5,
                f"Recall: {optimal_recall:.0f}% @ {optimal_threshold}\nemd: {emd:.2f}",
                horizontalalignment="center",
                color="black",
                verticalalignment="center",
                transform=ax.transAxes,
                fontsize=12,
                fontweight=1000,
            )
            ax.set_xlim(-0.05 * bounds[hasher_name], bounds[hasher_name])
            if rowIdx == 0:
                ax.set_title(hasher_name)
                ax.legend()
            if colIdx == 0:
                ax.set_ylabel(group_name)
        fig.tight_layout()

    def compute_threshold_recall(
        self, precision_threshold=99.9, grouping=None, **kwargs
    ) -> pd.DataFrame:
        """Compute a table for threshold and recall for each category, hasher,
        and transformation combinations. Additional arguments passed to compute_metrics.

        Args:
            precision_threshold: The precision threshold to use
                for choosing a distance threshold for each hasher.
            grouping: List of fields to group by. By default, all fields are used
                (category, and transform_name).

        Returns:
            A pandas DataFrame with 7 columns. The key columns are threshold
            (The optimal distance threshold for detecting a match for this
            combination), recall (the number of correct matches divided by
            the number of possible matches), and precision (the number correct
            matches divided by the total number of matches whether correct
            or incorrect).
        """
        if grouping is None:
            grouping = ["category", "transform_name"]

        def group_func(subset):
            inner_keys = ["guid"] + (
                ["transform_name"] if "transform_name" in subset.columns else []
            )
            pos, neg = (
                subset.groupby(inner_keys)[
                    [
                        "distance_to_closest_correct_image",
                        "distance_to_closest_incorrect_image",
                    ]
                ]
                .min()
                .values.T
            )

            (
                optimal_threshold,
                optimal_precision,
                optimal_recall,
            ) = compute_threshold_precision_recall(
                pos=pos, neg=neg, precision_threshold=precision_threshold
            )
            return pd.Series(
                {
                    "threshold": optimal_threshold,
                    "recall": optimal_recall,
                    "precision": optimal_precision,
                    "n_exemplars": len(subset),
                }
            )

        return (
            self.compute_metrics(**kwargs)
            .groupby(grouping + ["hasher_name"])
            .apply(group_func)
        )


class BenchmarkDataset(Saveable):
    """A dataset of images separated into
    categories. It is essentially a wrapper around a pandas
    dataframe with the following columns:

    - filepath
    - category
    """

    expected_columns = ["filepath", "category"]

    @classmethod
    def from_tuples(cls, files: list[tuple[str, str]]):
        """Build dataset from a set of files.

        Args:
            files: A list of tuples where each entry is a pair
                filepath and category.
        """
        df = pd.DataFrame.from_records(
            [{"filepath": f, "category": c} for f, c in files]
        )
        return cls(df)

    def transform(self, transforms, storage_dir, errors):
        raise NotImplementedError()


class BenchmarkTransforms(Saveable):
    """A dataset of transformed images. Essentially wraps a DataFrame with the
    following columns:

    - guid
    - filepath
    - category
    - transform_name
    - input_filepath (for memo purposes only)
    """

    expected_columns = [
        "filepath",
        "category",
        "transform_name",
        "input_filepath",
        "guid",
    ]

    def compute_hashes(self, hashers, max_workers):
        raise NotImplementedError()


================================================
FILE: perception/benchmarking/extensions.pyx
================================================
# cython: language_level=3

import cython
import numpy as np
from cython.parallel import parallel, prange

cimport numpy as np
from libc.math cimport sqrt
from libc.stdlib cimport abort, free, malloc


cdef extern from "limits.h":
    int INT_MAX

ctypedef np.uint8_t uint8

@cython.boundscheck(False)
@cython.wraparound(False)
def compute_euclidean_metrics(int[:, :] X_noop, int[:, :] X_tran, uint8[:, :] mask):
    """Compute the positive / negative distance metrics between two sets of vectors
    using euclidean distance. This function obtains the necessary metrics roughly
    10x faster than using scipy.spatial.distance.cdist and numpy functions.

    Args:
        X_noop: The vectors for the noop hashes with shape (N, K)
        X_tran: The vectors for the transformed instances with shape (M, K)
        mask: A (M, N) array indicating whether noop n corresponds to transform m

    Returns:
        distances: An M by 2 array with the closest false positive and closest
            true positive for each transform.
        indexes: An M by 2 array with the index for the closest false positive
            noop and the closest true positive noop.
    """

    cdef Py_ssize_t n_noop = X_noop.shape[0]
    cdef Py_ssize_t d_noop = X_noop.shape[1]
    cdef Py_ssize_t n_tran = X_tran.shape[0]
    cdef Py_ssize_t d_tran = X_tran.shape[1]
    cdef Py_ssize_t n_mask_tran = mask.shape[0]
    cdef Py_ssize_t n_mask_noop = mask.shape[1]
    cdef Py_ssize_t i_mask_tran
    cdef Py_ssize_t i_mask_noop
    cdef int n_pos

    cdef int current_distance
    cdef int current_closest_fp
    cdef int current_closest_tp
    cdef int[:] x
    cdef int[:] y
    cdef uint8 is_pos
    cdef Py_ssize_t i_noop, i_tran, i_d
    cdef Py_ssize_t i_closest_fp = 0
    cdef Py_ssize_t i_closest_tp = 1
    cdef Py_ssize_t i_closest_fp_idx = 0
    cdef Py_ssize_t i_closest_tp_idx = 1
    cdef int * local_buf
    cdef size_t size = 5
    cdef float NAN
    NAN = float("NaN")

    assert d_noop == d_tran, "Dimensionality of vectors must match."
    assert n_mask_tran == n_tran, "Dimension 0 of mask must correspond to n_transforms."
    assert n_mask_noop == n_noop, "Dimension 1 of mask must correspond to n_noops."
    for i_mask_tran in range(n_mask_tran):
        n_pos = 0
        for i_mask_noop in range(n_mask_noop):
            if mask[i_mask_tran, i_mask_noop] == True:
                n_pos += 1
        assert n_pos > 0, "All transforms must have at least one positive noop."
        assert n_pos < n_mask_noop, "All transforms must have at least one negative noop."

    distances = np.zeros((n_tran, 2), dtype=np.float32)
    indexes = np.zeros((n_tran, 2), dtype=np.int32)

    cdef np.float32_t[:, :] distances_view = distances
    cdef int[:, :] indexes_view = indexes

    with nogil, parallel():
        local_buf = <int *> malloc(sizeof(int) * size)
        if local_buf is NULL:
            abort()
        for i_tran in prange(n_tran):
            local_buf[1] = INT_MAX  # Smallest false positive distance
            local_buf[2] = INT_MAX  # Smallest true positive distance
            local_buf[3] = 0        # Smallest false positive index
            local_buf[4] = 0        # Smallest true positive index
            for i_noop in range(n_noop):
                local_buf[0] = 0    # Current distance
                is_pos = mask[i_tran, i_noop] == True
                for i_d in range(d_noop):
                    local_buf[0] += (X_noop[i_noop, i_d] - X_tran[i_tran, i_d]) ** 2
                if is_pos and (local_buf[0] < local_buf[2]):
                    local_buf[2] = local_buf[0]
                    local_buf[4] = i_noop
                if not is_pos and (local_buf[0] < local_buf[1]):
                    local_buf[1] = local_buf[0]
                    local_buf[3] = i_noop
            # I do not think that an <int *> can ever actually be
            # greater than INT_MAX but we'll leave the check in.
            if local_buf[1] < INT_MAX:
                distances_view[i_tran, i_closest_fp] = sqrt(local_buf[1])
            else:
                distances_view[i_tran, i_closest_fp] = NAN
            if local_buf[2] < INT_MAX:
                distances_view[i_tran, i_closest_tp] = sqrt(local_buf[2])
            else:
                distances_view[i_tran, i_closest_tp] = NAN
            indexes_view[i_tran, i_closest_fp_idx] = local_buf[3]
            indexes_view[i_tran, i_closest_tp_idx] = local_buf[4]
        free(local_buf)
    return distances, indexes


================================================
FILE: perception/benchmarking/image.py
================================================
import logging
import os
import uuid
import warnings

import cv2
import albumentations
import pandas as pd
from tqdm import tqdm

from ..hashers import tools
from ..hashers.hasher import ImageHasher
from ..tools import deduplicate, flatten
from .common import BenchmarkDataset, BenchmarkHashes, BenchmarkTransforms

log = logging.getLogger(__name__)


class BenchmarkImageTransforms(BenchmarkTransforms):
    def compute_hashes(
        self, hashers: dict[str, ImageHasher], max_workers: int = 5
    ) -> BenchmarkHashes:
        """Compute hashes for a series of files given some set of hashers.

        Args:
            hashers: A dictionary of hashers.
            max_workers: Maximum number of workers for parallel hash
                computation.

        Returns:
            metrics: A BenchmarkHashes object.
        """
        hashsets = []
        filepaths = self._df["filepath"]
        for hasher_name, hasher in hashers.items():
            hash_dicts = hasher.compute_parallel(
                filepaths,
                progress=tqdm,
                progress_desc=f"Computing hashes for {hasher_name}",
                max_workers=max_workers,
            )
            if not hasher.returns_multiple:
                hashes_df = pd.DataFrame.from_records(hash_dicts)
            else:
                hash_groups = [
                    hash_dict["hash"] if hash_dict["error"] is None else [None]
                    for hash_dict in hash_dicts
                ]
                hash_group_sizes = [len(hash_group) for hash_group in hash_groups]
                current_hashes = flatten(hash_groups)
                current_filepaths = flatten(
                    [
                        [hash_dict["filepath"]] * hash_group_size
                        for hash_dict, hash_group_size in zip(
                            hash_dicts, hash_group_sizes
                        )
                    ]
                )
                current_errors = flatten(
                    [
                        [hash_dict["error"]] * hash_group_size
                        for hash_dict, hash_group_size in zip(
                            hash_dicts, hash_group_sizes
                        )
                    ]
                )
                hashes_df = pd.DataFrame(
                    {
                        "error": current_errors,
                        "filepath": current_filepaths,
                        "hash": current_hashes,
                    }
                )
            hashset = hashes_df.assign(
                hasher_name=hasher_name,
                hasher_hash_length=hasher.hash_length,
                hasher_dtype=hasher.dtype,
                hasher_distance_metric=hasher.distance_metric,
            )
            hashset = hashset.merge(self._df, on="filepath")
            hashsets.append(hashset)
        return BenchmarkHashes(pd.concat(hashsets, sort=True))


class BenchmarkImageDataset(BenchmarkDataset):
    def deduplicate(
        self, hasher: ImageHasher, threshold=0.001, isometric=False
    ) -> tuple["BenchmarkImageDataset", set[tuple[str, str]]]:
        """Remove duplicate files from dataset.

        Args:
            files: A list of file paths
            hasher: A hasher to use for finding a duplicate
            threshold: The threshold required for a match
            isometric: Whether to compute the rotated versions of the images

        Returns:
            A list where each entry is a list of files that are
            duplicates of each other. We keep only the last entry.
        """
        pairs: set[tuple[str, str]] = set()
        for _, group in tqdm(
            self._df.groupby(["category"]), desc="Deduplicating categories."
        ):
            pairs = pairs.union(
                set(
                    deduplicate(
                        files=group["filepath"].tolist(),
                        hashers=[(hasher, threshold)],
                        isometric=isometric,
                    )
                )
            )
        removed = [pair[0] for pair in pairs]
        return (
            BenchmarkImageDataset(self._df[~self._df["filepath"].isin(removed)].copy()),
            pairs,
        )

    def transform(
        self,
        transforms: dict[str, albumentations.BasicTransform],
        storage_dir: str,
        errors: str = "raise",
    ) -> BenchmarkImageTransforms:
        """Prepare files to be used as part of benchmarking run.

        Args:
            transforms: A dictionary of transformations. The only required
                key is `noop` which determines how the original, untransformed
                image is saved. For a true copy, simply make the `noop` key
                `albumentations.NoOp`
            storage_dir: A directory to store all the images along with
                their transformed counterparts.
            errors: How to handle errors reading files. If "raise", exceptions are
                raised. If "warn", the error is printed as a warning.

        Returns:
            transforms: A BenchmarkImageTransforms object
        """
        assert (
            "noop" in transforms
        ), "You must provide a no-op transform such as `lambda img: img`."

        os.makedirs(storage_dir, exist_ok=True)

        files = self._df.copy()
        files["guid"] = [str(uuid.uuid4()) for n in range(len(files))]

        def apply_transform(files, transform_name):
            transform = transforms[transform_name]
            transformed_arr = []
            for _, row in tqdm(
                files.iterrows(),
                desc=f"Creating files for {transform_name}",
                total=len(files),
            ):
                filepath, guid, category = row[["filepath", "guid", "category"]]
                try:
                    image = tools.read(filepath)
                except Exception as exception:
                    message = f"An error occurred reading {filepath}."
                    if errors == "raise":
                        raise exception
                    warnings.warn(message, UserWarning)
                    continue
                try:
                    transformed = transform(image=image)
                    # If albumentations, output is a dict with 'image' key
                    if isinstance(transformed, dict) and "image" in transformed:
                        transformed = transformed["image"]
                except Exception as e:
                    raise RuntimeError(
                        f"An exception occurred while processing {filepath} "
                        f"with transform {transform_name}."
                    ) from e
                transformed_path = os.path.join(
                    storage_dir, f"{guid}_{transform_name}.jpg"
                )
                cv2.imwrite(
                    transformed_path, cv2.cvtColor(transformed, cv2.COLOR_RGB2BGR)
                )
                transformed_arr.append(
                    {
                        "guid": guid,
                        "transform_name": transform_name,
                        "input_filepath": filepath,
                        "filepath": transformed_path,
                        "category": category,
                    }
                )
            return pd.DataFrame.from_records(transformed_arr)

        results = [apply_transform(files, transform_name="noop")]

        for transform_name in transforms.keys():
            if transform_name == "noop":
                continue
            results.append(apply_transform(results[0], transform_name=transform_name))
        benchmark_transforms = BenchmarkImageTransforms(
            df=pd.concat(results, axis=0, ignore_index=True)
        )
        benchmark_transforms.save(storage_dir)
        return benchmark_transforms


================================================
FILE: perception/benchmarking/image_transforms.py
================================================
import cv2
import numpy as np


def apply_watermark(watermark, alpha: float = 1.0, size: float = 1.0):
    """Apply a watermark to the bottom right of
    images. Based on the work provided at
    https://www.pyimagesearch.com/2016/04/25/watermarking-images-with-opencv-and-python/

    Args:
        watermark: The watermark to overlay
        alpha: The strength of the overlay
        size: The maximum proportion of the image
            taken by the watermark.
    """
    assert watermark.shape[-1] == 4, "Watermark must have an alpha channel."

    # Why do we have to do this? It's not clear. But the process doesn't work
    # without it.
    B, G, R, A = cv2.split(watermark)
    B = cv2.bitwise_and(B, B, mask=A)
    G = cv2.bitwise_and(G, G, mask=A)
    R = cv2.bitwise_and(R, R, mask=A)
    watermark = cv2.merge([B, G, R, A])

    def transform(image):
        # Add alpha channel
        h, w = image.shape[:2]
        wh, ww = watermark.shape[:2]
        scale = size * min(h / wh, w / ww)
        image = np.dstack([image, np.ones((h, w), dtype="uint8") * 255])
        # Construct an overlay that is the same size as the input.
        overlay = np.zeros((h, w, 4), dtype="uint8")
        scaled = cv2.resize(watermark, (int(scale * ww), int(scale * wh)))
        sh, sw = scaled.shape[:2]
        overlay[max(h - sh, 0) :, max(w - sw, 0) : w] = scaled
        # Blend the two images together using transparent overlays
        output = image.copy()
        cv2.addWeighted(overlay, alpha, output, 1.0, 0, output)
        return cv2.cvtColor(output, cv2.COLOR_RGBA2RGB)

    return transform


================================================
FILE: perception/benchmarking/video.py
================================================
import concurrent.futures
import os
import typing
import uuid

import pandas as pd
import tqdm

from ..hashers import VideoHasher, tools
from ..tools import flatten
from .common import BenchmarkDataset, BenchmarkHashes, BenchmarkTransforms


def _process_row(row, hashers, framerates):
    error = None
    try:
        assert not pd.isnull(row["filepath"]), "No filepath provided."
        hashes = tools.compute_synchronized_video_hashes(
            filepath=row["filepath"],
            hashers=hashers,
            framerates=framerates,
            hash_format="base64",
        )
    except Exception as exception:
        error = str(exception)
        hashes = {
            hasher_name: [None] if hasher.returns_multiple else None
            for hasher_name, hasher in hashers.items()
        }
    base_dict = {
        "guid": row["guid"],
        "filepath": row["filepath"],
        "error": error,
        "category": row["category"],
        "transform_name": row["transform_name"],
        "input_filepath": row["input_filepath"],
    }
    hash_dicts = []
    for hasher_name, hasher in hashers.items():
        base_hash_dict = {
            "hasher_name": hasher_name,
            "hasher_dtype": hasher.dtype,
            "hasher_distance_metric": hasher.distance_metric,
            "hasher_hash_length": hasher.hash_length,
        }
        if not hasher.returns_multiple:
            hash_dicts.append(
                {
                    **{
                        "hash": hashes[hasher_name],
                    },
                    **base_hash_dict,
                }
            )
        else:
            for hash_value in hashes[hasher_name]:
                hash_dicts.append(
                    {
                        **{
                            "hash": hash_value,
                        },
                        **base_hash_dict,
                    }
                )
    return [{**hash_dict, **base_dict} for hash_dict in hash_dicts]


class BenchmarkVideoDataset(BenchmarkDataset):
    def transform(
        self,
        transforms: dict[str, typing.Callable],
        storage_dir: str,
        errors: str = "raise",
    ):
        """Prepare files to be used as part of benchmarking run.

        Args:
            transforms: A dictionary of transformations. The only required
                key is `noop` which determines how the original, untransformed
                video is saved. Each transform should be a callable function with
                that accepts an `input_filepath` and `output_filepath` argument and
                it should return the `output_filepath` (which may have a different
                extension appended by the transform function).
            storage_dir: A directory to store all the videos along with
                their transformed counterparts.
            errors: How to handle errors reading files. If "raise", exceptions are
                raised. If "warn", the error is printed as a warning.

        Returns:
            transforms: A BenchmarkVideoTransforms object
        """
        assert "noop" in transforms, "You must provide a no-op transform."

        os.makedirs(storage_dir, exist_ok=True)

        files = self._df.copy()
        files["guid"] = [str(uuid.uuid4()) for n in range(len(files))]

        def apply_transform_to_file(input_filepath, guid, transform_name, category):
            if input_filepath is None:
                # This can happen if the noop transform did not yield
                # a file. We don't want to drop the records so we
                # keep them.
                return {
                    "guid": guid,
                    "error": "No source file provided",
                    "transform_name": transform_name,
                    "input_filepath": input_filepath,
                    "filepath": None,
                    "category": category,
                }
            try:
                output_filepath = transforms[transform_name](
                    input_filepath,
                    output_filepath=os.path.join(
                        storage_dir, f"{guid}_{transform_name}"
                    ),
                )
                error = None
            except Exception as e:
                output_filepath = None
                error = str(e)
            return {
                "guid": guid,
                "error": error,
                "transform_name": transform_name,
                "input_filepath": input_filepath,
                "filepath": output_filepath,
                "category": category,
            }

        def apply_transform_to_files(files, transform_name):
            return pd.DataFrame.from_records(
                [
                    apply_transform_to_file(
                        input_filepath=row["filepath"],
                        guid=row["guid"],
                        transform_name=transform_name,
                        category=row["category"],
                    )
                    for _, row in tqdm.tqdm(
                        files.iterrows(),
                        desc=f"Creating files for {transform_name}",
                        total=len(files),
                    )
                ]
            )

        results = [apply_transform_to_files(files, transform_name="noop")]
        for transform_name in transforms.keys():
            if transform_name == "noop":
                continue
            results.append(
                apply_transform_to_files(results[0], transform_name=transform_name)
            )
        benchmark_transforms = BenchmarkVideoTransforms(
            df=pd.concat(results, axis=0, ignore_index=True)
        )
        benchmark_transforms.save(storage_dir)
        return benchmark_transforms


class BenchmarkVideoTransforms(BenchmarkTransforms):
    expected_columns = [
        "filepath",
        "category",
        "transform_name",
        "input_filepath",
        "guid",
        "error",
    ]

    def compute_hashes(
        self, hashers: dict[str, VideoHasher], max_workers: int = 5
    ) -> BenchmarkHashes:
        """Compute hashes for a series of files given some set of hashers.

        Args:
            hashers: A dictionary of hashers.
            max_workers: Maximum number of workers for parallel hash
                computation.

        Returns:
            hashes: A BenchmarkHashes object.
        """
        id_rates = {
            hasher_name: hasher.frames_per_second
            for hasher_name, hasher in hashers.items()
            if hasher.frames_per_second is not None
        }
        if id_rates:
            framerates = tools.get_common_framerates(
                {
                    hasher_name: hasher.frames_per_second
                    for hasher_name, hasher in hashers.items()
                    if hasher.frames_per_second is not None
                }
            )
        else:
            framerates = {}

        with concurrent.futures.ProcessPoolExecutor(
            max_workers=max_workers
        ) as executor:
            futures = [
                executor.submit(
                    _process_row, row=row, framerates=framerates, hashers=hashers
                )
                for index, row in self._df.iterrows()
            ]
            return BenchmarkHashes(
                pd.DataFrame.from_records(
                    flatten(
                        [
                            future.result()
                            for future in tqdm.tqdm(
                                concurrent.futures.as_completed(futures),
                                desc="Computing hashes.",
                                total=len(self._df),
                            )
                        ]
                    )
                )
            )


================================================
FILE: perception/benchmarking/video_transforms.py
================================================
import os

import cv2
import ffmpeg

from ..hashers.tools import read_video


def probe(filepath):
    """Get the output of ffprobe."""
    return ffmpeg.probe(filepath)


def sanitize_output_filepath(input_filepath, output_filepath, output_ext=None):
    """Get a suitable output filepath with an extension based on
    an input filepath.

    Args:
        input_filepath: The filepath for the source file.
        output_filepath: The filepath for the output file.
        output_ext: A new extension to add (e.g., '.gif')
    """
    _, input_ext = os.path.splitext(input_filepath)
    if not output_filepath.lower().endswith(output_ext or input_ext):
        output_filepath += output_ext or input_ext
    return output_filepath


def get_simple_transform(
    width: str | int = -1,
    height: str | int = -1,
    pad: str | None = None,
    codec: str | None = None,
    clip_pct: tuple[float, float] | None = None,
    clip_s: tuple[float, float] | None = None,
    sar=None,
    fps=None,
    output_ext=None,
):
    """Resize to a specific size and re-encode.

    Args:
        width: The target width (-1 to maintain aspect ratio)
        height: The target height (-1 to maintain aspect ratio)
        pad: An ffmpeg pad argument provided as a string.
        codec: The codec for encoding the video.
        fps: The new frame rate for the video.
        clip_pct: The video start and end in percentages of video duration.
        clip_s: The video start and end in seconds (used over clip_pct if both
            are provided).
        sar: Whether to make all videos have a common sample aspect
            ratio (i.e., for all square pixels, set this to '1/1').
        output_ext: The extension to use when re-encoding (used to select
            video format). It should include the leading '.'.
    """

    def transform(input_filepath, output_filepath):
        output_filepath = sanitize_output_filepath(
            input_filepath, output_filepath, output_ext
        )
        data = None
        if codec is None:
            data = data or probe(input_filepath)
            output_codec = [s for s in data["streams"] if s["codec_type"] == "video"][
                0
            ]["codec_name"]
        else:
            output_codec = codec
        format_kwargs = {"codec:v": output_codec}
        if clip_pct is not None or clip_s is not None:
            pct_start, pct_end, pos_start, pos_end = None, None, None, None
            if clip_pct is not None:
                pct_start, pct_end = clip_pct
            if clip_s is not None:
                pos_start, pos_end = clip_s
            if pct_start is not None:
                assert 0 <= pct_start <= 1, "Start position must be between 0 and 1."
            if pct_end is not None:
                assert 0 <= pct_end <= 1, "End position must be between 0 and 1."
            if pct_start is not None and pct_end is not None:
                assert pct_start < pct_end, "End must be greater than start."
            if (pct_start is not None and pos_start is None) or (
                pct_end is not None and pos_end is None
            ):
                # We only want to get the duration for the video if we need
                # it.
                data = data or probe(input_filepath)
                duration = float(data["streams"][0]["duration"])
            if pct_start is not None or pos_start is not None:
                format_kwargs["ss"] = pos_start or pct_start * duration  # type: ignore
            if pct_end is not None or pos_end is not None:
                format_kwargs["t"] = pos_end or pct_end * duration  # type: ignore
        stream = ffmpeg.input(input_filepath)
        if not (width == -1 and height == -1):
            stream = stream.filter("scale", width, height)
        if pad is not None:
            stream = stream.filter("pad", *pad.split(":"))
        if fps is not None:
            stream = stream.filter("fps", fps)
        if sar is not None:
            stream = stream.filter("setsar", sar)
        stream = stream.output(output_filepath, **format_kwargs).overwrite_output()
        ffmpeg.run(stream)
        if os.path.isfile(output_filepath):
            return output_filepath
        return None

    return transform


def get_slideshow_transform(
    frame_input_rate, frame_output_rate, max_frames=None, offset=0
):
    """Get a slideshow transform to create slideshows from
    videos.

    Args:
        frame_input_rate: The rate at which frames will be sampled
            from the source video (e.g., a rate of 1 means we collect
            one frame per second of the input video).
        frame_output_rate: The rate at which the sampled frames are played
            in the slideshow (e.g., a rate of 0.5 means each frame will
            appear for 2 seconds).
        max_frames: The maximum number of frames to write.
        offset: The number of seconds to wait before beginning the slide show.
    """

    def transform(input_filepath, output_filepath):
        output_filepath = sanitize_output_filepath(
            input_filepath, output_filepath, output_ext=".avi"
        )
        writer = None
        frame_count = 0
        try:
            for frame, _, timestamp in read_video(
                filepath=input_filepath, frames_per_second=frame_input_rate
            ):
                if timestamp < offset:
                    continue
                if writer is None:
                    writer = cv2.VideoWriter(
                        filename=output_filepath,
                        fourcc=cv2.VideoWriter_fourcc(*"MJPG"),  # type: ignore[attr-defined]
                        fps=frame_output_rate,
                        frameSize=tuple(frame.shape[:2][::-1]),
                        isColor=True,
                    )
                writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
                frame_count += 1
                if max_frames is not None and frame_count >= max_frames:
                    break
        finally:
            if writer is not None:
                writer.release()
        if os.path.isfile(output_filepath):
            return output_filepath
        return None

    return transform


def get_black_frame_padding_transform(duration_s=0, duration_pct=0):
    """Get a transform that adds black frames at the start and end
    of a video.

    Args:
        duration_s: The duration of the black frames in seconds.
        duration_pct: The duration of the black frames
            as a percentage of video duration. If both duration_s
            and duration_pct are provided, the maximum value
            is used.
    """

    def transform(input_filepath, output_filepath):
        output_filepath = sanitize_output_filepath(input_filepath, output_filepath)
        stream = next(
            stream
            for stream in probe(input_filepath)["streams"]
            if stream["codec_type"] == "video"
        )
        assert stream["sample_aspect_ratio"] == "1:1", "SAR is not 1:1."
        width = stream["width"]
        height = stream["height"]
        duration = max(duration_s, duration_pct * float(stream["duration"]))
        ffmpeg.input(input_filepath).output(
            output_filepath,
            vf=(
                "color=c=black:s={width}x{height}:d={duration} [pre] ; "
                "color=c=black:s={width}x{height}:d={duration} [post] ; "
                "[pre] [in] [post] concat=n=3"
            ).format(width=width, height=height, duration=duration),
            fps_mode="vfr",
        ).overwrite_output().run()
        if os.path.isfile(output_filepath):
            return output_filepath
        return None

    return transform


================================================
FILE: perception/extensions.pyx
================================================
# cython: language_level=3
# cython: language=c++

import math
import sys

import cython
import numpy as np
from cython.parallel import parallel, prange

cimport numpy as np
from libc.stdlib cimport abort, free, malloc
from libcpp cimport bool as cppbool
from libcpp.vector cimport vector


cdef extern from "limits.h":
    int INT_MAX

ctypedef np.uint8_t uint8

@cython.boundscheck(False)
@cython.wraparound(False)
def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts: np.uint32_t[:] = None, compute_overlap=False):
    """Find the pairwise overlap within an array of vectors, where there may be multiple
    vectors for the same file. This function is faster than using scipy.spatial.distance
    because it computes distances in parallel, avoids computing full distances when they're
    not necessary, skips computing distances for pairs of hashes that are for the
    same file, and skips computing distances for vectors if both have already been matched.

    Args:
        X: The vectors with shape (N, D). Vectors for the same file need to be
            supplied sequentially so that we can use the counts argument
            to determine which vectors are for the same file.
        counts: For each file, the number of sequential vectors in X. If not
            provided, each vector is assumed to be for a different file (i.e.,
            this is equivalent to `counts = np.ones(N)`).
        compute_overlap: If True, the values returned will be divided by the number
            of hashes in each file. If False, the raw duplicate counts will
            be returned.

    Returns:
        duplicates: An array of shape (M!/(2*((M-2)!)), 2) indicating
            the fraction of vectors for each file found in another file.
            The indexing matches that of scipy.spatial.pdist. M is the number of files.
            So if M = 4, the array will represent comparisons of the file indexes as follows:
            [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3)]. So (assuming compute_overlap=True),
            a possible return would be [(1.0, 1.0), (0, 0), (0, 0), (0.66, 1.0), (0.5, 0.25)]
            which means that:

            - There was 100% overlap between file 0 and file 1
            - 66% of file 1 was in file 2 and 100% of file 2 was in file 1
            - 50% of file 2 was in file 3 and 25% of file 3 was in file 2
    """
    if counts is None:
        counts = np.ones(X.shape[0], dtype=np.uint32)
    cdef Py_ssize_t n = X.shape[0]
    cdef Py_ssize_t m = counts.shape[0]
    cdef Py_ssize_t d = X.shape[1]
    n_pairs_python = int(math.factorial(m)/(2*math.factorial(m-2)))
    assert n_pairs_python < sys.maxsize, 'Too many files were provided for deduplication.'
    cdef Py_ssize_t n_pairs = n_pairs_python
    cdef Py_ssize_t max_counts = np.max(counts)
    cdef int compute_overlap_int = 0
    if compute_overlap:
        compute_overlap_int = 1
    # i_1 is the index of file1, i_2 is the index of file2, i_d is the
    # index of the vector dimension we're on, i_i is used to compute
    # the starting index in the flattened vector in the different threads.
    # i_1_subhash is the index of the hash on file1, i_2_subhash is
    # the index of the hash on file2.
    cdef Py_ssize_t i_1, i_2, i_d, i_i, i_1_sub, i_2_sub, i_1_offset
    duplicate_arr = np.zeros((n_pairs, 2), dtype=np.double)
    cdef double[:, :] duplicate = duplicate_arr
    offsets_arr = np.zeros(m, dtype=np.int32)
    cdef np.int32_t[:] offsets = offsets_arr
    for i_1 in range(m):
        for i_i in range(i_1):
            offsets[i_1] += counts[i_i]
    # local_buf will contain distance, flattened array offset, index_offset_1, index_offset_2
    cdef size_t local_buf_size = 4
    cdef float threshold2 = threshold ** 2
    with nogil, parallel():
        local_buf = <np.uint64_t *> malloc(sizeof(np.uint64_t) * local_buf_size)

        # An array of flags indicating whether a vector in file 1 was
        # matched.
        matched_1 = <int *> malloc(sizeof(int) * max_counts)

        # An array of flags indicating whether a vector in file 2 was
        # matched.
        matched_2 = <int *> malloc(sizeof(int) * max_counts)
        if local_buf is NULL or matched_1 is NULL or matched_2 is NULL:
            abort()
        # Iterate over all of the files.
        for i_1 in prange(m-1):
            local_buf[1] = 0
            local_buf[2] = offsets[i_1]
            # Compute the index of the output vector
            # where we will count the number of duplicates.
            for i_i in range(i_1):
                local_buf[1] += m - i_i - 1
            # Iterate over all the other files to compare.
            for i_2 in range(i_1 + 1, m):
                local_buf[3] = offsets[i_2]
                # Initialize all match flags to zero for
                # both file 1 and file 2.
                for i_1_sub in range(counts[i_1]):
                    matched_1[i_1_sub] = 0
                for i_2_sub in range(counts[i_2]):
                    matched_2[i_2_sub] = 0
                # Iterate over all the hashes in file1
                for i_1_sub in range(counts[i_1]):
                    # Iterate over all the hashes in file2
                    for i_2_sub in range(counts[i_2]):
                        local_buf[0] = 0
                        if matched_1[i_1_sub] == 1 and matched_2[i_2_sub] == 1:
                            # Both the vectors in this pair have already been matched, so
                            # there is nothing to gain from this comparison.
                            continue
                        for i_d in range(d):
                            local_buf[0] += (X[local_buf[2] + i_1_sub, i_d] - X[local_buf[3] + i_2_sub, i_d]) ** 2
                            if local_buf[0] > threshold2:
                                # If we're already beyond the distance threshold,
                                # we don't need to continue computing squared
                                # distances.
                                break
                        if local_buf[0] < threshold2:
                            # A match was found. Set flags for both vectors
                            # to 1.
                            matched_1[i_1_sub] = 1
                            matched_2[i_2_sub] = 1
                # Add up the number of matches for file 1.
                for i_1_sub in range(counts[i_1]):
                    duplicate[local_buf[1], 0] += matched_1[i_1_sub]
                # Add up the number of matches for file 2.
                for i_2_sub in range(counts[i_2]):
                    duplicate[local_buf[1], 1] += matched_2[i_2_sub]
                # Divide by the total number of vectors for each file.
                if compute_overlap_int:
                    duplicate[local_buf[1], 0] /= counts[i_1]
                    duplicate[local_buf[1], 1] /= counts[i_2]
                # Advance to the next pair index.
                local_buf[1] += 1
        free(local_buf)
        free(matched_1)
        free(matched_2)
    return duplicate_arr


@cython.boundscheck(False)
@cython.wraparound(False)
def compute_euclidean_pairwise_duplicates_simple(int[:, :] X, float threshold, np.uint32_t[:] counts = None, float minimum_overlap = 0):
    """Find the pairwise overlap within an array of vectors, where there may be multiple
    vectors for the same file. This function is similar to compute_euclidean_pairwise_duplicates
    but uses much less memory.

    Args:
        X: The vectors with shape (N, D). Vectors for the same file need to be
            supplied sequentially so that we can use the counts argument
            to determine which vectors are for the same file.
        threshold: The maximum distance between to vectors to allow for
            a match.
        counts: For each of the M files, the number of sequential vectors in X.
            If not provided, each vector is assumed to be for a different file (i.e.,
            this is equivalent to `counts = np.ones(N)` which also implies M == N).
            Otherwise, assumed to have length M. The counts should add up to N.
        minimum_overlap: The minimum overlap between two groups of hashes to
            call it a match.

    Returns:
        pairs: Pairs of indexes that met the matching criteria.
    """
    if counts is None:
        counts_arr = np.ones(X.shape[0], dtype=np.uint32)
        counts = counts_arr
    cdef Py_ssize_t n = X.shape[0]
    cdef Py_ssize_t m = counts.shape[0]
    cdef Py_ssize_t d = X.shape[1]
    n_pairs_python = int(math.factorial(m)/(2*math.factorial(m-2)))
    assert n_pairs_python < sys.maxsize, 'Too many files were provided for deduplication.'
    cdef Py_ssize_t n_pairs = n_pairs_python
    cdef Py_ssize_t max_counts = np.max(counts)
    # i_1 is the index of file1, i_2 is the index of file2, i_d is the
    # index of the vector dimension we're on, i_i is used to compute
    # the starting index in the flattened vector in the different threads.
    # i_1_subhash is the index of the hash on file1, i_2_subhash is
    # the index of the hash on file2.
    cdef Py_ssize_t i_1, i_2, i_d, i_i, i_1_sub, i_2_sub
    cdef vector[cppbool] duplicate
    duplicate.resize(n_pairs)
    offsets_arr = np.zeros(m, dtype=np.uint64)
    cdef np.uint64_t[:] offsets = offsets_arr
    cdef np.int32_t expected_n = 0
    for i_1 in range(m):
        for i_i in range(i_1):
            offsets[i_1] += counts[i_i]
        expected_n += counts[i_1]
    assert expected_n == n, "Provided value for counts is inconsistent with X."
    # local_buf will contain:
    # distance, flattened array offset,
    # index_offset_1, index_offset_2
    cdef size_t local_buf_size = 4
    cdef float threshold2 = threshold ** 2
    with nogil, parallel():
        local_buf = <np.uint64_t *> malloc(sizeof(np.uint64_t) * local_buf_size)

        # An array of flags indicating whether a vector in file 1 was
        # matched.
        matched_1 = <int *> malloc(sizeof(int) * max_counts)

        # An array of flags indicating whether a vector in file 2 was
        # matched.
        matched_2 = <int *> malloc(sizeof(int) * max_counts)

        # Pair overlap and minimum required overlap
        overlap = <float *> malloc(sizeof(float) * 4)

        if local_buf is NULL or matched_1 is NULL or matched_2 is NULL or overlap is NULL:
            abort()
        # Iterate over all of the files.
        for i_1 in prange(m-1):
            local_buf[1] = 0
            local_buf[2] = offsets[i_1]
            # Compute the index of the output vector
            # where we will count the number of duplicates.
            for i_i in range(i_1):
                local_buf[1] += m - i_i - 1
            # Iterate over all the other files to compare.
            for i_2 in range(i_1 + 1, m):
                # Set the current and minimum overlaps
                overlap[0] = 0
                overlap[1] = 0
                overlap[2] = minimum_overlap * counts[i_1]
                overlap[3] = minimum_overlap * counts[i_2]
                local_buf[3] = offsets[i_2]

                # Set early termination flag.
                local_buf[4] = 0

                # Initialize all match flags to zero for
                # both file 1 and file 2.
                for i_1_sub in range(counts[i_1]):
                    matched_1[i_1_sub] = 0
                for i_2_sub in range(counts[i_2]):
                    matched_2[i_2_sub] = 0
                # Iterate over all the hashes in file1
                for i_1_sub in range(counts[i_1]):
                    # Stop early if there's no way to get enough
                    # matches from i1 to i2
                    if overlap[0] + counts[i_1] - i_1_sub < overlap[2]:
                        break
                    # Stop early if we've already reached the minimum overlap
                    if overlap[0] >= overlap[2] and overlap[1] >= overlap[3] and overlap[0] > 0 and overlap[1] > 0:
                        break

                    # Iterate over all the hashes in file2
                    for i_2_sub in range(counts[i_2]):
                        local_buf[0] = 0
                        if matched_1[i_1_sub] == 1 and matched_2[i_2_sub] == 1:
                            # Both the vectors in this pair have already been matched, so
                            # there is nothing to gain from this comparison.
                            continue
                        for i_d in range(d):
                            local_buf[0] += (X[local_buf[2] + i_1_sub, i_d] - X[local_buf[3] + i_2_sub, i_d]) ** 2
                            if local_buf[0] > threshold2:
                                # If we're already beyond the distance threshold,
                                # we don't need to continue computing squared
                                # distances.
                                break
                        if local_buf[0] < threshold2:
                            # A match was found. Set flags for both vectors
                            # to 1 and increment the overlap.
                            if matched_1[i_1_sub] != 1:
                                overlap[0] += 1
                            if matched_2[i_2_sub] != 1:
                                overlap[1] += 1
                            matched_1[i_1_sub] = 1
                            matched_2[i_2_sub] = 1
                if overlap[0] >= overlap[2] and overlap[1] >= overlap[3] and overlap[0] > 0 and overlap[1] > 0:
                    duplicate[local_buf[1]] = 1
                local_buf[1] += 1
        free(matched_1)
        free(matched_2)
        free(overlap)
        free(local_buf)
    cdef int n_duplicates = 0
    cdef Py_ssize_t i_offset = 0
    for i_offset in range(n_pairs):
        if duplicate[i_offset] > 0:
            n_duplicates += 1
    pairs_arr = np.zeros((n_duplicates, 2), dtype=np.int32)
    cdef np.int32_t[:, :] pairs = pairs_arr
    i_offset = 0
    cdef Py_ssize_t pair_offset = 0
    for i_1 in range(m-1):
        # Compute the index of the output vector
        # where we will count the number of duplicates.
        for i_2 in range(i_1 + 1, m):
            if duplicate[i_offset] > 0:
                pairs[pair_offset][0] = i_1
                pairs[pair_offset][1] = i_2
                pair_offset += 1
            i_offset += 1
    return pairs_arr


================================================
FILE: perception/hashers/__init__.py
================================================
from .hasher import ImageHasher, VideoHasher
from .image.average import AverageHash
from .image.dhash import DHash
from .image.opencv import BlockMean, ColorMoment, MarrHildreth
from .image.phash import PHash, PHashF, PHashU8
from .image.wavelet import WaveletHash
from .video.framewise import FramewiseHasher
from .video.tmk import TMKL1, TMKL2

__all__ = [
    "ImageHasher",
    "VideoHasher",
    "AverageHash",
    "PHash",
    "WaveletHash",
    "MarrHildreth",
    "BlockMean",
    "ColorMoment",
    "DHash",
    "FramewiseHasher",
    "TMKL1",
    "TMKL2",
    "PHashU8",
    "PHashF",
]

try:
    from .image.pdq import PDQHash as PDQHash, PDQHashF as PDQHashF
except ImportError:
    pass
else:
    __all__.extend(["PDQHash", "PDQHashF"])


================================================
FILE: perception/hashers/hasher.py
================================================
import concurrent.futures
import typing
import warnings
from abc import ABC, abstractmethod
from logging import warning

import numpy as np
import scipy.spatial
import tqdm

from perception.hashers import tools


class Hasher(ABC):
    """All hashers implement a common set of methods from
    the Hasher base class.
    """

    #: The metric to use when computing distance between two hashes. All hashers
    #: must supply this parameter.
    distance_metric: str

    #: The numpy type to use when converting from string to array form.
    #: All hashers must supply this parameter.
    dtype: str

    #: Indicates the length of the hash vector
    hash_length: int

    #: Whether or not this hash returns multiple values
    returns_multiple: bool = False

    #: Indicates whether the hashes can be computed in parallel
    allow_parallel: bool = True

    def string_to_vector(self, hash_string: str, hash_format: str = "base64"):
        """Convert hash string to vector.

        Args:
            hash_string: The input hash string
            hash_format: One of 'base64' or 'hex'
        """
        return tools.string_to_vector(
            hash_string,
            dtype=self.dtype,
            hash_length=self.hash_length,
            hash_format=hash_format,
        )

    def vector_to_string(
        self, vector: np.ndarray, hash_format: str = "base64"
    ) -> str | None:
        """Convert vector to hash string.

        Args:
            vector: Input vector
            hash_format: One of 'base64' or 'hex'
        """
        return tools.vector_to_string(vector, dtype=self.dtype, hash_format=hash_format)

    def compute_distance(
        self,
        hash1: np.ndarray | str,
        hash2: np.ndarray | str,
        hash_format="base64",
    ):
        """Compute the distance between two hashes.

        Args:
            hash1: The first hash or vector
            hash2: The second hash or vector
            hash_format: If either or both of the hashes are hash strings,
                what format the string is encoded in.
        """
        hash1 = (
            self.string_to_vector(hash1, hash_format=hash_format)
            if isinstance(hash1, str)
            else hash1
        )  # makes mypy happy
        hash2 = (
            self.string_to_vector(hash2, hash_format=hash_format)
            if isinstance(hash2, str)
            else hash2
        )

        if self.distance_metric == "sqeuclidean":
            return scipy.spatial.distance.sqeuclidean(
                hash1.astype("float32"), hash2.astype("float32")
            )
        if self.distance_metric == "euclidean":
            return scipy.spatial.distance.euclidean(
                hash1.astype("float32"), hash2.astype("float32")
            )
        if self.distance_metric == "hamming":
            return scipy.spatial.distance.hamming(hash1, hash2)
        if self.distance_metric == "cosine":
            return scipy.spatial.distance.cosine(
                hash1.astype("float32"), hash2.astype("float32")
            )
        if self.distance_metric == "custom":
            return self._compute_distance(hash1, hash2)
        raise NotImplementedError(
            f"Distance metric: {self.distance_metric} not supported."
        )

    def _compute_distance(self, vector1, vector2):
        raise ValueError("Called a custom distance function but it is not implemented.")

    @typing.no_type_check
    def compute_parallel(
        self,
        filepaths: list[str],
        progress: tqdm.tqdm | None = None,
        progress_desc: str | None = None,
        max_workers: int = 5,
        isometric: bool = False,
    ):
        """Compute hashes in a parallelized fashion.

        Args:
            filepaths: A list of paths to images or videos (depending on the hasher).
            progress: A tqdm-like wrapper for reporting progress. If None,
                progress is not reported.
            progress_desc: The title of the progress bar.
            max_workers: The maximum number of workers
            isometric: Whether to compute all eight isometric transforms for
                each image.
        """
        if not self.allow_parallel and max_workers != 1:
            warnings.warn(
                message="This hash cannot be used in parallel. Setting max_workers to 1.",
                category=UserWarning,
            )
            max_workers = 1
        assert all(
            isinstance(p, str) for p in filepaths
        ), "All images should be provided as paths."

        if isinstance(self, VideoHasher) and isometric:
            raise ValueError("Computing isometric hashes for videos is not supported.")

        # We can use a with statement to ensure threads are cleaned up promptly
        records = []
        if isinstance(self, VideoHasher):
            executor_class = concurrent.futures.ProcessPoolExecutor
        else:
            executor_class = concurrent.futures.ThreadPoolExecutor
        with executor_class(max_workers=max_workers) as executor:
            # Start the load operations and mark each future with its filepath
            compute: typing.Callable = (
                self.compute_isometric if isometric else self.compute
            )
            future_to_path: dict = {
                executor.submit(compute, path): path for path in filepaths
            }
            generator = concurrent.futures.as_completed(future_to_path)
            if progress is not None:
                generator = progress(
                    generator, total=len(filepaths), desc=progress_desc
                )
            for future in generator:
                path = future_to_path[future]
                try:
                    hash_value = future.result()
                except Exception as exc:
                    records.append({"filepath": path, "hash": None, "error": str(exc)})
                else:
                    records.append(
                        {"filepath": path, "hash": hash_value, "error": None}
                    )
        return records


class ImageHasher(Hasher):
    @abstractmethod
    def _compute(self, image: np.ndarray) -> np.ndarray:
        """Compute hash from an image.

        Args:
            image: A numpy array representing an image as
                of shape (H, W, 3) where channels are ordered
                as RGB or a filepath to an image.
        """

    def compute_isometric_from_hash(self, hash_string_or_vector, hash_format="base64"):
        """For supported hashes, obtain the hashes for the dihedral transformations
        of the original image. They are provided in the following order:

        - Vertical flip
        - Horizontal flip
        - 180 degree rotation
        - 90 degree rotation
        - 90 degree rotation and vertical flip
        - 90 degree rotation and horizontal flip
        - 270 degree rotation

        Args:
            hash_string_or_vector: The hash string or vector
            hash_format: One 'base64' or 'hex'
        """
        if not hasattr(self, "_compute_isometric_from_hash"):
            raise NotImplementedError("This hasher does not support hash rotation.")
        rotations = self._compute_isometric_from_hash(  # type: ignore
            hash_string_or_vector
            if isinstance(hash_string_or_vector, np.ndarray)
            else self.string_to_vector(hash_string_or_vector, hash_format=hash_format)
        )
        return {
            transform_name: self.vector_to_string(vector, hash_format=hash_format)
            for transform_name, vector in rotations.items()
        }

    def compute_isometric(self, image: tools.ImageInputType):
        image = tools.to_image_array(image)
        if hasattr(self, "_compute_isometric"):
            hashes = self._compute_isometric(image)  # type: ignore
        elif hasattr(self, "_compute_isometric_from_hash"):
            hashes = self._compute_isometric_from_hash(  # type: ignore
                self._compute(image)
            )
        else:
            transforms = tools.get_isometric_transforms(image)
            for name, transform in transforms.items():
                transforms[name] = self._compute(transform)
            hashes = transforms
        return {
            transform_name: self.vector_to_string(vector)
            for transform_name, vector in hashes.items()
        }

    def compute(
        self, image: tools.ImageInputType, hash_format="base64"
    ) -> np.ndarray | str | None | list[str | None]:
        """Compute a hash from an image.

        Args:
            image: An image represented as a filepath, a PIL image object,
                or as an np.ndarray object. If it is an np.ndarray object,
                it must be in RGB color order (note the OpenCV default is
                BGR).
            hash_format: One 'base64', 'hex', or 'vector'
        """
        vector = self._compute(tools.to_image_array(image))
        if hash_format == "vector":
            # Take care of this separately because we took out `vector`
            # as valid return type to vector_to_string().
            # The .tolist() might seem unnecessary for the
            # ndarray `vector` but downstream expects a list and it
            # stays consistent with original, so keeping for now.
            # return (vector.tolist() if self.returns_multiple
            #        else vector)
            return vector  # should iterate the same as vector.tolist()
        if self.returns_multiple:
            return [self.vector_to_string(v, hash_format=hash_format) for v in vector]
        return self.vector_to_string(vector, hash_format=hash_format)

    def compute_with_quality(
        self, image: tools.ImageInputType, hash_format="base64"
    ) -> tuple[
        (np.ndarray | str | None | list[str | None]),
        int,
    ]:
        """Compute hash and hash quality from image.

        Args:
            image: An image represented as a filepath, a PIL image object,
                or as an np.ndarray object. If it is an np.ndarray object,
                it must be in RGB color order (note the OpenCV default is
                BGR).
            hash_format: One 'base64', 'hex', or 'vector'

        Returns:
            A tuple of (hash, quality)
        """
        vector, quality = self._compute_with_quality(tools.to_image_array(image))
        if hash_format == "vector":
            return vector, quality
        if self.returns_multiple:
            return (
                [self.vector_to_string(v, hash_format=hash_format) for v in vector],
                quality,
            )
        return (self.vector_to_string(vector, hash_format=hash_format), quality)

    def _compute_with_quality(self, image: np.ndarray) -> tuple[np.ndarray, int]:
        return self._compute(image), tools.compute_quality(image)


class VideoHasher(Hasher):

    #: The frame rate at which videos are read
    frames_per_second: float = 1

    @abstractmethod
    def process_frame(
        self,
        frame: np.ndarray,
        frame_index: int | None,
        frame_timestamp: float | None,
        state: dict | None = None,
    ) -> dict:
        """Called for each frame in the video. For all
        but the first frame, a state is provided recording the state from
        the previous frame.

        Args:
            frame: The current frame as an RGB ndarray
            frame_index: The current frame index
            frame_timestamp: The current frame timestamp
            state: The state from the last call to process_frame
        """

    @abstractmethod
    def hash_from_final_state(self, state: dict) -> np.ndarray:
        """Called after all frames have been processed. Returns the final
        feature vector.

        Args:
            state: The state dictionary at the end of processing.
        """

    def compute(
        self,
        filepath,
        errors="raise",
        hash_format="base64",
        scenes=None,
        **kwargs,
    ):
        """Compute a hash for a video at a given filepath. All
        other arguments are passed to perception.hashers.tools.read_video.

        Args:
            filepath: Path to video file
            errors: One of "raise", "ignore", or "warn". Passed
                to perception.hashers.tools.read_video.
            hash_format: One of "vector", "base64", or "hex"
            max_duration: The maximum length of the video to hash.
            max_size: The maximum size of frames to queue
            scenes: An array used to pass scene info back to wrapper
                functions
        """
        frame_timestamp, state = None, None
        # Iterate through the video, aggregating scene info in the state
        # dict
        for frame, frame_index, frame_timestamp in tools.read_video(
            filepath=filepath,
            frames_per_second=self.frames_per_second,
            errors=errors,
            **kwargs,
        ):
            state = self.process_frame(
                frame=frame,
                frame_index=frame_index,
                frame_timestamp=frame_timestamp,
                state=state,
            )

        if state is None:
            if errors == "raise":
                raise ValueError(
                    f"Video processing failed for {filepath}, State is None."
                )
            if errors == "warn":
                warning(f"Video processing failed for {filepath}, State is None.")

            return None

        # Persist the final timestamp in the state to allow us to pass along
        # duration
        state["end"] = frame_timestamp
        vectors = self.hash_from_final_state(state=state)
        if scenes is not None:
            scenes += state.get("scenes", [])
        if hash_format == "vector":
            # Take care of this separately because we took out `vector`
            # as valid return type to vector_to_string().
            # The .tolist() might seem unnecessary for the
            # ndarray `vector` but downstream expects a list and it
            # stays consistent with original, so keeping for now.
            # return (vector.tolist() if self.returns_multiple
            #        else vector)
            return vectors  # should iterate the same as vector.tolist()
        if self.returns_multiple:
            return [self.vector_to_string(v, hash_format=hash_format) for v in vectors]
        return self.vector_to_string(vectors, hash_format=hash_format)


================================================
FILE: perception/hashers/image/__init__.py
================================================
from .average import AverageHash
from .dhash import DHash
from .opencv import BlockMean, ColorMoment, MarrHildreth
from .phash import PHash, PHashF, PHashU8
from .wavelet import WaveletHash

__all__ = [
    "AverageHash",
    "PHash",
    "WaveletHash",
    "MarrHildreth",
    "BlockMean",
    "ColorMoment",
    "DHash",
    "PHashF",
    "PHashU8",
]


================================================
FILE: perception/hashers/image/average.py
================================================
import cv2

from .. import tools
from ..hasher import ImageHasher


class AverageHash(ImageHasher):
    """Computes a simple hash comparing the intensity of each
    pixel in a resized version of the image to the mean.
    Implementation based on that of
    `ImageHash <https://github.com/JohannesBuchner/imagehash>`_."""

    distance_metric = "hamming"
    dtype = "bool"

    def __init__(self, hash_size=8):
        assert hash_size >= 2, "Hash size must be greater than or equal to 2."
        self.hash_size = hash_size
        self.hash_length = hash_size * hash_size

    def _compute(self, image):
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = cv2.resize(
            image, dsize=(self.hash_size, self.hash_size), interpolation=cv2.INTER_AREA
        )
        diff = image > image.mean()
        return diff.flatten()

    def _compute_isometric_from_hash(self, vector):
        return {
            transform_name: diff.flatten()
            for transform_name, diff in tools.get_isometric_transforms(
                vector.reshape(self.hash_size, self.hash_size, 1), require_color=False
            ).items()
        }


================================================
FILE: perception/hashers/image/dhash.py
================================================
import cv2

from ..hasher import ImageHasher


class DHash(ImageHasher):
    """A hash based on the differences between adjacent pixels.
    Implementation based on that of
    `ImageHash <https://github.com/JohannesBuchner/imagehash>`_.
    """

    dtype = "bool"
    distance_metric = "hamming"

    def __init__(self, hash_size=8):
        assert hash_size > 1, "Hash size must be greater than 1."
        self.hash_size = hash_size
        self.hash_length = hash_size * hash_size

    def _compute(self, image):
        image = cv2.resize(
            image,
            dsize=(self.hash_size + 1, self.hash_size),
            interpolation=cv2.INTER_AREA,
        )
        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        previous = image[:, :-1]
        current = image[:, 1:]
        difference = previous > current
        return difference.flatten()


================================================
FILE: perception/hashers/image/opencv.py
================================================
import cv2
import numpy as np

from ..hasher import ImageHasher


class OpenCVHasher(ImageHasher):
    allow_parallel = False

    def __init__(self):
        if not hasattr(cv2, "img_hash"):
            raise RuntimeError(
                "You do not appear to have opencv-contrib installed. It is required for pure OpenCV hashers."
            )


class MarrHildreth(OpenCVHasher):
    """A wrapper around OpenCV's Marr-Hildreth hash.
    See `paper <https://www.phash.org/docs/pubs/thesis_zauner.pdf>`_ for details."""

    dtype = "bool"
    distance_metric = "hamming"
    hash_length = 576

    def __init__(self):
        super().__init__()
        self.hasher = cv2.img_hash.MarrHildrethHash.create()  # type: ignore[attr-defined]

    def _compute(self, image):
        return np.unpackbits(self.hasher.compute(image)[0])


class ColorMoment(OpenCVHasher):
    """A wrapper around OpenCV's Color Moments hash.
    See `paper <https://www.phash.org/docs/pubs/thesis_zauner.pdf>`_ for details."""

    dtype = "float32"
    distance_metric = "euclidean"
    hash_length = 42

    def __init__(self):
        super().__init__()
        self.hasher = cv2.img_hash.ColorMomentHash.create()  # type: ignore[attr-defined]

    def _compute(self, image):
        return 10000 * self.hasher.compute(image)[0]


class BlockMean(OpenCVHasher):
    """A wrapper around OpenCV's Block Mean hash.
    See `paper <https://www.phash.org/docs/pubs/thesis_zauner.pdf>`_ for details."""

    dtype = "bool"
    distance_metric = "hamming"
    hash_length = 968

    def __init__(self):
        super().__init__()
        self.hasher = cv2.img_hash.BlockMeanHash.create(1)  # type: ignore[attr-defined]

    def _compute(self, image):
        # https://stackoverflow.com/questions/54762896/why-cv2-norm-hamming-gives-different-value-than-actual-hamming-distance
        return np.unpackbits(self.hasher.compute(image)[0])


================================================
FILE: perception/hashers/image/pdq.py
================================================
import pdqhash

from ..hasher import ImageHasher


class PDQHash(ImageHasher):
    """The Facebook PDQ hash. Based on the original implementation located at
    the `official repository <https://github.com/facebook/ThreatExchange>`_.
    """

    distance_metric = "hamming"
    dtype = "bool"
    hash_length = 256

    def _compute(self, image):
        return pdqhash.compute(image)[0] > 0

    def _compute_with_quality(self, image):
        hash_vector, quality = pdqhash.compute(image)
        return hash_vector > 0, quality

    def _compute_isometric(self, image):
        hash_vectors, _ = pdqhash.compute_dihedral(image)
        names = ["r0", "r90", "r180", "r270", "fv", "fh", "r90fv", "r90fh"]
        return dict(zip(names, hash_vectors))


class PDQHashF(PDQHash):
    dtype = "float32"
    distance_metric = "euclidean"
    hash_length = 256

    def _compute(self, image):
        return pdqhash.compute_float(image)[0]


================================================
FILE: perception/hashers/image/phash.py
================================================
import cv2
import numpy as np
import scipy.fftpack

from .. import tools
from ..hasher import ImageHasher


class PHash(ImageHasher):
    """Also known as the DCT hash, a hash based on discrete cosine transforms of images.
    See `complete paper <https://www.phash.org/docs/pubs/thesis_zauner.pdf>`_ for
    details. Implementation based on that of
    `ImageHash <https://github.com/JohannesBuchner/imagehash>`_.

    Args:
        hash_size: The number of DCT elements to retain (the hash length
            will be hash_size * hash_size).
        highfreq_factor: The multiple of the hash size to resize the input
            image to before computing the DCT.
        exclude_first_term: WHether to exclude the first term of the DCT
        freq_shift: The number of DCT low frequency elements to skip.
    """

    distance_metric = "hamming"
    dtype = "bool"

    def __init__(
        self, hash_size=8, highfreq_factor=4, exclude_first_term=False, freq_shift=0
    ):
        assert hash_size >= 2, "Hash size must be greater than or equal to 2"
        assert (
            freq_shift <= highfreq_factor * hash_size - hash_size
        ), "Frequency shift is too large for this hash size / highfreq_factor combination."
        self.hash_size = hash_size
        self.highfreq_factor = highfreq_factor
        self.exclude_first_term = exclude_first_term
        self.hash_length = hash_size * hash_size
        self.freq_shift = freq_shift
        if exclude_first_term:
            self.hash_length -= 1

    def _compute_dct(self, image):
        img_size = self.hash_size * self.highfreq_factor
        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        image = cv2.resize(
            image, dsize=(img_size, img_size), interpolation=cv2.INTER_AREA
        )
        dct = scipy.fftpack.dct(scipy.fftpack.dct(image, axis=0), axis=1)
        return dct[
            self.freq_shift : self.hash_size + self.freq_shift,
            self.freq_shift : self.hash_size + self.freq_shift,
        ]

    def _dct_to_hash(self, dct):
        dct = dct.flatten()
        if self.exclude_first_term:
            dct = dct[1:]
        return dct > np.median(dct)

    def _compute(self, image):
        dct = self._compute_dct(image)
        return self._dct_to_hash(dct)

    def _compute_isometric(self, image):
        return {
            transform_name: self._dct_to_hash(dct)
            for transform_name, dct in tools.get_isometric_dct_transforms(
                self._compute_dct(image)
            ).items()
        }


class PHashF(PHash):
    """A real-valued version of PHash. It
    returns the raw 32-bit floats in the DCT.
    For a more compact approach, see PHashU8."""

    dtype = "float32"
    distance_metric = "euclidean"

    def _dct_to_hash(self, dct):
        dct = dct.flatten()
        if self.exclude_first_term:
            dct = dct[1:]
        if (dct == 0).all():
            return None
        return dct


class PHashU8(PHash):
    """A real-valued version of PHash. It
    uses minimum / maximum scaling to convert
    DCT values to unsigned 8-bit integers (more
    compact than the 32-bit floats used by PHashF at
    the cost of precision)."""

    dtype = "uint8"
    distance_metric = "euclidean"

    def _dct_to_hash(self, dct):
        dct = dct.flatten()
        if self.exclude_first_term:
            dct = dct[1:]
        if (dct == 0).all():
            return None
        min_value = dct.min()
        max_value = dct.max()
        dct = np.uint8(255 * (dct - min_value) / (max_value - min_value))
        return dct


================================================
FILE: perception/hashers/image/wavelet.py
================================================
import cv2
import numpy as np
import pywt

from ..hasher import ImageHasher


class WaveletHash(ImageHasher):
    """Similar to PHash but using wavelets instead of DCT.
    Implementation based on that of
    `ImageHash <https://github.com/JohannesBuchner/imagehash>`_.
    """

    distance_metric = "hamming"
    dtype = "bool"

    def __init__(self, hash_size=8, image_scale=None, mode="haar"):
        assert hash_size & (hash_size - 1) == 0, "Hash size must be a power of 2."
        if image_scale is not None:
            assert (
                image_scale & (image_scale - 1) == 0
            ), "Image scale must be a power of 2."
            assert (
                image_scale >= hash_size
            ), "Image scale must be greater than or equal to than hash size."
        self.hash_size = hash_size
        self.image_scale = image_scale
        self.mode = mode
        self.hash_length = hash_size * hash_size

    def _compute(self, image):
        if self.image_scale is None:
            image_scale = max(2 ** int(np.log2(min(image.shape[:2]))), self.hash_size)
        else:
            image_scale = self.image_scale
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = cv2.resize(
            image, dsize=(image_scale, image_scale), interpolation=cv2.INTER_AREA
        )
        image = np.float32(image) / 255

        ll_max_level = int(np.log2(image_scale))
        level = int(np.log2(self.hash_size))
        dwt_level = ll_max_level - level

        if self.mode == "haar":
            coeffs = pywt.wavedec2(image, "haar", level=ll_max_level)
            coeffs = list(coeffs)
            coeffs[0] *= 0
            image = pywt.waverec2(coeffs, "haar")

        coeffs = pywt.wavedec2(image, self.mode, level=dwt_level)
        dwt_low = coeffs[0]

        # Subtract median and compute hash
        med = np.median(dwt_low)
        diff = dwt_low > med

        return diff.flatten()


================================================
FILE: perception/hashers/tools.py
================================================
import base64
import fractions
import functools
import hashlib
import io
import itertools
import json
import logging
import math
import os
import queue
import shlex
import subprocess
import tempfile
import threading
import typing
import warnings
from collections import Counter
from http import client
from numbers import Number
from urllib import request

import cv2
import numpy as np
import PIL
import PIL.Image
import validators

LOGGER = logging.getLogger(__name__)

ImageInputType = typing.Union[
    str, np.ndarray, "PIL.Image.Image", io.BytesIO, tempfile.SpooledTemporaryFile
]

SIZES = {"float32": 32, "uint8": 8, "bool": 1}

# Map codec names to the CUDA-accelerated version. Obtain
# from ffmpeg -codecs after building using CUDA.
CUDA_CODECS = {
    "h264": "h264_cuvid",
    "hevc": "hevc_cuvid",
    "mjpeg": "mjpeg_cuvid",
    "mpeg1video": "mpeg1_cuvid",
    "mpeg2video": "mpeg2_cuvid",
    "mpeg4": "mpeg4_cuvid",
    "vc1": "vc1_cuvid",
    "vp8": "vp8_cuvid",
    "vp9": "vp9_cuvid",
}

FramesWithIndexesAndTimestamps = typing.Generator[
    tuple[np.ndarray, int | None, float | None], None, None
]


def get_ffprobe():
    return os.environ.get("PERCEPTION_FFPROBE_BINARY", "ffprobe")


def get_ffmpeg():
    return os.environ.get("PERCEPTION_FFMPEG_BINARY", "ffmpeg")


def compute_quality(image) -> int:
    """Compute a quality metric, using the calculation proposed by
    `Facebook <https://github.com/facebook/ThreatExchange/blob/master/hashing/hashing.pdf/>`_
    for their PDQ hash algorithm."""
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, code=cv2.COLOR_RGB2GRAY)
    if image.shape[0] != 64 or image.shape[1] != 64:
        image = cv2.resize(src=image, dsize=(64, 64)).astype("float32")
    dx = 100 * np.abs(image[:, 1:] - image[:, :-1]) / 255
    dy = 100 * np.abs(image[1:] - image[:-1]) / 255
    dx = dx.astype("int").sum()
    dy = dy.astype("int").sum()
    return int(np.clip(a=int((dx + dy) / 90), a_min=0, a_max=100))


def compute_md5(filepath) -> str:
    """Compute the md5 hash for a file at `filepath`.

    Args:
        filepath: The path to the file
    """
    with open(filepath, "rb") as f:
        hash_str = hashlib.md5(f.read()).hexdigest()
    return hash_str


def get_string_length(hash_length: int, dtype: str, hash_format="hex") -> int:
    """Compute the expected length of a hash string.

    Args:
        hash_length: The length of the hash vector
        dtype: The dtype of the vector
        hash_format: One of 'base64' or 'hex'

    Returns:
        The expected string length
    """
    hash_bytes = math.ceil(hash_length * SIZES[dtype] / 8)

    if hash_format == "base64":
        return int((4 * hash_bytes / 3) + 3) & ~3
    if hash_format == "hex":
        return 2 * hash_bytes
    raise NotImplementedError("Unknown hash format: " + hash_format)


def vector_to_string(vector: np.ndarray, dtype: str, hash_format: str) -> str | None:
    """Convert vector to hash.

    Args:
        vector: Input vector
    """
    # At times, a vector returned by a hasher is None (e.g., for hashes
    # that depend on the image not being featureless). In those cases,
    # we need to just return None, which is the least surprising outcome
    # because after all, the string representation of None is None.
    if vector is None:
        return None
    if hash_format == "vector":
        # return vector.astype(dtype)  # old behavior
        raise DeprecationWarning("`hash_format` `vector` has been removed.")
    if dtype == "uint8":
        vector_bytes = vector.astype("uint8")
    elif dtype == "float32":
        vector_bytes = vector.astype("float32")
    elif dtype == "bool":
        vector_bytes = np.packbits(vector.astype("bool"))
    else:
        raise NotImplementedError(f"Cannot convert hash of type {dtype}.")
    if hash_format == "base64":
        return base64.b64encode(vector_bytes.tobytes()).decode("utf-8")
    if hash_format == "hex":
        return vector_bytes.tobytes().hex()
    raise NotImplementedError(f"Cannot convert to string format: {hash_format}.")


def string_to_vector(
    hash_string: str,
    dtype: str,
    hash_length: int,
    hash_format: str,
    verify_length: bool = True,
) -> np.ndarray:
    """Convert hash back to vector.

    Args:
        hash_string: The input hash string
        dtype: The data type of the hash
        hash_length: The length of the hash vector
        hash_format: The input format of the hash (base64 or hex)
        verify_length: Whether to verify the string length
    """
    assert not verify_length or len(hash_string) == get_string_length(
        hash_length=hash_length, hash_format=hash_format, dtype=dtype
    ), "Incorrect string length for this hash format."
    if hash_format == "base64":
        vector_bytes = np.frombuffer(
            base64.b64decode(hash_string),
            dtype="uint8" if dtype in ["bool", "uint8"] else dtype,
        )
    elif hash_format == "hex":
        vector_bytes = np.frombuffer(
            bytearray.fromhex(hash_string),
            dtype="uint8" if dtype in ["bool", "uint8"] else dtype,
        )
    else:
        raise NotImplementedError(f"Cannot convert to string format: {hash_format}")
    if dtype == "uint8":
        return vector_bytes[:hash_length]
    if dtype == "float32":
        return vector_bytes[:hash_length]
    if dtype == "bool":
        return np.unpackbits(vector_bytes)[:hash_length].astype("bool")
    raise NotImplementedError(f"Cannot convert hash of type {dtype}.")


def hex_to_b64(
    hash_string: str, dtype: str, hash_length: int, verify_length: bool = True
):
    """Convert a hex-encoded hash to base64.

    Args:
        hash_string: The input base64 hash string
        dtype: The data type of the hash
        hash_length: The length of the hash vector
        verify_length: Whether to verify the string length
    """
    return vector_to_string(
        string_to_vector(
            hash_string,
            hash_length=hash_length,
            hash_format="hex",
            dtype=dtype,
            verify_length=verify_length,
        ),
        dtype=dtype,
        hash_format="base64",
    )


def b64_to_hex(
    hash_string: str, dtype: str, hash_length: int, verify_length: bool = True
):
    """Convert a base64-encoded hash to hex.

    Args:
        hash_string: The input hex hash string
        dtype: The data type of the hash
        hash_length: The length of the hash vector
        verify_length: Whether to verify the string length
    """
    return vector_to_string(
        string_to_vector(
            hash_string,
            hash_length=hash_length,
            hash_format="base64",
            dtype=dtype,
            verify_length=verify_length,
        ),
        dtype=dtype,
        hash_format="hex",
    )


def to_image_array(image: ImageInputType, require_color=True) -> np.ndarray:
    if isinstance(image, np.ndarray):
        assert image.flags["C_CONTIGUOUS"], (
            "Provided arrays must be contiguous to avoid "
            "erroneous results when arrays are passed to "
            "underlying libraries. This can be achieved using"
            "np.ascontiguousarray(image)"
        )
        assert not require_color or (
            len(image.shape) == 3 and image.shape[-1] == 3
        ), "Provided images must be RGB images."
        return image
    return read(image)


def get_common_framerates(id_rates: dict):
    """Compute an optimal set of framerates for a list
    of framerates. Optimal here means that reading the video
    at each of the framerates will allow one to collect all
    of the frames required with the smallest possible number of
    frames decoded.

    For example, consider if we need to read a video at
    3 fps, 5 fps, 1 fps and 0.5 fps. We could read the video
    4 times (once per framerate). But a more optimal approach
    is to read the video only twice, once at 3 frames per second
    and another time at 5 frames per second. For the 1 fps hasher,
    we simply pass every 3rd frame of the 3 fps pass. For the
    0.5 fps hasher, we pass every 6th frame of the 3 fps pass. So
    if you pass this function {A: 3, B: 5, C: 1, D: 0.5}, you will
    get back {3: [A, C, D], 5: C}.

    Args:
        id_rates: A dictionary with IDs as keys and frame rates as values.

    Returns:
        rate_ids: A dictionary with framerates as keys and a list of
            ids as values.
    """

    def partition(collection):
        """This function taken from
        https://stackoverflow.com/questions/19368375/set-partitions-in-python/30134039#30134039
        """
        if len(collection) == 1:
            yield [collection]
            return

        first = collection[0]
        for smaller in partition(collection[1:]):
            # insert `first` in each of the subpartition's subsets
            for n, subset in enumerate(smaller):
                yield smaller[:n] + [[first] + subset] + smaller[n + 1 :]
            # put `first` in its own subset
            yield [[first]] + smaller

    framerates = list(id_rates.values())
    factor = 2 * 3 * 5 * 7 * 11 * 60 * 60
    assert (
        min(framerates) >= 1 / factor
    ), "Framerates must be at least 1 frame per hour."
    best_frame_count = np.inf
    best_grouping: list | None = None
    best_frame_rates: list | None = None

    # We try every possible grouping of framerates to minimize the number
    # of frames we decode. There is likely a better way to do this,
    # but this seems to do the job for now.
    for grouping in partition(list(set(framerates))):
        current_frame_rates = [
            functools.reduce(np.lcm, (np.array(group) * factor).round().astype(int))
            / factor
            for group in grouping
        ]
        current_frame_count = sum(current_frame_rates)
        if current_frame_count < best_frame_count:
            best_frame_count = current_frame_count
            best_frame_rates = current_frame_rates
            best_grouping = grouping

    assert best_frame_rates is not None
    assert best_grouping is not None
    return {
        framerate: tuple(name for name, rate in id_rates.items() if rate in group)
        for framerate, group in zip(best_frame_rates, best_grouping)
    }


def get_isometric_transforms(image: ImageInputType, require_color=True) -> dict:
    image_array = to_image_array(image, require_color=require_color)
    return {
        "r0": image_array,
        "fv": np.ascontiguousarray(image_array[::-1, :]),
        "fh": np.ascontiguousarray(image_array[:, ::-1]),
        "r180": np.ascontiguousarray(image_array[::-1, ::-1]),
        "r90": np.ascontiguousarray(image_array.transpose(1, 0, 2)[::-1, :, :]),
        "r90fv": np.ascontiguousarray(image_array.transpose(1, 0, 2)),
        "r90fh": np.ascontiguousarray(image_array.transpose(1, 0, 2)[::-1, ::-1]),
        "r270": np.ascontiguousarray(image_array.transpose(1, 0, 2)[:, ::-1]),
    }


def get_isometric_dct_transforms(dct: np.ndarray):
    T1 = np.empty_like(dct)
    T1[::2] = 1
    T1[1::2] = -1

    T2 = np.empty_like(dct)
    T2[::2, ::2] = 1
    T2[1::2, 1::2] = 1
    T2[::2, 1::2] = -1
    T2[1::2, ::2] = -1
    return {
        "r0": dct,
        "fv": dct * T1,
        "fh": dct * T1.T,
        "r180": dct * T2,
        "r90": dct.T * T1,
        "r90fv": dct.T,
        "r90fh": dct.T * T2,
        "r270": dct.T * T1.T,
    }


def read(filepath_or_buffer: ImageInputType, timeout=None) -> np.ndarray:
    """Read a file into an image object

    Args:
        filepath_or_buffer: The path to the file or any object
            with a `read` method (such as `io.BytesIO`)
        timeout: If filepath_or_buffer is a URL, the timeout to
            use for making the HTTP request.
    """
    if isinstance(filepath_or_buffer, PIL.Image.Image):
        return np.array(filepath_or_buffer.convert("RGB"))
    if isinstance(
        filepath_or_buffer,
        (io.BytesIO, client.HTTPResponse, tempfile.SpooledTemporaryFile),
    ):
        image = np.asarray(bytearray(filepath_or_buffer.read()), dtype=np.uint8)
        decoded_image = cv2.imdecode(image, cv2.IMREAD_UNCHANGED)
    elif isinstance(filepath_or_buffer, str):
        if validators.url(filepath_or_buffer):
            with request.urlopen(filepath_or_buffer, timeout=timeout) as response:
                return read(response)
        if not os.path.isfile(filepath_or_buffer):
            raise FileNotFoundError(
                "Could not find image at path: " + filepath_or_buffer
            )
        decoded_image = cv2.imread(filepath_or_buffer)
    else:
        raise RuntimeError(
            "Unhandled filepath_or_buffer type: " + str(type(filepath_or_buffer))
        )
    if decoded_image is None:
        raise ValueError(f"An error occurred reading {filepath_or_buffer}.")
    # We use cvtColor here instead of just ret[..., ::-1]
    # in order to ensure that we provide a contiguous
    # array for later processing. Some hashers use ctypes
    # to pass the array and non-contiguous arrays can lead
    # to erroneous results.
    return cv2.cvtColor(decoded_image, cv2.COLOR_BGR2RGB)


def _get_keyframes(filepath):
    """Get the keyframes for a video.

    Args:
        filepath: Path to the target file

    Returns:
        A list of frame indexes.
    """
    args = [
        get_ffprobe(),
        "-select_streams",
        "v",
        "-i",
        f"'{filepath}'",
        "-print_format",
        "json",
        "-show_entries",
        "frame=pict_type,coded_picture_number",
    ]
    with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
        out, err = p.communicate()
        if p.returncode != 0:
            raise ValueError(f"{str(out)}: {str(err)}")
        data = json.loads(out.decode("utf-8"))["frames"]
        frames = [f["coded_picture_number"] for f in data if f["pict_type"] == "I"]
        # ffprobe will return frames repeated and out of order at times. This
        # last step deduplicates and sorts them.
        frames = list(set(frames))
        frames.sort()
    return frames


def get_video_properties(filepath):
    cmd = f"""
    {get_ffprobe()} -select_streams v:0 -i '{filepath}'
    -print_format json -show_entries stream=width,height,avg_frame_rate,codec_name,start_time
    """
    with subprocess.Popen(
        shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE
    ) as p:
        out, err = p.communicate()
        if p.returncode != 0:
            raise ValueError(f"{str(out)}: {str(err)}")
        data = json.loads(out.decode("utf-8"))["streams"][0]
        numerator, denominator = tuple(map(int, data["avg_frame_rate"].split("/")[:2]))
        avg_frame_rate: fractions.Fraction | None
        if numerator > 0 and denominator > 0:
            avg_frame_rate = fractions.Fraction(
                numerator=numerator, denominator=denominator
            )
        else:
            avg_frame_rate = None
        return (
            data["width"],
            data["height"],
            avg_frame_rate,
            data["codec_name"],
            float(data.get("start_time", "0")),
        )


def read_video_to_generator_ffmpeg(
    filepath,
    frames_per_second: str | float | None = None,
    errors="raise",
    max_duration: float | None = None,
    max_size: int | None = None,
    interp: str | None = None,
    frame_rounding: str = "up",
    draw_timestamps=False,
    use_cuda=False,
) -> FramesWithIndexesAndTimestamps:
    """This is used by :code:`read_video` when :code:`use_ffmpeg` is True. It
    differs from :code:`read_video_to_generator` in that it uses FFMPEG instead of
    OpenCV and, optionally, allows for CUDA acceleration. CUDA acceleration
    can be faster for larger videos (>1080p) where downsampling is desired.
    For other videos, CUDA may be slower, but the decoding load will still be
    taken off the CPU, which may still be advantageous. You can specify which
    FFMPEG binary to use by setting PERCEPTION_FFMPEG_BINARY.

    Args:
        filepath: See read_video
        frames_per_second: See read_video
        errors: See read_video
        max_duration: See read_video
        max_size: See read_video
        interp: The interpolation method to use. When not using CUDA, you must choose one
            of the `interpolation options <https://ffmpeg.org/ffmpeg-scaler.html#sws_005fflags>`_
            (default: area). When using CUDA, you must choose from the
            `interp_algo options <http://underpop.online.fr/f/ffmpeg/help/scale_005fnpp.htm.gz>`_
            (default: super).
        frame_rounding: The frame rounding method.
        draw_timestamps: Draw original timestamps onto the frames (for debugging only)
        use_cuda: Whether to enable CUDA acceleration. Requires a
            CUDA-accelerated version of ffmpeg.

    To build FFMPEG with CUDA, do the following in a Docker
    container based on nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04. The
    FFMPEG binary will be ffmpeg/ffmpeg.

    .. code-block:: bash

        git clone https://git.videolan.org/git/ffmpeg/nv-codec-headers.git
        cd nv-codec-headers
        make
        sudo make install
        cd ..
        git clone https://git.ffmpeg.org/ffmpeg.git
        cd ffmpeg
        sudo apt-get update && sudo apt-get -y install yasm
        export PATH=$PATH:/usr/local/cuda/bin
        # Note: Scroll far right to see full configure command:
        ./configure --enable-cuda-nvcc --enable-cuvid --enable-nvenc --enable-nvdec \
                    --enable-libnpp --enable-nonfree --extra-cflags=-I/usr/local/cuda/include \
                    --extra-ldflags=-L/usr/local/cuda/lib64
        make -j 10
        sudo make install

    Returns:
        See :code:`read_video`
    """
    if interp is None:
        interp = "super" if use_cuda else "area"
    try:
        (
            raw_width,
            raw_height,
            avg_frame_rate,
            codec_name,
            start_time,
        ) = get_video_properties(filepath)
        start_time_offset = (
            0.0 if avg_frame_rate is None else float(1 / (2 * avg_frame_rate))
        )
        LOGGER.debug(
            "raw_width: %s, raw_height: %s, avg_frame_rate: %s, codec_name: %s, start_time: %s",
            raw_width,
            raw_height,
            avg_frame_rate,
            codec_name,
            start_time,
        )
        channels = 3
        scale = (
            min(max_size / raw_width, max_size / raw_height, 1)
            if max_size is not None
            else 1
        )
        width, height = map(lambda d: int(round(scale * d)), [raw_width, raw_height])
        # If there is no average frame rate, the offset tends to be unreliable.
        offset = max(start_time, start_time_offset) if avg_frame_rate is not None else 0
        cmd = (
            f"{get_ffmpeg()} -hide_banner -an -vsync 0 -loglevel fatal "
            f"-itsoffset -{offset}"
        )
        filters = []
        if draw_timestamps:
            pattern = "%{pts}-%{frame_num}"
            filters.append(
                f"drawtext=fontsize={int(raw_height * 0.1)}:"
                f"fontcolor=yellow:text={pattern}"
                ":x=(w-text_w):y=(h-text_h)"
            )
        # Add frame rate filters.
        if frames_per_second is None:
            seconds_per_frame = (
                float(1 / avg_frame_rate) if avg_frame_rate is not None else None
            )
        elif frames_per_second == "keyframes":
            seconds_per_frame = None
            filters.append(r"select=eq(pict_type\,I)")
        else:
            assert isinstance(
                frames_per_second, (float, int)
            ), f"Invalid framerate: {frames_per_second}"
            seconds_per_frame = 1 / frames_per_second
            filters.append(
                f"fps={frames_per_second}:round={frame_rounding}:start_time={offset}"
            )
        # Add resizing filters.
        if use_cuda and codec_name in CUDA_CODECS:
            cuda_codec = CUDA_CODECS[codec_name]
            cmd += f" -hwaccel cuda -c:v {cuda_codec}"
            filters.append("hwupload_cuda")
            if scale != 1:
                filters.append(f"scale_npp={width}:{height}:interp_algo={interp}")
            filters.extend(
                [
                    "hwdownload",
                    "format=nv12",
                ]
            )
        elif scale != 1:
            filters.append(f"scale={width}:{height}:flags={interp}")
        cmd += f" -i '{filepath}'"
        if filters:
            cmd += f" -vf '{','.join(filters)}'"
        cmd += " -pix_fmt rgb24 -f image2pipe -vcodec rawvideo -"
        LOGGER.debug("running ffmpeg with: %s", cmd)
        framebytes = width * height * channels
        bufsize = framebytes * int(os.environ.get("PERCEPTION_FFMPEG_BUFSIZE", "5"))
        with subprocess.Popen(
            shlex.split(cmd),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            bufsize=bufsize,
        ) as p:
            assert p.stdout is not None, "Could not launch subprocess pipe."
            timestamp: float | None = 0
            frame_index: int | None = 0
            while True:
                batch = p.stdout.read(bufsize)
                if not batch:
                    break
                for image in np.frombuffer(batch, dtype="uint8").reshape(
                    (
                        -1,
                        height,
                        width,
                        channels,
                    )
                ):
                    if frames_per_second != "keyframes":
                        yield (image, frame_index, timestamp)
                        if seconds_per_frame is not None:
                            assert timestamp is not None
                            timestamp += seconds_per_frame
                            frame_index = (
                                math.ceil(avg_frame_rate * timestamp)
                                if avg_frame_rate is not None
                                else None
                            )
                        else:
                            timestamp = None
                            frame_index = None
                    else:
                        # Obtaining the keyframe indexes with ffprobe is very slow (slower
                        # than reading the video sometimes). We don't *have* to do it
                        # when using ffmpeg, so we don't. The OpenCV approach *does*
                        # get the keyframe indexes, but only because they're required
                        # in order to select them.
                        yield (image, None, None)
                    if (
                        max_duration is not None
                        and timestamp is not None
                        and timestamp > max_duration
                    ):
                        break
            stdout, stderr = p.communicate()
            if p.returncode != 0:
                raise ValueError(
                    f"Error parsing video: {stdout.decode('utf-8')} {stderr.decode('utf-8')}"
                )
    except Exception as e:
        if errors not in ["warn", "ignore"]:
            raise e
        if errors == "warn":
            warnings.warn(
                message=f"An error occurred while reading {filepath}. Processing may be truncated."
            )


def read_video_to_generator(
    filepath,
    frames_per_second: str | float | None = None,
    errors="raise",
    max_duration: float | None = None,
    max_size: int | None = None,
) -> FramesWithIndexesAndTimestamps:
    """This is used by :code:`read_video` when :code:`use_ffmpeg` is False (default).

    Args:
        filepath: See :code:`read_video`.
        frames_per_second: See :code:`read_video`.
        errors: See :code:`read_video`.
        max_duration: See :code:`read_video`.
        max_size: See :code:`read_video`.

    Returns:
        See :code:`read_video`.
    """
    if cv2.__version__ < "4.1.1" and filepath.lower().endswith("gif"):
        message = "Versions of OpenCV < 4.1.1 may read GIF files improperly. Upgrade recommended."
        if errors == "raise":
            raise ValueError(message)
        warnings.warn(message=message)

    if not os.path.isfile(filepath):
        raise FileNotFoundError(f"Could not find {filepath}.")
    if not os.access(filepath, os.R_OK):
        raise OSError(f"{filepath} is not readable")
    cap = cv2.VideoCapture(filename=filepath, apiPreference=cv2.CAP_FFMPEG)
    try:
        # The purpose of the following block is largely to create a
        # frame_indexes (iterator or list) that indicates which
        # frames we should be returning to the user and then
        # yielding those frames as we come across them.
        file_frames_per_second = cap.get(cv2.CAP_PROP_FPS)
        if file_frames_per_second == 0:
            if errors == "raise":
                raise ValueError("Video file has framerate of 0fps.")
            # The known case where this occurs is for GIFs, where
            # 0 fps is typically inferred as 10 fps.
            file_frames_per_second = 10
            if errors == "warn":
                warnings.warn(
                    message="Video file has framerate of 0 fps. Guessing framerate of 10fps."
                )
        if frames_per_second is None:
            frames_per_second = file_frames_per_second
        seconds_between_desired_frames = (
            None
            if (frames_per_second is not None and isinstance(frames_per_second, str))
            else 1 / frames_per_second  # type: ignore
        )
        seconds_between_grabbed_frames = 1 / file_frames_per_second
        grabbed_frame_count = 0
        if frames_per_second == "keyframes":
            frame_indexes: range | list[int] | typing.Iterator[int] = _get_keyframes(
                filepath
            )
            # The repeat flag is used to handle the case where the
            # desired sampling rate is higher than the file's frame
            # rate. In this case, we will need to repeat frames in
            # order to provide the least-surprising behavior that
            # we can.
            repeat = False
        else:
            num_frames_per_second = float(frames_per_second)
            frame_indexes = itertools.count(
                0, max(1, file_frames_per_second / num_frames_per_second)
            )
            repeat = file_frames_per_second < num_frames_per_second
        input_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
        input_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
        if max_size is not None:
            scale = min(max_size / max(input_width, input_height), 1)
        else:
            scale = 1
        target_size: tuple[int, int] | None
        if scale < 1:
            target_size = (int(scale * input_width), int(scale * input_height))
        else:
            target_size = None
        for frame_index in frame_indexes:
            while grabbed_frame_count < frame_index:
                # We need to skip this frame.
                success = cap.grab()
                if not success:
                    break
                grabbed_frame_count += 1
            success, frame = cap.read()
            grabbed_frame_count += 1
            if not success:
                # The video is over or an error has occurred.
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            if target_size is not None:
                frame = cv2.resize(frame, target_size, interpolation=cv2.INTER_NEAREST)
            current_timestamp = frame_index / file_frames_per_second
            yield frame, grabbed_frame_count - 1, current_timestamp
            if max_duration is not None and current_timestamp > max_duration:
                break
            if repeat and isinstance(seconds_between_desired_frames, Number):
                next_desired_timestamp = (
                    current_timestamp + seconds_between_desired_frames
                )
                next_timestamp = current_timestamp + seconds_between_grabbed_frames
                while next_desired_timestamp < next_timestamp:
                    yield (frame, grabbed_frame_count - 1, next_desired_timestamp)
                    next_desired_timestamp += seconds_between_desired_frames
    except Exception as e:
        if errors not in ["warn", "ignore"]:
            raise e
        if errors == "warn":
            warnings.warn(
                message=f"An error occurred while reading {filepath}. Processing may be truncated."
            )
    finally:
        cap.release()


def read_video_into_queue(*args, video_queue, terminate, func, **kwargs):
    # We're inside a thread now and the queue is being read elsewhere.
    try:
        for frame, frame_index, timestamp in func(*args, **kwargs):
            if not terminate.is_set():
                video_queue.put((frame, frame_index, timestamp))
            else:
                break
    finally:
        video_queue.put((None, None, None))


def read_video(
    filepath,
    frames_per_second: str | float | None = None,
    max_queue_size=128,
    use_queue=True,
    errors="raise",
    use_ffmpeg=False,
    **kwargs,
) -> FramesWithIndexesAndTimestamps:
    """Provides a generator of RGB frames, frame indexes, and timestamps from a
    video. This function requires you to have installed ffmpeg. All other
    arguments passed to read_video_to_generator.

    Args:
        filepath: Path to the video file
        frames_per_second: How many frames to provide for
            each second of video. If None, all frames
            are provided. If frames_per_second is "keyframes",
            we use ffmpeg to select I frames from the video.
        max_queue_size: The maximum number of frames to load in the queue
        use_queue: Whether to use a queue of frames during processing
        max_duration: The maximum length of the video to hash.
        max_size: The maximum size of frames to queue
        errors: Whether to 'raise', 'warn', or 'ignore' errors
        use_ffmpeg: Whether to use the FFMPEG CLI to read videos. If True, other
            kwargs (e.g., :code:`use_cuda`) are passed to
            :code:`read_video_to_generator_ffmpeg`.

    Yields:
        (frame, frame_index, timestamp) tuples
    """
    for ffmpeg_kwarg in ["interp", "frame_rounding", "draw_timestamps", "use_cuda"]:
        if not use_ffmpeg and ffmpeg_kwarg in kwargs:
            if kwargs[ffmpeg_kwarg] is not None:
                # Only log a warning if the value is something other than None.
                warnings.warn(
                    f"{ffmpeg_kwarg} is ignored when use_ffmpeg is False.", UserWarning
                )
            del kwargs[ffmpeg_kwarg]
    generator: typing.Callable[..., FramesWithIndexesAndTimestamps]
    if use_ffmpeg:
        generator = read_video_to_generator_ffmpeg
    else:
        generator = read_video_to_generator
    frame_index: int | None
    timestamp: float | None
    if use_queue:
        video_queue: queue.Queue[tuple[np.ndarray, int, float]] = queue.Queue(
            maxsize=max_queue_size
        )
        terminate = threading.Event()
        thread = threading.Thread(
            target=read_video_into_queue,
            kwargs={
                "frames_per_second": frames_per_second,
                "func": generator,
                "video_queue": video_queue,
                "filepath": filepath,
                "errors": errors,
                "terminate": terminate,
                **kwargs,
            },
        )
        thread.start()
        try:
            while True:
                frame, frame_index, timestamp = video_queue.get()
                video_queue.task_done()
                if frame is None:
                    break
                yield (frame, frame_index, timestamp)
        finally:
            # Set the termination flag for the
            # background thread.
            terminate.set()
            try:
                # Unblock the thread, in the event
                # that it is waiting.
                video_queue.get_nowait()

                # Do it twice for the edge case
                # where the queue is completely
                # full and the end sentinel is
                # blocking.
                video_queue.get_nowait()
            except queue.Empty:
                # It doesn't matter if it's empty.
                pass
            # Wait for the background thread to terminate.
            thread.join()
    else:
        for frame, frame_index, timestamp in generator(
            filepath=filepath,
            frames_per_second=frames_per_second,
            errors=errors,
            **kwargs,
        ):
            yield (frame, frame_index, timestamp)


def compute_synchronized_video_hashes(
    filepath: str, hashers: dict, framerates=None, hash_format="base64", use_queue=True
):
    """Compute the video hashes for a group of hashers with synchronized
    frame processing wherever possible.

    Args:
        filepath: Path to video file.
        hashers: A dictionary mapping hasher names to video hasher objects
        hash_format: The format in which to return the hashes
        use_queue: Whether to use queued video frames
    """
    if framerates is None:
        framerates = get_common_framerates(
            {
                k: h.frames_per_second
                for k, h in hashers.items()
                if h.frames_per_second is not None
            }
        )
    else:
        assert all(
            any(hasher_name in hasher_names for hasher_names in framerates.values())
            for hasher_name, hasher in hashers.items()
            if hasher.frames_per_second is not None
        ), "Provided framerates do not have an entry for all required hashers."

    results = {
        hasher_name: {
            "state": None,
            "hash": None,
            "relative_framerate": next(
                framerate / hasher.frames_per_second
                for framerate, hasher_names in framerates.items()
                if hasher_name in hasher_names
            ),
        }
        for hasher_name, hasher in hashers.items()
        if hasher.frames_per_second is not None
    }
    for current_framerate, current_hasher_names in framerates.items():
        for frame_index, (frame, grabbed_frame_index, frame_timestamp) in enumerate(
            read_video(
                filepath=filepath,
                frames_per_second=current_framerate,
                use_queue=use_queue,
            )
        ):
            for hasher_name in current_hasher_names:
                config = results[hasher_name]
                hasher = hashers[hasher_name]
                assert config["relative_framerate"] is not None
                if frame_index % config["relative_framerate"] == 0:
                    config["state"] = hasher.process_frame(
                        frame=frame,
                        frame_index=grabbed_frame_index,
                        frame_timestamp=frame_timestamp,
                        state=config["state"],
                    )
        for hasher_name in current_hasher_names:
            config = results[hasher_name]
            hasher = hashers[hasher_name]
            current_hash = hasher.hash_from_final_state(state=config["state"])
            if hash_format == "vector":
                config["hash"] = current_hash
            else:
                if not hasher.returns_multiple:
                    config["hash"] = hasher.vector_to_string(
                        current_hash, hash_format=hash_format
                    )
                else:
                    config["hash"] = [
                        hasher.vector_to_string(h, hash_format=hash_format)
                        for h in current_hash
                    ]
            config["state"] = None
    hashes = {hasher_name: config["hash"] for hasher_name, config in results.items()}
    for hasher_name, hasher in hashers.items():
        if hasher.frames_per_second is None:
            # This is a custom hasher that we just pass a video path to.
            hashes[hasher_name] = hasher.compute(filepath)
    return hashes


def unletterbox(
    image: np.ndarray,
    only_remove_black: bool = False,
    min_fraction_meaningful_pixels: float = 0.1,
    color_threshold: float = 2,
    min_side_length: int = 50,
    min_reduction: float = 0.02,
) -> tuple[tuple[int, int], tuple[int, int]] | None:
    """Return bounds of the non-trivial (content) region of an image, or None.

    Letterboxing refers to uniform-color borders added around an image
    (e.g., black bars on a video frame). This function detects such borders
    by identifying the background color from the image corners and finding
    the bounding box of pixels that differ from that background.

    The function returns bounds as ``(x1, x2), (y1, y2)`` suitable for
    slicing: ``image[y1:y2, x1:x2]``. The bounds are exclusive on the
    right/bottom (i.e., x2 and y2 point one past the last content pixel).

    **Algorithm overview:**

    1. Sample the four corner pixels and find the most common value as
       the candidate background color. If all four corners differ, return
       ``None`` (no consistent letterbox detected).
    2. Build a binary content mask where each pixel whose grayscale
       intensity differs from the background by more than
       ``color_threshold`` is marked as content.
    3. Project the mask onto rows and columns and find the first/last
       row and column where the fraction of content pixels exceeds
       ``min_fraction_meaningful_pixels``.
    4. Validate that the resulting crop is meaningfully smaller than the
       original (controlled by ``min_reduction``) and that both sides
       exceed ``min_side_length``.

    Returns ``None`` when:

    - No two corners share the same color (no clear background).
    - Every pixel differs from the detected background (no border).
    - No row or column meets the content-pixel threshold.
    - The crop would not remove at least ``min_reduction`` fraction
      from any dimension.
    - Either cropped dimension would be smaller than ``min_side_length``.

    Args:
        image: Input image as an ``np.ndarray``. May be grayscale (H×W)
            or RGB (H×W×3); RGB images are converted to grayscale
            internally for background detection.
        only_remove_black: If ``True``, treat black (intensity 0) as the
            background regardless of corner colors. If ``False`` (default),
            infer the background color from the most common corner value.
        min_fraction_meaningful_pixels: The minimum fraction (0–1) of
            pixels in a row or column that must differ from the background
            for that row/column to be considered part of the content region.
            Defaults to 0.1 (10%).
        color_threshold: The minimum absolute difference in grayscale
            intensity between a pixel and the background color for that
            pixel to be classified as content. Defaults to 2.
        min_side_length: The minimum width or height (in pixels) of the
            cropped region. If the crop would be smaller, ``None`` is
            returned. Defaults to 50.
        min_reduction: The minimum fraction (0–1) of the original width
            or height that must be removed for the crop to be worthwhile.
            If the crop removes less than this from both dimensions,
            ``None`` is returned. Defaults to 0.02 (2%).

    Returns:
        A tuple ``((x1, x2), (y1, y2))`` giving the left, right, top,
        and bottom bounds of the content region (right/bottom exclusive),
        or ``None`` if no meaningful letterbox was detected.
    """
    if not 0 <= min_fraction_meaningful_pixels <= 1:
        raise ValueError("min_fraction_meaningful_pixels must be between 0 and 1")
    if not 0 <= min_reduction <= 1:
        raise ValueError("min_reduction must be between 0 and 1")
    image = image.astype(np.uint8)

    shape = image.shape
    h, w = shape[0:2]
    if len(shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

    # Determine background color and build binary content mask.
    if only_remove_black:
        bg_gray = 0
    else:
        # Sample the four corner pixels. If all four are unique there is no
        # consistent background color, so we bail out early (O(1) rejection).
        corners = (
            image[0, 0],
            image[0, w - 1],
            image[h - 1, 0],
            image[h - 1, w - 1],
        )

        if len(set(corners)) == 4:
            LOGGER.debug("No common corner color detected, skipping content detection.")
            return (
                (0, w),
                (0, h),
            )  # Return full image bounds instead of None to maintain backwards compatibility
        # Use the most common corner value as the background intensity.
        counts = Counter(corners)
        bg_gray = counts.most_common(1)[0][0]

    # Mark pixels whose grayscale intensity differs from the background
    # by more than color_threshold as content (True).
    content_mask = np.abs(image.astype(np.int16) - bg_gray) > color_threshold

    # If every pixel is classified as content, there is no border to remove.
    if content_mask.all():
        LOGGER.debug("All pixels differ from background; no letterbox detected.")
        return (
            (0, w),
            (0, h),
        )  # Return full image bounds instead of None to maintain backwards compatibility

    # Find the content bounding box by projecting the mask onto rows and
    # columns. cv2.reduce is used instead of np.sum for performance.
    mask_u8 = content_mask.astype(np.uint8)
    row_content = cv2.reduce(mask_u8, 1, cv2.REDUCE_SUM, dtype=cv2.CV_32S).ravel()
    col_content = cv2.reduce(mask_u8, 0, cv2.REDUCE_SUM, dtype=cv2.CV_32S).ravel()

    # Thresholds for minimum content per row/column
    row_threshold = min_fraction_meaningful_pixels * w
    col_threshold = min_fraction_meaningful_pixels * h

    # Find first/last rows and columns with sufficient content
    content_rows = np.where(row_content > row_threshold)[0]
    content_cols = np.where(col_content > col_threshold)[0]

    if len(content_rows) == 0 or len(content_cols) == 0:
        LOGGER.debug("No rows or columns with sufficient content detected.")
        return None

    top = int(content_rows[0])
    bottom = int(content_rows[-1]) + 1
    left = int(content_cols[0])
    right = int(content_cols[-1]) + 1
    height = bottom - top
    width = right - left

    # Reject if the crop does not remove at least min_reduction from
    # at least one dimension (i.e., the border is negligibly thin).
    if width >= w * (1 - min_reduction) and height >= h * (1 - min_reduction):
        LOGGER.debug(
            "Crop would not reduce either dimension by %.0f%%; skipping.",
            min_reduction * 100,
        )
        return (
            (0, w),
            (0, h),
        )  # Return full image bounds instead of None to maintain backwards compatibility
    # Reject if the remaining content region is too small to be useful.
    if width < min_side_length or height < min_side_length:
        LOGGER.debug(
            "Cropped region (%dx%d) smaller than min_side_length=%d; skipping.",
            width,
            height,
            min_side_length,
        )
        return None

    return ((left, right), (top, bottom))


def unletterbox_crop(
    image: np.ndarray,
    min_fraction_meaningful_pixels: float = 0.1,
    color_threshold: float = 2,
    min_side_length: int = 50,
    min_reduction: float = 0.02,
) -> np.ndarray | None:
    """Detect and crop the letterboxed regions from an image.

    Args:
        image: The image from which to remove letterboxing.
        min_fraction_meaningful_pixels: 0 to 1: if cropped version is
            smaller than this fraction of the image do not unletterbox.
            0.1 == 10% of the image.
        color_threshold: The minimum absolute difference in grayscale
            intensity between a pixel and the background color for that
            pixel to be classified as content. Defaults to 2.
        min_side_length: The minimum width or height (in pixels) of the
            cropped region. If the crop would be smaller, ``None`` is
            returned. Defaults to 50.
        min_reduction: The minimum fraction (0–1) of the original width
            or height that must be removed for the crop to be worthwhile.
            If the crop removes less than this from both dimensions,
            the original image is returned. Defaults to 0.02 (2%).
    Returns:
        The cropped image or None if the image is mostly blank space.
    """
    if not isinstance(image, np.ndarray):
        raise TypeError(f"Expected np.ndarray, got {type(image).__name__}")

    bounds = unletterbox(
        image,
        min_fraction_meaningful_pixels=min_fraction_meaningful_pixels,
        color_threshold=color_threshold,
        min_side_length=min_side_length,
        min_reduction=min_reduction,
    )
    if bounds is None:
        return None
    (x1, x2), (y1, y2) = bounds
    cropped = np.ascontiguousarray(image[y1:y2, x1:x2])
    assert cropped.data.contiguous
    return cropped


================================================
FILE: perception/hashers/video/__init__.py
================================================
from .framewise import FramewiseHasher
from .tmk import TMKL1, TMKL2

__all__ = ["FramewiseHasher", "TMKL1", "TMKL2"]


================================================
FILE: perception/hashers/video/framewise.py
================================================
import numpy as np

from .. import tools
from ..hasher import ImageHasher, VideoHasher


class FramewiseHasher(VideoHasher):
    """A hasher that simply returns frame-wise hashes at some
    regular interval with some minimum inter-frame distance threshold."""

    returns_multiple = True

    def __init__(
        self,
        frame_hasher: ImageHasher,
        interframe_threshold: float,
        frames_per_second: int = 15,
        quality_threshold: float | None = None,
    ):
        self.hash_length = frame_hasher.hash_length
        self.frames_per_second = frames_per_second
        self.frame_hasher = frame_hasher
        self.distance_metric = frame_hasher.distance_metric
        if self.distance_metric == "hamming" and interframe_threshold > 1:
            raise ValueError(
                "Hamming distance is always between 0 and 1 but "
                f"`interframe_threshold` was set to {interframe_threshold}."
            )
        self.dtype = frame_hasher.dtype
        self.interframe_threshold = interframe_threshold
        self.quality_threshold = quality_threshold

    def process_frame(self, frame, frame_index, frame_timestamp, state=None):
        if self.quality_threshold is None:
            current = self.frame_hasher.compute(frame, hash_format="vector")
        else:
            current, quality = self.frame_hasher.compute_with_quality(
                frame, hash_format="vector"
            )
            if quality < self.quality_threshold:
                return state or {"previous": None, "hashes": []}
        assert isinstance(current, np.ndarray)  # help type checking below
        if state is None or state["previous"] is None:
            # We keep a separate reference to the previous hash instead of using
            # the last entry in the hashes list because `compute_batches` may
            # clear the hashes list but we still want to be able to compare
            # the final entry.
            state = {
                "previous": current,
                "hashes": [current],
            }
        else:
            if (
                self.frame_hasher.compute_distance(current, state["previous"])
                > self.interframe_threshold
            ):
                state["hashes"].append(current)
        return state

    def compute_batches(
        self, filepath: str, batch_size: int, errors="raise", hash_format="base64"
    ):
        """Compute hashes for a video in batches.

        Args:
            filepath: Path to video file
            batch_size: The batch size to use for returning hashes
            errors: One of "raise", "ignore", or "warn". Passed
                to perception.hashers.tools.read_video.
            hash_format: The format in which to return hashes
        """

        def format_batch(hashes):
            return [
                (
                    self.vector_to_string(vector, hash_format=hash_format)
                    if hash_format != "vector"
                    else vector
                )
                for vector in hashes
            ]

        state = None
        for frame, frame_index, frame_timestamp in tools.read_video(
            filepath=filepath, frames_per_second=self.frames_per_second, errors=errors
        ):
            state = self.process_frame(
                frame=frame,
                frame_index=frame_index,
                frame_timestamp=frame_timestamp,
                state=state,
            )
            if state is not None and len(state["hashes"]) > batch_size:
                yield format_batch(state["hashes"])
                state["hashes"] = []
        if state is not None and state["hashes"]:
            yield format_batch(state["hashes"])

    def hash_from_final_state(self, state):
        if state is None:
            return []
        return state["hashes"]


================================================
FILE: perception/hashers/video/tmk.py
================================================
import platform
import warnings

import numpy as np
import scipy.special

from ..hasher import ImageHasher, VideoHasher
from ..image.phash import PHashF


class TMKL2(VideoHasher):
    """The TMK L2 video hashing algorithm."""

    dtype = "float32"
    distance_metric = "custom"

    def __init__(
        self,
        frame_hasher: ImageHasher | None = None,
        frames_per_second: int = 15,
        normalization: str = "matrix",
    ):
        if platform.machine() == "arm64":
            warnings.warn("TMK is not supported on ARM64")

        T = np.array([2731, 4391, 9767, 14653]).astype("float32")
        m = 32
        if frame_hasher is None:
            frame_hasher = PHashF(hash_size=16, exclude_first_term=True, freq_shift=1)
        self.frames_per_second = frames_per_second
        assert frame_hasher.dtype != "bool", "This hasher requires real valued hashes."

        # Beta parameter of the modified Bessel function of the first kind
        self.beta = 32

        # Number of Fourier coefficients per period
        self.m = m

        # The periods with shape (T, )
        self.T = T  # (T)

        # The Fourier coefficients with shape (T, m, 1)
        self.ms = 2 * np.pi * np.arange(0, self.m).astype("float32")  # (m)
        self.ms_normed = (self.ms[np.newaxis,] / self.T.reshape(-1, 1)).reshape(
            len(self.T), self.m, 1
        )  # (T, m, 1)

        # The weights with shape (T, 2m, 1)
        a = np.array(
            [
                (scipy.special.iv(0, self.beta) - np.exp(-self.beta))
                / (2 * np.sinh(self.beta))
            ]
            + [
                scipy.special.iv(i, self.beta) / np.sinh(self.beta)
                for i in range(1, self.m)
            ]
        )
        a = a.reshape(1, -1).repeat(repeats=len(self.T), axis=0)  # type: ignore
        a = np.sqrt(a)
        self.a = a[..., np.newaxis]

        # The frame-wise hasher
        self.frame_hasher = frame_hasher

        self.hash_length = self.T.shape[0] * 2 * self.m * self.frame_hasher.hash_length

        self.normalization = normalization

    def process_frame(self, frame, frame_index, frame_timestamp, state=None):
        if state is None:
            state = {"features": [], "timestamps": []}
        state["features"].append(self.frame_hasher.compute(frame, hash_format="vector"))
        state["timestamps"].append(frame_timestamp)
        return state

    def hash_from_final_state(self, state):
        timestamps = np.array(state["timestamps"])
        features = np.array(state["features"]).reshape(
            (
                1,
                1,
                timestamps.shape[0],
                self.frame_hasher.hash_length,
            )
        )
        x = self.ms_normed * timestamps
        yw1 = np.sin(x) * self.a
        yw2 = np.cos(x) * self.a
        yw = np.concatenate([yw1, yw2], axis=1)[..., np.newaxis]  # (T, 2m, t, 1)
        y = (yw * features).sum(axis=2)  # (T, 2m, d)
        return y.flatten()

    def _compute_distance(self, vector1, vector2):
        shape = (len(self.T), 2 * self.m, self.frame_hasher.hash_length)
        return 1 - self._score_pair(
            fv_a=vector1.reshape(shape),
            fv_b=vector2.reshape(shape),
            offsets=None,
            normalization=self.normalization,
        )

    def _score_pair(self, fv_a, fv_b, offsets=None, normalization="matrix"):
        eps = 1e-8

        if offsets is None:
            offsets = np.array([0])

        assert normalization in [
            "feat",
            "freq",
            "feat_freq",
            "matrix",
        ], "Invalid normalization"

        if "feat" in normalization:
            a_xp = np.concatenate([self.a, self.a], axis=1)  # (T, 2m, 1)
            fv_a_0 = fv_a / a_xp
            fv_b_0 = fv_b / a_xp
            norm_a = np.sqrt(np.sum(fv_a_0**2, axis=2, keepdims=True) + eps) + eps
            norm_b = np.sqrt(np.sum(fv_b_0**2, axis=2, keepdims=True) + eps) + eps
            fv_a = fv_a / norm_a
            fv_b = fv_b / norm_b

        if "freq" in normalization:
            norm_a, norm_b = (
                np.sqrt((fv**2).sum(axis=1, keepdims=True) / self.m + eps) + eps
                for fv in [fv_a, fv_b]
            )
            fv_a = fv_a / norm_a
            fv_b = fv_b / norm_b

        if normalization == "matrix":
            norm_a, norm_b = (
                np.sqrt(np.sum(fv**2, axis=(1, 2)) + eps)[..., np.newaxis] + eps
                for fv in [fv_a, fv_b]
            )  # (T, 1)

        fv_a_sin, fv_b_sin = (fv[:, : self.m] for fv in [fv_a, fv_b])  # (T, m, d)
        fv_a_cos, fv_b_cos = (fv[:, self.m :] for fv in [fv_a, fv_b])  # (T, m, d)
        ms = self.ms.reshape(-1, 1)  # (m, 1)
        dot_sin_sin, dot_sin_cos, dot_cos_cos, dot_cos_sin = (
            np.sum(p, axis=2, keepdims=True)
            for p in [
                fv_a_sin * fv_b_sin,
                fv_a_sin * fv_b_cos,
                fv_a_cos * fv_b_cos,
                fv_a_cos * fv_b_sin,
            ]
        )  # (T, m, 1)
        delta = (
            ms.reshape(1, -1, 1) * offsets.reshape(1, -1) / self.T.reshape((-1, 1, 1))
        )
        cos_delta = np.cos(delta)  # (T, m, delta)
        sin_delta = np.sin(delta)  # (T, m, delta)
        dots = (
            dot_sin_sin * cos_delta
            + dot_sin_cos * sin_delta
            + dot_cos_cos * cos_delta
            - dot_cos_sin * sin_delta
        ).sum(axis=1)
        if normalization == "matrix":
            dots = dots / (norm_a * norm_b)
        if normalization == "freq":
            dots = dots / self.m  # (T, m, delta)
        elif normalization in ["feat", "feat_freq"]:
            dots = dots / 512
        return dots.mean(axis=0)


class TMKL1(VideoHasher):
    """The TMK L1 video hashing algorithm."""

    def __init__(
        self,
        frame_hasher: ImageHasher | None = None,
        frames_per_second: int = 15,
        dtype="float32",
        distance_metric="cosine",
        norm=2,
        quality_threshold=None,
    ):
        if frame_hasher is None:
            frame_hasher = PHashF(hash_size=16, exclude_first_term=True, freq_shift=1)
        self.hash_length = frame_hasher.hash_length
        self.frames_per_second = frames_per_second
        assert frame_hasher.dtype != "bool", "This hasher requires real valued hashes."
        self.frame_hasher = frame_hasher
        self.norm = norm
        self.dtype = dtype or self.frame_hasher.dtype
        self.distance_metric = distance_metric or self.frame_hasher.distance_metric
        self.quality_threshold = quality_threshold

    def process_frame(self, frame, frame_index, frame_timestamp, state=None):
        if state is None:
            state = {"sum": np.zeros(self.frame_hasher.hash_length), "frame_count": 0}
        if self.quality_threshold is None:
            hash_vector = self.frame_hasher.compute(frame, hash_format="vector")
        else:
            hash_vector, quality = self.frame_hasher.compute_with_quality(
                frame, hash_format="vector"
            )
            if quality < self.quality_threshold:
                return state
        assert isinstance(hash_vector, np.ndarray)  # help type checking below
        if hash_vector is not None:
            state["sum"] += hash_vector.astype(np.float32)
            state["frame_count"] += 1
        return state

    def hash_from_final_state(self, state):
        if state["frame_count"] == 0:
            return None
        average_vector = state["sum"] / state["frame_count"]
        if self.norm is not None:
            return (
                average_vector / np.linalg.norm(average_vector, ord=self.norm)
            ).astype(self.frame_hasher.dtype)
        return average_vector.astype(self.frame_hasher.dtype)


================================================
FILE: perception/local_descriptor_deduplication.py
================================================
import concurrent.futures
import logging
import typing
from abc import ABC
from warnings import warn

import cv2
import numpy as np
import pandas as pd
import tqdm
import typing_extensions

import perception.approximate_deduplication as ad
import perception.hashers.tools as pht

LOGGER = logging.getLogger(__name__)
DEFAULT_MAX_FEATURES = 256
DEFAULT_OVERLAP = 0.01
DEFAULT_MATCH_PCT = 0.4
DEFAULT_INTERSECTION = 0.6
DEFAULT_INLIERS = 5
DEFAULT_MAX_SIZE = 256
DEFAULT_MIN_FEATURES = 10

DEFAULT_THRESHOLD = 100
DEFAULT_SIFT_THRESHOLD = 100
DEFAULT_AKAZE_THRESHOLD = 250

DEFAULT_RATIO = 0.5
DEFAULT_SIFT_RATIO = 0.5
DEFAULT_AKAZE_RATIO = 0.85


class Descriptors(typing_extensions.TypedDict):
    keypoints: np.ndarray
    descriptors: np.ndarray
    descriptor_count: int
    dimensions: tuple[int, int]
    filepath: str
    hasher: str


class MatchStats(typing_extensions.TypedDict):
    match: float | None
    min_kpBM: int | None
    MAB: str | None
    intersection: float | None
    inliers: float | None
    bounds_intersection: float | None
    final_matched_a_pts: list[np.ndarray] | None
    final_matched_b_pts: list[np.ndarray] | None


class LocalHasher(ABC):
    grayscale = False
    name: str
    hasher: typing.Any
    ratio: float
    threshold: int

    def __init__(
        self,
        max_features: int = DEFAULT_MAX_FEATURES,
        ratio: float = DEFAULT_SIFT_RATIO,
        threshold: int = DEFAULT_THRESHOLD,
        overlap: float = DEFAULT_OVERLAP,
        validation_match: float = DEFAULT_MATCH_PCT,
        validation_inliers: int = DEFAULT_INLIERS,
        validation_intersection: float = DEFAULT_INTERSECTION,
    ):
        self.ratio = ratio
        self.threshold = threshold
        self.max_features = max_features
        self.overlap = overlap
        self.validation_match = validation_match
        self.validation_inliers = validation_inliers
        self.validation_intersection = validation_intersection

    def compute(self, image) -> tuple[np.ndarray, np.ndarray]:
        return self.hasher.detectAndCompute(image, None)

    def validate_match(
        self,
        descriptor1: Descriptors,
        descriptor2: Descriptors,
        minimum_match: float = DEFAULT_MATCH_PCT,
        minimum_intersection: float = DEFAULT_INTERSECTION,
        minimum_inliers: int = DEFAULT_INLIERS,
    ) -> tuple[bool, MatchStats]:
        """Validate the match between two sets of keypoints and descriptors. The
        validation algorithm is as follows:

        #. Compute the mutual set of matches between the two sets of descriptors
           and filter them using Lowe's ratio test.
        #. If the minimum number of passing matches is less than "minimum_match",
           the match fails. This ensures we don't have trivial matches.
        #. Compute the intersection area of the matched keypoints versus the
           raw keypoints. If the area overlap is less than minimum_intersection,
           the match fails. This ensures we don't match on small subsegments of
           an image, such as logos.
        #. Compute a transformation matrix using cv2.findHomography. If we cannot
           obtain a transformation matrix, the match fails. If the sum total
           of inliers for the transformation matrix is less than minimum_inliers,
           the match fails.
        #. Finally, use the transformation matrix on a set of points representing
           the bounding box of each image. If less than minimum_intersection of
           the bounding box fits within the bounds of the transformed version,
           the match fails. This is a second pass safety check for logos and other
           subsegments of images.

        Args:
            kp1: The first set of keypoints
            des1: The first set of descriptors
            kp2: The second set of keypoints
            des2: The second set of descriptors
            dims1: The dimensions (width, height) for the first image
            dims2: The dimensions (width, height) for the second image
            minimum_match: The minimum number of matches passing the ratio test.
            minimum_intersection: The minimum overlapping area between the keypoints
                in the filtered set of matches and the original keypoints.
            minimum_inliers: The minimum number of inliers for the transformation
                matrix.
            ratio: The ratio to use for Lowe's ratio test.

        Returns:
            True if the match passes, False otherwise.
        """
        swap = descriptor1["keypoints"].shape[0] < descriptor2["keypoints"].shape[0]
        descriptorA = descriptor2 if swap else descriptor1
        descriptorB = descriptor1 if swap else descriptor2

        stats: MatchStats = {
            "match": None,
            "min_kpBM": None,
            "MAB": None,
            "intersection": None,
            "inliers": None,
            "bounds_intersection": None,
            "final_matched_a_pts": None,
            "final_matched_b_pts": None,
        }

        indexA = ad.build_index(descriptorA["descriptors"], approximate=False)
        indexB = ad.build_index(descriptorB["descriptors"], approximate=False)
        if (
            descriptorA["descriptors"] is None
            or indexA is None
            or descriptorB["descriptors"] is None
            or indexB is None
        ):
            return False, stats

        distances_A2B, indexes_A2B = indexB.search(
            descriptorA["descriptors"].astype("float32"), 2
        )
        distances_B2A, _ = indexA.search(
            descriptorB["descriptors"].astype("float32"), 2
        )
        good_A2B, good_B2A = map(
            lambda distances: (distances[:, 0] < distances[:, 1] * self.ratio),
            [distances_A2B, distances_B2A],
        )
        match = min(
            good_A2B.sum() / good_A2B.shape[0], good_B2A.sum() / good_B2A.shape[0]
        )
        stats["match"] = match

        if match < minimum_match:
            # We didn't get enough good matches.
            return False, stats
        kpAM = descriptorA["keypoints"][good_A2B]
        kpBM = descriptorB["keypoints"][indexes_A2B[good_A2B, 0]]

        # findHomography requires 4 points from each to work.
        stats["min_kpBM"] = min(len(kpAM), len(kpBM))
        if len(kpAM) < 4 or len(kpBM) < 4:
            return False, stats

        intersection = compute_minimum_intersection(
            kp1=descriptorA["keypoints"],
            kp2=descriptorB["keypoints"],
            filter_arr1=good_A2B,
            filter_arr2=indexes_A2B[good_A2B, 0],
        )
        stats["intersection"] = intersection
        if intersection < minimum_intersection:
            return False, stats

        MAB, mask = cv2.findHomography(
            kpAM.reshape(-1, 1, 2),
            kpBM.reshape(-1, 1, 2),
            cv2.RANSAC,
            1.0,
            maxIters=50_000,
            confidence=0.9999,
        )
        stats["MAB"] = "good"
        if MAB is None:
            # We didn't get a transformation matrix.
            stats["MAB"] = "is-None"
            return False, stats
        stats["inliers"] = mask.sum()
        if mask.sum() < minimum_inliers:
            # The transformation matrix didn't include enough inliers.
            return False, stats
        # Check how much of each original bounding box fits onto
        # the other image.
        try:
            MBA = np.linalg.inv(MAB)
        except np.linalg.LinAlgError:
            # We couldn't compute the matrix inverse.
            stats["MAB"] = "inverse-failed"
            return False, stats
        ptsA = np.array([[0, 0], descriptorA["dimensions"]]).astype("float32")
        ptsB = np.array([[0, 0], descriptorB["dimensions"]]).astype("float32")
        ptsAt = (
            cv2.perspectiveTransform(ptsA.reshape((-1, 1, 2)), MAB)
            .reshape(-1, 2)
            .clip(0, descriptorB["dimensions"])
        )
        ptsBt = (
            cv2.perspectiveTransform(ptsB.reshape((-1, 1, 2)), MBA)
            .reshape(-1, 2)
            .clip(0, descriptorA["dimensions"])
        )
        bounds_intersection = min(
            abs(np.prod(ptsBt[1] - ptsBt[0]) / np.prod(descriptorA["dimensions"])),
            abs(np.prod(ptsAt[1] - ptsAt[0]) / np.prod(descriptorB["dimensions"])),
        )
        stats["bounds_intersection"] = bounds_intersection

        # Apply mask index to kpAM, kpBM for list of matcihing points. mask ==1 for keep
        matched_a_pts = []
        matched_b_pts = []
        for i in range(mask.shape[0]):
            if mask[i][0] == 1:
                matched_a_pts.append(kpAM[i])
                matched_b_pts.append(kpBM[i])
        # Unswap points before final return.
        if swap:
            stats["final_matched_a_pts"] = matched_b_pts
            stats["final_matched_b_pts"] = matched_a_pts
        else:
            stats["final_matched_a_pts"] = matched_a_pts
            stats["final_matched_b_pts"] = matched_b_pts

        return (bounds_intersection >= minimum_intersection, stats)


class SIFT(LocalHasher):
    name = "SIFT"

    def __init__(
        self,
        max_features: int = DEFAULT_MAX_FEATURES,
        ratio: float = DEFAULT_SIFT_RATIO,
        threshold: int = DEFAULT_SIFT_THRESHOLD,
        **kwargs,
    ):
        super().__init__(max_features, ratio, threshold, **kwargs)
        self.hasher = cv2.SIFT_create(nfeatures=self.max_features)  # type: ignore[attr-defined]


class AKAZE(LocalHasher):
    name = "AKAZE"

    def __init__(
        self,
        max_features: int = DEFAULT_MAX_FEATURES,
        ratio: float = DEFAULT_AKAZE_RATIO,
        threshold: int = DEFAULT_AKAZE_THRESHOLD,
        **kwargs,
    ):
        super().__init__(max_features, ratio, threshold, **kwargs)
        LOGGER.warning("The default AKAZE tuning has issues with some cropped images.")
        self.hasher = cv2.AKAZE_create()  # type: ignore[attr-defined]


def load_and_preprocess(filepath, max_size=DEFAULT_MAX_SIZE, grayscale=True):
    """Read, unletterbox, and resize an image.

    Args:
        filepath: The path to the file
        max_size: The maximum size for a dimension of the image
        grayscale: Set to false to get RGB
    """
    image = pht.read(filepath)
    if image is None:
        LOGGER.warning("Failed to load image %s", filepath)
        return None
    res = pht.unletterbox(image)
    if res is None:
        return None
    (x1, x2), (y1, y2) = res
    image = np.ascontiguousarray(image[y1:y2, x1:x2])
    if grayscale:
        image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

    max_dimension = max(image.shape[:2])
    if max_dimension > max_size:
        scale = max_size / max_dimension
        image = cv2.resize(
            image, (int(image.shape[1] * scale), int(image.shape[0] * scale))
        )
    return image


def generate_image_descriptors(
    filepath: str,
    hasher: LocalHasher | None = None,
    min_features=DEFAULT_MIN_FEATURES,
    max_size=DEFAULT_MAX_SIZE,
) -> Descriptors | None:
    """Generate local descriptors for a file.

    Args:
        filepath: Path to image file.
        max_features: The maximum number of features to
            extract.
        min_features: The minimum number of features to
            extract.
        max_size: The maximum side length for an image.

    Returns:
        If successful, returns a tuple of keypoints, descriptors,
        and a (width, height) tuple.
    """
    if hasher is None:
        hasher = SIFT(
            max_features=DEFAULT_MAX_FEATURES,
        )

    try:
        image = load_and_preprocess(
            filepath, max_size=max_size, grayscale=hasher.grayscale
        )
        if image is None:
            return None
        keypoints, descriptors = hasher.compute(image)
    except FileNotFoundError:
        LOGGER.warning("Image file %s not found.", filepath)
        return None
    except ValueError as e:
        LOGGER.error("Processing image file %s failed.", filepath, exc_info=e)
        return None

    if descriptors is None:
        return None
    if descriptors.shape[0] < min_features:
        return None
    keypoints = np.array([kp.pt for kp in keypoints], dtype=np.float32)

    return {
        "keypoints": keypoints,
        "descriptors": descriptors,
        "descriptor_count": descriptors.shape[0],
        "filepath": filepath,
        "dimensions": (image.shape[1], image.shape[0]),
        "hasher": hasher.name,
    }


def build_reference_df(
    filepaths: typing.Iterable[str],
    hasher: LocalHasher | None = None,
    min_features=DEFAULT_MIN_FEATURES,
    max_size=DEFAULT_MAX_SIZE,
    show_progress=False,
) -> pd.DataFrame:
    """Build descriptors for a list of files.

    Args:
        filepaths: A list of filepaths for which descriptors
            are desired.
        hasher: The local descriptor hasher to use to extract
            features.
        min_features: The minimum number of features to
            extract.
        max_size: The maximum side length for an image.

    Returns:
        A dataframe, indexed by filepath with columns for descriptors
        and descriptor counts.
    """
    LOGGER.debug("Generating descriptors")

    if hasher is None:
        hasher = SIFT()

    features = []
    for filepath in tqdm.tqdm(filepaths, disable=not show_progress, desc="Filepaths"):
        features.append(
            generate_image_descriptors(
                filepath,
                hasher=hasher,
                min_features=min_features,
                max_size=max_size,
            )
        )
    LOGGER.debug("Finished computing descriptors.")
    return pd.DataFrame(
        {
            "descriptors": [
                f["descriptors"] if f is not None else None for f in features
            ],
            "keypoints": [f["keypoints"] if f is not None else None for f in features],
            "descriptor_count": [
                f["descriptor_count"] if f is not None else None for f in features
            ],  # type: ignore
            "dimensions": [
                f["dimensions"] if f is not None else None for f in features
            ],
            "hasher": hasher.name,
            "filepath": filepaths,
        }
    ).set_index("filepath")


def hasher_name(df: pd.DataFrame) -> str:
    return df.iloc[0].get("hasher", "SIFT")


def check_hasher(df1: pd.DataFrame, df2: pd.DataFrame):
    assert hasher_name(df1) == hasher_name(
        df2
    ), "The hashers must mach for deduplication to work."


def compute_pairs(
    match_df,
    query_df=None,
    hasher: LocalHasher | None = None,
    pct_probe=0.1,
    use_gpu: bool = True,
    faiss_cache_path: str | None = None,
    show_progress: bool = False,
):
    """Compute pairs of matching images from a reference
    dataframe.
    Args:
        match_df: A dataframe, as computed by build_reference_df, will compute pairs against self,
            unless query_df is provided.
        query_df: optional, if provided will be used to query against match_df for matches.
        threshold: The match threshold between two vectors.
        minimum_overlap: The minimum overlap between a pair of files.
        pct_probe: The percentage of the dataset to search for approximate
            search.
        faiss_cache_path: If provided load any existing faiss index from this path, and if
            it does not exist then save the generated faiss index to the path.
        show_progress: Whether or not to show a progress bar while computing pairs
    """
    match_df = match_df.dropna(subset=["descriptors"])
    counts = match_df["descriptor_count"].values.astype("uint32")
    descriptors = np.vstack(match_df["descriptors"].values)

    if hasher is None:
        hasher = SIFT()

    if query_df is None:
        assert (
            hasher_name(match_df) == hasher.name
        ), "The hasher must mach the original hash format."
        y_counts = None
        y_descriptors = None
    else:
        check_hasher(match_df, query_df)
        query_df = query_df.dropna(subset=["descriptors"])
        y_counts = query_df["descriptor_count"].values.astype("uint32")
        y_descriptors = np.vstack(query_df["descriptors"].values).astype("float32")
    LOGGER.debug("Computing euclid pairs aprox")
    pairs = ad.compute_euclidean_pairwise_duplicates_approx(
        X=descriptors.astype("float32"),
        counts=counts,
        threshold=hasher.threshold,
        minimum_overlap=hasher.overlap,
        pct_probe=pct_probe,
        Y=y_descriptors,
        y_counts=y_counts,
        use_gpu=use_gpu,
        faiss_cache_path=faiss_cache_path,
        show_progress=show_progress,
    )

    if query_df is None:
        query_df = match_df  # Assign query_df to be able to lookup matches.

    return [(query_df.iloc[p1].name, match_df.iloc[p2].name) for p1, p2 in pairs]


def compute_area(box):
    """Compute the area of a box given a set
    of points x1, y1, x2, y2.

    Args:
        box: A list of coordinates.
    """
    return (box[3] - box[1]) * (box[2] - box[0])


def compute_intersection(kps, filter_arr):
    """Compute the percentage of area covered by
    a set of filtered keypoints versus raw keypoints.

    Args:
        kps: A list of points
        filter_arr: A filter array of same length as kps_raw
            indicating whether to keep that keypoint.
    """
    kps_filtered = kps[filter_arr]
    box_after = np.hstack([kps_filtered.min(axis=0), kps_filtered.max(axis=0)])
    box_before = np.hstack([kps.min(axis=0), kps.max(axis=0)])
    area_before = compute_area(box_before)
    area_after = compute_area(box_after)
    return area_after / area_before


def compute_minimum_intersection(kp1, kp2, filter_arr1, filter_arr2):
    """Compute the minimum intersection between two pairs
    of keypoints (filtered and unfiltered).

    Args:
        kp1: A list of the first set of keypoints
        kp2: A list of the second set of keypoints
        filter_arr1: A filter array for the first set of keypoints
        filter_arr2: A filter array for the second set of keypoints
    """
    return min(
        compute_intersection(kp1, filter_arr1), compute_intersection(kp2, filter_arr2)
    )


def deduplicate_sift_dfs(*args, **kwargs):
    "DEPRECATED please use deduplicate_dfs."
    warn("deduplicate_sift_dfs is deprecated.", DeprecationWarning, stacklevel=2)
    deduplicate_dfs(*args, **kwargs)


def deduplicate_dfs(
    match_df: pd.DataFrame,
    query_df: pd.DataFrame | None = None,
    coarse_pct_probe: float = ad.DEFAULT_PCT_PROBE,
    max_workers: int | None = None,
    use_gpu: bool = True,
    faiss_cache_path: str | None = None,
    verbose: bool = False,
    hasher: LocalHasher | None = None,
    show_progress: bool = False,
) -> (
    list[tuple[typing.Any, typing.Any]]
    | list[tuple[typing.Any, typing.Any, MatchStats]]
):
    """Deduplicate images within one set of images or between two sets of images:
    #. Given a dataframe (or two) of descriptors and keypoints for images.
    #. Perform a coarse, approximate search for images with common features.
    #. For each candidate pair, validate it pairwise by checking the features
    and keypoints with the traditional approach using the ratio test. See
    validate_match for more information.
    Args:
        match_df: Dataframe of features to dedup within.
        query_df: If provided will search for matches between this and match_df, if None will
            just search match_df against itself.
        coarse_pct_probe: The minimum fraction of nearest lists to search. If
            the product of pct_probe and the number of lists is less
            than 1, one list will be searched.
        corase_threshold: The threshold for a match as a euclidean distance.
        minimum_coarse_overlap: The minimum overlap between two files to qualify as a match.
        minimum_validation_match: The minimum number of matches passing the ratio test.
        minimum_validation_intersection: The minimum overlapping area between the keypoints
            in the filtered set of matches and the original keypoints.
        minimum_validation_inliers: The minimum number of inliers for the transformation
            matrix.
        ratio: The ratio to use for Lowe's ratio test.
        max_workers: The maximum number of threads to use for doing the final validation
            step.
        faiss_cache_path: If provided load any existing faiss index from this path, and if
            it does not exist then save the generated faiss index to the path. Most helpful if
            doing multiple queries against the same match_df.
        verbose: return metada with matches such as overlap percent etc.
        show_progress: Whether or not to show a progress bar while computing duplicate file pairs
    Returns:
        A list of pairs of file duplicates.
        If verbose is true the tuple will be: (match_id1, match_id2, metadata_dict)
    """
    if hasher is None:
        hasher = SIFT()

    LOGGER.debug("Computing candidate pairs")
    candidates = compute_pairs(
        match_df,
        query_df,
        pct_probe=coarse_pct_probe,
        hasher=hasher,
        use_gpu=use_gpu,
        faiss_cache_path=faiss_cache_path,
        show_progress=show_progress,
    )

    if query_df is None:
        query_df = match_df

    assert (
        match_df.index.is_unique
    ), "Index of match_df must be unique, or it will cause wrong matches."
    assert (
        query_df.index.is_unique
    ), "Index of query_df must be unique, or it will cause wrong matches."

    LOGGER.debug("Validating candidate pairs: %d", len(candidates))
    keep: (
        list[tuple[typing.Any, typing.Any]]
        | list[tuple[typing.Any, typing.Any, MatchStats]]
    ) = []  # type: ignore
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        batch_size = 10_000
        for start in tqdm.tqdm(range(0, len(candidates), batch_size)):
            futures = {
                executor.submit(
                    hasher.validate_match,
                    descriptor1=query_df.loc[c1].to_dict(),
                    descriptor2=match_df.loc[c2].to_dict(),
                    minimum_match=hasher.validation_match,
                    minimum_inliers=hasher.validation_inliers,
                    minimum_intersection=hasher.validation_intersection,
                ): (c1, c2)
                for c1, c2 in candidates[start : start + batch_size]
            }
            for future in concurrent.futures.as_completed(futures):
                is_match, metadata = future.result()
                if is_match:
                    if verbose:
                        keep.append(
                            (futures[future][0], futures[future][1], metadata)  # type: ignore
                        )
                    else:
                        keep.append(futures[future])  # type: ignore
    LOGGER.debug("Validating complete, keeping: %d", len(keep))
    return keep


def deduplicate(
    filepaths_or_reference_df: typing.Iterable[str] | pd.DataFrame,
    query_filepaths_or_df: None | (typing.Iterable[str] | pd.DataFrame) = None,
    max_features: int = DEFAULT_MAX_FEATURES,
    min_features: int = DEFAULT_MIN_FEATURES,
    max_size: int = DEFAULT_MAX_SIZE,
    hasher: LocalHasher | None = None,
    show_progress: bool = False,
    **kwargs,
) -> (
    list[tuple[typing.Any, typing.Any]]
    | list[tuple[typing.Any, typing.Any, MatchStats]]
):
    """Deduplicate images by doing the following:
    #. Unletterbox all images and resize to some maximum size, preserving
       aspect ratio.
    #. Compute the descriptors and keypoints for all the resulting images.
    #. See `deduplicate_dfs` for remaining steps.
    Args:
        filepaths_or_reference_df: The list of images to deduplicate, or a precomputed
            descriptor DataFrame.
        query_filepaths_or_df: If provided will look for matches between these files and
            the files in the first param.
        max_features: The maximum number of features to
            extract.
        min_features: The minimum number of features to
            extract.
        max_size: The maximum side length for an image.
        show_progress: Whether or not to show a progress bar while building descriptors and
            computing pairs of file duplicates
    Returns:
        A list of pairs of file duplicates.
        If verbose is true the tuple will be: (match_id1, match_id2, metadata_dict)
    """
    if hasher is None:
        hasher = SIFT(max_features=max_features)

    if isinstance(filepaths_or_reference_df, pd.DataFrame):
        reference_df = filepaths_or_reference_df
    else:
        reference_df = build_reference_df(
            filepaths=filepaths_or_reference_df,
            hasher=hasher,
            min_features=min_features,
            max_size=max_size,
            show_progress=show_progress,
        )

    if query_filepaths_or_df is None:
        query_df = None
    else:
        if isinstance(query_filepaths_or_df, pd.DataFrame):
            query_df = query_filepaths_or_df
        else:
            query_df = build_reference_df(
                filepaths=query_filepaths_or_df,
                hasher=hasher,
                min_features=min_features,
                max_size=max_size,
                show_progress=show_progress,
            )

    return deduplicate_dfs(
        reference_df,
        query_df=query_df,
        hasher=hasher,
        show_progress=show_progress,
        **kwargs,
    )


================================================
FILE: perception/py.typed
================================================


================================================
FILE: perception/testing/__init__.py
================================================
import atexit
import math
import typing
from contextlib import ExitStack
from importlib import resources

import cv2
import numpy as np
import pandas as pd
import pytest
from PIL import Image

from .. import hashers, tools

SIZES = {"float32": 32, "uint8": 8, "bool": 1}


def get_low_detail_image():
    v = np.arange(0, 50, 1)
    v = np.concatenate([v, v[::-1]])[np.newaxis,]
    image = np.matmul(v.T, v)
    image = (image * 255 / image.max()).astype("uint8")
    image = image[..., np.newaxis].repeat(repeats=3, axis=2)
    image[:, 50:] = 0
    image[50:] = 0
    return image


LOW_DETAIL_IMAGE = get_low_detail_image()

file_manager = ExitStack()
atexit.register(file_manager.close)

DEFAULT_TEST_IMAGES = [
    str(
        file_manager.enter_context(
            resources.as_file(
                resources.files("perception") / "testing" / "images" / f"image{n}.jpg"
            )
        )
    )
    for n in range(1, 11)
]
DEFAULT_TEST_LOGOS = [
    str(
        file_manager.enter_context(
            resources.as_file(
                resources.files("perception") / "testing" / "logos" / "logoipsum.png"
            )
        )
    )
]
DEFAULT_TEST_VIDEOS = [
    str(
        file_manager.enter_context(
            resources.as_file(
                resources.files("perception") / "testing" / "videos" / f"v{n}.m4v"
            )
        )
    )
    for n in range(1, 3)
] + [
    str(
        file_manager.enter_context(
            resources.as_file(
                resources.files("perception") / "testing" / "videos" / "v2s.mov"
            )
        )
    )
]


@typing.no_type_check
def test_opencv_hasher(hasher: hashers.ImageHasher, image1: str, image2: str):
    # For OpenCV hashers we make sure the distance we compute
    # is the same as inside OpenCV
    f1 = image1
    f2 = image2
    opencv_distance = hasher.hasher.compare(
        hasher.hasher.compute(hashers.tools.read(f1)),
        hasher.hasher.compute(hashers.tools.read(f2)),
    )
    if hasher.distance_metric == "hamming":
        opencv_distance /= hasher.hash_length
    np.testing.assert_approx_equal(
        opencv_distance,
        hasher.compute_distance(hasher.compute(f1), hasher.compute(f2)),
        significant=4,
    )


def hash_dicts_to_df(hash_dicts, returns_multiple):
    assert all(
        h["error"] is None for h in hash_dicts
    ), "An error was found in the hash dictionaries"
    if returns_multiple:
        return pd.DataFrame(
            {
                "filepath": tools.flatten(
                    [[h["filepath"]] * len(h["hash"]) for h in hash_dicts]
                ),
                "hash": tools.flatten([h["hash"] for h in hash_dicts]),
            }
        ).assign(error=np.nan)
    return pd.DataFrame.from_records(hash_dicts).assign(error=np.nan)


def test_hasher_parallelization(hasher, test_filepaths):
    filepaths_10x = test_filepaths * 10
    if not hasher.allow_parallel:
        with pytest.warns(UserWarning, match="cannot be used in parallel"):
            hashes_parallel_dicts = hasher.compute_parallel(filepaths=filepaths_10x)
    else:
        hashes_parallel_dicts = hasher.compute_parallel(filepaths=filepaths_10x)
    hashes_sequential_dicts = [
        {"filepath": filepath, "hash": hasher.compute(filepath), "error": None}
        for filepath in filepaths_10x
    ]
    hashes_parallel = hash_dicts_to_df(
        hashes_parallel_dicts, returns_multiple=hasher.returns_multiple
    ).sort_values(["filepath", "hash"])
    hashes_sequential = hash_dicts_to_df(
        hashes_sequential_dicts, returns_multiple=hasher.returns_multiple
    ).sort_values(["filepath", "hash"])
    assert (hashes_sequential.hash.values == hashes_parallel.hash.values).all()
    assert (hashes_sequential.filepath.values == hashes_parallel.filepath.values).all()


def test_video_hasher_integrity(
    hasher: hashers.VideoHasher, test_videos: list[str] = DEFAULT_TEST_VIDEOS
):
    test_hasher_parallelization(hasher, test_videos)


def test_image_hasher_integrity(
    hasher: hashers.ImageHasher,
    pil_opencv_threshold: float,
    transform_threshold: float,
    test_images: list[str] = DEFAULT_TEST_IMAGES,
    opencv_hasher: bool = False,
):
    """Test to ensure a hasher works correctly.

    Args:
        hasher: The hasher to test.
        test_images: A list of filepaths to images to use for testing.
        pil_opencv_threshold: The hash distance permitted for an image
            when loaded with OpenCV vs. PIL.
        transform_threshold: The permitted error in isometric transform
            hashes.
        opencv_hasher: Whether the hasher is an OpenCV hasher. Used to
            determine whether to check for consistent distances.
    """
    assert len(test_images) >= 2, "You must provide at least two test images."
    image1 = test_images[0]
    image2 = test_images[1]
    hash1_1 = str(hasher.compute(image1))  # str() games for mypy, not proud
    hash1_2 = str(hasher.compute(Image.open(image1)))
    image_cv = cv2.imread(image1)
    assert image_cv is not None, f"Failed to load image: {image1}"
    hash1_3 = str(hasher.compute(cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB)))

    hash2_1 = str(hasher.compute(image2))

    # There is a small distance because PIL and OpenCV read
    # JPEG images a little differently (e.g., libjpeg-turbo vs. libjpeg)
    assert hasher.compute_distance(hash1_1, hash1_2) < pil_opencv_threshold
    assert hasher.compute_distance(hash1_1, hash2_1) > pil_opencv_threshold
    assert hasher.compute_distance(hash1_1, hash1_3) == 0

    # Ensure the conversion to and from vectors works for both base64 and hex.
    assert hasher.vector_to_string(hasher.string_to_vector(hash2_1)) == hash2_1
    assert (
        hasher.vector_to_string(
            hasher.string_to_vector(
                str(
                    hasher.vector_to_string(
                        hasher.string_to_vector(hash2_1), hash_format="hex"
                    )
                ),
                hash_format="hex",
            )
        )
        == hash2_1
    )

    # Ensure parallelization works properly.
    test_hasher_parallelization(hasher=hasher, test_filepaths=test_images)

    # Ensure the isometric hashes computation work properly
    for image in test_images:
        transforms = hashers.tools.get_isometric_transforms(image)
        hashes_exp = {
            key: str(hasher.compute(value)) for key, value in transforms.items()
        }
        hashes_act = hasher.compute_isometric(transforms["r0"])
        for transform_name in hashes_exp.keys():
            assert (
                hasher.compute_distance(
                    hashes_exp[transform_name], hashes_act[transform_name]
                )
                < transform_threshold
            )

    # Verify that hashes are the correct length.
    hash_bits = hasher.hash_length * SIZES[hasher.dtype]

    words_base64 = math.ceil(hash_bits / 6)  # Base64 uses 8 bits for every 6 bits
    words_base64 += (
        0 if words_base64 % 4 == 0 else 4 - (words_base64 % 4)
    )  # Base64 always uses multiples of four
    assert len(hash2_1) == words_base64

    words_hex = 2 * math.ceil(hash_bits / 8)  # Hex uses 16 bits for every 8 bits
    words_hex += (
        0 if words_hex % 2 == 0 else 1
    )  # Two characters for every one character.
    assert (
        len(
            str(
                hasher.vector_to_string(
                    hasher.string_to_vector(hash2_1), hash_format="hex"
                )
            )
        )
        == words_hex
    )

    # Verify that low quality images yield zero quality
    image = np.zeros((100, 100, 3)).astype("uint8")  # type: ignore
    _, quality = hasher.compute_with_quality(image)
    assert quality == 0

    # Verify that high quality images yield high quality
    # scores.
    assert (
        min(hasher.compute_with_quality(filepath)[1] for filepath in test_images) == 100
    )

    # Verify that medium quality images yield medium quality
    _, quality = hasher.compute_with_quality(LOW_DETAIL_IMAGE)
    assert 0 < quality < 100

    if opencv_hasher:
        test_opencv_hasher(hasher, image1, image2)


================================================
FILE: perception/testing/images/README.md
================================================
# Sample images
These images were obtained from Wikimedia Commons.

- [Image 1](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:ADAC-Zentrale,_Munich,_March_2017-05.jpg)
- [Image 2](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Two-tailed_pasha_(Charaxes_jasius_jasius)_Greece.jpg)
- [Image 3](https://commons.wikimedia.org/wiki/Main_Page#/media/File:Escolta_presidencial,_Plaza_de_Armas,_Lima,_Per%C3%BA,_2015-07-28,_DD_40.JPG)
- [Image 4](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Iglesia_de_Ntra._Sra._de_la_Junquera,_Luesma,_Zaragoza,_Espa%C3%B1a,_2017-01-04,_DD_60.jpg)
- [Image 5](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Bahrain_Fort_March_2015.JPG)
- [Image 6](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:ET_Gondar_asv2018-02_img18_Fasil_Ghebbi.jpg)
- [Image 7](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:M%C3%BCnster,_Beresa,_Mercedes-Benz_C-Klasse_Cabrio_--_2018_--_1757.jpg)
- [Image 8](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Panoramic_sunset_in_Conques_02.jpg)
- [Image 9](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Catedral_de_San_Basilio,_Mosc%C3%BA,_Rusia,_2016-10-03,_DD_05-06_HDR.jpg)
- [Image 10](https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day#/media/File:Tupolev_Tu-160_overflying_Moscow_fix.jpg)

================================================
FILE: perception/testing/logos/README.md
================================================
# Sample Logos
These logos were obtained from free sources.

- [LogoIpsum](https://logoipsum.com/)

================================================
FILE: perception/testing/videos/README.md
================================================
Video from https://www.youtube.com/watch?v=84Er4LnWXtI under Creative Commons Attribution License.

Notes
- v1 is a fairly short, slow moving video
- v2 is a longer but faster-paced video
- v2s is the same as v2 but with a snippet removed in the middle (simulates a scene or cut)

================================================
FILE: perception/tools.py
================================================
import base64
import json
import os
import urllib.parse
import urllib.request
import warnings

import numpy as np
from scipy import spatial
from tqdm import tqdm

from . import hashers as perception_hashers
from .utils import flatten

try:
    from . import extensions  # type: ignore
except ImportError:
    warnings.warn(
        "C extensions were not built. Some metrics will be computed more slowly. "
        "Please install from wheels or set up a compiler prior to installation "
        "from source to use extensions."
    )
    extensions = None


def _multiple_hashes_for_ids(hashes: list[tuple[str, str | np.ndarray]]):
    """Check if a list of (hash_id, hash) tuples has more
    than one hash for a hash_id.

    Args:
        hashes: A list of (hash_id, hash) tuples.
    """
    hash_ids = [hash_id for hash_id, _ in hashes]
    return len(hash_ids) != len(set(hash_ids))


def deduplicate_hashes(
    hashes: list[tuple[str, str | np.ndarray]],
    threshold: float,
    hash_format: str = "base64",
    hasher: perception_hashers.ImageHasher | None = None,
    hash_length: int | None = None,
    hash_dtype: str | None = None,
    distance_metric: str | None = None,
    progress: tqdm | None = None,
) -> list[tuple[str, str]]:
    """Find duplicates using a list of precomputed hashes.

    Args:
        hashes: A list of (id, hash) tuples
        threshold: A distance threshold
        hasher: A hasher to use for computing distances
        progress: A tqdm object for reporting progress

    Returns:
        A list of duplicated id pairs. To use, you can just remove the
        first entry of each pair from your dataset. The pairs are provided
        in the event that you wish to apply further analysis.
    """
    assert (
        hash_length is not None
        and hash_dtype is not None
        and distance_metric is not None
    ) or (hasher is not None), (
        "You must provide either `hasher` or all of "
        "`hash_length`, `hash_dtype`, and `distance_metric`."
    )
    if hasher is not None:
        assert all(
            k is None for k in [hash_length, hash_dtype, distance_metric]
        ), "If hasher is provided, hash_length, hash_dtype, and distance_metric must all be None."
        hash_length = hasher.hash_length
        hash_dtype = hasher.dtype
        distance_metric = hasher.distance_metric
    assert hash_length is not None
    assert isinstance(hash_dtype, str)
    assert isinstance(distance_metric, str)
    # If there is more than one hash for an id, we want them
    # to be sequential in case we are able to use the more
    # efficient distance calculation (compute_euclidean_pairwise_duplicates)
    # that skips computation of distance between two hashes for the same file.
    multiple_hashes_per_id = _multiple_hashes_for_ids(hashes)
    if multiple_hashes_per_id:
        hashes = sorted(hashes)
    vectors = np.array(
        [
            (
                perception_hashers.tools.string_to_vector(
                    hash_string=hash_string_or_vector,
                    hash_format=hash_format,
                    hash_length=hash_length,
                    dtype=hash_dtype,
                )
                if isinstance(hash_string_or_vector, str)
                else hash_string_or_vector
            )
            for _, hash_string_or_vector in hashes
        ]
    )
    files = np.array([identifier for identifier, _ in hashes])
    pairs: list[tuple[str, str]] = []
    n_hashes = len(vectors)
    start_idx = 0
    end_idx = None
    if distance_metric != "euclidean" or "int" not in hash_dtype or extensions is None:
        iterator = range(n_hashes)
        if progress is not None:
            iterator = progress(iterator, total=n_hashes, desc="Deduplicating.")  # type: ignore[operator]
        distances = spatial.distance.pdist(vectors, metric=distance_metric)
        for hash_index in iterator:
            if end_idx is not None:
                start_idx = end_idx
            end_idx = start_idx + (n_hashes - hash_index - 1)
            current_distances = distances[start_idx:end_idx]
            duplicated_files = files[hash_index + 1 :][current_distances < threshold]
            current_file = files[hash_index]
            # We have to make sure the two files are not the same file
            # because it can happen for highly symmetric images when
            # we are including isometric hashes.
            pairs.extend(
                [
                    (current_file, duplicated_file)
                    for duplicated_file in duplicated_files
                    if duplicated_file != current_file
                ]
            )
    else:
        # We want to count the number of hashes for each unique hash ID. There
        # may be more than one -- for example in the case of video. We need
        # this so we can pass it to the compute_euclidean_pairwise_duplicates
        # function.
        if multiple_hashes_per_id:
            counts = np.zeros(shape=len({hash_id for hash_id, _ in hashes})).astype(
                "uint32"
            )
            previous_hash_id = None
            counts_idx = 0
            files_ = (
                []  # make type check happy
            )  # We're going to re-build the IDs with deduplicated files.
            for hash_id, _ in hashes:
                if hash_id != previous_hash_id:
                    files_.append(hash_id)
                if previous_hash_id is not None and hash_id != previous_hash_id:
                    counts_idx += 1
                counts[counts_idx] += 1
                previous_hash_id = hash_id
            files = np.array(files_)
        else:
            counts = None  # type: ignore
        pairs = [
            (files[idx1], files[idx2])
            for idx1, idx2 in extensions.compute_euclidean_pairwise_duplicates_simple(
                vectors.astype("int32"), threshold=threshold, counts=counts
            )
        ]
    return list(set(pairs))


def deduplicate(
    files: list[str],
    hashers: list[tuple[perception_hashers.ImageHasher, float]],
    isometric: bool = False,
    progress: tqdm | None = None,
) -> list[tuple[str, str]]:
    """Find duplicates in a list of files.

    Args:
        files: A list of filepaths.
        hashers: A list of tuples of the form (hasher, threshold)
        isometric: Whether to compare the rotated versions of the images
        progress: A tqdm progress indicator

    Returns:
        A list of duplicated file pairs. To use, you can just remove the
        first entry of each pair from your dataset. The pairs are provided
        in the event that you wish to apply further analysis.
    """
    files_dedup = set(files)
    if len(files_dedup) != len(files):
        warnings.warn(
            message="Duplicate file paths were provided. These will be automatically removed.",
            category=UserWarning,
        )
        files = list(files_dedup)
    pairs: list[tuple[str, str]] = []
    for hasher_idx, (hasher, threshold) in enumerate(hashers):
        hash_dicts = hasher.compute_parallel(
            filepaths=files,
            progress=progress,
            progress_desc=f"Computing hashes for hash {hasher_idx+1} of {len(hashers)}.",
            isometric=isometric,
        )
        hash_list = sorted(hash_dicts, key=lambda h: h["filepath"])
        if isometric:
            hash_list = flatten(
                [
                    list(row["hash"].values())
                    for row in hash_dicts
                    if row["error"] is None
                ]
            )
            files_for_hashes = flatten(
                [[row["filepath"]] * 8 for row in hash_dicts if row["error"] is None]
            )
        elif hasher.returns_multiple:
            hash_list = flatten(
                [row["hash"] for row in hash_dicts if row["error"] is None]
            )
            files_for_hashes = flatten(
                [[row["filepath"]] * 8 for row in hash_dicts if row["error"] is None]
            )
        else:
            hash_list = [row["hash"] for row in hash_dicts if row["error"] is None]
            files_for_hashes = [
                row["filepath"] for row in hash_dicts if row["error"] is None
            ]
        pairs.extend(
            deduplicate_hashes(
                hashes=list(zip(files_for_hashes, hash_list)),
                hasher=hasher,
                threshold=threshold,
                progress=progress,
            )
        )
    return list(set(pairs))


class SaferMatcher:
    """An object for matching hashes with the known CSAM hashes in the
    Safer matching service.
    Please contact `info@getsafer.io <mailto:info@getsafer.io>`_
    for details on obtaining credentials and information on how match
    responses are provided.

    Here's a minimalist example:

    .. code-block:: python

        from perception import hashers, tools

        hasher = hashers.PHash(hash_size=16)
        matches = hashers.tools.SaferMatcher(
            api_key='YOUR_API_KEY',
            username='YOUR_USERNAME', # You only need to provide
            password='YOUR_PASSWORD', # an API key OR username/password.
            url='MATCHING_SERVICE_URL'
        )

    For authentication, you must provide the API key OR username and password pair.
    If neither is provided, the function will attempt to find them as environment
    variables with names :code:`SAFER_MATCHING_SERVICE_API_KEY`,
    :code:`SAFER_MATCHING_SERVICE_USERNAME`, and :code:`SAFER_MATCHING_SERVICE_PASSWORD`,
    respectively. You must also provide the URL endpoint for the matching service,
    either as a keyword argument or as a :code:`SAFER_MATCHING_SERVICE_URL`
    environment variable.

    Args:
        api_key: A base64 encoded set of matching service credentials
        username: Matching service username
        password: Matching service password
        url: Safer matching service URL
        hasher: A hasher to use for matching
        hasher_api_id: The hasher ID for finding matches.
        quality_threshold: The quality threshold filter to use
    """

    def __init__(
        self,
        api_key: str | None = None,
        username: str | None = None,
        password: str | None = None,
        url: str | None = None,
        hasher: perception_hashers.ImageHasher | None = None,
        hasher_api_id: str | None = None,
        quality_threshold: int = 90,
    ):
        if (
            username is None
            and password is None
            and api_key is None
            and os.environ.get("SAFER_MATCHING_SERVICE_USERNAME") is not None
            and os.environ.get("SAFER_MATCHING_SERVICE_PASSWORD") is not None
        ):
            username = os.environ["SAFER_MATCHING_SERVICE_USERNAME"]
            password = os.environ["SAFER_MATCHING_SERVICE_PASSWORD"]
        if username is not None and password is not None:
            credentials = f"{username}:{password}"
            api_key = base64.b64encode(credentials.encode("ascii")).decode("ascii")
        if api_key is None:
            api_key = os.environ.get("SAFER_MATCHING_SERVICE_API_KEY")
            if api_key is None:
                raise ValueError(
                    "You must provide one of (1) API key, (2) API key provided as "
                    "`SAFER_MATCHING_SERVICE_API_KEY` env var, (3) username and password or "
                    "(4) username and password as `SAFER_MATCHING_SERVICE_USERNAME` and "
                    "`SAFER_MATCHING_SERVICE_PASSWORD` env vars."
                )
        if url is None:
            url = os.environ.get("SAFER_MATCHING_SERVICE_URL")
            if url is None:
                raise ValueError(
                    "You must provide either the url or the SAFER_MATCHING_SERVICE_URL env var."
                )
        if urllib.parse.urlparse(url).scheme != "https" and not os.environ.get(
            "SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP"
        ):
            raise ValueError("You must provide an url that begins with `https://`.")
        self.api_key = api_key
        self.url = url
        if hasher is None:
            hasher = perception_hashers.PHash(hash_size=16, highfreq_factor=4)
        if hasher_api_id is None:
            hasher_api_id = "phash"
        self.hasher = hasher
        self.hasher_api_id = hasher_api_id
        self.quality_threshold = quality_threshold

    def match(
        self,
        images: list[(str | tuple[perception_hashers.tools.ImageInputType, str])],
    ) -> dict:
        """Match hashes with the Safer matching service.

        Args:
            images: A list of image filepaths or (image_like, image_id) tuples.

        Returns:
            A dictionary of matches. See Safer matching service documentation (
            contact Thorn for a copy).
        """
        raw_hashes = [
            self.hasher.compute_with_quality(
                image if isinstance(image, str) else image[0]
            )
            for image in images
        ]
        hashes = [
            {
                "id": image if isinstance(image, str) else image[1],
                self.hasher_api_id: hash_string,
                "md5": (
                    perception_hashers.tools.compute_md5(image)
                    if isinstance(image, str)
                    else (
                        perception_hashers.tools.compute_md5(image[0])
                        if isinstance(image[0], str)
                        else None
                    )
                ),
            }
            for image, (hash_string, quality) in zip(images, raw_hashes)
            if quality > self.quality_threshold
        ]
        for hash_dict in hashes:
            # We cannot include an md5 key if we don't
            # have the md5.
            if hash_dict["md5"] is None:
                del hash_dict["md5"]
        if not hashes:
            warnings.warn(
                message="No images of sufficient quality were found.",
                category=UserWarning,
            )
            return {}
        body = {"hashes": hashes, "version": "v2"}
        headers = {
            "Authorization": f"Basic {self.api_key}",
            "Content-Type": "application/json",
        }
        req = urllib.request.Request(
            url=self.url,
            data=str(json.dumps(body)).encode("utf-8"),
            headers=headers,
            method="POST",
        )
        with urllib.request.urlopen(req) as res:
            ret = json.loads(res.read().decode("utf-8"))
        return ret


================================================
FILE: perception/utils.py
================================================
def flatten(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]


================================================
FILE: poetry.toml
================================================
[virtualenvs]
create = true
in-project = true


================================================
FILE: pyproject.toml
================================================
[project]
name = "Perception"
dynamic = ["version"]
description = "Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use."
authors = [{ name = "Thorn", email = "info@wearethorn.org" }]
license = "Apache-2.0"
readme = "README.md"
requires-python = ">=3.10,<4.0"
dependencies = [
  "Cython>=3.0.0,<4.0.0",
  "numpy>=1.26.4,<3.0.0",
  "opencv-contrib-python-headless>=4.10.0,<5.0.0",
  "faiss-cpu>=1.8.0,<2.0.0",
  "networkit>=11.1,<12.0.0; sys_platform != 'darwin'",
  "networkx>=3.0,<4.0; sys_platform == 'darwin'",
  "pandas",
  "Pillow",
  "pywavelets>=1.5.0,<2.0.0",
  "validators>=0.22.0,<1.0.0",
  "rich>=13.7.0,<14.0.0",
  "scipy",
  "tqdm>=4.67.1,<5.0.0",
]


[project.optional-dependencies]
benchmarking = [
  "matplotlib",
  "albumentations>=2.0.8,<3.0.0",
  "tabulate",
  "scikit-learn",
  "ffmpeg-python",
]
matching = ["aiohttp", "python-json-logger"]
pdq = ["pdqhash>=0.2.7,<0.3.0"]


[tool.poetry]
version = "0.0.0"


[tool.poetry.group.dev.dependencies]
black = "^26"
coverage = "*"
ipython = "*"
mypy = "*"
pandas-stubs = "*"
pre-commit = "*"
pytest = "*"
pytest-cov = "*"
ruff = "*"
types-pillow = "*"
types-tqdm = "*"
twine = "*"
albumentations = "^2.0.8"


[tool.poetry.build]
script = "build.py"
generate-setup-file = true

[tool.mypy]
exclude = ["/tests/"]
check_untyped_defs = true
ignore_missing_imports = true

[tool.poetry-dynamic-versioning]
enable = true
vcs = "git"

[build-system]
requires = [
  "poetry-core",
  "poetry-dynamic-versioning",
  "numpy",
  "Cython",
  "setuptools",
  "wheel",
]
build-backend = "poetry_dynamic_versioning.backend"


================================================
FILE: setup.py
================================================
# -*- coding: utf-8 -*-
from setuptools import setup

packages = [
    "perception",
    "perception.approximate_deduplication",
    "perception.benchmarking",
    "perception.hashers",
    "perception.hashers.image",
    "perception.hashers.video",
    "perception.testing",
]

package_data = {"": ["*"], "perception.testing": ["images/*", "logos/*", "videos/*"]}

extras_require = {
    "benchmarking": [
        "matplotlib",
        "scipy",
        "albumentations",
        "tabulate",
        "scikit-learn",
        "ffmpeg-python",
    ],
    "experimental": ["networkit", "faiss-cpu"],
    "matching": ["aiohttp", "python-json-logger"],
}

setup_kwargs = {
    "name": "Perception",
    "version": "0.0.0",
    "description": "Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use.",
    "long_description": "# perception ![ci](https://github.com/thorn-oss/perception/workflows/ci/badge.svg)\n\n`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. See [the documentation](https://perception.thorn.engineering/en/latest/) for details.\n\n## Background\n\n`perception` was initially developed at [Thorn](https://www.thorn.org) as part of our work to eliminate child sexual abuse material from the internet. For more information on the issue, check out [our CEO's TED talk](https://www.thorn.org/blog/time-is-now-eliminate-csam/).\n\n## Getting Started\n\n### Installation\n\n`pip install perception`\n\n### Hashing\n\nHashing with different functions is simple with `perception`.\n\n```python\nfrom perception import hashers\n\nfile1, file2 = 'test1.jpg', 'test2.jpg'\nhasher = hashers.PHash()\nhash1, hash2 = hasher.compute(file1), hasher.compute(file2)\ndistance = hasher.compute_distance(hash1, hash2)\n```\n\n### Examples\n\nSee below for end-to-end examples for common use cases for perceptual hashes.\n\n- [Detecting child sexual abuse material](https://perception.thorn.engineering/en/latest/examples/detecting_csam.html)\n- [Deduplicating media](https://perception.thorn.engineering/en/latest/examples/deduplication.html)\n- [Benchmarking perceptual hashes](https://perception.thorn.engineering/en/latest/examples/benchmarking.html)\n\n## Supported Hashing Algorithms\n\n`perception` currently ships with:\n\n- pHash (DCT hash) (`perception.hashers.PHash`)\n- Facebook's PDQ Hash (`perception.hashers.PDQ`)\n- dHash (difference hash) (`perception.hashers.DHash`)\n- aHash (average hash) (`perception.hashers.AverageHash`)\n- Marr-Hildreth (`perception.hashers.MarrHildreth`)\n- Color Moment (`perception.hashers.ColorMoment`)\n- Block Mean (`perception.hashers.BlockMean`)\n- wHash (wavelet hash) (`perception.hashers.WaveletHash`)\n\n## Contributing\n\nTo work on the project, start by doing the following.\n\n```bash\n# Install local dependencies for\n# code completion, etc.\nmake init\n\n- To do a (close to) comprehensive check before committing code, you can use `make precommit`.\n\nTo implement new features, please first file an issue proposing your change for discussion.\n\nTo report problems, please file an issue with sample code, expected results, actual results, and a complete traceback.\n\n## Alternatives\n\nThere are other packages worth checking out to see if they meet your needs for perceptual hashing. Here are some\nexamples.\n\n- [dedupe](https://github.com/dedupeio/dedupe)\n- [imagededup](https://idealo.github.io/imagededup/)\n- [ImageHash](https://github.com/JohannesBuchner/imagehash)\n- [PhotoHash](https://github.com/bunchesofdonald/photohash)\n```\n",
    "author": "Thorn",
    "author_email": "info@wearethorn.org",
    "maintainer": "None",
    "maintainer_email": "None",
    "url": "None",
    "packages": packages,
    "package_data": package_data,
    "extras_require": extras_require,
    "python_requires": ">=3.10,<4.0",
}
from build import *

build(setup_kwargs)

setup(**setup_kwargs)


================================================
FILE: tests/test_approximate_deduplication.py
================================================
import perception.approximate_deduplication as ad


def get_cluster_members(assignments):
    clusters: dict[int, list[str]] = {}
    for assignment in assignments:
        clusters.setdefault(assignment["cluster"], []).append(assignment["id"])
    return sorted(sorted(members) for members in clusters.values())


def test_pairs_to_clusters_component_strictness():
    assignments = ad.pairs_to_clusters(
        ids=["a", "b", "c", "d"],
        pairs=[("a", "b"), ("b", "c")],
        strictness="component",
    )

    assert get_cluster_members(assignments) == [["a", "b", "c"], ["d"]]


def test_pairs_to_clusters_community_strictness():
    assignments = ad.pairs_to_clusters(
        ids=["a", "b", "c"],
        pairs=[("a", "b"), ("b", "c")],
        strictness="community",
    )

    assert get_cluster_members(assignments) == [["a", "b", "c"]]


def test_pairs_to_clusters_clique_strictness():
    assignments = ad.pairs_to_clusters(
        ids=["a", "b", "c", "d"],
        pairs=[("a", "b"), ("a", "c"), ("b", "c"), ("c", "d")],
        strictness="clique",
    )

    assert get_cluster_members(assignments) == [["a", "b", "c"], ["d"]]


================================================
FILE: tests/test_benchmarking.py
================================================
import base64
import os
import shutil
import tempfile

import numpy as np
import pytest
import albumentations
from scipy import spatial

from perception import benchmarking, hashers, testing
from perception.benchmarking import video_transforms
from perception.benchmarking.image import BenchmarkImageDataset
from perception.benchmarking.video import BenchmarkVideoDataset

files = testing.DEFAULT_TEST_IMAGES
dataset = BenchmarkImageDataset.from_tuples([(fn, i % 2) for i, fn in enumerate(files)])


def test_deduplicate():
    tempdir = tempfile.TemporaryDirectory()
    new_file = os.path.join(tempdir.name, "dup_file.jpg")
    shutil.copy(files[0], new_file)
    duplicated_files = files + [new_file]
    deduplicated, duplicates = BenchmarkImageDataset.from_tuples(
        [(fn, i % 2) for i, fn in enumerate(duplicated_files)]
    ).deduplicate(hasher=hashers.AverageHash(), threshold=1e-2)
    assert len(duplicates) == 1
    assert len(deduplicated._df) == len(files)


def test_bad_dataset():
    bad_files = files + ["tests/images/nonexistent.jpg"]
    bad_dataset = BenchmarkImageDataset.from_tuples(
        [(fn, i % 2) for i, fn in enumerate(bad_files)]
    )
    transforms = {
        "blur0.05": albumentations.GaussianBlur(sigma_limit=0.05, p=1),
        "noop": albumentations.Resize(height=256, width=256, p=1),
    }
    with pytest.raises(Exception):
        transformed = bad_dataset.transform(
            transforms=transforms, storage_dir="/tmp/transforms", errors="raise"
        )
    with pytest.warns(UserWarning, match="occurred reading"):
        transformed = bad_dataset.transform(
            transforms=transforms, storage_dir="/tmp/transforms", errors="warn"
        )
    assert len(transformed._df) == len(files) * 2


def test_benchmark_dataset():
    assert len(dataset._df) == len(files)
    assert len(dataset.filter(category=[0])._df) == len(files) / 2
    with pytest.warns(UserWarning, match="Did not find"):
        assert len(dataset.filter(category=[3])._df) == 0

    dataset.save("/tmp/dataset.zip")
    dataset.save("/tmp/dataset_folder")
    o1 = BenchmarkImageDataset.load("/tmp/dataset.zip")
    o2 = BenchmarkImageDataset.load("/tmp/dataset_folder")
    o3 = BenchmarkImageDataset.load("/tmp/dataset.zip")

    for opened in [o1, o2, o3]:
        assert (
            opened._df["filepath"].apply(os.path.basename)
            == dataset._df["filepath"].apply(os.path.basename)
        ).all()


def test_benchmark_transforms():
    transformed = dataset.transform(
        transforms={
            "blur0.05": albumentations.GaussianBlur(sigma_limit=0.05, p=1),
            "noop": albumentations.Resize(height=256, width=256, p=1),
        },
        storage_dir="/tmp/transforms",
    )

    assert len(transformed._df) == len(files) * 2

    hashes = transformed.compute_hashes(hashers={"pdna": hashers.PHash()})
    tr = hashes.compute_threshold_recall().reset_index()

    hashes._metrics = None
    hashes._df.at[0, "hash"] = None
    with pytest.warns(UserWarning, match="invalid / empty hashes"):
        hashes.compute_threshold_recall()

    assert (tr[tr["transform_name"] == "noop"]["recall"] == 100.0).all()

    # This is a charting function but we execute it just to make sure
    # it runs without error.
    hashes.show_histograms()


def convert_hash_string_to_vector(hash_string):
    buff = base64.decodebytes(hash_string.encode("utf-8"))
    return np.frombuffer(buff, dtype=np.uint8)


def test_video_benchmark_dataset():
    video_dataset = BenchmarkVideoDataset.from_tuples(
        files=[
            ("perception/testing/videos/v1.m4v", "category1"),
            ("perception/testing/videos/v2.m4v", "category1"),
            ("perception/testing/videos/v1.m4v", "category2"),
            ("perception/testing/videos/v2.m4v", "category2"),
        ]
    )
    transforms = {
        "noop": video_transforms.get_simple_transform(width=128, sar="1/1"),
        "gif": video_transforms.get_simple_transform(codec="gif", output_ext=".gif"),
        "clip1s": video_transforms.get_simple_transform(clip_s=(1, None)),
        "blackpad": video_transforms.get_black_frame_padding_transform(duration_s=1),
        "slideshow": video_transforms.get_slideshow_transform(
            frame_input_rate=1, frame_output_rate=1
        ),
    }
    transformed = video_dataset.transform(
        storage_dir=tempfile.TemporaryDirectory().name, transforms=transforms
    )
    assert len(transformed._df) == len(transforms) * len(video_dataset._df)
    assert transformed._df["filepath"].isnull().sum() == 0

    # We will compute hashes for each of the transformed
    # videos and check the results for correctness.
    phash_framewise_hasher = hashers.FramewiseHasher(
        frame_hasher=hashers.PHash(), interframe_threshold=-1, frames_per_second=2
    )
    hashes = transformed.compute_hashes(
        hashers={"phashframewise": phash_framewise_hasher}
    )

    guid = hashes._df.guid.iloc[0]
    df = hashes._df[hashes._df["guid"] == guid]
    clip1s = df[(df.transform_name == "clip1s")]
    noop = df[(df.transform_name == "noop")]
    blackpad = df[(df.transform_name == "blackpad")]
    slideshow = df[(df.transform_name == "slideshow")]

    # We should have dropped two hashes from the beginning
    # on the clipped video.
    assert len(clip1s) == len(noop) - 2

    # The first hash from the clipped video should be the
    # same as the third hash from the original
    np.testing.assert_allclose(
        convert_hash_string_to_vector(clip1s.hash.iloc[0]),
        convert_hash_string_to_vector(noop.hash.iloc[2]),
        rtol=0.2,
    )

    # The black padding adds four hashes (two on either side).
    assert len(blackpad) == len(noop) + 4

    # A black frame should yield all zeros for PHash
    assert phash_framewise_hasher.string_to_vector(blackpad.iloc[0].hash).sum() == 0

    # The slideshow hashes should be the same as the noop
    # hashes for every other hash.
    # Note: this is a weird test structure now because the original test, which was
    # assert (noop.hash.values[::2] == slideshow.hash.values[::2]).all()
    # kept failing because of 1 bit difference in 1 hash. This is keeps the same
    # spirit, but is more complex with a little leniency. We suspect the difference is
    # due to some versioning. So might be worthwhile to try replacing the test with the
    # original one occasionally.
    noop_hash_vectors = [
        convert_hash_string_to_vector(h) for h in noop.hash.values[::2]
    ]
    slideshow_hash_vectors = [
        convert_hash_string_to_vector(h) for h in slideshow.hash.values[::2]
    ]
    total_missed_bits = 0
    for noop_vector, slideshow_vector in zip(noop_hash_vectors, slideshow_hash_vectors):
        for n in range(0, len(noop_vector)):
            if noop_vector[n] != slideshow_vector[n]:
                total_missed_bits += 1
    assert total_missed_bits <= 4

    # Every second hash in the slideshow should be the same as the
    # previous one.
    for n in range(0, 10, 2):
        assert slideshow.hash.values[n] == slideshow.hash.values[n + 1]


def test_euclidean_extension():

    # This function plainly inplements the process of computing
    # the closest positive and negative examples and their indexes.
    def compute_euclidean_metrics_py(X_noop, X_transformed, mask):
        distance_matrix = spatial.distance.cdist(
            XA=X_transformed, XB=X_noop, metric="euclidean"
        )
        pos = np.ma.masked_array(distance_matrix, np.logical_not(mask))
        neg = np.ma.masked_array(distance_matrix, mask)
        distances = np.concatenate(
            [neg.min(axis=1).data[np.newaxis], pos.min(axis=1).data[np.newaxis]], axis=0
        ).T
        indexes = np.concatenate(
            [
                neg.argmin(axis=1)[np.newaxis],
                pos.argmin(axis=1)[np.newaxis],
            ]
        ).T
        return distances, indexes

    X_noop = np.random.uniform(low=0, high=255, size=(5, 144)).astype("int32")
    X_trans = np.random.uniform(low=0, high=255, size=(10, 144)).astype("int32")
    mask = np.array([True, False] * 5 * 5).reshape(10, 5)

    distances, indexes = benchmarking.common.extensions.compute_euclidean_metrics(
        X_noop, X_trans, mask
    )
    distances_py, indexes_py = compute_euclidean_metrics_py(X_noop, X_trans, mask)

    assert (indexes_py == indexes).all()
    np.testing.assert_allclose(distances, distances_py)


================================================
FILE: tests/test_hashers.py
================================================
import os
import string

import pytest

from perception import hashers, testing
from perception.hashers.image.pdq import PDQHash

TEST_IMAGES = [os.path.join("tests", "images", f"image{n}.jpg") for n in range(1, 11)]


# The PDQ hash isometric computation is inexact. See
# https://github.com/faustomorales/pdqhash-python/blob/master/tests/test_compute.py
# for details.
@pytest.mark.parametrize(
    "hasher_class,pil_opencv_threshold,transform_threshold,opencv_hasher",
    [
        (hashers.AverageHash, 0.1, 0.1, False),
        (hashers.WaveletHash, 0.1, 0.1, False),
        (hashers.PHash, 0.1, 0.1, False),
        (PDQHash, 0.1, 0.15, False),
        (hashers.DHash, 0.1, 0.1, False),
        (hashers.MarrHildreth, 0.1, 0.1, True),
        (hashers.BlockMean, 0.1, 0.1, True),
        (hashers.ColorMoment, 10, 0.1, True),
    ],
)
def test_image_hashing_common(
    hasher_class, pil_opencv_threshold, transform_threshold, opencv_hasher
):
    testing.test_image_hasher_integrity(
        hasher=hasher_class(),
        pil_opencv_threshold=pil_opencv_threshold,
        transform_threshold=transform_threshold,
        opencv_hasher=opencv_hasher,
    )


def test_video_hashing_common():
    testing.test_video_hasher_integrity(
        hasher=hashers.FramewiseHasher(
            frame_hasher=hashers.PHash(hash_size=16),
            interframe_threshold=0.1,
            frames_per_second=1,
        )
    )


def test_video_reading():
    # We should get one red, one green, and one blue frame
    for frame, _, timestamp in hashers.tools.read_video(
        filepath="perception/testing/videos/rgb.m4v", frames_per_second=0.5
    ):
        assert timestamp in [0.0, 2.0, 4.0]
        channel = int(timestamp / 2)
        assert frame[:, :, channel].min() > 220
        for other in [0, 1, 2]:
            if other == channel:
                continue
            assert frame[:, :, other].max() < 20


def test_common_framerate():
    assert hashers.tools.get_common_framerates(
        dict(zip(["a", "b", "c"], [1 / 3, 1 / 2, 1 / 5]))
    ) == {1.0: ("a", "b", "c")}
    assert hashers.tools.get_common_framerates(
        dict(zip(["a", "b", "c"], [1 / 3, 1 / 6, 1 / 9]))
    ) == {1 / 3: ("a", "b", "c")}
    assert hashers.tools.get_common_framerates(
        dict(zip(["a", "b", "c", "d", "e"], [1 / 3, 1 / 2, 1 / 5, 1 / 7, 1 / 11]))
    ) == {1.0: ("a", "b", "c", "d", "e")}
    assert hashers.tools.get_common_framerates(
        dict(zip(string.ascii_lowercase[:6], [10, 5, 3, 1 / 3, 1 / 6, 1 / 9]))
    ) == {3.0: ("c", "d", "e", "f"), 10.0: ("a", "b")}
    assert hashers.tools.get_common_framerates(dict(zip(["a", "b"], [100, 1]))) == {
        100: ("a", "b")
    }


def test_synchronized_hashing():
    video_hashers = {
        "phashframewise": hashers.FramewiseHasher(
            frame_hasher=hashers.PHash(hash_size=16),
            frames_per_second=1,
            interframe_threshold=0.2,
        ),
        "tmkl2": hashers.TMKL2(frames_per_second=15),
        "tmkl1": hashers.TMKL1(frames_per_second=15),
    }

    for filepath in [
        "perception/testing/videos/v1.m4v",
        "perception/testing/videos/v2.m4v",
    ]:
        # Ensure synchronized hashing
        hashes1 = {
            hasher_name: hasher.compute(filepath)
            for hasher_name, hasher in video_hashers.items()
        }
        hashes2 = hashers.tools.compute_synchronized_video_hashes(
            filepath=filepath, hashers=video_hashers
        )
        assert hashes1 == hashes2


def test_hex_b64_conversion():
    b64_string = """
    CFFRABrAaRKCDQigEBIGwAhNBdIISgVZBxQYAgP4fwYNUR0oBgYCPwwIDSqTAmIH
    FRQhCiT/IT9DpHIeIx4cA2hQcBTwISovFkspMxz/MzdnljeCOEs4LnBYNHHBMC4x
    EC8mPxLaLkI/dywmNk1lMXoqJyCLSyg7BxwRSgTmIlI/LwsrP04hTCMtBSxaGAFB
    """.replace("\n", "").replace(" ", "").strip()
    hex_string = """
    085151001ac06912820d08a0101206c0084d05d2084a05590714180203f87f06
    0d511d280606023f0c080d2a930262071514210a24ff213f43a4721e231e1c03
    68507014f0212a2f164b29331cff333767963782384b382e70583471c1302e31
    102f263f12da2e423f772c26364d65317a2a27208b4b283b071c114a04e62252
    3f2f0b2b3f4e214c232d052c5a180141
    """.replace("\n", "").replace(" ", "").strip()
    assert (
        hashers.tools.hex_to_b64(hex_string, dtype="uint8", hash_length=144)
        == b64_string
    )
    assert (
        hashers.tools.b64_to_hex(b64_string, dtype="uint8", hash_length=144)
        == hex_string
    )


================================================
FILE: tests/test_local_descriptor_deduplication.py
================================================
import os
import tempfile

import albumentations
import cv2
import pandas as pd
import pytest


import perception.benchmarking.image as pb
import perception.benchmarking.image_transforms as pbit
import perception.approximate_deduplication as ad
import perception.local_descriptor_deduplication as ldd
import perception.hashers.tools as pht
import perception.testing as pt
from perception.approximate_deduplication.debug import vizualize_pair

# Params for object level matching.
OBJECT_MATCH_PARAMS = {
    "strong_match_threshold": 0.3,  # Ideally something close to 95% precision.
    "ratio": 0.5,
    "coarse_pct_probe": 0.1,
    "minimum_coarse_overlap": 0.001,
    "coarse_threshold": 100.0,
    "minimum_validation_match": 0.04,
    "minimum_validation_intersection": 0.04,
    "minimum_validation_inliers": 6,
}


@pytest.mark.parametrize("hasher", [ldd.SIFT(), ldd.AKAZE()])
def test_deduplication(hasher):
    tdir = tempfile.TemporaryDirectory()
    watermark = cv2.cvtColor(
        cv2.imread(pt.DEFAULT_TEST_LOGOS[0], cv2.IMREAD_UNCHANGED), cv2.COLOR_BGRA2RGBA
    )
    transformed = pb.BenchmarkImageDataset.from_tuples(
        files=[(filepath, "test") for filepath in pt.DEFAULT_TEST_IMAGES]
    ).transform(
        transforms={
            "noop": albumentations.NoOp(p=1),
            "pad": albumentations.CropAndPad(percent=0.1, p=1),
            "crop": albumentations.CropAndPad(percent=-0.1, p=1),
            "watermark": pbit.apply_watermark(watermark, alpha=1, size=0.8),  # type: ignore
        },
        storage_dir=tdir.name,
    )
    df = transformed._df.set_index("filepath")
    pairs = ldd.deduplicate(
        filepaths_or_reference_df=df.index, max_workers=2, hasher=hasher
    )  #  Test throws errors if unset.

    clustered = (
        pd.DataFrame(
            ad.pairs_to_clusters(ids=df.index, pairs=pairs, strictness="component")
        )
        .set_index("id")
        .merge(df, left_index=True, right_index=True)
        .reset_index()
    )
    print("test2")
    n_clusters = clustered["cluster"].nunique()
    n_transforms = clustered["transform_name"].nunique()
    perfect = (
        clustered.groupby("cluster")
        .apply(
            lambda g: g["guid"].nunique() == 1
            and g["transform_name"].nunique() == n_transforms
        )
        .sum()
    )

    tainted = clustered.groupby("cluster")["guid"].nunique().gt(1).sum()
    pct_perfect = perfect / n_clusters
    pct_tainted = tainted / n_clusters
    assert pct_tainted == 0
    assert pct_perfect > 0.1


@pytest.mark.parametrize("hasher", [ldd.SIFT(), ldd.AKAZE()])
def test_deduplication_across_sets(hasher):
    tdir = tempfile.TemporaryDirectory()
    watermark = cv2.cvtColor(
        cv2.imread(pt.DEFAULT_TEST_LOGOS[0], cv2.IMREAD_UNCHANGED), cv2.COLOR_BGRA2RGBA
    )
    transformed = pb.BenchmarkImageDataset.from_tuples(
        files=[(filepath, "test") for filepath in pt.DEFAULT_TEST_IMAGES]
    ).transform(
        transforms={
            "noop": albumentations.NoOp(p=1),
            "pad": albumentations.CropAndPad(percent=0.1, p=1),
            "crop": albumentations.CropAndPad(percent=0.1, p=1),
            "watermark": pbit.apply_watermark(watermark, alpha=1, size=0.8),  # type: ignore
        },
        storage_dir=tdir.name,
    )

    df = transformed._df.set_index("filepath")
    query_images = list(df[df.transform_name == "noop"].index.values)
    images_to_match_to = list(df[~(df.transform_name == "noop")].index.values)

    pairs = ldd.deduplicate(
        filepaths_or_reference_df=images_to_match_to,
        query_filepaths_or_df=query_images,
        max_workers=2,
        hasher=hasher,
    )  #  Test throws errors if unset.

    assert len(pairs) >= 20, "Wrong # of pairs."
    only_one_noop = [p for p in pairs if (("noop" in p[0]) != ("noop" in p[1]))]
    assert len(only_one_noop) == len(
        pairs
    ), "All pairs must be between a noop and non-noop file"


@pytest.mark.parametrize("hasher", [ldd.SIFT(), ldd.AKAZE()])
def test_validation_for_overlapping_case(hasher):
    tdir = tempfile.TemporaryDirectory()
    # Each image will have the center of the other
    # pasted in the top left corner.
    image1 = pht.read(pt.DEFAULT_TEST_IMAGES[0])
    image2 = pht.read(pt.DEFAULT_TEST_IMAGES[1])
    image1[:100, :100] = image2[100:200, 100:200]
    image2[:100, :100] = image1[100:200, 100:200]
    fp1 = os.path.join(tdir.name, "test1.jpg")
    fp2 = os.path.join(tdir.name, "test2.jpg")
    cv2.imwrite(fp1, image1[..., ::-1])
    cv2.imwrite(fp2, image2[..., ::-1])
    descriptor1 = ldd.generate_image_descriptors(fp1, hasher)
    descriptor2 = ldd.generate_image_descriptors(fp2, hasher)
    assert descriptor1 is not None
    assert descriptor2 is not None

    # These images should not match.
    assert not hasher.validate_match(descriptor1=descriptor1, descriptor2=descriptor2)[
        0
    ]


@pytest.mark.parametrize("hasher", [ldd.SIFT(), ldd.AKAZE()])
def test_handling_bad_file_case(caplog, hasher):
    tdir = tempfile.TemporaryDirectory()
    missing_file = os.path.join(tdir.name, "missing-file")
    bad_file_handle = tempfile.NamedTemporaryFile()
    bad_file = bad_file_handle.name
    transformed = pb.BenchmarkImageDataset.from_tuples(
        files=[(filepath, "test") for filepath in pt.DEFAULT_TEST_IMAGES]
    ).transform(
        transforms={
            "noop": lambda image: image,
        },
        storage_dir=tdir.name,
    )
    df = transformed._df.set_index("filepath")
    df.loc[missing_file] = df.iloc[0]
    df.loc[bad_file] = df.iloc[0]
    pairs = ldd.deduplicate(filepaths_or_reference_df=df.index, hasher=hasher)
    clustered = (
        pd.DataFrame(
            ad.pairs_to_clusters(ids=df.index, pairs=pairs, strictness="component")
        )
        .set_index("id")
        .merge(df, left_index=True, right_index=True)
        .reset_index()
    )

    assert bad_file not in clustered.index
    assert missing_file not in clustered.index

    bad_file_error = next(
        record for record in caplog.records if bad_file in record.message
    )
    assert bad_file_error
    assert bad_file_error.levelname == "ERROR"

    missing_file_warning = next(
        record for record in caplog.records if missing_file in record.message
    )
    assert missing_file_warning
    assert missing_file_warning.levelname == "WARNING"


def test_handling_hasher_mismatch():
    tdir = tempfile.TemporaryDirectory()
    transformed = pb.BenchmarkImageDataset.from_tuples(
        files=[(filepath, "test") for filepath in pt.DEFAULT_TEST_IMAGES]
    ).transform(
        transforms={
            "noop": lambda image: image,
        },
        storage_dir=tdir.name,
    )
    df = transformed._df.set_index("filepath")
    reference_df = ldd.build_reference_df(filepaths=df.index, hasher=ldd.SIFT())
    query_df = ldd.build_reference_df(filepaths=df.index, hasher=ldd.AKAZE())
    with pytest.raises(AssertionError):
        ldd.deduplicate(reference_df, query_df)


def test_viz_pair():
    object_sift = ldd.SIFT(
        max_features=256,
        ratio=OBJECT_MATCH_PARAMS["ratio"],
        threshold=OBJECT_MATCH_PARAMS["coarse_threshold"],
        overlap=OBJECT_MATCH_PARAMS["minimum_coarse_overlap"],
        validation_match=OBJECT_MATCH_PARAMS["minimum_validation_match"],
        validation_inliers=OBJECT_MATCH_PARAMS["minimum_validation_inliers"],
        validation_intersection=OBJECT_MATCH_PARAMS["minimum_validation_intersection"],
    )
    filepaths = [
        "tests/images/chair.png",
        "tests/images/chair3.png",
        "tests/images/chair-square.png",
        "tests/images/chair-tall.png",
    ]
    reference_df = ldd.build_reference_df(
        filepaths=filepaths,
        hasher=object_sift,
        min_features=10,
        max_size=1000,
        show_progress=False,
    )
    pairs = ldd.deduplicate(
        filepaths_or_reference_df=reference_df,
        hasher=object_sift,
        max_size=1000,
        min_features=10,
        verbose=True,
    )
    row = pairs[0]
    viz_img = vizualize_pair(
        reference_df.loc[row[0]],
        reference_df.loc[row[1]],
        0.5,
        match_metadata=row[2],
        sanitized=False,
    )
    viz_img = cv2.cvtColor(viz_img, cv2.COLOR_RGB2BGR)
    cv2.imwrite("tests/images/debug-image.png", viz_img)


def test_viz_pair_symmetry():
    # This test catches a regression where if the smaller image was the query one LDD would swap
    # points during distance calculation, but not unswap points before returning them.
    object_sift = ldd.SIFT(
        max_features=256,
        ratio=OBJECT_MATCH_PARAMS["ratio"],
        threshold=OBJECT_MATCH_PARAMS["coarse_threshold"],
        overlap=OBJECT_MATCH_PARAMS["minimum_coarse_overlap"],
        validation_match=OBJECT_MATCH_PARAMS["minimum_validation_match"],
        validation_inliers=OBJECT_MATCH_PARAMS["minimum_validation_inliers"],
        validation_intersection=OBJECT_MATCH_PARAMS["minimum_validation_intersection"],
    )
    filepaths = [
        "tests/images/chair.png",
        "tests/images/chair3.png",
    ]
    reference_df = ldd.build_reference_df(
        filepaths=filepaths,
        hasher=object_sift,
        min_features=10,
        max_size=1000,
        show_progress=False,
    )
    pairs = ldd.deduplicate(
        filepaths_or_reference_df=filepaths[:1],
        query_filepaths_or_df=filepaths[1:],
        hasher=object_sift,
        max_size=1000,
        min_features=10,
        verbose=True,
    )
    row = pairs[0]
    viz_img = vizualize_pair(
        reference_df.loc[row[0]],
        reference_df.loc[row[1]],
        0.5,
        match_metadata=row[2],
        sanitized=False,
    )
    viz_img = cv2.cvtColor(viz_img, cv2.COLOR_RGB2BGR)
    cv2.imwrite("tests/images/debug-image-symmetry-1.png", viz_img)

    # Swap order of ref and query files.
    pairs = ldd.deduplicate(
        filepaths_or_reference_df=filepaths[1:],
        query_filepaths_or_df=filepaths[:1],
        hasher=object_sift,
        max_size=1000,
        min_features=10,
        verbose=True,
    )
    row = pairs[0]
    viz_img = vizualize_pair(
        reference_df.loc[row[0]],
        reference_df.loc[row[1]],
        0.5,
        match_metadata=row[2],
        sanitized=False,
    )
    viz_img = cv2.cvtColor(viz_img, cv2.COLOR_RGB2BGR)
    cv2.imwrite("tests/images/debug-image-symmetry-2.png", viz_img)


================================================
FILE: tests/test_tmk.py
================================================
import gzip
import json
from pathlib import Path
from typing import cast
import platform

import numpy as np
import pytest

from perception.hashers.video import tmk

TEST_FILES = Path("perception") / "testing" / "videos"


def test_tmk_parity():
    if platform.machine() == "arm64":
        pytest.xfail("TMK is not supported on ARM64")

    hasher = tmk.TMKL2()
    with gzip.open(TEST_FILES / "expected_tmk.json.gz", "rt", encoding="utf8") as f:
        expected_output = json.load(f)
    expected_output = {k: np.array(v) for k, v in expected_output.items()}

    output = []

    for filepath in [
        "perception/testing/videos/v1.m4v",
        "perception/testing/videos/v2.m4v",
    ]:
        hash_value: np.ndarray = cast(
            np.ndarray, hasher.compute(filepath=filepath, hash_format="vector")
        )
        output.append(hash_value.reshape((4, 64, -1)))

    # Verify the hashes are the same
    for o, t in zip(output, expected_output["hashes"]):
        np.testing.assert_allclose(o.reshape(*t.shape), t)

    # Verify the pair-wise scores are the same
    offsets = np.arange(-5, 5)
    for normalization in ["feat", "feat_freq", "matrix"]:
        score = hasher._score_pair(
            output[0], output[1], offsets=offsets, normalization=normalization
        )
        np.testing.assert_allclose(score, expected_output[normalization])


================================================
FILE: tests/test_tools.py
================================================
import os
import shutil
import tempfile
import io

import numpy as np
import pytest

from perception import hashers, testing, tools


def test_deduplicate():
    directory = tempfile.TemporaryDirectory()
    original = testing.DEFAULT_TEST_IMAGES[0]
    duplicate = os.path.join(directory.name, "image1.jpg")
    shutil.copy(original, duplicate)
    pairs = tools.deduplicate(
        files=[
            testing.DEFAULT_TEST_IMAGES[0],
            testing.DEFAULT_TEST_IMAGES[1],
            duplicate,
        ],
        hashers=[(hashers.PHash(hash_size=16), 0.25)],
    )
    assert len(pairs) == 1
    file1, file2 = pairs[0]
    assert ((file1 == duplicate) and (file2 == original)) or (
        (file1 == original) and (file2 == duplicate)
    )


def test_deduplicate_u8():
    # This test verifies that extensions.compute_euclidean_pairwise_duplicates
    # works properly.
    directory = tempfile.TemporaryDirectory()
    original = testing.DEFAULT_TEST_IMAGES[0]
    duplicate = os.path.join(directory.name, "image1.jpg")
    shutil.copy(original, duplicate)
    pairs = tools.deduplicate(
        files=[
            testing.DEFAULT_TEST_IMAGES[0],
            testing.DEFAULT_TEST_IMAGES[1],
            duplicate,
        ],
        hashers=[(hashers.PHashU8(hash_size=16), 10)],
    )
    assert len(pairs) == 1
    file1, file2 = pairs[0]
    assert ((file1 == duplicate) and (file2 == original)) or (
        (file1 == original) and (file2 == duplicate)
    )


def test_deduplicate_hashes_multiple():
    # This test verifies that deduplicate_hashes functions properly
    # when there is more than one hash for a file.
    directory = tempfile.TemporaryDirectory()
    original = testing.DEFAULT_TEST_IMAGES[0]
    duplicate = os.path.join(directory.name, "image1.jpg")
    hasher = hashers.PHashU8(hash_size=16)
    shutil.copy(original, duplicate)
    hashes = [
        (0, hasher.compute(original)),
        (1, hasher.compute(duplicate)),
        (1, hasher.compute(duplicate)),
        (1, hasher.compute(duplicate)),
        (2, hasher.compute(testing.DEFAULT_TEST_IMAGES[1])),
    ]
    pairs = tools.deduplicate_hashes(
        hashes=hashes,  # type: ignore[arg-type]
        threshold=10,
        hash_format="base64",
        hash_length=hasher.hash_length,
        distance_metric="euclidean",
        hash_dtype="uint8",
    )
    assert len(pairs) == 1
    file1, file2 = pairs[0]
    assert ((file1 == 0) and (file2 == 1)) or ((file1 == 1) and (file2 == 0))


def test_compute_euclidean_pairwise_duplicates():
    # The purpose of this test is to verify that the handling of
    # deduplication with files that have multiple hashes works
    # properly. This is particularly important for video where
    # we are likely to have many hashes.
    X = np.array(
        [
            # File 1
            [0, 0, 0],
            [1, 1, 1],
            [2, 2, 2],
            # File 2
            [1, 1, 1],
            [2, 2, 2],
            [3, 3, 3],
            # File 3
            [3, 3, 3],
            [4, 4, 4],
            # File 4
            [5, 5, 5],
            [6, 6, 6],
        ]
    )

    # Use grouped files.
    counts = np.array([3, 3, 2, 2])
    expected = np.array(
        [[2 / 3, 2 / 3], [0, 0], [0, 0], [1 / 3, 1 / 2], [0, 0], [0, 0]]
    )
    actual = tools.extensions.compute_euclidean_pairwise_duplicates(
        X=X.astype("int32"),
        threshold=1,
        counts=counts.astype("uint32"),
        compute_overlap=True,
    )
    assert (expected == actual).all()

    # Use without computing overlap.
    expected = np.array([[2, 2], [0, 0], [0, 0], [1, 1], [0, 0], [0, 0]])
    actual = tools.extensions.compute_euclidean_pairwise_duplicates(
        X=X.astype("int32"),
        threshold=1,
        counts=counts.astype("uint32"),
        compute_overlap=False,
    )
    assert (expected == actual).all()

    # Use ungrouped files.
    X = np.array(
        [
            # File 1
            [0, 0, 0],
            [1, 1, 1],
            [2, 2, 2],
            [1, 1, 1],
        ]
    )
    expected = np.array([[0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [0, 0]])
    actual = tools.extensions.compute_euclidean_pairwise_duplicates(
        X=X.astype("int32"), threshold=1, compute_overlap=True
    )
    assert (expected == actual).all()


def test_api_is_over_https():
    matcher_https = tools.SaferMatcher(api_key="foo", url="https://www.example.com/")
    assert matcher_https

    if "SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP" in os.environ:
        del os.environ["SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP"]
    with pytest.raises(ValueError):
        tools.SaferMatcher(api_key="foo", url="http://www.example.com/")

    os.environ["SAFER_MATCHING_SERVICE_DEV_ALLOW_HTTP"] = "1"
    matcher_http_with_escape_hatch = tools.SaferMatcher(
        api_key="foo", url="http://www.example.com/"
    )
    assert matcher_http_with_escape_hatch


def test_unletterbox():
    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])
    padded = np.zeros((image.shape[0] + 100, image.shape[1] + 50, 3), dtype="uint8")
    padded[50 : 50 + image.shape[0], 25 : 25 + image.shape[1]] = image
    result = hashers.tools.unletterbox(padded)
    assert result is not None
    (x1, x2), (y1, y2) = result
    assert y1 == 50
    assert y2 == 50 + image.shape[0]
    assert x1 == 25
    assert x2 == 25 + image.shape[1]


def test_unletterbox_crop():
    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])
    padded = np.zeros((image.shape[0] + 100, image.shape[1] + 50, 3), dtype="uint8")
    padded[50 : 50 + image.shape[0], 25 : 25 + image.shape[1]] = image
    cropped_image = hashers.tools.unletterbox_crop(padded)
    assert cropped_image is not None
    assert image.shape[0] == cropped_image.shape[0]
    assert image.shape[1] == cropped_image.shape[1]


def test_unletterbox_crop_meaningful_pixels():
    """Test the value of .5  min_fraction_meaningful_pixels in unletterbox()."""
    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])
    h, w, _ = image.shape

    # make tall skinny images with lots of padding around the content
    # so its below min_fraction_meaningful_pixels threshold
    padding_size = int(5 * h)

    padded = np.r_[
        np.zeros((padding_size, w, 3)), image, np.zeros((padding_size, w, 3))
    ]
    assert None is hashers.tools.unletterbox_crop(
        padded, min_fraction_meaningful_pixels=0.5
    )


def test_unletterbox_color():
    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])
    padded = np.zeros((image.shape[0] + 100, image.shape[1] + 50, 3), dtype="uint8")
    padded[:, :] = (200, 0, 200)
    padded[50 : 50 + image.shape[0], 25 : 25 + image.shape[1]] = image
    # Should not unletterbox since not black.
    results = hashers.tools.unletterbox(padded, only_remove_black=True)
    assert results is not None
    (x1, x2), (y1, y2) = results
    assert y1 == 0
    assert y2 == padded.shape[0]
    assert x1 == 0
    assert x2 == padded.shape[1]

    # Should  unletterbox color:
    results = hashers.tools.unletterbox(padded, only_remove_black=False)
    assert results is not None
    (x1, x2), (y1, y2) = results
    assert y1 == 50
    assert y2 == 50 + image.shape[0]
    assert x1 == 25
    assert x2 == 25 + image.shape[1]


def test_unletterbox_aspect_ratio():
    """Test the value of .1 in unletterbox()."""
    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])
    h, w, z = image.shape

    # make tall skinny images with non-trivial content just below and
    # above 10% threshold
    base = int(4.5 * h)  # 2 * base + h = 100%
    h_fail, h_pass = base + 10, base - 10

    padded = np.r_[np.zeros((h_fail, w, 3)), image, np.zeros((h_fail, w, 3))]
    assert None is hashers.tools.unletterbox(padded)

    padded = np.r_[np.zeros((h_pass, w, 3)), image, np.zeros((h_pass, w, 3))]

    results = hashers.tools.unletterbox(padded)
    assert results is not None
    (x1, x2), (y1, y2) = results

    assert y1 == h_pass
    assert y2 == h_pass + image.shape[0]
    assert x1 == 0
    assert x2 == image.shape[1]


def test_unletterbox_noblackbars():
    image = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])

    results = hashers.tools.unletterbox(image)
    assert results is not None
    (x1, x2), (y1, y2) = results
    assert x1 == 0
    assert y1 == 0
    assert x2 == image.shape[1]
    assert y2 == image.shape[0]


def test_ffmpeg_video():
    """Check that the FFMPEG video parsing code provides substantially similar
    results to the OpenCV approach (which also uses FFMPEG under the hood but
    also has different frame selection logic)."""
    frames_per_second = 2.3
    for filepath in testing.DEFAULT_TEST_VIDEOS:
        filename = os.path.basename(filepath)
        for (frame1, index1, timestamp1), (frame2, index2, timestamp2) in zip(
            hashers.tools.read_video_to_generator_ffmpeg(
                filepath, frames_per_second=frames_per_second
            ),
            hashers.tools.read_video_to_generator(
                filepath, frames_per_second=frames_per_second
            ),
        ):
            diff = np.abs(frame1.astype("int32") - frame2.astype("int32")).flatten()
            assert index1 == index2, f"Index mismatch for {filename}"
            np.testing.assert_allclose(
                timestamp1, timestamp2
            ), f"Timestamp mismatch for {filename}"
            assert np.percentile(diff, 75) < 25, f"Frame mismatch for {filename}"


def test_videos_with_extra_channels():
    frames_per_second = 1
    test_videos = [
        "perception/testing/videos/extra_channel_attached_pic.mp4",
        "perception/testing/videos/extra_channel_attached_pic_audio.mp4",
    ]
    expected_frames = 10
    for filepath in test_videos:
        filename = os.path.basename(filepath)
        frame_count = 0
        for frame1, index1, timestamp1 in hashers.tools.read_video_to_generator_ffmpeg(
            filepath, frames_per_second=frames_per_second
        ):
            frame_count += 1
        assert frame_count == expected_frames, f"Frame count mismatch for {filename}"


def test_image_input_types():
    image_expected = hashers.tools.read(testing.DEFAULT_TEST_IMAGES[0])

    with open(testing.DEFAULT_TEST_IMAGES[0], "rb") as f:
        image_data = f.read()

    image_bytes_io = hashers.tools.read(io.BytesIO(image_data))
    assert (image_expected == image_bytes_io).all()

    with tempfile.SpooledTemporaryFile() as f:
        f.write(image_data)
        f.seek(0)
        image_tempfile = hashers.tools.read(f)

    assert (image_expected == image_tempfile).all()