Showing preview only (388K chars total). Download the full file or copy to clipboard to get everything.
Repository: thorn-oss/perception
Branch: main
Commit: b17dcb841435
Files: 75
Total size: 366.2 KB
Directory structure:
gitextract_4qwyzu2o/
├── .dockerignore
├── .git-blame-ignore-revs
├── .gitattributes
├── .github/
│ ├── dependabot.yaml
│ └── workflows/
│ ├── ci.yaml
│ ├── gh-pages.yaml
│ └── release.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── build.py
├── docs/
│ ├── api/
│ │ ├── benchmarking.rst
│ │ ├── hashers.rst
│ │ ├── index.rst
│ │ └── tools.rst
│ ├── conf.py
│ ├── examples/
│ │ ├── benchmarking.rst
│ │ ├── deduplication.rst
│ │ ├── detecting_csam.rst
│ │ └── index.rst
│ ├── index.rst
│ └── requirements.txt
├── perception/
│ ├── __init__.py
│ ├── approximate_deduplication/
│ │ ├── __init__.py
│ │ ├── _graph_backend.py
│ │ ├── debug.py
│ │ ├── index.py
│ │ └── serve.py
│ ├── benchmarking/
│ │ ├── __init__.py
│ │ ├── common.py
│ │ ├── extensions.pyx
│ │ ├── image.py
│ │ ├── image_transforms.py
│ │ ├── video.py
│ │ └── video_transforms.py
│ ├── extensions.pyx
│ ├── hashers/
│ │ ├── __init__.py
│ │ ├── hasher.py
│ │ ├── image/
│ │ │ ├── __init__.py
│ │ │ ├── average.py
│ │ │ ├── dhash.py
│ │ │ ├── opencv.py
│ │ │ ├── pdq.py
│ │ │ ├── phash.py
│ │ │ └── wavelet.py
│ │ ├── tools.py
│ │ └── video/
│ │ ├── __init__.py
│ │ ├── framewise.py
│ │ └── tmk.py
│ ├── local_descriptor_deduplication.py
│ ├── py.typed
│ ├── testing/
│ │ ├── __init__.py
│ │ ├── images/
│ │ │ └── README.md
│ │ ├── logos/
│ │ │ └── README.md
│ │ └── videos/
│ │ ├── README.md
│ │ ├── rgb.m4v
│ │ ├── v1.m4v
│ │ └── v2.m4v
│ ├── tools.py
│ └── utils.py
├── poetry.toml
├── pyproject.toml
├── setup.py
└── tests/
├── test_approximate_deduplication.py
├── test_benchmarking.py
├── test_hashers.py
├── test_local_descriptor_deduplication.py
├── test_tmk.py
└── test_tools.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
notebooks
.venv/
================================================
FILE: .git-blame-ignore-revs
================================================
# Format with black
6c03f96a9335e548685ece233474125fe453c262
================================================
FILE: .gitattributes
================================================
perception/_version.py export-subst
================================================
FILE: .github/dependabot.yaml
================================================
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
# Check for updates to GitHub Actions every week.
interval: "weekly"
================================================
FILE: .github/workflows/ci.yaml
================================================
name: ci
on:
push:
branches:
- "**"
tags-ignore:
- v*
jobs:
test:
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12", "3.13"]
os: ["ubuntu-latest", "windows-latest", "macos-latest"]
runs-on: ${{ matrix.os }}
steps:
- name: checkout
uses: actions/checkout@v6
- name: Setup Poetry
uses: abatilo/actions-poetry@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
cache: poetry
cache-dependency-path: poetry.lock
- name: Setup FFMPEG
uses: FedericoCarboni/setup-ffmpeg@v3
if: ${{ ! startsWith(matrix.os, 'macos') }}
- name: Setup Dependencies with Homebrew
if: startsWith(matrix.os, 'macos')
run: |
brew install llvm ffmpeg
echo "CC=$(brew --prefix)/opt/llvm/bin/clang" >> $GITHUB_ENV
echo "CXX=$(brew --prefix)/opt/llvm/bin/clang++" >> $GITHUB_ENV
- name: Setup Project
run: make init-project
- name: Normalize OpenCV package
run: |
poetry run python -m pip uninstall -y opencv-python-headless
poetry run python -m pip install --no-deps --force-reinstall opencv-contrib-python-headless
- name: Run precommit
run: make precommit
================================================
FILE: .github/workflows/gh-pages.yaml
================================================
name: Deploy Sphinx documentation to Pages
on:
push:
branches:
- dunnack/sphinx-to-github-pages
- main
paths:
- .github/workflows/gh-pages.yaml
- docs/**
jobs:
pages:
runs-on: ubuntu-latest
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
permissions:
contents: read
pages: write
id-token: write
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- id: deployment
uses: sphinx-notes/pages@v3
with:
checkout: false
documentation_path: docs
requirements_path: docs/requirements.txt
================================================
FILE: .github/workflows/release.yaml
================================================
name: release
on:
release:
types: [published]
workflow_dispatch:
jobs:
build-wheels:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12", "3.13"]
os: ["ubuntu-latest", "windows-latest", "macos-latest"]
name: Build for ${{ matrix.os }} on Python ${{ matrix.python-version }}
steps:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: Setup Poetry
uses: abatilo/actions-poetry@v4
- name: Setup FFMPEG
uses: FedericoCarboni/setup-ffmpeg@v3
if: ${{ ! startsWith(matrix.os, 'macos') }}
- name: Setup Dependencies with Homebrew
if: startsWith(matrix.os, 'macos')
run: |
brew install llvm ffmpeg
echo "CC=$(brew --prefix)/opt/llvm/bin/clang" >> $GITHUB_ENV
echo "CXX=$(brew --prefix)/opt/llvm/bin/clang++" >> $GITHUB_ENV
- uses: actions/checkout@v6
with:
# Full clone for version calculation
fetch-depth: 0
fetch-tags: true
ref: ${{ github.event_name == 'release' && format('refs/tags/{0}', github.event.release.tag_name) || github.ref }}
- name: Build Project
run: make build-wheel
- uses: actions/upload-artifact@v7
with:
name: package-wheels-${{ matrix.os }}-${{ matrix.python-version }}
path: dist/*
build-sdist:
runs-on: ubuntu-latest
name: Build sdist
steps:
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.13"
- name: Setup Poetry
uses: abatilo/actions-poetry@v4
- uses: actions/checkout@v6
with:
# Full clone for version calculation
fetch-depth: 0
fetch-tags: true
ref: ${{ github.event_name == 'release' && format('refs/tags/{0}', github.event.release.tag_name) || github.ref }}
- name: Build Project
run: make build-sdist
- uses: actions/upload-artifact@v7
with:
name: package-sdist
path: dist/*
publish:
needs: [build-wheels, build-sdist]
runs-on: ubuntu-latest
if: ${{ github.repository_owner == 'thorn-oss' && github.event_name == 'release' }}
steps:
- uses: actions/checkout@v6
with:
# Full clone for version calculation
fetch-depth: 0
fetch-tags: true
ref: refs/tags/${{ github.event.release.tag_name }}
- uses: actions/setup-python@v6
with:
python-version: "3.13"
- name: Setup Poetry
uses: abatilo/actions-poetry@v4
- name: Setup Dynamic Versioning
run: poetry self add "poetry-dynamic-versioning[plugin]"
- name: Download wheels
uses: actions/download-artifact@v8
with:
path: dist
pattern: package-*
merge-multiple: true
- name: Load PyPI Token
uses: 1password/load-secrets-action@v4
with:
# Export loaded secrets as environment variables
export-env: true
env:
OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.DATA_SCIENCE_OP_SERVICE_ACCOUNT_TOKEN }}
POETRY_PYPI_TOKEN_PYPI: op://data-science-oss/perception-pypi-api-key/secret/value
- name: Verify artifacts
run: |
mapfile -t artifacts < <(find dist -type f \( -name "*.whl" -o -name "*.tar.gz" \))
if [ ${#artifacts[@]} -eq 0 ]; then
echo "No artifacts found in dist"
exit 1
fi
printf '%s\n' "${artifacts[@]}"
if printf '%s\n' "${artifacts[@]}" | grep -E -- '-0\.0\.0([.-]|$)'; then
echo "Refusing to publish placeholder version 0.0.0 artifacts"
exit 1
fi
- name: Publish package
run: poetry publish -n
================================================
FILE: .gitignore
================================================
# MacOS stuff
.DS_Store
# Python artifacts
*.egg-info
# Cache
.mypy_cache
.pytest_cache
__pycache__
.ipynb_checkpoints
dist
# Any temporary images or CSV files
notebooks
# Local environment
.venv
.python-version
# Coverage file
.coverage
# Versioneer artifacts
/versioneer.pyc
# Build artifacts
/build
# Docs build artifacts
/docs/_build
# Remove .vscode folder
.vscode
# Extension artifacts
*.c
*.cpp
*.so
debug-image*
================================================
FILE: .pre-commit-config.yaml
================================================
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/psf/black
rev: 26.3.1
hooks:
- id: black
language_version: python3
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.11.13
hooks:
# Run the linter.
- id: ruff
args: [ --fix ]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.8.0
hooks:
- id: mypy
================================================
FILE: .readthedocs.yaml
================================================
version: 2
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
formats: all
# Installs the package and the docs requirements.
python:
version: 3.9
install:
- requirements: docs/requirements.txt
- method: pip
path: .
system_packages: true
================================================
FILE: CHANGELOG.md
================================================
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.4.0] - 2020-10-17
This release switches from using false positive rates in benchmarking to reporting precision, which is more intuitive.
### Breaking changes
All references to fpr_threshold now refer to precision_threshold.
### Bug fixes
The PDQHash hasher now correctly returns the hash vector instead of the (vector, quality) tuple.
## [0.3.0] - 2020-04-27
This release adds significantly more support for video.
### Breaking changes
- Previously, `read_video` returned `(frame, index, timestamp)` tuples where `index` reflected the index of the yielded frame (i.e., it always increased by exactly 1). It now reflects the index of the frame in the original video. This means that, if the requested framerate is higher than the encoded video framerate, this index may repeat the same value, indicating that we have repeated the same frame.
### Enhancements
- We now include a `SimpleSceneDetection` hasher that can wrap other video hashers using scene detection.
- `compute_metrics` is much faster now for integer-valued hashes that use a euclidean distance metric.
- We now include an unsigned 8-bit integer version of `PHash`, called `PHashU8`. This provides a useful framewise hasher for averaging across frames (e.g., using TMK) while being more compact than `PHashF`.
- We include more thorough support for benchmarking video hashes.
### Bug fixes
- When using `hasher.vector_to_string` with hashers that return multiple hashes, the `hash_format` argument was not respected.
- The `compute_threshold_recall` and `show_histograms` functions did not work properly when `grouping=[]`.
## [0.2.0] - 2019-12-20
This release adds more support for hashing videos (including TMK L2 and TMK L2). As part of that, it also includes a re-factor to separate `benchmarking.BenchmarkDataset` and `benchmarking.BenchmarkTransforms` into image and video variants.
## [0.1.0] - 2019-11-04
Initial release
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to make participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies within all project spaces, and it also applies when
an individual is representing the project or its community in public spaces.
Examples of representing a project or community include using an official
project e-mail address, posting via an official social media account, or acting
as an appointed representative at an online or offline event. Representation of
a project may be further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at conduct@thorn.org. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
https://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright 2019 Thorn
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
include perception/testing/images/*
include perception/testing/videos/*
include perception/testing/logos/*
include perception/**/*.pyx
include perception/*.pyx
include perception/py.typed
exclude tests/*
================================================
FILE: Makefile
================================================
TEST_SCOPE?=tests/
.PHONY: build build-wheel build-sdist verify-version init-project init test lint_check type_check format format_check precommit
init-project:
poetry install --all-extras
init: init-project
poetry run pre-commit install
test:
poetry run pytest $(TEST_SCOPE)
lint_check:
poetry run ruff check perception tests
type_check:
poetry run mypy perception
format:
poetry run black .
format_check:
poetry run black --check . || (echo '\nUnexpected format.' && exit 1)
precommit:
poetry check
make lint_check
make type_check
make format_check
make test
verify-version:
@echo "Poetry: $$(poetry --version)"
@echo "Poetry plugins:"
poetry self show plugins
@echo "Git describe: $$(git describe --tags --always)"
@poetry self show plugins | grep -q "poetry-dynamic-versioning"
build-wheel:
poetry run pip -q install repairwheel
poetry self add "poetry-dynamic-versioning[plugin]"
$(MAKE) verify-version
poetry build --format="wheel" --output="dist-tmp"
poetry run repairwheel -o dist dist-tmp/*.whl
@find dist -name "*.whl" -type f | sed -n "s/\(.*\)\.linux.*\.whl$$/& \1.whl/p" | xargs -r -n 2 mv # Fix wheel name
@rm -rf dist-tmp
build-sdist:
poetry self add "poetry-dynamic-versioning[plugin]"
$(MAKE) verify-version
poetry build --format="sdist" --output="dist"
build: build-wheel build-sdist
================================================
FILE: README.md
================================================
# perception 
`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. See [the documentation](https://perception.thorn.engineering/en/latest/) for details.
## Background
`perception` was initially developed at [Thorn](https://www.thorn.org) as part of our work to eliminate child sexual abuse material from the internet. For more information on the issue, check out [our CEO's TED talk](https://www.thorn.org/blog/time-is-now-eliminate-csam/).
## Getting Started
### Installation
`pip install perception`
### Hashing
Hashing with different functions is simple with `perception`.
```python
from perception import hashers
file1, file2 = 'test1.jpg', 'test2.jpg'
hasher = hashers.PHash()
hash1, hash2 = hasher.compute(file1), hasher.compute(file2)
distance = hasher.compute_distance(hash1, hash2)
```
### Examples
See below for end-to-end examples for common use cases for perceptual hashes.
- [Detecting child sexual abuse material](https://perception.thorn.engineering/en/latest/examples/detecting_csam.html)
- [Deduplicating media](https://perception.thorn.engineering/en/latest/examples/deduplication.html)
- [Benchmarking perceptual hashes](https://perception.thorn.engineering/en/latest/examples/benchmarking.html)
## Supported Hashing Algorithms
`perception` currently ships with:
- pHash (DCT hash) (`perception.hashers.PHash`)
- Facebook's PDQ Hash (`perception.hashers.PDQ`)
- dHash (difference hash) (`perception.hashers.DHash`)
- aHash (average hash) (`perception.hashers.AverageHash`)
- Marr-Hildreth (`perception.hashers.MarrHildreth`)
- Color Moment (`perception.hashers.ColorMoment`)
- Block Mean (`perception.hashers.BlockMean`)
- wHash (wavelet hash) (`perception.hashers.WaveletHash`)
## Contributing
To work on the project, start by doing the following.
```bash
# Install local dependencies for
# code completion, etc.
make init
- To do a (close to) comprehensive check before committing code, you can use `make precommit`.
To implement new features, please first file an issue proposing your change for discussion.
To report problems, please file an issue with sample code, expected results, actual results, and a complete traceback.
## Alternatives
There are other packages worth checking out to see if they meet your needs for perceptual hashing. Here are some
examples.
- [dedupe](https://github.com/dedupeio/dedupe)
- [imagededup](https://idealo.github.io/imagededup/)
- [ImageHash](https://github.com/JohannesBuchner/imagehash)
- [PhotoHash](https://github.com/bunchesofdonald/photohash)
```
================================================
FILE: build.py
================================================
from Cython.Build import cythonize
import numpy as np
compiler_directives = {"language_level": 3, "embedsignature": True}
def build(setup_kwargs):
setup_kwargs.update(
{
"ext_modules": cythonize(
"perception/**/extensions.pyx", compiler_directives=compiler_directives
),
"include_dirs": [np.get_include()],
}
)
================================================
FILE: docs/api/benchmarking.rst
================================================
Benchmarking
************
.. autoclass:: perception.benchmarking.BenchmarkImageDataset
:members:
:inherited-members:
.. autoclass:: perception.benchmarking.BenchmarkImageTransforms
:members:
:inherited-members:
.. autoclass:: perception.benchmarking.BenchmarkVideoDataset
:members:
:inherited-members:
.. autoclass:: perception.benchmarking.BenchmarkVideoTransforms
:members:
:inherited-members:
.. autoclass:: perception.benchmarking.BenchmarkHashes
:members:
:inherited-members:
Video Transforms
================
Transforming videos can be more complex, so we provide the following
tools for transforming videos.
.. automodule:: perception.benchmarking.video_transforms
:members: get_simple_transform, get_black_frame_padding_transform, get_slideshow_transform
================================================
FILE: docs/api/hashers.rst
================================================
Hashers
*******
All hashers from the :code:`Hasher` class.
.. autoclass:: perception.hashers.hasher.Hasher
:members:
Images
~~~~~~
All image hashers inherit from the :code:`ImageHasher` class.
.. autoclass:: perception.hashers.hasher.ImageHasher
:members:
The following image hash functions are included in the package.
.. automodule:: perception.hashers.image
:members:
:imported-members:
Videos
~~~~~~
All video hashers inherit from the :code:`VideoHasher` class.
.. autoclass:: perception.hashers.hasher.VideoHasher
:members:
The following video hash functions are included in the package.
.. automodule:: perception.hashers.video
:members:
:imported-members:
Tools
~~~~~
These utility functions are only used by the hashers but are documented
here for completeness.
.. automodule:: perception.hashers.tools
:members:
================================================
FILE: docs/api/index.rst
================================================
API
***
.. toctree::
:maxdepth: 2
:caption: Contents:
hashers
benchmarking
tools
================================================
FILE: docs/api/tools.rst
================================================
Tools
*****
.. automodule:: perception.tools
:members:
================================================
FILE: docs/conf.py
================================================
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# -- Project information -----------------------------------------------------
project = "perception"
copyright = "2019, thorn"
author = "thorn"
# The short X.Y version
version = ""
# The full version, including alpha/beta/rc tags
release = ""
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.imgmath",
"sphinx.ext.napoleon",
"sphinx_autodoc_typehints",
"m2r",
]
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = ".rst"
# The master toctree document.
master_doc = "index"
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = None
html_theme = "sphinx_rtd_theme"
html_theme_options = {"navigation_depth": 4, "collapse_navigation": False}
================================================
FILE: docs/examples/benchmarking.rst
================================================
Benchmarking
************
This package provides a fair amount of infrastructure for benchmarking different hashers to evaluate their performance.
Image Hashing
=============
The below example does the following:
- Download a benchmarking dataset (we provide a dataset with images that have compatible licensing for this example)
- Load the dataset. If you are using your own datasets, you may wish to call `deduplicate` on it to ensure no duplicates are included.
- Transform the dataset to generate synthetic images.
- Define a new custom hasher that we want to evaluate.
It's not very good -- but demonstrates how you can evaluate your own custom hash functions.
- Compute all the hashes.
- Report metrics for each image category / hasher / transformation combination.
.. code-block:: python
import os
import glob
import zipfile
import urllib.request
import cv2
import albumentations
import tabulate # Optional: Only used for generating tables for the Sphinx documentation
import numpy as np
from perception import benchmarking, hashers
from perception.hashers.image.pdq import PDQHash
urllib.request.urlretrieve(
"https://thorn-perception.s3.amazonaws.com/thorn-perceptual-benchmark-v0.zip",
"thorn-perceptual-benchmark-v0.zip"
)
with zipfile.ZipFile('thorn-perceptual-benchmark-v0.zip') as f:
f.extractall('.')
# Load the dataset
dataset = benchmarking.BenchmarkImageDataset.from_tuples(files=[
(filepath, filepath.split(os.path.sep)[-2]) for filepath in glob.glob(
os.path.join('thorn-perceptual-benchmark-v0', '**', '*.jpg')
)
])
# Define the transforms we want to use for
# evaluation hash quality.
def watermark(image):
fontScale = 5
thickness = 5
text = "TEXT"
fontFace = cv2.FONT_HERSHEY_SIMPLEX
targetWidth = 0.2*image.shape[1]
(textWidth, textHeight), _ = cv2.getTextSize(
text="TEST",
fontFace=fontFace,
fontScale=fontScale,
thickness=thickness
)
fontScaleCorr = targetWidth / textWidth
textHeight *= fontScaleCorr
textWidth *= fontScaleCorr
fontScale *= fontScaleCorr
org = ( textHeight, image.shape[0] - textHeight )
org = tuple(map(int, org))
color = (0, 0, 0, 200)
placeholder = cv2.putText(
img=np.zeros(image.shape[:2] + (4, ), dtype='uint8'),
text="TEST",
org=org,
color=color,
fontFace=fontFace,
fontScale=fontScale,
thickness=thickness
).astype('float32')
augmented = (
(image.astype('float32')[..., :3]*(255 - placeholder[..., 3:]) + placeholder[..., :3]*placeholder[..., 3:])
) / 255
return augmented.astype('uint8')
def vignette(image):
height, width = image.shape[:2]
a = cv2.getGaussianKernel(height, height/2)
b = cv2.getGaussianKernel(width, width/2)
c = (b.T*a)[..., np.newaxis]
d = c/c.max()
e = image*d
return e.astype('uint8')
transforms={
'watermark': watermark,
'blur2': albumentations.GaussianBlur(sigma_limit=2.0, p=1),
'vignette': vignette,
'gamma2': albumentations.RandomGamma(gamma_limit=2, p=1),
'jpeg95': albumentations.ImageCompression(quality=95, p=1),
'pad0.2': albumentations.CropAndPad(percent=(0.2, 2), p=1),
'crop0.05': albumentations.CropAndPad(percent=-0.05, p=1),
'noise0.2': albumentations.GaussNoise(noise_scale_factor=0.2, p=1),
'rotate4': albumentations.Affine(rotate=4, p=1),
'noop': albumentations.NoOp(p=1),
}
# Compute the transformed versions of the images.
# This takes a while but you can reload the
# generated dataset without recomputing it (see next line).
transformed = dataset.transform(
transforms=transforms,
storage_dir='transformed',
errors="raise"
)
# We don't actually have to do this, but it shows
# how to reload the transformed dataset later.
transformed = benchmarking.BenchmarkImageTransforms.load(
path_to_zip_or_directory='transformed', verify_md5=False
)
# Create a new hash that we want to evaluate.
# perception will handle most of the plumbing but
# we do have to specify a few things.
class ShrinkHash(hashers.ImageHasher):
"""This is a simple hash to demonstrate how you
can create your own hasher and compare it to others.
It just shrinks images to 8x8 pixels and then flattens
the result.
"""
# We have to let perception know
# the shape and type of our hash.
hash_length = 64
dtype = 'uint8'
# We need to specify how distance is
# computed between hashes.
distance_metric = 'euclidean'
def _compute(self, image):
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
resized = cv2.resize(gray, dsize=(8, 8))
return resized.flatten()
hashers_dict = {
'ahash': hashers.AverageHash(hash_size=16),
'dhash': hashers.DHash(hash_size=16),
'pdq': PDQHash(),
'phash': hashers.PHash(hash_size=16),
'marrhildreth': hashers.MarrHildreth(),
'wavelet': hashers.WaveletHash(hash_size=16),
'blockmean': hashers.BlockMean(),
'shrinkhash': ShrinkHash()
}
# Compute the hashes
hashes = transformed.compute_hashes(hashers=hashers_dict)
# Get performance metrics (i.e., recall) for each hash function based on
# a minimum precision threshold. Here we use 99.99%.
precision_threshold = 99.99
# The metrics are just pandas dataframes. We use tabulate here to obtain the tables
# formatted for the documentation.
metrics = hashes.compute_threshold_recall(precision_threshold=precision_threshold).reset_index()
print(tabulate.tabulate(metrics, showindex=False, headers=metrics.columns, tablefmt='rst'))
metrics_by_transform = hashes.compute_threshold_recall(grouping=['transform_name'], precision_threshold=precision_threshold).reset_index()
print(tabulate.tabulate(metrics_by_transform, showindex=False, headers=metrics_by_transform.columns, tablefmt='rst'))
metrics_simple = hashes.compute_threshold_recall(grouping=[], precision_threshold=precision_threshold).reset_index()
print(tabulate.tabulate(metrics_simple, showindex=False, headers=metrics_simple.columns, tablefmt='rst'))
=========== ================ ============= ============ ======== =========== =============
category transform_name hasher_name threshold recall precision n_exemplars
=========== ================ ============= ============ ======== =========== =============
paintings blur2 ahash 0.0078125 51.724 100 2204
paintings blur2 blockmean 0.0123967 85.753 100 2204
paintings blur2 dhash 0.105469 100 100 2204
paintings blur2 marrhildreth 0.0989583 100 100 2204
paintings blur2 pdq 0.117188 100 100 2204
paintings blur2 phash 0.0390625 100 100 2204
paintings blur2 shrinkhash 60.8112 43.33 100 2204
paintings blur2 wavelet 0.0117188 66.379 100 2204
paintings crop0.05 ahash 0.00390625 0.045 100 2204
paintings crop0.05 blockmean 0.0123967 0.227 100 2204
paintings crop0.05 dhash 0.210938 7.577 100 2204
paintings crop0.05 marrhildreth 0.213542 3.584 100 2204
paintings crop0.05 pdq 0.257812 8.439 100 2204
paintings crop0.05 phash 0.226562 6.76 100 2204
paintings crop0.05 shrinkhash 95.0053 2.269 100 2204
paintings crop0.05 wavelet 0.0078125 0 nan 2204
paintings gamma2 ahash 0.00390625 0.998 100 2204
paintings gamma2 blockmean 0.0072314 1.724 100 2204
paintings gamma2 dhash 0.167969 98.639 100 2204
paintings gamma2 marrhildreth 0.159722 99.41 100 2204
paintings gamma2 pdq 0.164062 100 100 2204
paintings gamma2 phash 0.164062 100 100 2204
paintings gamma2 shrinkhash 46.5296 0 nan 2204
paintings gamma2 wavelet 0.0117188 18.512 100 2204
paintings jpeg95 ahash 0.00390625 4.22 100 2204
paintings jpeg95 blockmean 0.0134298 28.811 100 2204
paintings jpeg95 dhash 0.191406 94.782 100 2204
paintings jpeg95 marrhildreth 0.168403 82.985 100 2204
paintings jpeg95 pdq 0.257812 100 100 2204
paintings jpeg95 phash 0.234375 100 100 2204
paintings jpeg95 shrinkhash 66.053 55.172 100 2204
paintings jpeg95 wavelet 0 0 nan 2204
paintings noise0.2 ahash 0.00390625 2.677 100 2204
paintings noise0.2 blockmean 0.00826446 6.987 100 2204
paintings noise0.2 dhash 0.25 93.648 100 2204
paintings noise0.2 marrhildreth 0.170139 73.911 100 2204
paintings noise0.2 pdq 0.257812 99.229 100 2204
paintings noise0.2 phash 0.257812 100 100 2204
paintings noise0.2 shrinkhash 169.387 3.312 100 2204
paintings noise0.2 wavelet 0.0078125 1.407 100 2204
paintings noop ahash 0 100 100 2204
paintings noop blockmean 0 100 100 2204
paintings noop dhash 0 100 100 2204
paintings noop marrhildreth 0 100 100 2204
paintings noop pdq 0 100 100 2204
paintings noop phash 0 100 100 2204
paintings noop shrinkhash 0 100 100 2204
paintings noop wavelet 0 100 100 2204
paintings pad0.2 ahash 0.0703125 0 nan 2204
paintings pad0.2 blockmean 0.0795455 0 nan 2204
paintings pad0.2 dhash 0.210938 1.089 100 2204
paintings pad0.2 marrhildreth 0.177083 0 nan 2204
paintings pad0.2 pdq 0.289062 1.86 100 2204
paintings pad0.2 phash 0.273438 2.541 100 2204
paintings pad0.2 shrinkhash 146.325 0.181 100 2204
paintings pad0.2 wavelet 0.109375 0 nan 2204
paintings resize0.5 ahash 0.0078125 76.089 100 2204
paintings resize0.5 blockmean 0.0144628 98.185 100 2204
paintings resize0.5 dhash 0.0976562 100 100 2204
paintings resize0.5 marrhildreth 0.154514 99.819 100 2204
paintings resize0.5 pdq 0.1875 100 100 2204
paintings resize0.5 phash 0.09375 100 100 2204
paintings resize0.5 shrinkhash 56.9034 76.27 100 2204
paintings resize0.5 wavelet 0.0117188 84.71 100 2204
paintings rotate4 ahash 0.0390625 2.949 100 2204
paintings rotate4 blockmean 0.0382231 2.949 100 2204
paintings rotate4 dhash 0.207031 36.298 100 2204
paintings rotate4 marrhildreth 0.227431 61.978 100 2204
paintings rotate4 pdq 0.273438 56.08 100 2204
paintings rotate4 phash 0.257812 61.615 100 2204
paintings rotate4 shrinkhash 69.1737 2.813 100 2204
paintings rotate4 wavelet 0.03125 0.136 100 2204
paintings vignette ahash 0.0429688 6.171 100 2204
paintings vignette blockmean 0.0475207 8.122 100 2204
paintings vignette dhash 0.121094 32.305 100 2204
paintings vignette marrhildreth 0.177083 77.904 100 2204
paintings vignette pdq 0.132812 100 100 2204
paintings vignette phash 0.132812 100 100 2204
paintings vignette shrinkhash 102.186 3.267 100 2204
paintings vignette wavelet 0.046875 3.085 100 2204
paintings watermark ahash 0.00390625 20.054 100 2204
paintings watermark blockmean 0.0123967 45.145 100 2204
paintings watermark dhash 0.0585938 100 100 2204
paintings watermark marrhildreth 0.0625 100 100 2204
paintings watermark pdq 0.273438 98.866 100 2204
paintings watermark phash 0.28125 99.456 100 2204
paintings watermark shrinkhash 104.398 75.998 100 2204
paintings watermark wavelet 0.0117188 51.27 100 2204
photographs blur2 ahash 0.015625 76.727 100 1650
photographs blur2 blockmean 0.0330579 98 100 1650
photographs blur2 dhash 0.0859375 98.97 100 1650
photographs blur2 marrhildreth 0.107639 97.576 100 1650
photographs blur2 pdq 0.304688 100 100 1650
photographs blur2 phash 0.179688 100 100 1650
photographs blur2 shrinkhash 117.627 44 100 1650
photographs blur2 wavelet 0.0195312 79.879 100 1650
photographs crop0.05 ahash 0.0078125 0.182 100 1650
photographs crop0.05 blockmean 0.0258264 0.788 100 1650
photographs crop0.05 dhash 0.0976562 1.091 100 1650
photographs crop0.05 marrhildreth 0.173611 3.152 100 1650
photographs crop0.05 pdq 0.304688 30.606 100 1650
photographs crop0.05 phash 0.320312 63.697 100 1650
photographs crop0.05 shrinkhash 125.94 1.152 100 1650
photographs crop0.05 wavelet 0.015625 0.182 100 1650
photographs gamma2 ahash 0.015625 8.182 100 1650
photographs gamma2 blockmean 0.0268595 17.212 100 1650
photographs gamma2 dhash 0.101562 90.303 100 1650
photographs gamma2 marrhildreth 0.105903 90.909 100 1650
photographs gamma2 pdq 0.210938 100 100 1650
photographs gamma2 phash 0.234375 100 100 1650
photographs gamma2 shrinkhash 119.683 0.545 100 1650
photographs gamma2 wavelet 0.0195312 18.424 100 1650
photographs jpeg95 ahash 0.0117188 29.879 100 1650
photographs jpeg95 blockmean 0.0278926 76.788 100 1650
photographs jpeg95 dhash 0.121094 84.182 100 1650
photographs jpeg95 marrhildreth 0.104167 69.576 100 1650
photographs jpeg95 pdq 0.296875 99.879 100 1650
photographs jpeg95 phash 0.28125 99.879 100 1650
photographs jpeg95 shrinkhash 131.031 89.212 100 1650
photographs jpeg95 wavelet 0.0195312 40.242 100 1650
photographs noise0.2 ahash 0.015625 27.636 100 1650
photographs noise0.2 blockmean 0.036157 75.091 100 1650
photographs noise0.2 dhash 0.121094 54.121 100 1650
photographs noise0.2 marrhildreth 0.0989583 46.364 100 1650
photographs noise0.2 pdq 0.296875 99.697 100 1650
photographs noise0.2 phash 0.304688 99.818 100 1650
photographs noise0.2 shrinkhash 210.661 57.576 100 1650
photographs noise0.2 wavelet 0.0234375 27.03 100 1650
photographs noop ahash 0 100 100 1650
photographs noop blockmean 0 100 100 1650
photographs noop dhash 0 100 100 1650
photographs noop marrhildreth 0 100 100 1650
photographs noop pdq 0 100 100 1650
photographs noop phash 0 100 100 1650
photographs noop shrinkhash 0 100 100 1650
photographs noop wavelet 0 100 100 1650
photographs pad0.2 ahash 0.0429688 0.061 100 1650
photographs pad0.2 blockmean 0.0320248 0 nan 1650
photographs pad0.2 dhash 0.105469 0.545 100 1650
photographs pad0.2 marrhildreth 0.177083 0.121 100 1650
photographs pad0.2 pdq 0.28125 1.455 100 1650
photographs pad0.2 phash 0.289062 3.515 100 1650
photographs pad0.2 shrinkhash 114.721 0.061 100 1650
photographs pad0.2 wavelet 0.0820312 0 nan 1650
photographs resize0.5 ahash 0.015625 87.697 100 1650
photographs resize0.5 blockmean 0.0330579 99.152 100 1650
photographs resize0.5 dhash 0.0898438 98.485 100 1650
photographs resize0.5 marrhildreth 0.111111 95.394 100 1650
photographs resize0.5 pdq 0.328125 99.818 100 1650
photographs resize0.5 phash 0.234375 100 100 1650
photographs resize0.5 shrinkhash 132.117 80.242 100 1650
photographs resize0.5 wavelet 0.0195312 88.97 100 1650
photographs rotate4 ahash 0.0273438 1.818 100 1650
photographs rotate4 blockmean 0.0371901 3.879 100 1650
photographs rotate4 dhash 0.09375 2.97 100 1650
photographs rotate4 marrhildreth 0.149306 4.606 100 1650
photographs rotate4 pdq 0.304688 73.394 100 1650
photographs rotate4 phash 0.3125 89.818 100 1650
photographs rotate4 shrinkhash 130.211 4.424 100 1650
photographs rotate4 wavelet 0.0078125 0.061 100 1650
photographs vignette ahash 0.0273438 8.242 100 1650
photographs vignette blockmean 0.0320248 10 100 1650
photographs vignette dhash 0.0703125 22 100 1650
photographs vignette marrhildreth 0.0954861 38.727 100 1650
photographs vignette pdq 0.117188 100 100 1650
photographs vignette phash 0.125 100 100 1650
photographs vignette shrinkhash 138.989 11.939 100 1650
photographs vignette wavelet 0.0195312 4.242 100 1650
photographs watermark ahash 0.015625 42.667 100 1650
photographs watermark blockmean 0.0247934 60.788 100 1650
photographs watermark dhash 0.078125 100 100 1650
photographs watermark marrhildreth 0.112847 98.727 100 1650
photographs watermark pdq 0.3125 99.818 100 1650
photographs watermark phash 0.3125 99.758 100 1650
photographs watermark shrinkhash 142.046 79.576 100 1650
photographs watermark wavelet 0.0195312 53.455 100 1650
=========== ================ ============= ============ ======== =========== =============
================ ============= ============ ======== =========== =============
transform_name hasher_name threshold recall precision n_exemplars
================ ============= ============ ======== =========== =============
blur2 ahash 0.0078125 49.014 100 3854
blur2 blockmean 0.0123967 80.773 100 3854
blur2 dhash 0.0859375 99.196 100 3854
blur2 marrhildreth 0.107639 98.962 100 3854
blur2 pdq 0.234375 99.948 100 3854
blur2 phash 0.179688 100 100 3854
blur2 shrinkhash 60.8112 28.412 100 3854
blur2 wavelet 0.0117188 62.247 100 3854
crop0.05 ahash 0.00390625 0.052 100 3854
crop0.05 blockmean 0.0123967 0.208 100 3854
crop0.05 dhash 0.0976562 0.493 100 3854
crop0.05 marrhildreth 0.173611 1.635 100 3854
crop0.05 pdq 0.257812 9.03 100 3854
crop0.05 phash 0.226562 7.058 100 3854
crop0.05 shrinkhash 95.0053 1.427 100 3854
crop0.05 wavelet 0.0078125 0 nan 3854
gamma2 ahash 0.00390625 0.934 100 3854
gamma2 blockmean 0.0072314 1.713 100 3854
gamma2 dhash 0.101562 90.036 100 3854
gamma2 marrhildreth 0.105903 94.24 100 3854
gamma2 pdq 0.210938 100 100 3854
gamma2 phash 0.234375 100 100 3854
gamma2 shrinkhash 108.457 0.156 100 3854
gamma2 wavelet 0.0117188 14.997 100 3854
jpeg95 ahash 0.00390625 5.319 100 3854
jpeg95 blockmean 0.0134298 32.045 100 3854
jpeg95 dhash 0.121094 74.079 100 3854
jpeg95 marrhildreth 0.104167 59.263 100 3854
jpeg95 pdq 0.257812 99.896 100 3854
jpeg95 phash 0.234375 99.896 100 3854
jpeg95 shrinkhash 66.053 40.296 100 3854
jpeg95 wavelet 0.00390625 3.71 100 3854
noise0.2 ahash 0.00390625 2.984 100 3854
noise0.2 blockmean 0.00826446 8.563 100 3854
noise0.2 dhash 0.121094 40.088 100 3854
noise0.2 marrhildreth 0.0989583 33.083 100 3854
noise0.2 pdq 0.257812 99.222 100 3854
noise0.2 phash 0.273438 99.896 100 3854
noise0.2 shrinkhash 169.387 4.385 100 3854
noise0.2 wavelet 0.0078125 1.894 100 3854
noop ahash 0 100 100 3854
noop blockmean 0 100 100 3854
noop dhash 0 100 100 3854
noop marrhildreth 0 100 100 3854
noop pdq 0 100 100 3854
noop phash 0 100 100 3854
noop shrinkhash 0 100 100 3854
noop wavelet 0 100 100 3854
pad0.2 ahash 0.0429688 0.026 100 3854
pad0.2 blockmean 0.0320248 0 nan 3854
pad0.2 dhash 0.105469 0.234 100 3854
pad0.2 marrhildreth 0.177083 0.052 100 3854
pad0.2 pdq 0.28125 1.349 100 3854
pad0.2 phash 0.273438 2.387 100 3854
pad0.2 shrinkhash 114.721 0.052 100 3854
pad0.2 wavelet 0.0820312 0 nan 3854
resize0.5 ahash 0.0078125 70.784 100 3854
resize0.5 blockmean 0.0144628 95.226 100 3854
resize0.5 dhash 0.0898438 99.299 100 3854
resize0.5 marrhildreth 0.112847 97.846 100 3854
resize0.5 pdq 0.265625 99.844 100 3854
resize0.5 phash 0.234375 100 100 3854
resize0.5 shrinkhash 56.9034 51.453 100 3854
resize0.5 wavelet 0.0117188 80.747 100 3854
rotate4 ahash 0.0273438 1.297 100 3854
rotate4 blockmean 0.0371901 3.036 100 3854
rotate4 dhash 0.09375 1.401 100 3854
rotate4 marrhildreth 0.149306 3.762 100 3854
rotate4 pdq 0.273438 54.489 100 3854
rotate4 phash 0.257812 59.626 100 3854
rotate4 shrinkhash 69.1737 1.894 100 3854
rotate4 wavelet 0.0078125 0.026 100 3854
vignette ahash 0.0273438 4.67 100 3854
vignette blockmean 0.0320248 6.098 100 3854
vignette dhash 0.0703125 12.195 100 3854
vignette marrhildreth 0.0954861 30.54 100 3854
vignette pdq 0.132812 100 100 3854
vignette phash 0.132812 100 100 3854
vignette shrinkhash 103.005 4.541 100 3854
vignette wavelet 0.0195312 1.946 100 3854
watermark ahash 0.00390625 18.5 100 3854
watermark blockmean 0.0123967 41.593 100 3854
watermark dhash 0.078125 100 100 3854
watermark marrhildreth 0.112847 99.455 100 3854
watermark pdq 0.273438 99.014 100 3854
watermark phash 0.28125 99.377 100 3854
watermark shrinkhash 104.398 71.199 100 3854
watermark wavelet 0.0117188 46.912 100 3854
================ ============= ============ ======== =========== =============
============= =========== ======== =========== =============
hasher_name threshold recall precision n_exemplars
============= =========== ======== =========== =============
ahash 0.00390625 17.578 100 42394
blockmean 0.00826446 27.714 100 42394
dhash 0.0859375 51.981 99.9952 42394
marrhildreth 0.100694 55.942 99.9957 42394
pdq 0.257812 77.181 99.9969 42394
phash 0.273438 81.967 99.9942 42394
shrinkhash 56.9034 22.378 100 42394
wavelet 0.00390625 18.467 100 42394
============= =========== ======== =========== =============
Video Hashing
=============
The below example does the following:
- Download a benchmarking dataset. Here we use the `Charades <https://prior.allenai.org/projects/charades>`_ dataset which contain over 9,000 videos.
- Load the dataset.
- Transform the dataset to generate synthetically altered videos. Our hashers are responsible for
matching the altered videos with the originals.
- Define some hashers we want to evaluate.
- Compute all the hashes.
- Report metrics for each video category / hasher / transformation combination to see how well our hashers
can match the altered videos to the original ("no-op" videos).
.. code-block:: python
import os
import zipfile
import urllib.request
import pandas as pd
import perception.benchmarking
import perception.hashers
if not os.path.isdir('Charades_v1_480'):
# Download the dataset since it appears we do not have it. Note that
# these are large files (> 13GB).
urllib.request.urlretrieve(
url='http://ai2-website.s3.amazonaws.com/data/Charades_v1_480.zip',
filename='Charades_v1_480.zip'
)
with zipfile.ZipFile('Charades_v1_480.zip') as zfile:
zfile.extractall('.')
urllib.request.urlretrieve(
url='http://ai2-website.s3.amazonaws.com/data/Charades.zip',
filename='Charades.zip'
)
with zipfile.ZipFile('Charades.zip') as zfile:
zfile.extractall('.')
# These are files that we've identified as having identical subsequences, typically
# when a person is out of frame and the backgrounds are the same.
duplicates = [
('0HVVN.mp4', 'UZRQD.mp4'), ('ZIOET.mp4', 'YGXX6.mp4'), ('82XPD.mp4', 'E7QDZ.mp4'),
('FQDS1.mp4', 'AIOTI.mp4'), ('PBV4T.mp4', 'XXYWL.mp4'), ('M0P0H.mp4', 'STY6W.mp4'),
('3Q92U.mp4', 'GHPO3.mp4'), ('NFIQM.mp4', 'I2DHG.mp4'), ('PIRMO.mp4', '0GFE8.mp4'),
('LRPBA.mp4', '9VK0J.mp4'), ('UI0QG.mp4', 'FHXKQ.mp4'), ('Y05U8.mp4', '4RVZB.mp4'),
('J6TVB.mp4', '2ZBL5.mp4'), ('A8T8V.mp4', 'IGOQK.mp4'), ('H8QM1.mp4', 'QYMWC.mp4'),
('O45BC.mp4', 'ZS7X6.mp4'), ('NOP6W.mp4', 'F7KFE.mp4'), ('4MPPQ.mp4', 'A3M94.mp4'),
('L8FFR.mp4', 'M8MP0.mp4'), ('EHYXP.mp4', 'O8PO3.mp4'), ('MGBLJ.mp4', 'RIEG6.mp4'),
('53FPM.mp4', 'BLFEV.mp4'), ('UIIF3.mp4', 'TKEKQ.mp4'), ('GVX7E.mp4', '7GPSY.mp4'),
('T7HZB.mp4', '6KGZA.mp4'), ('65M4K.mp4', 'UDGP2.mp4'), ('6SS4H.mp4', 'CK6OL.mp4'),
('OVHFT.mp4', 'GG1X2.mp4'), ('VEHER.mp4', 'XBPEJ.mp4'), ('WN38A.mp4', '2QI8F.mp4'),
('UMXKN.mp4', 'EOKJ0.mp4'), ('OSIKP.mp4', 'WT2C0.mp4'), ('H5V2Y.mp4', 'ZXN6A.mp4'),
('XS6PF.mp4', '1WJ6O.mp4'), ('S2XJW.mp4', 'YH0BX.mp4'), ('UO607.mp4', 'Z5JZD.mp4'),
('XN64E.mp4', 'CSRZM.mp4'), ('YXI7M.mp4', 'IKQLJ.mp4'), ('1B9C8.mp4', '004QE.mp4'),
('V1SQH.mp4', '48WOM.mp4'), ('107YZ.mp4', 'I049A.mp4'), ('3S6WL.mp4', 'SC5YW.mp4'),
('OY50Q.mp4', '5T607.mp4'), ('XKH7W.mp4', '028CE.mp4'), ('X8XQE.mp4', 'J0VXY.mp4'),
('STB0G.mp4', 'J0VXY.mp4'), ('UNXLF.mp4', 'J0VXY.mp4'), ('56PK0.mp4', 'M1TZR.mp4'),
('FVITB.mp4', 'R0M34.mp4'), ('BPZE3.mp4', 'R0M34.mp4'), ('VS7DA.mp4', '1X0M3.mp4'),
('I7MEA.mp4', 'YMM1Z.mp4'), ('9N76L.mp4', '0LDP7.mp4'), ('AXS82.mp4', 'W8WRK.mp4'),
('8TSU4.mp4', 'MXATD.mp4'), ('80FWF.mp4', '18HFG.mp4'), ('RO3A2.mp4', 'V4HY4.mp4'),
('HU409.mp4', 'BDWIX.mp4'), ('3YY88.mp4', 'EHHRS.mp4'), ('65RS3.mp4', 'SLIH4.mp4'),
('LR0L8.mp4', 'Y665P.mp4'), ('DVPL2.mp4', 'EI5M3.mp4'), ('0EGNU.mp4', 'CU3JE.mp4'),
('94KP4.mp4', '94KP4.mp4'), ('79QDP.mp4', '79QDP.mp4'), ('GKBX9.mp4', 'GKBX9.mp4'),
('RX6R8.mp4', 'RX6R8.mp4'), ('PMVT7.mp4', 'PMVT7.mp4'), ('XNXW6.mp4', 'XNXW6.mp4'),
('I005F.mp4', 'I005F.mp4'), ('TF95Y.mp4', 'TF95Y.mp4'), ('79QDP.mp4', '79QDP.mp4'),
('LQGMM.mp4', 'LQGMM.mp4'), ('QCAUL.mp4', 'QCAUL.mp4'), ('GFVSV.mp4', 'GFVSV.mp4'),
('4UYGY.mp4', '4UYGY.mp4'), ('BYDSE.mp4', 'BYDSE.mp4'), ('PV3KQ.mp4', 'PV3KQ.mp4'),
('1X0M3.mp4', '1X0M3.mp4'), ('T5FHD.mp4', 'T5FHD.mp4'), ('QRHJJ.mp4', 'QRHJJ.mp4'),
('JYBGS.mp4', 'JYBGS.mp4'), ('N2XCF.mp4', 'N2XCF.mp4'), ('OZPA9.mp4', 'OZPA9.mp4'),
('297S4.mp4', '297S4.mp4'), ('LHU7D.mp4', 'LHU7D.mp4'), ('TSKZL.mp4', 'TSKZL.mp4'),
('BCONW.mp4', 'BCONW.mp4'), ('KBPDM.mp4', 'KBPDM.mp4'), ('7FTBS.mp4', '7FTBS.mp4'),
('099Y1.mp4', '099Y1.mp4'), ('S2RIQ.mp4', 'S2RIQ.mp4'), ('22FJU.mp4', '22FJU.mp4'),
('99UA6.mp4', '99UA6.mp4'), ('WJ13E.mp4', 'WJ13E.mp4'), ('5OLVC.mp4', '5OLVC.mp4'),
('YQ6Z6.mp4', 'YQ6Z6.mp4'), ('T5MLJ.mp4', 'T5MLJ.mp4'), ('0VOQC.mp4', '0VOQC.mp4'),
('S2RIQ.mp4', 'S2RIQ.mp4'), ('2VNXF.mp4', '2VNXF.mp4'), ('G87XG.mp4', 'G87XG.mp4'),
('RRS54.mp4', 'RRS54.mp4'), ('TXJK7.mp4', 'TXJK7.mp4'), ('G4KE3.mp4', 'G4KE3.mp4'),
('3SNSC.mp4', '3SNSC.mp4'), ('U2FA5.mp4', 'U2FA5.mp4'), ('9AFQ7.mp4', '9AFQ7.mp4')
]
blacklist = [fp1 for fp1, fp2 in duplicates]
df = pd.concat([pd.read_csv('Charades/Charades_v1_test.csv'), pd.read_csv('Charades/Charades_v1_train.csv')])
df = df[~(df['id'] + '.mp4').isin(blacklist)]
df['filepath'] = df['id'].apply(lambda video_id: os.path.join('Charades_v1_480', video_id + '.mp4'))
assert df['filepath'].apply(os.path.isfile).all(), 'Some video files are missing.'
dataset = perception.benchmarking.BenchmarkVideoDataset.from_tuples(
files=df[['filepath', 'scene']].itertuples(index=False)
)
if not os.path.isdir('benchmarking_videos'):
# We haven't computed the transforms yet, so we do that
# now. Below, we create the following files for each of
# the videos in our dataset. Note that the only required
# transform is `noop` (see documentation for
# perception.bencharmking.BenchmarkVideoDataset.transform).
#
# noop: This is the base video we'll actually use in benchmarking, rather
# than using the raw video. It is the same as the raw video but downsampled
# to a size that is reasonable for hashing (240p). This is because all
# of our hashers downsample to a size smaller than this anyway, so there
# is no benefit to a higher resolution. Also, we limit the length to the
# first five minutes of the video, which speeds everything up significantly.
# shrink: Shrink the noop video down to 70% of its original size.
# clip0.2: Clip the first 20% and last 20% of the noop video off.
# slideshow: Create a slideshow version of the video that grabs frames periodically
# from the original.
# black_frames: Add black frames before and after the start of the video.
# gif: Create a GIF from the video (similar to slideshow but with re-encoding)
# black_padding: Add black bars to the top and bottom of the video.
pad_width = 240
pad_height = 320
transforms = {
'noop': perception.benchmarking.video_transforms.get_simple_transform(
width='ceil(min(240/max(iw, ih), 1)*iw/2)*2',
height='ceil(min(240/max(iw, ih), 1)*ih/2)*2',
codec='h264',
output_ext='.m4v',
sar='1/1',
clip_s=(None, 60*5)
),
'shrink': perception.benchmarking.video_transforms.get_simple_transform(
width='ceil(0.7*iw/2)*2',
height='ceil(0.7*ih/2)*2'
),
'clip0.2': perception.benchmarking.video_transforms.get_simple_transform(clip_pct=(0.2, 0.8)),
'slideshow': perception.benchmarking.video_transforms.get_slideshow_transform(
frame_input_rate=1/2.5, frame_output_rate=0.5, max_frames=10, offset=1.3),
'black_frames': perception.benchmarking.video_transforms.get_black_frame_padding_transform(0.5, 0.05),
'gif': perception.benchmarking.video_transforms.get_simple_transform(
output_ext='.gif', codec='gif', clip_s=(1.2, 10.2), fps=1/2.5
),
'black_padding': perception.benchmarking.video_transforms.get_simple_transform(
width=f'(iw*sar)*min({pad_width}/(iw*sar),{pad_height}/ih)', height=f'ih*min({pad_width}/(iw*sar),{pad_height}/ih)',
pad=f'{pad_width}:{pad_height}:({pad_width}-iw*min({pad_width}/iw,{pad_height}/ih))/2:({pad_height}-ih*min({pad_width}/iw,{pad_height}/ih))/2'
)
}
# Save the transforms for later.
transformed = dataset.transform(transforms=transforms, storage_dir='benchmarking_videos')
transformed = perception.benchmarking.BenchmarkVideoTransforms.load('benchmarking_videos', verify_md5=False)
phashu8 = perception.hashers.PHashU8(exclude_first_term=False, freq_shift=1, hash_size=12)
hashers = {
'phashu8_framewise': perception.hashers.FramewiseHasher(
frames_per_second=1, frame_hasher=phashu8, interframe_threshold=50, quality_threshold=90),
'phashu8_tmkl1': perception.hashers.FramewiseHasher(
base_hasher=perception.hashers.TMKL1(
frames_per_second=5, frame_hasher=phashu8,
distance_metric='euclidean', dtype='uint8',
norm=None, quality_threshold=90)
)
}
if not os.path.isfile('hashes.csv'):
# We haven't computed the hashes, so we do that now.
hashes = transformed.compute_hashes(hashers=hashers, max_workers=5)
# Save the hashes for later. It took a long time after all!
hashes.save('hashes.csv')
hashes = perception.benchmarking.BenchmarkHashes.load('hashes.csv')
hashes.compute_threshold_recall(precision_threshold=99.9, grouping=['transform_name'])
================ ================= =========== ======== =========== =============
transform_name hasher_name threshold recall precision n_exemplars
================ ================= =========== ======== =========== =============
black_frames phashu8_framewise 51.0979 88.12 99.9069 278644
black_frames phashu8_tmkl1 55.7584 99.918 99.9079 403768
black_padding phashu8_framewise 74.6391 7.662 100 277399
black_padding phashu8_tmkl1 53.8702 99.898 99.9079 406899
clip0.2 phashu8_framewise 54.8635 90.741 99.9098 224264
clip0.2 phashu8_tmkl1 59.0424 99.724 99.9077 324251
gif phashu8_framewise 55.4437 68.21 99.9088 82232
gif phashu8_tmkl1 55.4887 81.029 99.9103 39757
noop phashu8_framewise 0 100 100 282658
noop phashu8_tmkl1 0 100 100 408871
shrink phashu8_framewise 24.7184 100 100 281731
shrink phashu8_tmkl1 49.8999 99.836 99.9078 400650
slideshow phashu8_framewise 56.9825 99.713 99.9076 172829
slideshow phashu8_tmkl1 56.8683 95.934 99.9035 90684
================ ================= =========== ======== =========== =============
================================================
FILE: docs/examples/deduplication.rst
================================================
Media Deduplication
*******************
Perceptual hashes can be used to deduplicate sets of images. Below we provide two examples (one simple, one larger scale).
**For most use cases, we recommend using PHash with** :code:`hash_size=16` **and
with 0.2 as the distance threshold as in the example below.** You may wish to adjust
this threshold up or down based on your tolerance for false negatives / positives.
In practice, deduplicating in memory on your machine by the methods below may be impractical.
For larger-scale applications, you may wish to use tools like
`FAISS <https://github.com/facebookresearch/faiss>`_,
`Annoy <https://github.com/spotify/annoy>`_, or databases with
functionality for querying based on distance such as
`MemSQL <https://docs.memsql.com/sql-reference/v6.8/euclidean_distance/>`_.
For the supported hashers, below are our recommended thresholds with expected false positive rates of <1%.
====================== ===========
hasher threshold
====================== ===========
ahash (hash_size=16) 0.008
blockmean 0.008
dhash (hash_size=16) 0.07
marrhildreth 0.1
pdq 0.2
phash (hash_size=16) 0.2
wavelet (hash_size=16) 0.02
====================== ===========
Simple example
==============
In this example, we download a ZIP file containing 18 images. One of the images is duplicated
twice and another image is duplicated once.
.. code-block:: python
import os
import glob
import zipfile
import urllib.request
import tabulate
import pandas as pd
from perception import tools, hashers
urllib.request.urlretrieve(
"https://thorn-perception.s3.amazonaws.com/thorn-perceptual-deduplication-example.zip",
"thorn-perceptual-deduplication-example.zip"
)
with zipfile.ZipFile('thorn-perceptual-deduplication-example.zip') as f:
f.extractall('.')
filepaths = glob.glob('thorn-perceptual-deduplication-example/*.jpg')
duplicate_pairs = tools.deduplicate(files=filepaths, hashers=[(hashers.PHash(hash_size=16), 0.2)])
print(tabulate.tabulate(pd.DataFrame(duplicate_pairs), showindex=False, headers=['file1', 'file2'], tablefmt='rst'))
# Now we can do whatever we want with the duplicates. We could just delete
# the first entry in each pair or manually verify the pairs to ensure they
# are, in fact duplicates.
=============================================== ===============================================
file1 file2
=============================================== ===============================================
thorn-perceptual-deduplication-example/309b.jpg thorn-perceptual-deduplication-example/309.jpg
thorn-perceptual-deduplication-example/309b.jpg thorn-perceptual-deduplication-example/309a.jpg
thorn-perceptual-deduplication-example/309a.jpg thorn-perceptual-deduplication-example/309.jpg
thorn-perceptual-deduplication-example/315a.jpg thorn-perceptual-deduplication-example/315.jpg
=============================================== ===============================================
Real-world example
==================
In the example below, we use the
`Caltech 256 Categories <http://www.vision.caltech.edu/Image_Datasets/Caltech256>`_ dataset. Like
most other public image datasets, it contains a handful of duplicates in some categories.
The code below will:
1. Download the dataset
2. Group all the filepaths by category (the dataset is provided in folders)
3. Within each group, find duplicates using PHash. We will compare not just the
original images, but also the 8 isometric transformations for each image.
.. code-block:: python
import os
import tarfile
from glob import glob
import urllib.request
import tqdm
from perception import hashers, tools
urllib.request.urlretrieve(
"http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar",
"256_ObjectCategories.tar"
)
with tarfile.open('256_ObjectCategories.tar') as tfile:
tfile.extractall()
files = glob('256_ObjectCategories/**/*.jpg')
# To reduce the number of pairwise comparisons,
# we can deduplicate within each image category
# (i.e., we don't need to compare images of
# butterflies with images of chess boards).
filepath_group = [
(
filepath,
os.path.normpath(filepath).split(os.sep)[-2]
) for filepath in files
]
groups = list(set([group for _, group in filepath_group]))
# We consider any pair of images with a PHash distance of < 0.2 as
# as a duplicate.
comparison_hashers = [(hashers.PHash(hash_size=16), 0.2)]
duplicate_pairs = []
for current_group in groups:
current_filepaths = [
filepath for filepath, group in filepath_group if group == current_group
]
current_duplicate_pairs = tools.deduplicate(
files=current_filepaths,
hashers=comparison_hashers,
isometric=True,
progress=tqdm.tqdm
)
duplicate_pairs.extend(current_duplicate_pairs)
# Now we can do whatever we want with the duplicates. We could just delete
# the first entry in each pair or manually verify the pairs to ensure they
# are, in fact duplicates.
Video deduplication
===================
Video deduplication requires more thought depending on your tolerance for false positives and
how important temporal relationships are. Below is one example approach for deduplicating a
group of videos by taking frames from each video that are sufficiently different from each other
(to avoid keeping too many) and then using them all to find
pairs of videos that have matching frames.
.. code-block:: python
import urllib.request
import zipfile
import glob
import tqdm
import perception.hashers
# Download some example videos.
urllib.request.urlretrieve(
"https://thorn-perception.s3.amazonaws.com/thorn-perceptual-video-deduplication-example.zip",
"thorn-perceptual-video-deduplication-example.zip"
)
with zipfile.ZipFile('thorn-perceptual-video-deduplication-example.zip') as f:
f.extractall('.')
frame_hasher = hashers.PHash(hash_size=16)
hasher = perception.hashers.FramewiseHasher(frames_per_second=1,
frame_hasher=frame_hasher,
interframe_threshold=50,
quality_threshold=90)
# Set a threshold for matching frames within videos and across videos.
filepaths = glob.glob('thorn-perceptual-video-deduplication-example/*.m4v') + \
glob.glob('thorn-perceptual-video-deduplication-example/*.gif')
# Returns a list of dicts with a "filepath" and "hash" key. "hash" contains a
# list of hashes.
hashes = hasher.compute_parallel(filepaths=filepaths, progress=tqdm.tqdm)
# Flatten the hashes into a list of (filepath, hash) tuples.
hashes_flattened = perception.tools.flatten([
[(hash_group['filepath'], hash_string) for hash_string in hash_group['hash']]
for hash_group in hashes
])
duplicates = perception.tools.deduplicate_hashes(
hashes=hashes_flattened,
threshold=50,
hasher=hasher
)
================================================
FILE: docs/examples/detecting_csam.rst
================================================
Detecting Child Sexual Abuse Material
*************************************
Using `perception` and a subscription to Thorn's Safer service,
you can easily check for child sexual abuse material against a database of known bad content
**without** having to send any images to a third party. You do this by sending compact, irreversible
image hashes to get matches with a high degree of precision. We support matching using
16x16 PHash hashes and md5 hashes.
See usage example below. Please contact info@getsafer.io to discuss Thorn's Safer service
and subscription options and visit `getsafer.io <https://getsafer.io/>`_ to learn more.
.. code-block:: python
from perception import tools
matcher = tools.SaferMatcher(
api_key='YOUR_API_KEY',
url='MATCHING_SERVICE_URL'
)
matches = matcher.match(['myfile.jpg'])
In some cases, you may have a username/password instead of an API key, in which case
you can pass those instead (see API documentation for details).
================================================
FILE: docs/examples/index.rst
================================================
Examples
********
.. toctree::
:maxdepth: 2
:caption: Contents:
deduplication
detecting_csam
benchmarking
================================================
FILE: docs/index.rst
================================================
perception
==========
:code:`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing
research, development, and production use. It provides a common wrapper around existing, popular perceptual hashes
(such as those implemented by `ImageHash <https://pypi.org/project/ImageHash/>`_)
along with tools to compare their performance and use them for common tasks.
Perceptual hashes are used to create compact image "fingerprints" which are invariant to small alterations to
the original image. Typically, the representations are compact enough that they are irreversible, which makes
them useful for deduplication and detecting abusive content while preserving the privacy of content owners.
Installation
************
You can install :code:`perception` using pip. You must install OpenCV separately (e.g., with :code:`pip install opencv-python`).
.. code-block:: bash
# Install from PyPi
pip install perception
# Install from GitHub
pip install git+https://github.com/thorn-oss/perception.git#egg=perception
To install with the necessary dependencies for benchmarking, use:
.. code-block:: bash
# Install from PyPi
pip install perception[benchmarking]
# Install from GitHub
pip install opencv-python git+https://github.com/thorn-oss/perception.git#egg=perception[benchmarking]
Getting Started
***************
Please see the examples for code snippets for common use cases.
.. toctree::
:maxdepth: 2
:caption: Contents:
examples/index
api/index
================================================
FILE: docs/requirements.txt
================================================
sphinx-autodoc-typehints==3.2.0
# sphinx-autobuild==3.0.2
# sphinx==1.8.3
sphinx_rtd_theme==3.0.2
m2r==0.3.1
opencv-contrib-python-headless
tqdm
albumentations
ffmpeg-python
typing-extensions
faiss-cpu
aiohttp
python-json-logger
networkit
================================================
FILE: perception/__init__.py
================================================
from importlib import metadata
__version__ = metadata.version("perception")
================================================
FILE: perception/approximate_deduplication/__init__.py
================================================
import logging
import math
import os.path as op
import typing
import faiss
import numpy as np
import tqdm
import typing_extensions
from ._graph_backend import get_graph_backend
LOGGER = logging.getLogger(__name__)
DEFAULT_PCT_PROBE = 0
# For faiss training on datasets larger than 50,000 vectors, we take a random sub-sample.
TRAIN_LARGE_SIZE: int = 50_000
class ClusterAssignment(typing_extensions.TypedDict):
cluster: int
id: typing.Any
def build_index(
X: np.ndarray,
pct_probe: float = DEFAULT_PCT_PROBE,
approximate: bool = True,
use_gpu: bool = True,
):
"""Buid a FAISS index from a reference dataframe.
Args:
X: The vectors to add to the index.
pct_probe: The minimum fraction of nearest lists to search. If
the product of pct_probe and the number of lists is less
than 1, one list will be searched.
approximate: Whether to build an approximate or exact index.
Returns:
An (index, lookup) tuple where the lookup returns the filepath
for a given entry in the index.
"""
if X is None:
return None
X = X.astype("float32")
d = X.shape[1]
if approximate:
ntotal = X.shape[0]
nlist = int(max(min(4 * np.sqrt(ntotal), ntotal / 39), 1))
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)
gpu = False
if use_gpu:
try:
res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index)
gpu = True
except AttributeError:
LOGGER.info("Building approximate FAISS index on CPU.")
if X.shape[0] > TRAIN_LARGE_SIZE:
# Take random sample of 50,000 or 39 points per centroid.
# 39 points per centroid is the min for for not getting warnings.
# https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids
sample_size = max(39 * nlist, TRAIN_LARGE_SIZE)
index.train(X[np.random.choice(X.shape[0], sample_size, replace=False)])
else:
index.train(X)
batch_size = 10_000
for i in range(0, X.shape[0], batch_size):
index.add(X[i : i + batch_size])
if gpu:
index = faiss.index_gpu_to_cpu(index)
nprobe = max(math.ceil(pct_probe * nlist), 1)
faiss.ParameterSpace().set_index_parameter(index, "nprobe", nprobe)
else:
index = faiss.IndexFlat(d)
index.add(X)
return index
def compute_euclidean_pairwise_duplicates_approx(
X,
counts,
threshold,
minimum_overlap,
Y=None,
y_counts=None,
pct_probe=0.1,
use_gpu: bool = True,
faiss_cache_path: str | None = None,
show_progress: bool = False,
):
"""Provides the same result as perception.extensions.compute_pairwise_duplicates_simple
but uses an approximate search instead of an exhaustive search, which can dramatically reduce
processing time.
Args:
X: An array of vectors to compute pairs for.
Y: if provided we search in X for Y vectors.
counts: A list of counts of vectors for separate files in the
in the vectors (should add up to the length of X)
threshold: The threshold for a match as a euclidean distance.
minimum_overlap: The minimum overlap between two files to qualify as a match.
pct_probe: The minimum percentage of sublists to search for matches. The larger the
value, the more exhaustive the search.
faiss_cache_path: If provided load any existing faiss index from this path, and if
it does not exist then save the generated faiss index to the path.
show_progress: Whether or not to show a progress bar while computing pairs
Returns:
A list of pairs of matching file indexes.
"""
assert (
counts.sum() == X.shape[0]
), "Length of counts incompatible with vectors shape."
assert (Y is None) == (
y_counts is None
), "Must provide both or neither for y, y_counts."
if X.dtype != "float32":
# Only make the copy if we have to.
X = X.astype("float32")
if Y is not None and Y.dtype != "float32":
# Only make the copy if we have to.
Y = Y.astype("float32")
lookup_ = []
for idx, count in enumerate(counts):
lookup_.extend([idx] * count)
lookup = np.array(lookup_)
if faiss_cache_path is not None and op.exists(faiss_cache_path):
LOGGER.debug("Loading cached FAISS index from %s", faiss_cache_path)
index = faiss.read_index(faiss_cache_path)
assert (
X.shape[0] == index.ntotal
), "Cached FAISS index does not match provided X."
else:
LOGGER.debug("Building FAISS index.")
index = build_index(X=X, pct_probe=pct_probe, approximate=True, use_gpu=use_gpu)
if faiss_cache_path is not None:
faiss.write_index(index, faiss_cache_path)
LOGGER.debug("FAISS index ready, start aprox search")
pairs = []
# Only use y_counts if present.
if y_counts is None:
iterator_counts = counts
M = X
else:
iterator_counts = y_counts
M = Y
for end, length, query in tqdm.tqdm(
zip(iterator_counts.cumsum(), iterator_counts, range(len(iterator_counts))),
total=len(iterator_counts),
disable=not show_progress,
desc="Vectors",
):
if length == 0:
continue
Xq = M[end - length : end]
lims, _, idxs = index.range_search(Xq, threshold**2)
lims = lims.astype("int32")
matched = [
match
for match in np.unique(lookup[list(set(idxs))]) # type: ignore
if match != query
or Y is not None # Protect self matches if Y is not present.
]
query_in_match: typing.Mapping[int, set] = {m: set() for m in matched}
match_in_query: typing.Mapping[int, set] = {m: set() for m in matched}
for query_idx in range(length):
for match_idx in idxs[lims[query_idx] : lims[query_idx + 1]]:
match = lookup[match_idx]
if (
match == query and Y is None
): # Protect self matches if Y is not present.
continue
match_in_query[match].add(match_idx)
query_in_match[match].add(query_idx)
for match in matched:
overlap = min(
[
len(query_in_match[match]) / length,
len(match_in_query[match]) / counts[match],
]
)
if overlap >= minimum_overlap and overlap > 0:
if Y is None:
pairs.append(tuple(sorted([query, match])))
else:
pairs.append(tuple([query, match]))
return list(set(pairs))
def pairs_to_clusters(
ids: typing.Iterable[str],
pairs: typing.Iterable[tuple[str, str]],
strictness: typing_extensions.Literal[
"clique", "community", "component"
] = "clique",
max_clique_batch_size: int = 1000,
) -> list[ClusterAssignment]:
"""Given a list of pairs of matching files, compute sets
of cliques where all files in a clique are connected.
Args:
ids: A list of node ids (e.g., filepaths).
pairs: A list of pairs of node ids, each pair is assumed to have an edge
strictness: The level at which groups will be clustered. "component"
means that all clusters will be connected components. "community"
will select clusters of files within components that are clustered
together. "clique" will result in clusters where every file is
connected to every other file.
max_clique_batch_size: The maximum batch size for identifying
cliques.
Returns:
A list of cluster assignments (dicts with id and cluster
entries).
"""
assert strictness in ["component", "community", "clique"], "Invalid strictness."
list_ids = list(ids)
id_to_node_map = {v: i for i, v in enumerate(list_ids)}
node_to_id_map = {v: k for k, v in id_to_node_map.items()}
LOGGER.debug("Building graph.")
node_pairs = {(id_to_node_map[pair[0]], id_to_node_map[pair[1]]) for pair in pairs}
backend = get_graph_backend()
graph = backend.build_graph(len(list_ids), node_pairs)
assignments: list[ClusterAssignment] = []
cluster_index = 0
components = backend.connected_components(graph)
for component in components:
LOGGER.debug("Got component with size: %s", len(component))
if strictness == "component":
assignments.extend(
[{"id": node_to_id_map[n], "cluster": cluster_index} for n in component]
)
cluster_index += 1
continue
communities = backend.communities(graph, component)
for community_members in communities:
LOGGER.debug("Got community with size: %s", len(community_members))
if strictness == "community":
assignments.extend(
[
{"id": node_to_id_map[n], "cluster": cluster_index}
for n in community_members
]
)
cluster_index += 1
continue
for clique_members in backend.maximal_cliques(
graph,
community_members,
max_clique_batch_size=max_clique_batch_size,
):
assignments.extend(
[
{
"id": node_to_id_map[n],
"cluster": cluster_index,
}
for n in clique_members
]
)
cluster_index += 1
return assignments
================================================
FILE: perception/approximate_deduplication/_graph_backend.py
================================================
import sys
import typing
from abc import ABC, abstractmethod
class GraphBackend(ABC):
@abstractmethod
def build_graph(
self, node_count: int, edges: typing.Iterable[tuple[int, int]]
) -> typing.Any: ...
@abstractmethod
def connected_components(self, graph: typing.Any) -> list[list[int]]: ...
@abstractmethod
def communities(
self, graph: typing.Any, component: list[int]
) -> list[list[int]]: ...
@abstractmethod
def maximal_cliques(
self,
graph: typing.Any,
community_nodes: list[int],
max_clique_batch_size: int,
) -> list[list[int]]: ...
class NetworkitGraphBackend(GraphBackend):
def __init__(self):
import networkit as nk
self.nk = nk
def build_graph(
self, node_count: int, edges: typing.Iterable[tuple[int, int]]
) -> typing.Any:
graph = self.nk.Graph(node_count)
for start, end in edges:
graph.addEdge(start, end)
return graph
def connected_components(self, graph: typing.Any) -> list[list[int]]:
cc_query = self.nk.components.ConnectedComponents(graph)
cc_query.run()
return cc_query.getComponents()
def communities(self, graph: typing.Any, component: list[int]) -> list[list[int]]:
component_node_map = dict(enumerate(component))
subgraph = self.nk.graphtools.subgraphFromNodes(graph, component, compact=True)
algo = self.nk.community.PLP(subgraph, maxIterations=32)
algo.run()
communities = algo.getPartition()
return [
[component_node_map[node] for node in communities.getMembers(community)]
for community in communities.subsetSizeMap().keys()
]
def maximal_cliques(
self,
graph: typing.Any,
community_nodes: list[int],
max_clique_batch_size: int,
) -> list[list[int]]:
cliques: list[list[int]] = []
for start in range(0, len(community_nodes), max_clique_batch_size):
batch_nodes = community_nodes[start : start + max_clique_batch_size]
community_node_map = dict(enumerate(batch_nodes))
subgraph = self.nk.graphtools.subgraphFromNodes(
graph, batch_nodes, compact=True
)
while subgraph.numberOfNodes() > 0:
clique = self.nk.clique.MaximalCliques(subgraph, maximumOnly=True)
clique.run()
clique_members = clique.getCliques()[0]
cliques.append([community_node_map[node] for node in clique_members])
for node in clique_members:
subgraph.removeNode(node)
return cliques
class NetworkxGraphBackend(GraphBackend):
def __init__(self):
import networkx as nx
self.nx = nx
def build_graph(
self, node_count: int, edges: typing.Iterable[tuple[int, int]]
) -> typing.Any:
graph = self.nx.Graph()
graph.add_nodes_from(range(node_count))
graph.add_edges_from(edges)
return graph
def connected_components(self, graph: typing.Any) -> list[list[int]]:
return [list(component) for component in self.nx.connected_components(graph)]
def communities(self, graph: typing.Any, component: list[int]) -> list[list[int]]:
subgraph = graph.subgraph(component)
return [
list(community)
for community in self.nx.algorithms.community.asyn_lpa_communities(
subgraph, seed=0
)
]
def maximal_cliques(
self,
graph: typing.Any,
community_nodes: list[int],
max_clique_batch_size: int,
) -> list[list[int]]:
cliques: list[list[int]] = []
for start in range(0, len(community_nodes), max_clique_batch_size):
batch_nodes = community_nodes[start : start + max_clique_batch_size]
subgraph = graph.subgraph(batch_nodes).copy()
while subgraph.number_of_nodes() > 0:
clique_members = max(
self.nx.find_cliques(subgraph),
key=lambda clique: (
len(clique),
tuple(sorted(clique)),
),
)
cliques.append(list(clique_members))
subgraph.remove_nodes_from(clique_members)
return cliques
def get_graph_backend() -> GraphBackend:
if sys.platform == "darwin":
return NetworkxGraphBackend()
return NetworkitGraphBackend()
================================================
FILE: perception/approximate_deduplication/debug.py
================================================
import logging
import random
import cv2
import numpy as np
import perception.local_descriptor_deduplication as ldd
LOGGER = logging.getLogger(__name__)
# Set a fixed size for drawing, we don't have the real descriptor size.
KEYPOINT_SIZE: int = 8
def vizualize_pair(
features_1,
features_2,
ratio: float,
match_metadata=None,
local_path_col: str | None = None,
sanitized: bool = False,
include_all_points=False,
circle_size=KEYPOINT_SIZE,
):
"""Given two rows from a reference df vizualize their overlap.
Currently recalcs overlap using cv2 default logic.
Args:
features_1: The row from a reference df for one image.
features_2: The row from a reference df for the other image.
ratio: Value for ratio test, suggest re-using value from matching.
match_metadata: metadata returned from matching, if None will redo brute force matching.
local_path_col: column in df with path to the image. If None will
use the index: features_1.name and features_2.name
sanitized: if True images themselves will not be rendered, only the points.
include_all_points: if True will draw all points, not just matched points.
circle_size: size of the circle to draw around keypoints.
Returns:
An image of the two images concatted together and matching keypoints drawn.
"""
# Set a fixed size for drawing, we don't have the real descriptor size.
if local_path_col is not None:
features_1_path = features_1[local_path_col]
features_2_path = features_2[local_path_col]
else:
features_1_path = features_1.name
features_2_path = features_2.name
img1 = np.zeros(
(features_1.dimensions[1], features_1.dimensions[0], 1), dtype="uint8"
)
img2 = np.zeros(
(features_2.dimensions[1], features_2.dimensions[0], 1), dtype="uint8"
)
if not sanitized:
try:
img1 = ldd.load_and_preprocess(
features_1_path, max_size=max(features_1.dimensions), grayscale=False
)
except Exception:
LOGGER.warning("Failed to load image %s", features_1_path)
try:
img2 = ldd.load_and_preprocess(
features_2_path, max_size=max(features_2.dimensions), grayscale=False
)
except Exception:
LOGGER.warning("Failed to load image %s", features_2_path)
if match_metadata is not None:
img_matched = viz_match_data(
features_1,
features_2,
img1,
img2,
match_metadata,
include_all_points=include_all_points,
circle_size=circle_size,
)
else:
LOGGER.warning("""No match_metadata provided, recalculating match points,
won't match perception match points.""")
img_matched = viz_brute_force(features_1, features_2, img1, img2, ratio=ratio)
return img_matched
def viz_match_data(
features_1,
features_2,
img1,
img2,
match_metadata,
include_all_points=False,
circle_size=KEYPOINT_SIZE,
):
"""Given match data viz matching points.
Args:
features_1: The row from a reference df for one image.
features_2: The row from a reference df for the other image.
img1: cv2 of first image
img2: cv2 of second image
match_metadata: metadata returned from matching, if None will redo
brute force matching.
include_all_points: if True will draw all points, not just matched points.
circle_size: size of the circle to draw around keypoints.
Returns:
cv2 img with matching keypoints drawn.
"""
# NOTE: could refactor to put matches in to correct format and use: cv2.drawMatchesKnn,
# but python docs on necessary class not clear.
# Pad img1 or img2 vertically with black pixels to match the height of the other image
if img1.shape[0] > img2.shape[0]:
img2 = np.pad(
img2,
((0, img1.shape[0] - img2.shape[0]), (0, 0), (0, 0)),
mode="constant",
constant_values=0,
)
elif img1.shape[0] < img2.shape[0]:
img1 = np.pad(
img1,
((0, img2.shape[0] - img1.shape[0]), (0, 0), (0, 0)),
mode="constant",
constant_values=0,
)
# draw two images h concat:
img_matched = np.concatenate((img1, img2), axis=1)
overlay = img_matched.copy()
if include_all_points:
# draw all points in kp_1
for k in features_1["keypoints"]:
new_color = (
random.randint(0, 255),
random.randint(0, 255),
random.randint(0, 255),
)
# Draw semi transparent circle
cv2.circle(img_matched, (int(k[0]), int(k[1])), circle_size, new_color, 1)
# draw all points in kp_2
for k in features_2["keypoints"]:
new_color = (
random.randint(0, 255),
random.randint(0, 255),
random.randint(0, 255),
)
cv2.circle(
img_matched,
(int(k[0] + features_1.dimensions[0]), int(k[1])),
circle_size,
new_color,
1,
)
# draw lines between matching points
for i in range(len(match_metadata["final_matched_b_pts"])):
new_color = (
random.randint(0, 255),
random.randint(0, 255),
random.randint(0, 255),
)
a_pt = (
int(match_metadata["final_matched_a_pts"][i][0]),
int(match_metadata["final_matched_a_pts"][i][1]),
)
b_pt = (
int(match_metadata["final_matched_b_pts"][i][0] + features_1.dimensions[0]),
int(match_metadata["final_matched_b_pts"][i][1]),
)
cv2.circle(img_matched, a_pt, circle_size, new_color, 1)
cv2.circle(img_matched, b_pt, circle_size, new_color, 1)
cv2.line(
img_matched,
a_pt,
b_pt,
new_color,
1,
)
# Re-overlay original image to add some transparency effect to lines and circles.
alpha = 0.4 # Transparency factor.
# Following line overlays transparent rectangle over the image
img_matched = cv2.addWeighted(overlay, alpha, img_matched, 1 - alpha, 0)
return img_matched
def viz_brute_force(features_1, features_2, img1, img2, ratio: float):
"""
Given two rows from a reference df vizualize their overlap.
NOTE: It redoes matching using cv2 bruteforce, so will not match the same
as the perception matching code.
Args:
features_1: The row from a reference df for one image.
features_2: The row from a reference df for the other image.
img1: cv2 of first image
img2: cv2 of second image
ratio: Value for ratio test, suggest re-using value from matching.
Returns:
An image of the two images concatted together and matching keypoints drawn.
"""
# Convert numpy keypoints to cv2.KeyPoints
kp1_fixed = []
for k in features_1["keypoints"]:
kp1_fixed.append(cv2.KeyPoint(k[0], k[1], KEYPOINT_SIZE))
kp2_fixed = []
for k in features_2["keypoints"]:
kp2_fixed.append(cv2.KeyPoint(k[0], k[1], KEYPOINT_SIZE))
brute_force_matcher = cv2.BFMatcher()
kn_matches = brute_force_matcher.knnMatch(
features_1["descriptors"], features_2["descriptors"], k=2
)
# Apply ratio test
good = []
for nearest_match, next_nearest_match in kn_matches:
if nearest_match.distance < ratio * next_nearest_match.distance:
good.append([nearest_match])
img_matched = cv2.drawMatchesKnn( # type: ignore[call-overload]
img1,
kp1_fixed,
img2,
kp2_fixed,
good,
None,
flags=cv2.DrawMatchesFlags_DRAW_RICH_KEYPOINTS,
)
return img_matched
================================================
FILE: perception/approximate_deduplication/index.py
================================================
import time
import typing
import warnings
import faiss
import numpy as np
import pandas as pd
import typing_extensions
import perception.hashers.tools as pht
class QueryInput(typing_extensions.TypedDict):
id: str
hash: str
class QueryMatch(typing_extensions.TypedDict):
id: typing.Any
matches: list[dict]
class TuningFailure(Exception):
pass
class QueryDecodingFailure(Exception):
pass
def build_query(table, ids, paramstyle, columns):
query = "SELECT {} FROM {} WHERE id in {}"
if paramstyle == "pyformat":
sql = query.format(",".join(columns), table, "%(ids)s")
params = {"ids": tuple(ids)}
elif paramstyle == "qmark":
params = ids
sql = query.format(",".join(columns), table, f"({','.join('?' * len(ids))})")
else:
raise NotImplementedError("Unsupported paramstyle.")
return sql, params
def query_by_id(con, table, ids, paramstyle, extra_columns=None) -> pd.DataFrame:
"""Get data from the database.
Args:
con: A connection to the database
table: The table in which to look up hashes
ids: The list of IDs to pull
paramstyle: The paramstyle for the database
extra_columns: A list of additional (non-ID) columns to pull.
"""
columns = ["id"]
if extra_columns is not None:
columns += extra_columns
if isinstance(ids, np.ndarray):
# If it's a numpy array, coerce to a list.
ids = ids.tolist()
dfs = []
batch_size = 1000
for start in range(0, len(ids), batch_size):
sql, params = build_query(
table=table,
ids=ids[start : start + batch_size],
paramstyle=paramstyle,
columns=columns,
)
dfs.append(pd.read_sql(con=con, sql=sql, params=params))
return pd.concat(dfs, ignore_index=True).set_index("id")
class ApproximateNearestNeighbors:
"""A wrapper for a FAISS index.
Args:
con: A database connection from which to obtain metadata for
matched hashes.
table: The table in the database that we should query for metadata.
paramstyle: The parameter style for the given database
index: A FAISS index (or filepath to a FAISS index)
hash_length: The length of the hash that is being matched against.
metadata_columns: The metadata that should be returned for queries.
dtype: The data type for the vectors
distance_metric: The distance metric for the vectors
"""
def __init__(
self,
con,
table,
paramstyle,
index,
hash_length,
metadata_columns=None,
dtype="uint8",
distance_metric="euclidean",
):
assert (
dtype == "uint8"
), "Only unsigned 8-bit integer hashes are supported at this time."
assert (
distance_metric == "euclidean"
), "Only euclidean distance is supported at this time."
if isinstance(index, str):
index = faiss.read_index(index)
self.con = con
self.index = index
self.distance_metric = distance_metric
self.hash_length = hash_length
self.dtype = dtype
self.table = table
self.metadata_columns = metadata_columns
self.paramstyle = paramstyle
assert (
self.index.d == self.hash_length
), "Index is incompatible with hash length."
@classmethod
def from_database(
cls,
con,
table,
paramstyle,
hash_length,
ids_train=None,
train_size=None,
chunksize=100000,
metadata_columns=None,
index=None,
gpu=False,
dtype="uint8",
distance_metric="euclidean",
):
"""Train and build a FAISS index from a database connection.
Args:
con: A database connection from which to obtain metadata for
matched hashes.
table: The table in the database that we should query for metadata.
paramstyle: The parameter style for the given database
hash_length: The length of the hash that is being matched against.
ids_train: The IDs for the vectors to train on.
train_size: The number of vectors to use for training. Will be
randomly selected from 1 to the number of vectors in the database.
Ignored if ids_train is not None.
chunksize: The chunks of data to draw from the database at a time
when adding vectors to the index.
metadata_columns: The metadata that should be returned for queries.
index: If a pretrained index is provided, training will be skipped,
any existing vectors will be discarded, and the index will be
repopulated with the current contents of the database.
gpu: If true, will attempt to carry out training on a GPU.
dtype: The data type for the vectors
distance_metric: The distance metric for the vectors
"""
assert (
dtype == "uint8"
), "Only unsigned 8-bit integer hashes are supported at this time."
assert (
distance_metric == "euclidean"
), "Only euclidean distance is supported at this time."
if index is None:
# Train the index using the practices from
# https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#if-below-1m-vectors-ivfx
ntotal = pd.read_sql(
sql="select count(*) as count from hashes", con=con
).iloc[0]["count"]
assert (
train_size <= ntotal
), "Cannot train on more hashes than are available."
nlist = int(min(4 * np.sqrt(ntotal), ntotal / 39))
min_train_size = 39 * nlist
if ids_train is not None:
train_size = len(ids_train)
if train_size is None:
train_size = min_train_size
assert (
train_size >= min_train_size
), f"Training an index used for {ntotal} hashes requires at least {min_train_size} training hashes."
if ids_train is None:
ids_train = np.random.choice(
np.arange(ntotal), size=train_size, replace=False
)
df_train = query_by_id(
con=con,
table=table,
ids=ids_train,
paramstyle=paramstyle,
extra_columns=["hash"],
)
x_train = np.array(
[np.frombuffer(h, dtype=dtype) for h in df_train["hash"]]
).astype("float32")
assert x_train.shape[1] == hash_length, "Hashes are of incorrect length."
index = faiss.IndexIVFFlat(
faiss.IndexFlatL2(hash_length), hash_length, nlist
)
if gpu:
res = faiss.StandardGpuResources()
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
gpu_index.train(x_train)
index = faiss.index_gpu_to_cpu(gpu_index)
else:
index.train(x_train)
else:
index.reset()
# Add hashes to the index in chunks.
for df_add in pd.read_sql(
sql=f"SELECT id, hash FROM {table}", con=con, chunksize=chunksize
):
x_add = np.array(
[np.frombuffer(h, dtype=dtype) for h in df_add["hash"]]
).astype("float32")
index.add_with_ids(x_add, df_add["id"].values)
return cls(
con=con,
index=index,
hash_length=hash_length,
distance_metric=distance_metric,
dtype=dtype,
table=table,
paramstyle=paramstyle,
metadata_columns=metadata_columns,
)
def query_by_id(
self, ids, include_metadata=True, include_hashes=False
) -> pd.DataFrame:
"""Get data from the database.
Args:
ids: The hash IDs to get from the database.
include_metadata: Whether to include metadata columns.
include_hashes: Whether to include the hashes
"""
if not self.metadata_columns and include_metadata and not include_hashes:
# There won't be anything to return.
return pd.DataFrame()
extra_columns = []
if self.metadata_columns and include_metadata:
extra_columns += self.metadata_columns
if include_hashes:
extra_columns += ["hash"]
return query_by_id(
con=self.con,
table=self.table,
ids=ids,
paramstyle=self.paramstyle,
extra_columns=extra_columns,
)
def string_to_vector(self, s: str, hash_format="base64") -> np.ndarray:
"""Convert a string to vector form.
Args:
s: The hash string
hash_format: The format for the hash string
"""
return pht.string_to_vector(
s, hash_format=hash_format, dtype=self.dtype, hash_length=self.hash_length
)
def vector_to_string(self, vector, hash_format="base64") -> str | None:
"""Convert a vector back to string
Args:
vector: The hash vector
hash_format: The format for the hash
"""
return pht.vector_to_string(vector, dtype=self.dtype, hash_format=hash_format)
def search(
self,
queries: list[QueryInput],
threshold: int | None = None,
threshold_func: typing.Callable[[np.ndarray], np.ndarray] | None = None,
hash_format="base64",
k=1,
):
"""Search the index and return matches.
Args:
queries: A list of queries in the form of {"id": <id>, "hash": "<hash_string>"}
threshold: The threshold to use for matching. Takes precedence over threshold_func.
threshold_func: A function that, given a query vector, returns the desired match threshold for that query.
hash_format: The hash format used for the strings in the query.
k: The number of nearest neighbors to return.
Returns:
Matches in the form of a list of dicts of the form:
{ "id": <query ID>, "matches": [{"distance": <distance>, "id": <match ID>, "metadata": {}}]}
The metadata consists of the contents of the metadata columns specified for this matching
instance.
"""
try:
xq = np.array(
[
self.string_to_vector(h["hash"], hash_format=hash_format)
for h in queries
]
).astype("float32")
except Exception as exc:
raise QueryDecodingFailure("Failed to parse hash query.") from exc
thresholds: np.ndarray = np.ones((len(xq), 1)) * np.inf
if threshold:
thresholds = np.ones((len(xq), 1)) * threshold
if not threshold and threshold_func:
thresholds = threshold_func(xq)
else:
thresholds = np.ones((len(xq), 1)) * np.inf
distances, indices = self.index.search(xq, k=k)
distances = np.sqrt(distances)
metadata = (
None
if not self.metadata_columns
else self.query_by_id(ids=np.unique(indices[distances < thresholds]))
)
matches: list[QueryMatch] = []
for match_distances, match_ids, q, q_threshold in zip(
distances, indices, queries, thresholds
):
match_filter = match_distances < q_threshold
match_ids = match_ids[match_filter]
match_distances = match_distances[match_filter]
match: QueryMatch = {"id": q["id"], "matches": []}
for match_id, distance in zip(match_ids, match_distances):
entry = {"distance": float(distance), "id": match_id}
if metadata is not None:
entry["metadata"] = metadata.loc[match_id].to_dict()
match["matches"].append(entry)
matches.append(match)
return matches
def tune(self, n_query=100, min_recall=99, max_noise=3):
"""Obtain minimum value for nprobe that achieves a target level of recall.
Args:
n_query: The number of hashes to use as test hashes.
min_recall: The minimum desired recall for the index.
max_noise: The maximum amount of noise to add to each test hash
Returns:
A tuple of recall, latency (in ms), and nprobe where the nprobe
value is the one that achieved the resulting recall.
Raises:
TuningFailure if no suitable nprobe value is found.
"""
assert (
n_query <= self.ntotal
), "Cannot use a test larger than ntotal (total number of hashes)."
# Pick a random set of query hashes
ids = np.random.choice(
np.arange(1, self.ntotal + 1), size=n_query, replace=False
)
df = self.query_by_id(ids, include_metadata=False, include_hashes=True)
xq = np.array(
[np.frombuffer(v, dtype=self.dtype) for v in df["hash"]], dtype=np.uint8
)
noise = np.random.randint(
low=(-xq.astype("int32")).clip(-max_noise, max_noise),
high=(255 - xq.astype("float32")).clip(-max_noise, max_noise),
)
xq = (xq.astype("int32") + noise).astype("uint8").astype("float32")
if min_recall == 100:
warnings.warn(
"100% recall can only be ensured with exhaustive search.", UserWarning
)
self.set_nprobe(self.nlist)
start = time.time()
self.index.search(xq, k=1)
latency = time.time() - start
return (100, 1000 * latency, self.nlist)
# Make the search exhaustive so we get ground truth.
self.set_nprobe(self.nlist)
_, expected = self.index.search(xq, k=1)
for nprobe in range(1, self.nlist):
self.set_nprobe(nprobe)
start = time.time()
_, actual = self.index.search(xq, k=1)
latency = time.time() - start
recall = 100 * (actual[:, 0] == expected).sum() / xq.shape[0]
if recall >= min_recall:
break
else:
# If we never break, it means we never reached the target recall
# for this query.
raise TuningFailure(
"Failed to find suitable parameters for selected recall."
)
return recall, 1000 * latency, nprobe
def save(self, filepath):
"""Save an index to disk.
Args:
filepath: Where to save the index.
"""
faiss.write_index(self.index, filepath)
def set_nprobe(self, nprobe) -> int:
"""Set the value of nprobe.
Args:
nprobe: The new value for nprobe
"""
faiss.ParameterSpace().set_index_parameter(self.index, "nprobe", nprobe)
return faiss.downcast_index(self.index).nprobe
@property
def nlist(self):
"""The number of lists in the index."""
return faiss.downcast_index(self.index).nlist
@property
def nprobe(self):
"""The current value of nprobe."""
return faiss.downcast_index(self.index).nprobe
@property
def ntotal(self):
"""The number of vectors in the index."""
return self.index.ntotal
================================================
FILE: perception/approximate_deduplication/serve.py
================================================
import asyncio
import functools
import json
import logging
import typing
import aiohttp.web
import numpy as np
from pythonjsonlogger import jsonlogger
import perception.hashers.tools as pht
from .index import ApproximateNearestNeighbors
def is_similarity_valid(data, index: ApproximateNearestNeighbors):
"""Validates input to the similarity endpoint."""
hash_format = data.get("hash_format", "base64")
expected_string_length = pht.get_string_length(
hash_length=index.hash_length, dtype=index.dtype, hash_format=hash_format
)
return (
isinstance(data, dict)
and "queries" in data
and isinstance(data["queries"], list)
and all(isinstance(x.get("hash", None), str) for x in data["queries"])
and hash_format in ["hex", "base64"]
and all(
len(x.get("hash", None)) == expected_string_length for x in data["queries"]
)
)
async def similarity(request):
"""Responds to a vector similarity query of the form:
```
{
"queries": [{"id": str, "hash": "base64_encoded_hash1"}, ...],
"k": int,
"threshold": float,
"hash_format": "base64"
}
```
with information about similar vectors in the index in the form:
```
{
"queries": [{"id": str, "matches": [{"metadata": {json metadata}, "distance": float},...],...]
}
```
"""
try:
request_data = await request.json()
except json.JSONDecodeError:
return aiohttp.web.json_response({"reason": "Malformed JSON"}, status=400)
index = request.app["index"]
try:
assert is_similarity_valid(request_data, index)
except Exception:
return aiohttp.web.json_response({"reason": "Invalid JSON request"}, status=400)
async with request.app["query_semaphore"]:
matches = await asyncio.get_event_loop().run_in_executor(
None,
functools.partial(
index.search,
queries=request_data["queries"],
threshold=request_data.get(
"threshold", request.app["default_threshold"]
),
threshold_func=request.app["default_threshold_func"],
k=request_data.get("k", request.app["default_k"]),
hash_format=request_data.get("hash_format", "base64"),
),
)
matches = json.loads(json.dumps({"queries": matches}))
return aiohttp.web.json_response(matches)
def get_logger(name, log_level):
logger = logging.Logger(name=name, level=log_level)
handler = logging.StreamHandler()
handler.setFormatter(
jsonlogger.JsonFormatter(
"%(asctime)s:%(levelname)s:%(name)s:%(message)s%(exc_info)"
)
)
logger.addHandler(handler)
return logger
async def serve(
index: ApproximateNearestNeighbors,
default_threshold: int | None = None,
default_threshold_func: typing.Callable[[np.ndarray], np.ndarray] | None = None,
default_k: int = 1,
concurrency: int = 2,
log_level=logging.INFO,
host="localhost",
port=8080,
):
"""Serve an index as a web API. This function does not block.
If you wish to use the function in a blocking manner, you can
do something like
.. code-block:: python
loop = asyncio.get_event_loop()
loop.run_until_complete(serve(...))
loop.run_forever()
You can query the API with something like:
.. code-block:: bash
curl --header "Content-Type: application/json" \\
--request POST \\
--data '{"queries": [{"hash": "<hash string>", "id": "bar"}], "threshold": 1200}' \\
http://localhost:8080/v1/similarity
Args:
index: The underlying index
default_threshold: The default threshold for matches
default_k: The default number of nearest neighbors to look for
concurrency: The number of concurrent requests served
log_level: The log level to use for the logger
host: The host for the servoce
port: The port for the service
"""
logger = get_logger(name="serve", log_level=log_level)
logger.info("Initializing web service")
app = aiohttp.web.Application()
app.router.add_post("/v1/similarity", similarity, name="similarity")
# Store globals in the application object
app["default_threshold"] = default_threshold
app["logger"] = logger
app["default_k"] = default_k
app["default_threshold_func"] = default_threshold_func
app["index"] = index
app["query_semaphore"] = asyncio.Semaphore(concurrency)
logger.info("Entering web service listener loop.")
runner = aiohttp.web.AppRunner(app, logger=logger)
await runner.setup()
site = aiohttp.web.TCPSite(runner, host, port)
await site.start()
return site
================================================
FILE: perception/benchmarking/__init__.py
================================================
from perception.benchmarking import video_transforms
from perception.benchmarking import video
from perception.benchmarking import image
from perception.benchmarking.image import (
BenchmarkImageDataset,
BenchmarkImageTransforms,
)
from perception.benchmarking.video import (
BenchmarkVideoDataset,
BenchmarkVideoTransforms,
)
from perception.benchmarking.common import BenchmarkHashes
__all__ = [
"BenchmarkImageDataset",
"BenchmarkImageTransforms",
"BenchmarkVideoDataset",
"BenchmarkVideoTransforms",
"BenchmarkHashes",
"video_transforms",
"video",
"image",
]
================================================
FILE: perception/benchmarking/common.py
================================================
import itertools
import logging
import os
import shutil
import tempfile
import uuid
import warnings
import zipfile
from abc import ABC
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from scipy import spatial, stats
from ..hashers.tools import compute_md5, string_to_vector
try:
from . import extensions # type: ignore
except ImportError:
warnings.warn(
"C extensions were not built. Some metrics will be computed more slowly. "
"Please install from wheels or set up a compiler prior to installation "
"from source to use extensions."
)
extensions = None
log = logging.getLogger(__name__)
def create_mask(transformed_guids, noop_guids):
"""Given a list of transformed guids and noop guids,
computes an MxN array indicating whether noop n has the same guid
as transform m. Used for applying a mask to a distance matrix
for efficient computation of recall at different thresholds.
Args:
transformed_guids: An iterable of transformed guids
noop: An iterable of noop guids
Returns:
An boolean array of shape
`(len(transformed_guids), len(transformed_noops))`
"""
n_noops = len(noop_guids)
previous_guid = None
start = None
end = 0
mask = np.zeros((len(transformed_guids), len(noop_guids)), dtype="bool")
for current_guid, row in zip(transformed_guids, mask):
if previous_guid is None or current_guid != previous_guid:
start = end
end = start + next(
(
other_index
for other_index, guid in enumerate(noop_guids[start:])
if guid != current_guid
),
n_noops,
)
previous_guid = current_guid
row[start:end] = True
return mask
def compute_threshold_precision_recall(pos, neg, precision_threshold=99.9):
# Sort both arrays according to the positive distance
neg = neg[pos.argsort()]
pos = pos[pos.argsort()]
# Compute false-positive rate for every value in pos
tp = np.arange(1, len(pos) + 1)
fp = np.array([(neg <= t).sum() for t in pos])
precision = 100 * tp / (tp + fp)
# Choose the optimal threshold
bad_threshold_idxs = np.where(precision < precision_threshold)[0]
if len(bad_threshold_idxs) > 0 and bad_threshold_idxs[0] > 0:
optimal_threshold = pos[bad_threshold_idxs[0] - 1]
recovered = (pos <= optimal_threshold).sum()
if recovered == 0:
optimal_precision = np.nan
else:
optimal_precision = precision[pos <= optimal_threshold].min()
optimal_recall = round(100 * recovered / len(pos), 3)
elif len(bad_threshold_idxs) > 0:
# The closest hash was a false positive.
optimal_threshold = pos[0]
optimal_recall = 0
optimal_precision = np.nan
else:
optimal_precision = 100
optimal_threshold = pos.max()
optimal_recall = 100
return optimal_threshold, optimal_precision, optimal_recall
class Filterable(ABC):
_df: pd.DataFrame
expected_columns: list
def __init__(self, df):
assert sorted(df.columns) == sorted(
self.expected_columns
), f"Column mismatch: Expected {sorted(self.expected_columns)}, found {sorted(df.columns)}."
self._df = df
@property
def categories(self):
"""The categories included in the dataset"""
return self._df["category"].unique()
def filter(self, **kwargs):
"""Obtain a new dataset filtered with the given
keyword arguments."""
df = self._df.copy()
for field, included in kwargs.items():
existing = self._df[field].unique()
if not all(inc in existing for inc in included):
missing = ", ".join(
[str(inc) for inc in included if inc not in existing]
)
message = f"Did not find {missing} in column {field} dataset."
warnings.warn(message, UserWarning)
df = df[df[field].isin(included)]
return self.__class__(df.copy())
class Saveable(Filterable):
@classmethod
def load(
cls,
path_to_zip_or_directory: str,
storage_dir: str | None = None,
verify_md5=True,
):
"""Load a dataset from a ZIP file or directory.
Args:
path_to_zip_or_directory: Pretty self-explanatory
storage_dir: If providing a ZIP file, where to extract
the contents. If None, contents will be extracted to
a folder with the same name as the ZIP file in the
same directory as the ZIP file.
verify_md5: Verify md5s when loading
"""
# Load index whether from inside ZIP file or from directory.
if os.path.splitext(path_to_zip_or_directory)[1] == ".zip":
if storage_dir is None:
storage_dir = os.path.join(
os.path.dirname(os.path.abspath(path_to_zip_or_directory)),
os.path.splitext(os.path.basename(path_to_zip_or_directory))[0],
)
os.makedirs(storage_dir, exist_ok=True)
with zipfile.ZipFile(path_to_zip_or_directory, "r") as z:
# Try extracting only the index at first so we can
# compare md5.
z.extract("index.csv", os.path.join(storage_dir))
index: pd.DataFrame = pd.read_csv(
os.path.join(storage_dir, "index.csv")
)
index["filepath"] = index["filename"].apply(
lambda fn: (
os.path.join(storage_dir, fn) if not pd.isnull(fn) else None
)
)
do_zip_extraction = True
if index["filepath"].apply(os.path.isfile).all():
if verify_md5:
do_zip_extraction = not all(
row["md5"] == compute_md5(row["filepath"])
for _, row in tqdm.tqdm(
index.iterrows(), desc="Checking cache"
)
)
else:
do_zip_extraction = False
if do_zip_extraction:
z.extractall(storage_dir)
else:
log.info("Found all files already extracted. Skipping extraction.")
verify_md5 = False
else:
assert (
storage_dir is None
), "Storage directory only valid if path is to ZIP file."
index = pd.read_csv(os.path.join(path_to_zip_or_directory, "index.csv"))
index["filepath"] = index["filename"].apply(
lambda fn: (
os.path.join(path_to_zip_or_directory, fn)
if not pd.isnull(fn)
else None
)
)
if verify_md5:
assert all(
row["md5"] == compute_md5(row["filepath"])
for _, row in tqdm.tqdm(
index.iterrows(),
desc="Performing final md5 integrity check.",
total=len(index.index),
)
), "An md5 mismatch has occurred."
return cls(index.drop(["filename", "md5"], axis=1))
def save(self, path_to_zip_or_directory):
"""Save a dataset to a directory or ZIP file.
Args:
path_to_zip_or_directory: Pretty self-explanatory
"""
df = self._df
assert "filepath" in df.columns, "Index dataframe must contain filepath."
# Build index using filename instead of filepath.
index = df.copy()
index["filename"] = df["filepath"].apply(
lambda filepath: (
os.path.basename(filepath) if not pd.isnull(filepath) else None
)
)
if index["filename"].dropna().duplicated().sum() > 0:
warnings.warn("Changing filenames to UUID due to duplicates.", UserWarning)
index["filename"] = [
(
str(uuid.uuid4()) + os.path.splitext(row["filename"])[1]
if not pd.isnull(row["filename"])
else None
)
for _, row in index.iterrows()
]
index["md5"] = [
compute_md5(filepath) if not pd.isnull(filepath) else None
for filepath in tqdm.tqdm(index["filepath"], desc="Computing md5s.")
]
# Add all files as well as the dataframe index to
# a ZIP file if path is to ZIP file or to the directory if it is
# not a ZIP file.
if os.path.splitext(path_to_zip_or_directory)[1] == ".zip":
with zipfile.ZipFile(path_to_zip_or_directory, "w") as f:
with tempfile.TemporaryFile(mode="w+") as index_file:
index.drop("filepath", axis=1).to_csv(index_file, index=False)
index_file.seek(0)
f.writestr("index.csv", index_file.read())
for _, row in tqdm.tqdm(
index.iterrows(), desc="Saving files", total=len(df)
):
if pd.isnull(row["filepath"]):
# There was an error associated with this file.
continue
f.write(row["filepath"], row["filename"])
else:
os.makedirs(path_to_zip_or_directory, exist_ok=True)
index.drop("filepath", axis=1).to_csv(
os.path.join(path_to_zip_or_directory, "index.csv"), index=False
)
for _, row in tqdm.tqdm(
index.iterrows(), desc="Saving files", total=len(df)
):
if pd.isnull(row["filepath"]):
# There was an error associated with this file.
continue
if row["filepath"] == os.path.join(
path_to_zip_or_directory, row["filename"]
):
# The source file is the same as the target file.
continue
shutil.copy(
row["filepath"],
os.path.join(path_to_zip_or_directory, row["filename"]),
)
class BenchmarkHashes(Filterable):
"""A dataset of hashes for transformed images. It is essentially
a wrapper around a `pandas.DataFrame` with the following columns:
- guid
- error
- filepath
- category
- transform_name
- hasher_name
- hasher_dtype
- hasher_distance_metric
- hasher_hash_length
- hash
"""
expected_columns = [
"error",
"filepath",
"hash",
"hasher_name",
"hasher_dtype",
"hasher_distance_metric",
"category",
"guid",
"input_filepath",
"transform_name",
"hasher_hash_length",
]
def __init__(self, df: pd.DataFrame):
super().__init__(df)
self._metrics: pd.DataFrame | None = None
def __add__(self, other):
return BenchmarkHashes(df=pd.concat([self._df, other._df]).drop_duplicates())
def __radd__(self, other):
return self.__add__(other)
@classmethod
def load(cls, filepath: str):
return cls(pd.read_csv(filepath))
def save(self, filepath):
self._df.to_csv(filepath, index=False)
def compute_metrics(
self, custom_distance_metrics: dict | None = None
) -> pd.DataFrame:
if self._metrics is not None:
return self._metrics
metrics = []
hashsets = self._df.sort_values("guid")
n_dropped = hashsets["hash"].isnull().sum()
if n_dropped > 0:
hashsets = hashsets.dropna(subset=["hash"])
warnings.warn(f"Dropping {n_dropped} invalid / empty hashes.", UserWarning)
for (hasher_name, transform_name, category), hashset in tqdm.tqdm(
hashsets.groupby(["hasher_name", "transform_name", "category"]),
desc="Computing metrics.",
):
# Note the guid filtering below. We need to include only guids
# for which we have the transform *and* the guid. One of them
# may have been dropped due to being invalid.
noops = hashsets[
(hashsets["transform_name"] == "noop")
& (hashsets["hasher_name"] == hasher_name)
& (hashsets["guid"].isin(hashset["guid"]))
]
valid_hashset = hashset[hashset["guid"].isin(noops["guid"])]
dtype, distance_metric, hash_length = valid_hashset.iloc[0][
["hasher_dtype", "hasher_distance_metric", "hasher_hash_length"]
]
n_noops = len(noops.guid)
n_hashset = len(valid_hashset.guid)
noop_guids = noops.guid.values
mask = create_mask(valid_hashset.guid.values, noops.guid.values)
if distance_metric != "custom":
X_trans = np.array(
valid_hashset.hash.apply(
string_to_vector, # type: ignore[arg-type]
hash_length=int(hash_length),
dtype=dtype,
hash_format="base64",
).tolist()
)
X_noop = np.array(
noops.hash.apply(
string_to_vector, # type: ignore[arg-type]
dtype=dtype,
hash_format="base64",
hash_length=int(hash_length),
).tolist()
)
if (
distance_metric != "euclidean"
or "int" not in dtype
or extensions is None
):
distance_matrix = spatial.distance.cdist(
XA=X_trans, XB=X_noop, metric=distance_metric
)
distance_to_closest_image = distance_matrix.min(axis=1)
distance_to_correct_image = np.ma.masked_array(
distance_matrix, np.logical_not(mask)
).min(axis=1)
distance_matrix_incorrect_image: np.ndarray = np.ma.masked_array(
distance_matrix, mask
)
distance_to_incorrect_image = distance_matrix_incorrect_image.min(
axis=1
)
closest_incorrect_guid = noop_guids[
distance_matrix_incorrect_image.argmin(axis=1)
]
else:
distances, indexes = extensions.compute_euclidean_metrics(
X_noop.astype("int32"), X_trans.astype("int32"), mask
)
distance_to_correct_image = distances[:, 1]
distance_to_incorrect_image = distances[:, 0]
distance_to_closest_image = distances.min(axis=1)
closest_incorrect_guid = [noop_guids[idx] for idx in indexes[:, 0]]
else:
assert (
custom_distance_metrics is not None
and hasher_name in custom_distance_metrics
), f"You must provide a custom distance metric for {hasher_name}."
noops_hash_values = noops.hash.values
hashset_hash_values = valid_hashset.hash.values
distance_matrix = np.zeros((n_hashset, n_noops))
distance_function = custom_distance_metrics[hasher_name]
for i1, i2 in itertools.product(range(n_hashset), range(n_noops)):
distance_matrix[i1, i2] = distance_function(
hashset_hash_values[i1], noops_hash_values[i2]
)
distance_to_closest_image = distance_matrix.min(axis=1)
distance_to_correct_image = np.ma.masked_array(
distance_matrix, np.logical_not(mask)
).min(axis=1)
distance_matrix_incorrect_image = np.ma.masked_array(
distance_matrix, mask
)
distance_to_incorrect_image = distance_matrix_incorrect_image.min(
axis=1
)
closest_incorrect_guid = noop_guids[
distance_matrix_incorrect_image.argmin(axis=1)
]
metrics.append(
pd.DataFrame(
{
"guid": valid_hashset["guid"].values,
"transform_name": transform_name,
"hasher_name": hasher_name,
"category": category,
"distance_to_closest_correct_image": distance_to_correct_image,
"distance_to_closest_incorrect_image": distance_to_incorrect_image,
"distance_to_closest_image": distance_to_closest_image,
"closest_incorrect_guid": closest_incorrect_guid,
}
)
)
metrics_df = pd.concat(metrics)
self._metrics = metrics_df
return metrics_df
def show_histograms(self, grouping=None, precision_threshold=99.9, **kwargs):
"""Plot histograms for true and false positives, similar
to https://tech.okcupid.com/evaluating-perceptual-image-hashes-okcupid/
Additional arguments passed to compute_metrics.
Args:
grouping: List of fields to group by. By default, all fields are used
(category, and transform_name).
"""
if grouping is None:
grouping = ["category", "transform_name"]
metrics = self.compute_metrics(**kwargs)
hasher_names = metrics["hasher_name"].unique().tolist()
bounds = (
metrics.groupby("hasher_name")[
["distance_to_closest_image", "distance_to_closest_incorrect_image"]
]
.max()
.max(axis=1)
)
if grouping:
group_names = [
":".join(map(str, row.values))
for idx, row in metrics[grouping].drop_duplicates().iterrows()
]
else:
group_names = [""]
ncols = len(hasher_names)
nrows = len(group_names)
fig, axs = plt.subplots(
ncols=ncols, nrows=nrows, figsize=(ncols * 4, nrows * 3), sharey=True
)
for group_name, subset in metrics.groupby(["hasher_name"] + grouping):
# Get names of group and hasher
if grouping:
hasher_name = group_name[0]
group_name = ":".join(map(str, group_name[1:]))
else:
hasher_name = group_name
group_name = ""
# Get the correct axis.
colIdx = hasher_names.index(hasher_name)
rowIdx = group_names.index(group_name)
if ncols > 1 and nrows > 1:
ax = axs[rowIdx, colIdx]
elif ncols == 1 and nrows == 1:
ax = axs
else:
ax = axs[rowIdx if nrows > 1 else colIdx]
# Plot the charts
inner_keys = ["guid"] + (
["transform_name"] if "transform_name" in subset.columns else []
)
pos, neg = (
subset.groupby(inner_keys)[
[
"distance_to_closest_correct_image",
"distance_to_closest_incorrect_image",
]
]
.min()
.values.T
)
optimal_threshold, _, optimal_recall = compute_threshold_precision_recall(
pos=pos, neg=neg, precision_threshold=precision_threshold
)
optimal_threshold = optimal_threshold.round(3)
emd = stats.wasserstein_distance(pos, neg).round(2)
ax.hist(neg, label="neg", bins=10)
ax.hist(pos, label="pos", bins=10)
ax.text(
0.5,
0.5,
f"Recall: {optimal_recall:.0f}% @ {optimal_threshold}\nemd: {emd:.2f}",
horizontalalignment="center",
color="black",
verticalalignment="center",
transform=ax.transAxes,
fontsize=12,
fontweight=1000,
)
ax.set_xlim(-0.05 * bounds[hasher_name], bounds[hasher_name])
if rowIdx == 0:
ax.set_title(hasher_name)
ax.legend()
if colIdx == 0:
ax.set_ylabel(group_name)
fig.tight_layout()
def compute_threshold_recall(
self, precision_threshold=99.9, grouping=None, **kwargs
) -> pd.DataFrame:
"""Compute a table for threshold and recall for each category, hasher,
and transformation combinations. Additional arguments passed to compute_metrics.
Args:
precision_threshold: The precision threshold to use
for choosing a distance threshold for each hasher.
grouping: List of fields to group by. By default, all fields are used
(category, and transform_name).
Returns:
A pandas DataFrame with 7 columns. The key columns are threshold
(The optimal distance threshold for detecting a match for this
combination), recall (the number of correct matches divided by
the number of possible matches), and precision (the number correct
matches divided by the total number of matches whether correct
or incorrect).
"""
if grouping is None:
grouping = ["category", "transform_name"]
def group_func(subset):
inner_keys = ["guid"] + (
["transform_name"] if "transform_name" in subset.columns else []
)
pos, neg = (
subset.groupby(inner_keys)[
[
"distance_to_closest_correct_image",
"distance_to_closest_incorrect_image",
]
]
.min()
.values.T
)
(
optimal_threshold,
optimal_precision,
optimal_recall,
) = compute_threshold_precision_recall(
pos=pos, neg=neg, precision_threshold=precision_threshold
)
return pd.Series(
{
"threshold": optimal_threshold,
"recall": optimal_recall,
"precision": optimal_precision,
"n_exemplars": len(subset),
}
)
return (
self.compute_metrics(**kwargs)
.groupby(grouping + ["hasher_name"])
.apply(group_func)
)
class BenchmarkDataset(Saveable):
"""A dataset of images separated into
categories. It is essentially a wrapper around a pandas
dataframe with the following columns:
- filepath
- category
"""
expected_columns = ["filepath", "category"]
@classmethod
def from_tuples(cls, files: list[tuple[str, str]]):
"""Build dataset from a set of files.
Args:
files: A list of tuples where each entry is a pair
filepath and category.
"""
df = pd.DataFrame.from_records(
[{"filepath": f, "category": c} for f, c in files]
)
return cls(df)
def transform(self, transforms, storage_dir, errors):
raise NotImplementedError()
class BenchmarkTransforms(Saveable):
"""A dataset of transformed images. Essentially wraps a DataFrame with the
following columns:
- guid
- filepath
- category
- transform_name
- input_filepath (for memo purposes only)
"""
expected_columns = [
"filepath",
"category",
"transform_name",
"input_filepath",
"guid",
]
def compute_hashes(self, hashers, max_workers):
raise NotImplementedError()
================================================
FILE: perception/benchmarking/extensions.pyx
================================================
# cython: language_level=3
import cython
import numpy as np
from cython.parallel import parallel, prange
cimport numpy as np
from libc.math cimport sqrt
from libc.stdlib cimport abort, free, malloc
cdef extern from "limits.h":
int INT_MAX
ctypedef np.uint8_t uint8
@cython.boundscheck(False)
@cython.wraparound(False)
def compute_euclidean_metrics(int[:, :] X_noop, int[:, :] X_tran, uint8[:, :] mask):
"""Compute the positive / negative distance metrics between two sets of vectors
using euclidean distance. This function obtains the necessary metrics roughly
10x faster than using scipy.spatial.distance.cdist and numpy functions.
Args:
X_noop: The vectors for the noop hashes with shape (N, K)
X_tran: The vectors for the transformed instances with shape (M, K)
mask: A (M, N) array indicating whether noop n corresponds to transform m
Returns:
distances: An M by 2 array with the closest false positive and closest
true positive for each transform.
indexes: An M by 2 array with the index for the closest false positive
noop and the closest true positive noop.
"""
cdef Py_ssize_t n_noop = X_noop.shape[0]
cdef Py_ssize_t d_noop = X_noop.shape[1]
cdef Py_ssize_t n_tran = X_tran.shape[0]
cdef Py_ssize_t d_tran = X_tran.shape[1]
cdef Py_ssize_t n_mask_tran = mask.shape[0]
cdef Py_ssize_t n_mask_noop = mask.shape[1]
cdef Py_ssize_t i_mask_tran
cdef Py_ssize_t i_mask_noop
cdef int n_pos
cdef int current_distance
cdef int current_closest_fp
cdef int current_closest_tp
cdef int[:] x
cdef int[:] y
cdef uint8 is_pos
cdef Py_ssize_t i_noop, i_tran, i_d
cdef Py_ssize_t i_closest_fp = 0
cdef Py_ssize_t i_closest_tp = 1
cdef Py_ssize_t i_closest_fp_idx = 0
cdef Py_ssize_t i_closest_tp_idx = 1
cdef int * local_buf
cdef size_t size = 5
cdef float NAN
NAN = float("NaN")
assert d_noop == d_tran, "Dimensionality of vectors must match."
assert n_mask_tran == n_tran, "Dimension 0 of mask must correspond to n_transforms."
assert n_mask_noop == n_noop, "Dimension 1 of mask must correspond to n_noops."
for i_mask_tran in range(n_mask_tran):
n_pos = 0
for i_mask_noop in range(n_mask_noop):
if mask[i_mask_tran, i_mask_noop] == True:
n_pos += 1
assert n_pos > 0, "All transforms must have at least one positive noop."
assert n_pos < n_mask_noop, "All transforms must have at least one negative noop."
distances = np.zeros((n_tran, 2), dtype=np.float32)
indexes = np.zeros((n_tran, 2), dtype=np.int32)
cdef np.float32_t[:, :] distances_view = distances
cdef int[:, :] indexes_view = indexes
with nogil, parallel():
local_buf = <int *> malloc(sizeof(int) * size)
if local_buf is NULL:
abort()
for i_tran in prange(n_tran):
local_buf[1] = INT_MAX # Smallest false positive distance
local_buf[2] = INT_MAX # Smallest true positive distance
local_buf[3] = 0 # Smallest false positive index
local_buf[4] = 0 # Smallest true positive index
for i_noop in range(n_noop):
local_buf[0] = 0 # Current distance
is_pos = mask[i_tran, i_noop] == True
for i_d in range(d_noop):
local_buf[0] += (X_noop[i_noop, i_d] - X_tran[i_tran, i_d]) ** 2
if is_pos and (local_buf[0] < local_buf[2]):
local_buf[2] = local_buf[0]
local_buf[4] = i_noop
if not is_pos and (local_buf[0] < local_buf[1]):
local_buf[1] = local_buf[0]
local_buf[3] = i_noop
# I do not think that an <int *> can ever actually be
# greater than INT_MAX but we'll leave the check in.
if local_buf[1] < INT_MAX:
distances_view[i_tran, i_closest_fp] = sqrt(local_buf[1])
else:
distances_view[i_tran, i_closest_fp] = NAN
if local_buf[2] < INT_MAX:
distances_view[i_tran, i_closest_tp] = sqrt(local_buf[2])
else:
distances_view[i_tran, i_closest_tp] = NAN
indexes_view[i_tran, i_closest_fp_idx] = local_buf[3]
indexes_view[i_tran, i_closest_tp_idx] = local_buf[4]
free(local_buf)
return distances, indexes
================================================
FILE: perception/benchmarking/image.py
================================================
import logging
import os
import uuid
import warnings
import cv2
import albumentations
import pandas as pd
from tqdm import tqdm
from ..hashers import tools
from ..hashers.hasher import ImageHasher
from ..tools import deduplicate, flatten
from .common import BenchmarkDataset, BenchmarkHashes, BenchmarkTransforms
log = logging.getLogger(__name__)
class BenchmarkImageTransforms(BenchmarkTransforms):
def compute_hashes(
self, hashers: dict[str, ImageHasher], max_workers: int = 5
) -> BenchmarkHashes:
"""Compute hashes for a series of files given some set of hashers.
Args:
hashers: A dictionary of hashers.
max_workers: Maximum number of workers for parallel hash
computation.
Returns:
metrics: A BenchmarkHashes object.
"""
hashsets = []
filepaths = self._df["filepath"]
for hasher_name, hasher in hashers.items():
hash_dicts = hasher.compute_parallel(
filepaths,
progress=tqdm,
progress_desc=f"Computing hashes for {hasher_name}",
max_workers=max_workers,
)
if not hasher.returns_multiple:
hashes_df = pd.DataFrame.from_records(hash_dicts)
else:
hash_groups = [
hash_dict["hash"] if hash_dict["error"] is None else [None]
for hash_dict in hash_dicts
]
hash_group_sizes = [len(hash_group) for hash_group in hash_groups]
current_hashes = flatten(hash_groups)
current_filepaths = flatten(
[
[hash_dict["filepath"]] * hash_group_size
for hash_dict, hash_group_size in zip(
hash_dicts, hash_group_sizes
)
]
)
current_errors = flatten(
[
[hash_dict["error"]] * hash_group_size
for hash_dict, hash_group_size in zip(
hash_dicts, hash_group_sizes
)
]
)
hashes_df = pd.DataFrame(
{
"error": current_errors,
"filepath": current_filepaths,
"hash": current_hashes,
}
)
hashset = hashes_df.assign(
hasher_name=hasher_name,
hasher_hash_length=hasher.hash_length,
hasher_dtype=hasher.dtype,
hasher_distance_metric=hasher.distance_metric,
)
hashset = hashset.merge(self._df, on="filepath")
hashsets.append(hashset)
return BenchmarkHashes(pd.concat(hashsets, sort=True))
class BenchmarkImageDataset(BenchmarkDataset):
def deduplicate(
self, hasher: ImageHasher, threshold=0.001, isometric=False
) -> tuple["BenchmarkImageDataset", set[tuple[str, str]]]:
"""Remove duplicate files from dataset.
Args:
files: A list of file paths
hasher: A hasher to use for finding a duplicate
threshold: The threshold required for a match
isometric: Whether to compute the rotated versions of the images
Returns:
A list where each entry is a list of files that are
duplicates of each other. We keep only the last entry.
"""
pairs: set[tuple[str, str]] = set()
for _, group in tqdm(
self._df.groupby(["category"]), desc="Deduplicating categories."
):
pairs = pairs.union(
set(
deduplicate(
files=group["filepath"].tolist(),
hashers=[(hasher, threshold)],
isometric=isometric,
)
)
)
removed = [pair[0] for pair in pairs]
return (
BenchmarkImageDataset(self._df[~self._df["filepath"].isin(removed)].copy()),
pairs,
)
def transform(
self,
transforms: dict[str, albumentations.BasicTransform],
storage_dir: str,
errors: str = "raise",
) -> BenchmarkImageTransforms:
"""Prepare files to be used as part of benchmarking run.
Args:
transforms: A dictionary of transformations. The only required
key is `noop` which determines how the original, untransformed
image is saved. For a true copy, simply make the `noop` key
`albumentations.NoOp`
storage_dir: A directory to store all the images along with
their transformed counterparts.
errors: How to handle errors reading files. If "raise", exceptions are
raised. If "warn", the error is printed as a warning.
Returns:
transforms: A BenchmarkImageTransforms object
"""
assert (
"noop" in transforms
), "You must provide a no-op transform such as `lambda img: img`."
os.makedirs(storage_dir, exist_ok=True)
files = self._df.copy()
files["guid"] = [str(uuid.uuid4()) for n in range(len(files))]
def apply_transform(files, transform_name):
transform = transforms[transform_name]
transformed_arr = []
for _, row in tqdm(
files.iterrows(),
desc=f"Creating files for {transform_name}",
total=len(files),
):
filepath, guid, category = row[["filepath", "guid", "category"]]
try:
image = tools.read(filepath)
except Exception as exception:
message = f"An error occurred reading {filepath}."
if errors == "raise":
raise exception
warnings.warn(message, UserWarning)
continue
try:
transformed = transform(image=image)
# If albumentations, output is a dict with 'image' key
if isinstance(transformed, dict) and "image" in transformed:
transformed = transformed["image"]
except Exception as e:
raise RuntimeError(
f"An exception occurred while processing {filepath} "
f"with transform {transform_name}."
) from e
transformed_path = os.path.join(
storage_dir, f"{guid}_{transform_name}.jpg"
)
cv2.imwrite(
transformed_path, cv2.cvtColor(transformed, cv2.COLOR_RGB2BGR)
)
transformed_arr.append(
{
"guid": guid,
"transform_name": transform_name,
"input_filepath": filepath,
"filepath": transformed_path,
"category": category,
}
)
return pd.DataFrame.from_records(transformed_arr)
results = [apply_transform(files, transform_name="noop")]
for transform_name in transforms.keys():
if transform_name == "noop":
continue
results.append(apply_transform(results[0], transform_name=transform_name))
benchmark_transforms = BenchmarkImageTransforms(
df=pd.concat(results, axis=0, ignore_index=True)
)
benchmark_transforms.save(storage_dir)
return benchmark_transforms
================================================
FILE: perception/benchmarking/image_transforms.py
================================================
import cv2
import numpy as np
def apply_watermark(watermark, alpha: float = 1.0, size: float = 1.0):
"""Apply a watermark to the bottom right of
images. Based on the work provided at
https://www.pyimagesearch.com/2016/04/25/watermarking-images-with-opencv-and-python/
Args:
watermark: The watermark to overlay
alpha: The strength of the overlay
size: The maximum proportion of the image
taken by the watermark.
"""
assert watermark.shape[-1] == 4, "Watermark must have an alpha channel."
# Why do we have to do this? It's not clear. But the process doesn't work
# without it.
B, G, R, A = cv2.split(watermark)
B = cv2.bitwise_and(B, B, mask=A)
G = cv2.bitwise_and(G, G, mask=A)
R = cv2.bitwise_and(R, R, mask=A)
watermark = cv2.merge([B, G, R, A])
def transform(image):
# Add alpha channel
h, w = image.shape[:2]
wh, ww = watermark.shape[:2]
scale = size * min(h / wh, w / ww)
image = np.dstack([image, np.ones((h, w), dtype="uint8") * 255])
# Construct an overlay that is the same size as the input.
overlay = np.zeros((h, w, 4), dtype="uint8")
scaled = cv2.resize(watermark, (int(scale * ww), int(scale * wh)))
sh, sw = scaled.shape[:2]
overlay[max(h - sh, 0) :, max(w - sw, 0) : w] = scaled
# Blend the two images together using transparent overlays
output = image.copy()
cv2.addWeighted(overlay, alpha, output, 1.0, 0, output)
return cv2.cvtColor(output, cv2.COLOR_RGBA2RGB)
return transform
================================================
FILE: perception/benchmarking/video.py
================================================
import concurrent.futures
import os
import typing
import uuid
import pandas as pd
import tqdm
from ..hashers import VideoHasher, tools
from ..tools import flatten
from .common import BenchmarkDataset, BenchmarkHashes, BenchmarkTransforms
def _process_row(row, hashers, framerates):
error = None
try:
assert not pd.isnull(row["filepath"]), "No filepath provided."
hashes = tools.compute_synchronized_video_hashes(
filepath=row["filepath"],
hashers=hashers,
framerates=framerates,
hash_format="base64",
)
except Exception as exception:
error = str(exception)
hashes = {
hasher_name: [None] if hasher.returns_multiple else None
for hasher_name, hasher in hashers.items()
}
base_dict = {
"guid": row["guid"],
"filepath": row["filepath"],
"error": error,
"category": row["category"],
"transform_name": row["transform_name"],
"input_filepath": row["input_filepath"],
}
hash_dicts = []
for hasher_name, hasher in hashers.items():
base_hash_dict = {
"hasher_name": hasher_name,
"hasher_dtype": hasher.dtype,
"hasher_distance_metric": hasher.distance_metric,
"hasher_hash_length": hasher.hash_length,
}
if not hasher.returns_multiple:
hash_dicts.append(
{
**{
"hash": hashes[hasher_name],
},
**base_hash_dict,
}
)
else:
for hash_value in hashes[hasher_name]:
hash_dicts.append(
{
**{
"hash": hash_value,
},
**base_hash_dict,
}
)
return [{**hash_dict, **base_dict} for hash_dict in hash_dicts]
class BenchmarkVideoDataset(BenchmarkDataset):
def transform(
self,
transforms: dict[str, typing.Callable],
storage_dir: str,
errors: str = "raise",
):
"""Prepare files to be used as part of benchmarking run.
Args:
transforms: A dictionary of transformations. The only required
key is `noop` which determines how the original, untransformed
video is saved. Each transform should be a callable function with
that accepts an `input_filepath` and `output_filepath` argument and
it should return the `output_filepath` (which may have a different
extension appended by the transform function).
storage_dir: A directory to store all the videos along with
their transformed counterparts.
errors: How to handle errors reading files. If "raise", exceptions are
raised. If "warn", the error is printed as a warning.
Returns:
transforms: A BenchmarkVideoTransforms object
"""
assert "noop" in transforms, "You must provide a no-op transform."
os.makedirs(storage_dir, exist_ok=True)
files = self._df.copy()
files["guid"] = [str(uuid.uuid4()) for n in range(len(files))]
def apply_transform_to_file(input_filepath, guid, transform_name, category):
if input_filepath is None:
# This can happen if the noop transform did not yield
# a file. We don't want to drop the records so we
# keep them.
return {
"guid": guid,
"error": "No source file provided",
"transform_name": transform_name,
"input_filepath": input_filepath,
"filepath": None,
"category": category,
}
try:
output_filepath = transforms[transform_name](
input_filepath,
output_filepath=os.path.join(
storage_dir, f"{guid}_{transform_name}"
),
)
error = None
except Exception as e:
output_filepath = None
error = str(e)
return {
"guid": guid,
"error": error,
"transform_name": transform_name,
"input_filepath": input_filepath,
"filepath": output_filepath,
"category": category,
}
def apply_transform_to_files(files, transform_name):
return pd.DataFrame.from_records(
[
apply_transform_to_file(
input_filepath=row["filepath"],
guid=row["guid"],
transform_name=transform_name,
category=row["category"],
)
for _, row in tqdm.tqdm(
files.iterrows(),
desc=f"Creating files for {transform_name}",
total=len(files),
)
]
)
results = [apply_transform_to_files(files, transform_name="noop")]
for transform_name in transforms.keys():
if transform_name == "noop":
continue
results.append(
apply_transform_to_files(results[0], transform_name=transform_name)
)
benchmark_transforms = BenchmarkVideoTransforms(
df=pd.concat(results, axis=0, ignore_index=True)
)
benchmark_transforms.save(storage_dir)
return benchmark_transforms
class BenchmarkVideoTransforms(BenchmarkTransforms):
expected_columns = [
"filepath",
"category",
"transform_name",
"input_filepath",
"guid",
"error",
]
def compute_hashes(
self, hashers: dict[str, VideoHasher], max_workers: int = 5
) -> BenchmarkHashes:
"""Compute hashes for a series of files given some set of hashers.
Args:
hashers: A dictionary of hashers.
max_workers: Maximum number of workers for parallel hash
computation.
Returns:
hashes: A BenchmarkHashes object.
"""
id_rates = {
hasher_name: hasher.frames_per_second
for hasher_name, hasher in hashers.items()
if hasher.frames_per_second is not None
}
if id_rates:
framerates = tools.get_common_framerates(
{
hasher_name: hasher.frames_per_second
for hasher_name, hasher in hashers.items()
if hasher.frames_per_second is not None
}
)
else:
framerates = {}
with concurrent.futures.ProcessPoolExecutor(
max_workers=max_workers
) as executor:
futures = [
executor.submit(
_process_row, row=row, framerates=framerates, hashers=hashers
)
for index, row in self._df.iterrows()
]
return BenchmarkHashes(
pd.DataFrame.from_records(
flatten(
[
future.result()
for future in tqdm.tqdm(
concurrent.futures.as_completed(futures),
desc="Computing hashes.",
total=len(self._df),
)
]
)
)
)
================================================
FILE: perception/benchmarking/video_transforms.py
================================================
import os
import cv2
import ffmpeg
from ..hashers.tools import read_video
def probe(filepath):
"""Get the output of ffprobe."""
return ffmpeg.probe(filepath)
def sanitize_output_filepath(input_filepath, output_filepath, output_ext=None):
"""Get a suitable output filepath with an extension based on
an input filepath.
Args:
input_filepath: The filepath for the source file.
output_filepath: The filepath for the output file.
output_ext: A new extension to add (e.g., '.gif')
"""
_, input_ext = os.path.splitext(input_filepath)
if not output_filepath.lower().endswith(output_ext or input_ext):
output_filepath += output_ext or input_ext
return output_filepath
def get_simple_transform(
width: str | int = -1,
height: str | int = -1,
pad: str | None = None,
codec: str | None = None,
clip_pct: tuple[float, float] | None = None,
clip_s: tuple[float, float] | None = None,
sar=None,
fps=None,
output_ext=None,
):
"""Resize to a specific size and re-encode.
Args:
width: The target width (-1 to maintain aspect ratio)
height: The target height (-1 to maintain aspect ratio)
pad: An ffmpeg pad argument provided as a string.
codec: The codec for encoding the video.
fps: The new frame rate for the video.
clip_pct: The video start and end in percentages of video duration.
clip_s: The video start and end in seconds (used over clip_pct if both
are provided).
sar: Whether to make all videos have a common sample aspect
ratio (i.e., for all square pixels, set this to '1/1').
output_ext: The extension to use when re-encoding (used to select
video format). It should include the leading '.'.
"""
def transform(input_filepath, output_filepath):
output_filepath = sanitize_output_filepath(
input_filepath, output_filepath, output_ext
)
data = None
if codec is None:
data = data or probe(input_filepath)
output_codec = [s for s in data["streams"] if s["codec_type"] == "video"][
0
]["codec_name"]
else:
output_codec = codec
format_kwargs = {"codec:v": output_codec}
if clip_pct is not None or clip_s is not None:
pct_start, pct_end, pos_start, pos_end = None, None, None, None
if clip_pct is not None:
pct_start, pct_end = clip_pct
if clip_s is not None:
pos_start, pos_end = clip_s
if pct_start is not None:
assert 0 <= pct_start <= 1, "Start position must be between 0 and 1."
if pct_end is not None:
assert 0 <= pct_end <= 1, "End position must be between 0 and 1."
if pct_start is not None and pct_end is not None:
assert pct_start < pct_end, "End must be greater than start."
if (pct_start is not None and pos_start is None) or (
pct_end is not None and pos_end is None
):
# We only want to get the duration for the video if we need
# it.
data = data or probe(input_filepath)
duration = float(data["streams"][0]["duration"])
if pct_start is not None or pos_start is not None:
format_kwargs["ss"] = pos_start or pct_start * duration # type: ignore
if pct_end is not None or pos_end is not None:
format_kwargs["t"] = pos_end or pct_end * duration # type: ignore
stream = ffmpeg.input(input_filepath)
if not (width == -1 and height == -1):
stream = stream.filter("scale", width, height)
if pad is not None:
stream = stream.filter("pad", *pad.split(":"))
if fps is not None:
stream = stream.filter("fps", fps)
if sar is not None:
stream = stream.filter("setsar", sar)
stream = stream.output(output_filepath, **format_kwargs).overwrite_output()
ffmpeg.run(stream)
if os.path.isfile(output_filepath):
return output_filepath
return None
return transform
def get_slideshow_transform(
frame_input_rate, frame_output_rate, max_frames=None, offset=0
):
"""Get a slideshow transform to create slideshows from
videos.
Args:
frame_input_rate: The rate at which frames will be sampled
from the source video (e.g., a rate of 1 means we collect
one frame per second of the input video).
frame_output_rate: The rate at which the sampled frames are played
in the slideshow (e.g., a rate of 0.5 means each frame will
appear for 2 seconds).
max_frames: The maximum number of frames to write.
offset: The number of seconds to wait before beginning the slide show.
"""
def transform(input_filepath, output_filepath):
output_filepath = sanitize_output_filepath(
input_filepath, output_filepath, output_ext=".avi"
)
writer = None
frame_count = 0
try:
for frame, _, timestamp in read_video(
filepath=input_filepath, frames_per_second=frame_input_rate
):
if timestamp < offset:
continue
if writer is None:
writer = cv2.VideoWriter(
filename=output_filepath,
fourcc=cv2.VideoWriter_fourcc(*"MJPG"), # type: ignore[attr-defined]
fps=frame_output_rate,
frameSize=tuple(frame.shape[:2][::-1]),
isColor=True,
)
writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
frame_count += 1
if max_frames is not None and frame_count >= max_frames:
break
finally:
if writer is not None:
writer.release()
if os.path.isfile(output_filepath):
return output_filepath
return None
return transform
def get_black_frame_padding_transform(duration_s=0, duration_pct=0):
"""Get a transform that adds black frames at the start and end
of a video.
Args:
duration_s: The duration of the black frames in seconds.
duration_pct: The duration of the black frames
as a percentage of video duration. If both duration_s
and duration_pct are provided, the maximum value
is used.
"""
def transform(input_filepath, output_filepath):
output_filepath = sanitize_output_filepath(input_filepath, output_filepath)
stream = next(
stream
for stream in probe(input_filepath)["streams"]
if stream["codec_type"] == "video"
)
assert stream["sample_aspect_ratio"] == "1:1", "SAR is not 1:1."
width = stream["width"]
height = stream["height"]
duration = max(duration_s, duration_pct * float(stream["duration"]))
ffmpeg.input(input_filepath).output(
output_filepath,
vf=(
"color=c=black:s={width}x{height}:d={duration} [pre] ; "
"color=c=black:s={width}x{height}:d={duration} [post] ; "
"[pre] [in] [post] concat=n=3"
).format(width=width, height=height, duration=duration),
fps_mode="vfr",
).overwrite_output().run()
if os.path.isfile(output_filepath):
return output_filepath
return None
return transform
================================================
FILE: perception/extensions.pyx
================================================
# cython: language_level=3
# cython: language=c++
import math
import sys
import cython
import numpy as np
from cython.parallel import parallel, prange
cimport numpy as np
from libc.stdlib cimport abort, free, malloc
from libcpp cimport bool as cppbool
from libcpp.vector cimport vector
cdef extern from "limits.h":
int INT_MAX
ctypedef np.uint8_t uint8
@cython.boundscheck(False)
@cython.wraparound(False)
def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts: np.uint32_t[:] = None, compute_overlap=False):
"""Find the pairwise overlap within an array of vectors, where there may be multiple
vectors for the same file. This function is faster than using scipy.spatial.distance
because it computes distances in parallel, avoids computing full distances when they're
not necessary, skips computing distances for pairs of hashes that are for the
same file, and skips computing distances for vectors if both have already been matched.
Args:
X: The vectors with shape (N, D). Vectors for the same file need to be
supplied sequentially so that we can use the counts argument
to determine which vectors are for the same file.
counts: For each file, the number of sequential vectors in X. If not
provided, each vector is assumed to be for a different file (i.e.,
this is equivalent to `counts = np.ones(N)`).
compute_overlap: If True, the values returned will be divided by the number
of hashes in each file. If False, the raw duplicate counts will
be returned.
Returns:
duplicates: An array of shape (M!/(2*((M-2)!)), 2) indicating
the fraction of vectors for each file found in another file.
The indexing matches that of scipy.spatial.pdist. M is the number of files.
So if M = 4, the array will represent comparisons of the file indexes as follows:
[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3)]. So (assuming compute_overlap=True),
a possible return would be [(1.0, 1.0), (0, 0), (0, 0), (0.66, 1.0), (0.5, 0.25)]
which means that:
- There was 100% overlap between file 0 and file 1
- 66% of file 1 was in file 2 and 100% of file 2 was in file 1
- 50% of file 2 was in file 3 and 25% of file 3 was in file 2
"""
if counts is None:
counts = np.ones(X.shape[0], dtype=np.uint32)
cdef Py_ssize_t n = X.shape[0]
cdef Py_ssize_t m = counts.shape[0]
cdef Py_ssize_t d = X.shape[1]
n_pairs_python = int(math.factorial(m)/(2*math.factorial(m-2)))
assert n_pairs_python < sys.maxsize, 'Too many files were provided for deduplication.'
cdef Py_ssize_t n_pairs = n_pairs_python
cdef Py_ssize_t max_counts = np.max(counts)
cdef int compute_overlap_int = 0
if compute_overlap:
compute_overlap_int = 1
# i_1 is the index of file1, i_2 is the index of file2, i_d is the
# index of the vector dimension we're on, i_i is used to compute
# the starting index in the flattened vector in the different threads.
# i_1_subhash is the index of the hash on file1, i_2_subhash is
# the index of the hash on file2.
cdef Py_ssize_t i_1, i_2, i_d, i_i, i_1_sub, i_2_sub, i_1_offset
duplicate_arr = np.zeros((n_pairs, 2), dtype=np.double)
cdef double[:, :] duplicate = duplicate_arr
offsets_arr = np.zeros(m, dtype=np.int32)
cdef np.int32_t[:] offsets = offsets_arr
for i_1 in range(m):
for i_i in range(i_1):
offsets[i_1] += counts[i_i]
# local_buf will contain distance, flattened array offset, index_offset_1, index_offset_2
cdef size_t local_buf_size = 4
cdef float threshold2 = threshold ** 2
with nogil, parallel():
local_buf = <np.uint64_t *> malloc(sizeof(np.uint64_t) * local_buf_size)
# An array of flags indicating whether a vector in file 1 was
# matched.
matched_1 = <int *> malloc(sizeof(int) * max_counts)
# An array of flags indicating whether a vector in file 2 was
# matched.
matched_2 = <int *> malloc(sizeof(int) * max_counts)
if local_buf is NULL or matched_1 is NULL or matched_2 is NULL:
abort()
# Iterate over all of the files.
for i_1 in prange(m-1):
local_buf[1] = 0
local_buf[2] = offsets[i_1]
# Compute the index of the output vector
# where we will count the number of duplicates.
for i_i in range(i_1):
local_buf[1] += m - i_i - 1
# Iterate over all the other files to compare.
for i_2 in range(i_1 + 1, m):
local_buf[3] = offsets[i_2]
# Initialize all match flags to zero for
# both file 1 and file 2.
for i_1_sub in range(counts[i_1]):
matched_1[i_1_sub] = 0
for i_2_sub in range(counts[i_2]):
matched_2[i_2_sub] = 0
# Iterate over all the hashes in file1
for i_1_sub in range(counts[i_1]):
# Iterate over all the hashes in file2
for i_2_sub in range(counts[i_2]):
local_buf[0] = 0
if matched_1[i_1_sub] == 1 and matched_2[i_2_sub] == 1:
# Both the vectors in this pair have already been matched, so
# there is nothing to gain from this comparison.
continue
for i_d in range(d):
local_buf[0] += (X[local_buf[2] + i_1_sub, i_d] - X[local_buf[3] + i_2_sub, i_d]) ** 2
if local_buf[0] > threshold2:
# If we're already beyond the distance threshold,
# we don't need to continue computing squared
# distances.
break
if local_buf[0] < threshold2:
# A match was found. Set flags for both vectors
# to 1.
matched_1[i_1_sub] = 1
matched_2[i_2_sub] = 1
# Add up the number of matches for file 1.
for i_1_sub in range(counts[i_1]):
duplicate[local_buf[1], 0] += matched_1[i_1_sub]
# Add up the number of matches for file 2.
for i_2_sub in range(counts[i_2]):
duplicate[local_buf[1], 1] += matched_2[i_2_sub]
# Divide by the total number of vectors for each file.
if compute_overlap_int:
duplicate[local_buf[1], 0] /= counts[i_1]
duplicate[local_buf[1], 1] /= counts[i_2]
# Advance to the next pair index.
local_buf[1] += 1
free(local_buf)
free(matched_1)
free(matched_2)
return duplicate_arr
@cython.boundscheck(False)
@cython.wraparound(False)
def compute_euclidean_pairwise_duplicates_simple(int[:, :] X, float threshold, np.uint32_t[:] counts = None, float minimum_overlap = 0):
"""Find the pairwise overlap within an array of vectors, where there may be multiple
vectors for the same file. This function is similar to compute_euclidean_pairwise_duplicates
but uses much less memory.
Args:
X: The vectors with shape (N, D). Vectors for the same file need to be
supplied sequentially so that we can use the counts argument
to determine which vectors are for the same file.
threshold: The maximum distance between to vectors to allow for
a match.
counts: For each of the M files, the number of sequential vectors in X.
If not provided, each vector is assumed to be for a different file (i.e.,
this is equivalent to `counts = np.ones(N)` which also implies M == N).
Otherwise, assumed to have length M. The counts should add up to N.
minimum_overlap: The minimum overlap between two groups of hashes to
call it a match.
Returns:
pairs: Pairs of indexes that met the matching criteria.
"""
if counts is None:
counts_arr = np.ones(X.shape[0], dtype=np.uint32)
counts = counts_arr
cdef Py_ssize_t n = X.shape[0]
cdef Py_ssize_t m = counts.shape[0]
cdef Py_ssize_t d = X.shape[1]
n_pairs_python = int(math.factorial(m)/(2*math.factorial(m-2)))
assert n_pairs_python < sys.maxsize, 'Too many files were provided for deduplication.'
cdef Py_ssize_t n_pairs = n_pairs_python
cdef Py_ssize_t max_counts = np.max(counts)
# i_1 is the index of file1, i_2 is the index of file2, i_d is the
# index of the vector dimension we're on, i_i is used to compute
# the starting index in the flattened vector in the different threads.
# i_1_subhash is the index of the hash on file1, i_2_subhash is
# the index of the hash on file2.
cdef Py_ssize_t i_1, i_2, i_d, i_i, i_1_sub, i_2_sub
cdef vector[cppbool] duplicate
duplicate.resize(n_pairs)
offsets_arr = np.zeros(m, dtype=np.uint64)
cdef np.uint64_t[:] offsets = offsets_arr
cdef np.int32_t expected_n = 0
for i_1 in range(m):
for i_i in range(i_1):
offsets[i_1] += counts[i_i]
expected_n += counts[i_1]
assert expected_n == n, "Provided value for counts is inconsistent with X."
# local_buf will contain:
# distance, flattened array offset,
# index_offset_1, index_offset_2
cdef size_t local_buf_si
gitextract_4qwyzu2o/
├── .dockerignore
├── .git-blame-ignore-revs
├── .gitattributes
├── .github/
│ ├── dependabot.yaml
│ └── workflows/
│ ├── ci.yaml
│ ├── gh-pages.yaml
│ └── release.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── build.py
├── docs/
│ ├── api/
│ │ ├── benchmarking.rst
│ │ ├── hashers.rst
│ │ ├── index.rst
│ │ └── tools.rst
│ ├── conf.py
│ ├── examples/
│ │ ├── benchmarking.rst
│ │ ├── deduplication.rst
│ │ ├── detecting_csam.rst
│ │ └── index.rst
│ ├── index.rst
│ └── requirements.txt
├── perception/
│ ├── __init__.py
│ ├── approximate_deduplication/
│ │ ├── __init__.py
│ │ ├── _graph_backend.py
│ │ ├── debug.py
│ │ ├── index.py
│ │ └── serve.py
│ ├── benchmarking/
│ │ ├── __init__.py
│ │ ├── common.py
│ │ ├── extensions.pyx
│ │ ├── image.py
│ │ ├── image_transforms.py
│ │ ├── video.py
│ │ └── video_transforms.py
│ ├── extensions.pyx
│ ├── hashers/
│ │ ├── __init__.py
│ │ ├── hasher.py
│ │ ├── image/
│ │ │ ├── __init__.py
│ │ │ ├── average.py
│ │ │ ├── dhash.py
│ │ │ ├── opencv.py
│ │ │ ├── pdq.py
│ │ │ ├── phash.py
│ │ │ └── wavelet.py
│ │ ├── tools.py
│ │ └── video/
│ │ ├── __init__.py
│ │ ├── framewise.py
│ │ └── tmk.py
│ ├── local_descriptor_deduplication.py
│ ├── py.typed
│ ├── testing/
│ │ ├── __init__.py
│ │ ├── images/
│ │ │ └── README.md
│ │ ├── logos/
│ │ │ └── README.md
│ │ └── videos/
│ │ ├── README.md
│ │ ├── rgb.m4v
│ │ ├── v1.m4v
│ │ └── v2.m4v
│ ├── tools.py
│ └── utils.py
├── poetry.toml
├── pyproject.toml
├── setup.py
└── tests/
├── test_approximate_deduplication.py
├── test_benchmarking.py
├── test_hashers.py
├── test_local_descriptor_deduplication.py
├── test_tmk.py
└── test_tools.py
SYMBOL INDEX (254 symbols across 31 files)
FILE: build.py
function build (line 7) | def build(setup_kwargs):
FILE: perception/approximate_deduplication/__init__.py
class ClusterAssignment (line 21) | class ClusterAssignment(typing_extensions.TypedDict):
function build_index (line 26) | def build_index(
function compute_euclidean_pairwise_duplicates_approx (line 85) | def compute_euclidean_pairwise_duplicates_approx(
function pairs_to_clusters (line 201) | def pairs_to_clusters(
FILE: perception/approximate_deduplication/_graph_backend.py
class GraphBackend (line 6) | class GraphBackend(ABC):
method build_graph (line 8) | def build_graph(
method connected_components (line 13) | def connected_components(self, graph: typing.Any) -> list[list[int]]: ...
method communities (line 16) | def communities(
method maximal_cliques (line 21) | def maximal_cliques(
class NetworkitGraphBackend (line 29) | class NetworkitGraphBackend(GraphBackend):
method __init__ (line 30) | def __init__(self):
method build_graph (line 35) | def build_graph(
method connected_components (line 43) | def connected_components(self, graph: typing.Any) -> list[list[int]]:
method communities (line 48) | def communities(self, graph: typing.Any, component: list[int]) -> list...
method maximal_cliques (line 59) | def maximal_cliques(
class NetworkxGraphBackend (line 84) | class NetworkxGraphBackend(GraphBackend):
method __init__ (line 85) | def __init__(self):
method build_graph (line 90) | def build_graph(
method connected_components (line 98) | def connected_components(self, graph: typing.Any) -> list[list[int]]:
method communities (line 101) | def communities(self, graph: typing.Any, component: list[int]) -> list...
method maximal_cliques (line 110) | def maximal_cliques(
function get_graph_backend (line 135) | def get_graph_backend() -> GraphBackend:
FILE: perception/approximate_deduplication/debug.py
function vizualize_pair (line 15) | def vizualize_pair(
function viz_match_data (line 89) | def viz_match_data(
function viz_brute_force (line 194) | def viz_brute_force(features_1, features_2, img1, img2, ratio: float):
FILE: perception/approximate_deduplication/index.py
class QueryInput (line 13) | class QueryInput(typing_extensions.TypedDict):
class QueryMatch (line 18) | class QueryMatch(typing_extensions.TypedDict):
class TuningFailure (line 23) | class TuningFailure(Exception):
class QueryDecodingFailure (line 27) | class QueryDecodingFailure(Exception):
function build_query (line 31) | def build_query(table, ids, paramstyle, columns):
function query_by_id (line 44) | def query_by_id(con, table, ids, paramstyle, extra_columns=None) -> pd.D...
class ApproximateNearestNeighbors (line 73) | class ApproximateNearestNeighbors:
method __init__ (line 88) | def __init__(
method from_database (line 120) | def from_database(
method query_by_id (line 229) | def query_by_id(
method string_to_vector (line 255) | def string_to_vector(self, s: str, hash_format="base64") -> np.ndarray:
method vector_to_string (line 266) | def vector_to_string(self, vector, hash_format="base64") -> str | None:
method search (line 276) | def search(
method tune (line 340) | def tune(self, n_query=100, min_recall=99, max_noise=3):
method save (line 403) | def save(self, filepath):
method set_nprobe (line 411) | def set_nprobe(self, nprobe) -> int:
method nlist (line 421) | def nlist(self):
method nprobe (line 426) | def nprobe(self):
method ntotal (line 431) | def ntotal(self):
FILE: perception/approximate_deduplication/serve.py
function is_similarity_valid (line 16) | def is_similarity_valid(data, index: ApproximateNearestNeighbors):
function similarity (line 34) | async def similarity(request):
function get_logger (line 84) | def get_logger(name, log_level):
function serve (line 96) | async def serve(
FILE: perception/benchmarking/common.py
function create_mask (line 32) | def create_mask(transformed_guids, noop_guids):
function compute_threshold_precision_recall (line 67) | def compute_threshold_precision_recall(pos, neg, precision_threshold=99.9):
class Filterable (line 100) | class Filterable(ABC):
method __init__ (line 104) | def __init__(self, df):
method categories (line 111) | def categories(self):
method filter (line 115) | def filter(self, **kwargs):
class Saveable (line 131) | class Saveable(Filterable):
method load (line 133) | def load(
method save (line 210) | def save(self, path_to_zip_or_directory):
class BenchmarkHashes (line 280) | class BenchmarkHashes(Filterable):
method __init__ (line 310) | def __init__(self, df: pd.DataFrame):
method __add__ (line 314) | def __add__(self, other):
method __radd__ (line 317) | def __radd__(self, other):
method load (line 321) | def load(cls, filepath: str):
method save (line 324) | def save(self, filepath):
method compute_metrics (line 327) | def compute_metrics(
method show_histograms (line 449) | def show_histograms(self, grouping=None, precision_threshold=99.9, **k...
method compute_threshold_recall (line 544) | def compute_threshold_recall(
class BenchmarkDataset (line 605) | class BenchmarkDataset(Saveable):
method from_tuples (line 617) | def from_tuples(cls, files: list[tuple[str, str]]):
method transform (line 629) | def transform(self, transforms, storage_dir, errors):
class BenchmarkTransforms (line 633) | class BenchmarkTransforms(Saveable):
method compute_hashes (line 652) | def compute_hashes(self, hashers, max_workers):
FILE: perception/benchmarking/image.py
class BenchmarkImageTransforms (line 19) | class BenchmarkImageTransforms(BenchmarkTransforms):
method compute_hashes (line 20) | def compute_hashes(
class BenchmarkImageDataset (line 85) | class BenchmarkImageDataset(BenchmarkDataset):
method deduplicate (line 86) | def deduplicate(
method transform (line 120) | def transform(
FILE: perception/benchmarking/image_transforms.py
function apply_watermark (line 5) | def apply_watermark(watermark, alpha: float = 1.0, size: float = 1.0):
FILE: perception/benchmarking/video.py
function _process_row (line 14) | def _process_row(row, hashers, framerates):
class BenchmarkVideoDataset (line 68) | class BenchmarkVideoDataset(BenchmarkDataset):
method transform (line 69) | def transform(
class BenchmarkVideoTransforms (line 163) | class BenchmarkVideoTransforms(BenchmarkTransforms):
method compute_hashes (line 173) | def compute_hashes(
FILE: perception/benchmarking/video_transforms.py
function probe (line 9) | def probe(filepath):
function sanitize_output_filepath (line 14) | def sanitize_output_filepath(input_filepath, output_filepath, output_ext...
function get_simple_transform (line 29) | def get_simple_transform(
function get_slideshow_transform (line 111) | def get_slideshow_transform(
function get_black_frame_padding_transform (line 162) | def get_black_frame_padding_transform(duration_s=0, duration_pct=0):
FILE: perception/hashers/hasher.py
class Hasher (line 14) | class Hasher(ABC):
method string_to_vector (line 36) | def string_to_vector(self, hash_string: str, hash_format: str = "base6...
method vector_to_string (line 50) | def vector_to_string(
method compute_distance (line 61) | def compute_distance(
method _compute_distance (line 106) | def _compute_distance(self, vector1, vector2):
method compute_parallel (line 110) | def compute_parallel(
class ImageHasher (line 174) | class ImageHasher(Hasher):
method _compute (line 176) | def _compute(self, image: np.ndarray) -> np.ndarray:
method compute_isometric_from_hash (line 185) | def compute_isometric_from_hash(self, hash_string_or_vector, hash_form...
method compute_isometric (line 213) | def compute_isometric(self, image: tools.ImageInputType):
method compute (line 231) | def compute(
method compute_with_quality (line 257) | def compute_with_quality(
method _compute_with_quality (line 285) | def _compute_with_quality(self, image: np.ndarray) -> tuple[np.ndarray...
class VideoHasher (line 289) | class VideoHasher(Hasher):
method process_frame (line 295) | def process_frame(
method hash_from_final_state (line 314) | def hash_from_final_state(self, state: dict) -> np.ndarray:
method compute (line 322) | def compute(
FILE: perception/hashers/image/average.py
class AverageHash (line 7) | class AverageHash(ImageHasher):
method __init__ (line 16) | def __init__(self, hash_size=8):
method _compute (line 21) | def _compute(self, image):
method _compute_isometric_from_hash (line 29) | def _compute_isometric_from_hash(self, vector):
FILE: perception/hashers/image/dhash.py
class DHash (line 6) | class DHash(ImageHasher):
method __init__ (line 15) | def __init__(self, hash_size=8):
method _compute (line 20) | def _compute(self, image):
FILE: perception/hashers/image/opencv.py
class OpenCVHasher (line 7) | class OpenCVHasher(ImageHasher):
method __init__ (line 10) | def __init__(self):
class MarrHildreth (line 17) | class MarrHildreth(OpenCVHasher):
method __init__ (line 25) | def __init__(self):
method _compute (line 29) | def _compute(self, image):
class ColorMoment (line 33) | class ColorMoment(OpenCVHasher):
method __init__ (line 41) | def __init__(self):
method _compute (line 45) | def _compute(self, image):
class BlockMean (line 49) | class BlockMean(OpenCVHasher):
method __init__ (line 57) | def __init__(self):
method _compute (line 61) | def _compute(self, image):
FILE: perception/hashers/image/pdq.py
class PDQHash (line 6) | class PDQHash(ImageHasher):
method _compute (line 15) | def _compute(self, image):
method _compute_with_quality (line 18) | def _compute_with_quality(self, image):
method _compute_isometric (line 22) | def _compute_isometric(self, image):
class PDQHashF (line 28) | class PDQHashF(PDQHash):
method _compute (line 33) | def _compute(self, image):
FILE: perception/hashers/image/phash.py
class PHash (line 9) | class PHash(ImageHasher):
method __init__ (line 27) | def __init__(
method _compute_dct (line 42) | def _compute_dct(self, image):
method _dct_to_hash (line 54) | def _dct_to_hash(self, dct):
method _compute (line 60) | def _compute(self, image):
method _compute_isometric (line 64) | def _compute_isometric(self, image):
class PHashF (line 73) | class PHashF(PHash):
method _dct_to_hash (line 81) | def _dct_to_hash(self, dct):
class PHashU8 (line 90) | class PHashU8(PHash):
method _dct_to_hash (line 100) | def _dct_to_hash(self, dct):
FILE: perception/hashers/image/wavelet.py
class WaveletHash (line 8) | class WaveletHash(ImageHasher):
method __init__ (line 17) | def __init__(self, hash_size=8, image_scale=None, mode="haar"):
method _compute (line 31) | def _compute(self, image):
FILE: perception/hashers/tools.py
function get_ffprobe (line 56) | def get_ffprobe():
function get_ffmpeg (line 60) | def get_ffmpeg():
function compute_quality (line 64) | def compute_quality(image) -> int:
function compute_md5 (line 79) | def compute_md5(filepath) -> str:
function get_string_length (line 90) | def get_string_length(hash_length: int, dtype: str, hash_format="hex") -...
function vector_to_string (line 110) | def vector_to_string(vector: np.ndarray, dtype: str, hash_format: str) -...
function string_to_vector (line 140) | def string_to_vector(
function hex_to_b64 (line 180) | def hex_to_b64(
function b64_to_hex (line 204) | def b64_to_hex(
function to_image_array (line 228) | def to_image_array(image: ImageInputType, require_color=True) -> np.ndar...
function get_common_framerates (line 243) | def get_common_framerates(id_rates: dict):
function get_isometric_transforms (line 316) | def get_isometric_transforms(image: ImageInputType, require_color=True) ...
function get_isometric_dct_transforms (line 330) | def get_isometric_dct_transforms(dct: np.ndarray):
function read (line 352) | def read(filepath_or_buffer: ImageInputType, timeout=None) -> np.ndarray:
function _get_keyframes (line 392) | def _get_keyframes(filepath):
function get_video_properties (line 425) | def get_video_properties(filepath):
function read_video_to_generator_ffmpeg (line 454) | def read_video_to_generator_ffmpeg(
function read_video_to_generator (line 655) | def read_video_to_generator(
function read_video_into_queue (line 775) | def read_video_into_queue(*args, video_queue, terminate, func, **kwargs):
function read_video (line 787) | def read_video(
function compute_synchronized_video_hashes (line 887) | def compute_synchronized_video_hashes(
function unletterbox (line 971) | def unletterbox(
function unletterbox_crop (line 1136) | def unletterbox_crop(
FILE: perception/hashers/video/framewise.py
class FramewiseHasher (line 7) | class FramewiseHasher(VideoHasher):
method __init__ (line 13) | def __init__(
method process_frame (line 33) | def process_frame(self, frame, frame_index, frame_timestamp, state=None):
method compute_batches (line 60) | def compute_batches(
method hash_from_final_state (line 99) | def hash_from_final_state(self, state):
FILE: perception/hashers/video/tmk.py
class TMKL2 (line 11) | class TMKL2(VideoHasher):
method __init__ (line 17) | def __init__(
method process_frame (line 70) | def process_frame(self, frame, frame_index, frame_timestamp, state=None):
method hash_from_final_state (line 77) | def hash_from_final_state(self, state):
method _compute_distance (line 94) | def _compute_distance(self, vector1, vector2):
method _score_pair (line 103) | def _score_pair(self, fv_a, fv_b, offsets=None, normalization="matrix"):
class TMKL1 (line 171) | class TMKL1(VideoHasher):
method __init__ (line 174) | def __init__(
method process_frame (line 194) | def process_frame(self, frame, frame_index, frame_timestamp, state=None):
method hash_from_final_state (line 211) | def hash_from_final_state(self, state):
FILE: perception/local_descriptor_deduplication.py
class Descriptors (line 34) | class Descriptors(typing_extensions.TypedDict):
class MatchStats (line 43) | class MatchStats(typing_extensions.TypedDict):
class LocalHasher (line 54) | class LocalHasher(ABC):
method __init__ (line 61) | def __init__(
method compute (line 79) | def compute(self, image) -> tuple[np.ndarray, np.ndarray]:
method validate_match (line 82) | def validate_match(
class SIFT (line 250) | class SIFT(LocalHasher):
method __init__ (line 253) | def __init__(
class AKAZE (line 264) | class AKAZE(LocalHasher):
method __init__ (line 267) | def __init__(
function load_and_preprocess (line 279) | def load_and_preprocess(filepath, max_size=DEFAULT_MAX_SIZE, grayscale=T...
function generate_image_descriptors (line 308) | def generate_image_descriptors(
function build_reference_df (line 363) | def build_reference_df(
function hasher_name (line 419) | def hasher_name(df: pd.DataFrame) -> str:
function check_hasher (line 423) | def check_hasher(df1: pd.DataFrame, df2: pd.DataFrame):
function compute_pairs (line 429) | def compute_pairs(
function compute_area (line 490) | def compute_area(box):
function compute_intersection (line 500) | def compute_intersection(kps, filter_arr):
function compute_minimum_intersection (line 517) | def compute_minimum_intersection(kp1, kp2, filter_arr1, filter_arr2):
function deduplicate_sift_dfs (line 532) | def deduplicate_sift_dfs(*args, **kwargs):
function deduplicate_dfs (line 538) | def deduplicate_dfs(
function deduplicate (line 640) | def deduplicate(
FILE: perception/testing/__init__.py
function get_low_detail_image (line 18) | def get_low_detail_image():
function test_opencv_hasher (line 74) | def test_opencv_hasher(hasher: hashers.ImageHasher, image1: str, image2:...
function hash_dicts_to_df (line 92) | def hash_dicts_to_df(hash_dicts, returns_multiple):
function test_hasher_parallelization (line 108) | def test_hasher_parallelization(hasher, test_filepaths):
function test_video_hasher_integrity (line 129) | def test_video_hasher_integrity(
function test_image_hasher_integrity (line 135) | def test_image_hasher_integrity(
FILE: perception/tools.py
function _multiple_hashes_for_ids (line 26) | def _multiple_hashes_for_ids(hashes: list[tuple[str, str | np.ndarray]]):
function deduplicate_hashes (line 37) | def deduplicate_hashes(
function deduplicate (line 160) | def deduplicate(
class SaferMatcher (line 229) | class SaferMatcher:
method __init__ (line 268) | def __init__(
method match (line 319) | def match(
FILE: perception/utils.py
function flatten (line 1) | def flatten(list_of_lists):
FILE: tests/test_approximate_deduplication.py
function get_cluster_members (line 4) | def get_cluster_members(assignments):
function test_pairs_to_clusters_component_strictness (line 11) | def test_pairs_to_clusters_component_strictness():
function test_pairs_to_clusters_community_strictness (line 21) | def test_pairs_to_clusters_community_strictness():
function test_pairs_to_clusters_clique_strictness (line 31) | def test_pairs_to_clusters_clique_strictness():
FILE: tests/test_benchmarking.py
function test_deduplicate (line 20) | def test_deduplicate():
function test_bad_dataset (line 32) | def test_bad_dataset():
function test_benchmark_dataset (line 52) | def test_benchmark_dataset():
function test_benchmark_transforms (line 71) | def test_benchmark_transforms():
function convert_hash_string_to_vector (line 97) | def convert_hash_string_to_vector(hash_string):
function test_video_benchmark_dataset (line 102) | def test_video_benchmark_dataset():
function test_euclidean_extension (line 187) | def test_euclidean_extension():
FILE: tests/test_hashers.py
function test_image_hashing_common (line 28) | def test_image_hashing_common(
function test_video_hashing_common (line 39) | def test_video_hashing_common():
function test_video_reading (line 49) | def test_video_reading():
function test_common_framerate (line 63) | def test_common_framerate():
function test_synchronized_hashing (line 81) | def test_synchronized_hashing():
function test_hex_b64_conversion (line 107) | def test_hex_b64_conversion():
FILE: tests/test_local_descriptor_deduplication.py
function test_deduplication (line 32) | def test_deduplication(hasher):
function test_deduplication_across_sets (line 81) | def test_deduplication_across_sets(hasher):
function test_validation_for_overlapping_case (line 117) | def test_validation_for_overlapping_case(hasher):
function test_handling_bad_file_case (line 141) | def test_handling_bad_file_case(caplog, hasher):
function test_handling_hasher_mismatch (line 183) | def test_handling_hasher_mismatch():
function test_viz_pair (line 200) | def test_viz_pair():
function test_viz_pair_symmetry (line 242) | def test_viz_pair_symmetry():
FILE: tests/test_tmk.py
function test_tmk_parity (line 15) | def test_tmk_parity():
FILE: tests/test_tools.py
function test_deduplicate (line 12) | def test_deduplicate():
function test_deduplicate_u8 (line 32) | def test_deduplicate_u8():
function test_deduplicate_hashes_multiple (line 54) | def test_deduplicate_hashes_multiple():
function test_compute_euclidean_pairwise_duplicates (line 82) | def test_compute_euclidean_pairwise_duplicates():
function test_api_is_over_https (line 146) | def test_api_is_over_https():
function test_unletterbox (line 162) | def test_unletterbox():
function test_unletterbox_crop (line 175) | def test_unletterbox_crop():
function test_unletterbox_crop_meaningful_pixels (line 185) | def test_unletterbox_crop_meaningful_pixels():
function test_unletterbox_color (line 202) | def test_unletterbox_color():
function test_unletterbox_aspect_ratio (line 226) | def test_unletterbox_aspect_ratio():
function test_unletterbox_noblackbars (line 251) | def test_unletterbox_noblackbars():
function test_ffmpeg_video (line 263) | def test_ffmpeg_video():
function test_videos_with_extra_channels (line 286) | def test_videos_with_extra_channels():
function test_image_input_types (line 303) | def test_image_input_types():
Condensed preview — 75 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (392K chars).
[
{
"path": ".dockerignore",
"chars": 17,
"preview": "notebooks\n.venv/\n"
},
{
"path": ".git-blame-ignore-revs",
"chars": 60,
"preview": "# Format with black\n6c03f96a9335e548685ece233474125fe453c262"
},
{
"path": ".gitattributes",
"chars": 36,
"preview": "perception/_version.py export-subst\n"
},
{
"path": ".github/dependabot.yaml",
"chars": 174,
"preview": "version: 2\nupdates:\n - package-ecosystem: \"github-actions\"\n directory: \"/\"\n schedule:\n # Check for updates t"
},
{
"path": ".github/workflows/ci.yaml",
"chars": 1389,
"preview": "name: ci\non:\n push:\n branches:\n - \"**\"\n tags-ignore:\n - v*\njobs:\n test:\n strategy:\n matrix:\n "
},
{
"path": ".github/workflows/gh-pages.yaml",
"chars": 674,
"preview": "name: Deploy Sphinx documentation to Pages\n\non:\n push:\n branches:\n - dunnack/sphinx-to-github-pages\n - mai"
},
{
"path": ".github/workflows/release.yaml",
"chars": 3921,
"preview": "name: release\non:\n release:\n types: [published]\n workflow_dispatch:\n\njobs:\n build-wheels:\n runs-on: ${{ matrix."
},
{
"path": ".gitignore",
"chars": 430,
"preview": "# MacOS stuff\n.DS_Store\n\n# Python artifacts\n*.egg-info\n\n# Cache\n.mypy_cache\n.pytest_cache\n__pycache__\n.ipynb_checkpoints"
},
{
"path": ".pre-commit-config.yaml",
"chars": 701,
"preview": "# See https://pre-commit.com for more information\n# See https://pre-commit.com/hooks.html for more hooks\nrepos:\n - repo"
},
{
"path": ".readthedocs.yaml",
"chars": 313,
"preview": "version: 2\n\n# Build documentation in the docs/ directory with Sphinx\nsphinx:\n configuration: docs/conf.py\n\nformats: all"
},
{
"path": "CHANGELOG.md",
"chars": 2158,
"preview": "# Changelog\nAll notable changes to this project will be documented in this file.\n\nThe format is based on [Keep a Changel"
},
{
"path": "CODE_OF_CONDUCT.md",
"chars": 3366,
"preview": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nIn the interest of fostering an open and welcoming environment, w"
},
{
"path": "LICENSE",
"chars": 10752,
"preview": "\n Apache License\n Version 2.0, January 2004\n "
},
{
"path": "MANIFEST.in",
"chars": 204,
"preview": "include perception/testing/images/*\ninclude perception/testing/videos/*\ninclude perception/testing/logos/*\ninclude perce"
},
{
"path": "Makefile",
"chars": 1342,
"preview": "TEST_SCOPE?=tests/\n\n.PHONY: build build-wheel build-sdist verify-version init-project init test lint_check type_check fo"
},
{
"path": "README.md",
"chars": 2703,
"preview": "# perception \n\n`perception` provides flexible, well"
},
{
"path": "build.py",
"chars": 390,
"preview": "from Cython.Build import cythonize\nimport numpy as np\n\ncompiler_directives = {\"language_level\": 3, \"embedsignature\": Tru"
},
{
"path": "docs/api/benchmarking.rst",
"chars": 859,
"preview": "Benchmarking\n************\n\n.. autoclass:: perception.benchmarking.BenchmarkImageDataset\n :members:\n :inher"
},
{
"path": "docs/api/hashers.rst",
"chars": 898,
"preview": "Hashers\n*******\n\nAll hashers from the :code:`Hasher` class.\n\n.. autoclass:: perception.hashers.hasher.Hasher\n :me"
},
{
"path": "docs/api/index.rst",
"chars": 98,
"preview": "API\n***\n\n.. toctree::\n :maxdepth: 2\n :caption: Contents:\n\n hashers\n benchmarking\n tools\n"
},
{
"path": "docs/api/tools.rst",
"chars": 61,
"preview": "\nTools\n*****\n\n\n.. automodule:: perception.tools\n :members:"
},
{
"path": "docs/conf.py",
"chars": 2099,
"preview": "# -*- coding: utf-8 -*-\n#\n# Configuration file for the Sphinx documentation builder.\n#\n# This file does only contain a s"
},
{
"path": "docs/examples/benchmarking.rst",
"chars": 43942,
"preview": "Benchmarking\n************\n\nThis package provides a fair amount of infrastructure for benchmarking different hashers to e"
},
{
"path": "docs/examples/deduplication.rst",
"chars": 7306,
"preview": "Media Deduplication\n*******************\n\nPerceptual hashes can be used to deduplicate sets of images. Below we provide t"
},
{
"path": "docs/examples/detecting_csam.rst",
"chars": 994,
"preview": "Detecting Child Sexual Abuse Material\n*************************************\n\nUsing `perception` and a subscription to Th"
},
{
"path": "docs/examples/index.rst",
"chars": 122,
"preview": "Examples\n********\n\n.. toctree::\n :maxdepth: 2\n :caption: Contents:\n\n deduplication\n detecting_csam\n benchmarki"
},
{
"path": "docs/index.rst",
"chars": 1558,
"preview": "perception\n==========\n\n:code:`perception` provides flexible, well-documented, and comprehensively tested tooling for per"
},
{
"path": "docs/requirements.txt",
"chars": 239,
"preview": "sphinx-autodoc-typehints==3.2.0\n# sphinx-autobuild==3.0.2\n# sphinx==1.8.3\nsphinx_rtd_theme==3.0.2\nm2r==0.3.1\nopencv-cont"
},
{
"path": "perception/__init__.py",
"chars": 77,
"preview": "from importlib import metadata\n\n__version__ = metadata.version(\"perception\")\n"
},
{
"path": "perception/approximate_deduplication/__init__.py",
"chars": 10066,
"preview": "import logging\nimport math\nimport os.path as op\nimport typing\n\nimport faiss\nimport numpy as np\nimport tqdm\nimport typing"
},
{
"path": "perception/approximate_deduplication/_graph_backend.py",
"chars": 4566,
"preview": "import sys\nimport typing\nfrom abc import ABC, abstractmethod\n\n\nclass GraphBackend(ABC):\n @abstractmethod\n def buil"
},
{
"path": "perception/approximate_deduplication/debug.py",
"chars": 8051,
"preview": "import logging\nimport random\n\nimport cv2\nimport numpy as np\n\nimport perception.local_descriptor_deduplication as ldd\n\nLO"
},
{
"path": "perception/approximate_deduplication/index.py",
"chars": 15650,
"preview": "import time\nimport typing\nimport warnings\n\nimport faiss\nimport numpy as np\nimport pandas as pd\nimport typing_extensions\n"
},
{
"path": "perception/approximate_deduplication/serve.py",
"chars": 4844,
"preview": "import asyncio\nimport functools\nimport json\nimport logging\nimport typing\n\nimport aiohttp.web\nimport numpy as np\nfrom pyt"
},
{
"path": "perception/benchmarking/__init__.py",
"chars": 612,
"preview": "from perception.benchmarking import video_transforms\nfrom perception.benchmarking import video\nfrom perception.benchmark"
},
{
"path": "perception/benchmarking/common.py",
"chars": 24536,
"preview": "import itertools\nimport logging\nimport os\nimport shutil\nimport tempfile\nimport uuid\nimport warnings\nimport zipfile\nfrom "
},
{
"path": "perception/benchmarking/extensions.pyx",
"chars": 4510,
"preview": "# cython: language_level=3\n\nimport cython\nimport numpy as np\nfrom cython.parallel import parallel, prange\n\ncimport numpy"
},
{
"path": "perception/benchmarking/image.py",
"chars": 7848,
"preview": "import logging\nimport os\nimport uuid\nimport warnings\n\nimport cv2\nimport albumentations\nimport pandas as pd\nfrom tqdm imp"
},
{
"path": "perception/benchmarking/image_transforms.py",
"chars": 1610,
"preview": "import cv2\nimport numpy as np\n\n\ndef apply_watermark(watermark, alpha: float = 1.0, size: float = 1.0):\n \"\"\"Apply a wa"
},
{
"path": "perception/benchmarking/video.py",
"chars": 7832,
"preview": "import concurrent.futures\nimport os\nimport typing\nimport uuid\n\nimport pandas as pd\nimport tqdm\n\nfrom ..hashers import Vi"
},
{
"path": "perception/benchmarking/video_transforms.py",
"chars": 7707,
"preview": "import os\n\nimport cv2\nimport ffmpeg\n\nfrom ..hashers.tools import read_video\n\n\ndef probe(filepath):\n \"\"\"Get the output"
},
{
"path": "perception/extensions.pyx",
"chars": 14422,
"preview": "# cython: language_level=3\n# cython: language=c++\n\nimport math\nimport sys\n\nimport cython\nimport numpy as np\nfrom cython."
},
{
"path": "perception/hashers/__init__.py",
"chars": 750,
"preview": "from .hasher import ImageHasher, VideoHasher\nfrom .image.average import AverageHash\nfrom .image.dhash import DHash\nfrom "
},
{
"path": "perception/hashers/hasher.py",
"chars": 14500,
"preview": "import concurrent.futures\nimport typing\nimport warnings\nfrom abc import ABC, abstractmethod\nfrom logging import warning\n"
},
{
"path": "perception/hashers/image/__init__.py",
"chars": 354,
"preview": "from .average import AverageHash\nfrom .dhash import DHash\nfrom .opencv import BlockMean, ColorMoment, MarrHildreth\nfrom "
},
{
"path": "perception/hashers/image/average.py",
"chars": 1160,
"preview": "import cv2\n\nfrom .. import tools\nfrom ..hasher import ImageHasher\n\n\nclass AverageHash(ImageHasher):\n \"\"\"Computes a si"
},
{
"path": "perception/hashers/image/dhash.py",
"chars": 869,
"preview": "import cv2\n\nfrom ..hasher import ImageHasher\n\n\nclass DHash(ImageHasher):\n \"\"\"A hash based on the differences between "
},
{
"path": "perception/hashers/image/opencv.py",
"chars": 1911,
"preview": "import cv2\nimport numpy as np\n\nfrom ..hasher import ImageHasher\n\n\nclass OpenCVHasher(ImageHasher):\n allow_parallel = "
},
{
"path": "perception/hashers/image/pdq.py",
"chars": 938,
"preview": "import pdqhash\n\nfrom ..hasher import ImageHasher\n\n\nclass PDQHash(ImageHasher):\n \"\"\"The Facebook PDQ hash. Based on th"
},
{
"path": "perception/hashers/image/phash.py",
"chars": 3582,
"preview": "import cv2\nimport numpy as np\nimport scipy.fftpack\n\nfrom .. import tools\nfrom ..hasher import ImageHasher\n\n\nclass PHash("
},
{
"path": "perception/hashers/image/wavelet.py",
"chars": 1942,
"preview": "import cv2\nimport numpy as np\nimport pywt\n\nfrom ..hasher import ImageHasher\n\n\nclass WaveletHash(ImageHasher):\n \"\"\"Sim"
},
{
"path": "perception/hashers/tools.py",
"chars": 45498,
"preview": "import base64\nimport fractions\nimport functools\nimport hashlib\nimport io\nimport itertools\nimport json\nimport logging\nimp"
},
{
"path": "perception/hashers/video/__init__.py",
"chars": 118,
"preview": "from .framewise import FramewiseHasher\nfrom .tmk import TMKL1, TMKL2\n\n__all__ = [\"FramewiseHasher\", \"TMKL1\", \"TMKL2\"]\n"
},
{
"path": "perception/hashers/video/framewise.py",
"chars": 3855,
"preview": "import numpy as np\n\nfrom .. import tools\nfrom ..hasher import ImageHasher, VideoHasher\n\n\nclass FramewiseHasher(VideoHash"
},
{
"path": "perception/hashers/video/tmk.py",
"chars": 7817,
"preview": "import platform\nimport warnings\n\nimport numpy as np\nimport scipy.special\n\nfrom ..hasher import ImageHasher, VideoHasher\n"
},
{
"path": "perception/local_descriptor_deduplication.py",
"chars": 25623,
"preview": "import concurrent.futures\nimport logging\nimport typing\nfrom abc import ABC\nfrom warnings import warn\n\nimport cv2\nimport "
},
{
"path": "perception/py.typed",
"chars": 0,
"preview": ""
},
{
"path": "perception/testing/__init__.py",
"chars": 8159,
"preview": "import atexit\nimport math\nimport typing\nfrom contextlib import ExitStack\nfrom importlib import resources\n\nimport cv2\nimp"
},
{
"path": "perception/testing/images/README.md",
"chars": 1476,
"preview": "# Sample images\nThese images were obtained from Wikimedia Commons.\n\n- [Image 1](https://commons.wikimedia.org/wiki/Commo"
},
{
"path": "perception/testing/logos/README.md",
"chars": 98,
"preview": "# Sample Logos\nThese logos were obtained from free sources.\n\n- [LogoIpsum](https://logoipsum.com/)"
},
{
"path": "perception/testing/videos/README.md",
"chars": 279,
"preview": "Video from https://www.youtube.com/watch?v=84Er4LnWXtI under Creative Commons Attribution License.\n\nNotes\n- v1 is a fair"
},
{
"path": "perception/tools.py",
"chars": 14585,
"preview": "import base64\nimport json\nimport os\nimport urllib.parse\nimport urllib.request\nimport warnings\n\nimport numpy as np\nfrom s"
},
{
"path": "perception/utils.py",
"chars": 95,
"preview": "def flatten(list_of_lists):\n return [item for sublist in list_of_lists for item in sublist]\n"
},
{
"path": "poetry.toml",
"chars": 46,
"preview": "[virtualenvs]\ncreate = true\nin-project = true\n"
},
{
"path": "pyproject.toml",
"chars": 1667,
"preview": "[project]\nname = \"Perception\"\ndynamic = [\"version\"]\ndescription = \"Perception provides flexible, well-documented, and co"
},
{
"path": "setup.py",
"chars": 4036,
"preview": "# -*- coding: utf-8 -*-\nfrom setuptools import setup\n\npackages = [\n \"perception\",\n \"perception.approximate_dedupli"
},
{
"path": "tests/test_approximate_deduplication.py",
"chars": 1153,
"preview": "import perception.approximate_deduplication as ad\n\n\ndef get_cluster_members(assignments):\n clusters: dict[int, list[s"
},
{
"path": "tests/test_benchmarking.py",
"chars": 8453,
"preview": "import base64\nimport os\nimport shutil\nimport tempfile\n\nimport numpy as np\nimport pytest\nimport albumentations\nfrom scipy"
},
{
"path": "tests/test_hashers.py",
"chars": 4450,
"preview": "import os\nimport string\n\nimport pytest\n\nfrom perception import hashers, testing\nfrom perception.hashers.image.pdq import"
},
{
"path": "tests/test_local_descriptor_deduplication.py",
"chars": 10420,
"preview": "import os\nimport tempfile\n\nimport albumentations\nimport cv2\nimport pandas as pd\nimport pytest\n\n\nimport perception.benchm"
},
{
"path": "tests/test_tmk.py",
"chars": 1371,
"preview": "import gzip\nimport json\nfrom pathlib import Path\nfrom typing import cast\nimport platform\n\nimport numpy as np\nimport pyte"
},
{
"path": "tests/test_tools.py",
"chars": 10618,
"preview": "import os\nimport shutil\nimport tempfile\nimport io\n\nimport numpy as np\nimport pytest\n\nfrom perception import hashers, tes"
}
]
// ... and 3 more files (download for full content)
About this extraction
This page contains the full source code of the thorn-oss/perception GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 75 files (366.2 KB), approximately 91.4k tokens, and a symbol index with 254 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.