Repository: ma7555/evalify Branch: main Commit: c0d5d6c9e78e Files: 34 Total size: 77.4 KB Directory structure: gitextract_pm2nxnd5/ ├── .coveragerc ├── .github/ │ ├── ISSUE_TEMPLATE.md │ └── workflows/ │ ├── codeql-analysis.yml │ ├── dev.yml │ └── release.yml ├── .gitignore ├── AUTHORS.md ├── CITATION.cff ├── CONTRIBUTING.md ├── HISTORY.md ├── LICENSE ├── README.md ├── codecov.yml ├── docs/ │ ├── api.md │ ├── authors.md │ ├── contributing.md │ ├── history.md │ ├── index.md │ ├── installation.md │ └── usage.md ├── evalify/ │ ├── __init__.py │ ├── evalify.py │ ├── metrics.py │ └── utils.py ├── examples/ │ └── LFW.py ├── mkdocs.yml ├── pyproject.toml ├── tests/ │ ├── __init__.py │ ├── data/ │ │ └── LFW.npz │ ├── test_evalify.py │ ├── test_experiment_real_data.py │ ├── test_metrics.py │ └── test_utils.py └── tox.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: .coveragerc ================================================ [run] # uncomment the following to omit files during running #omit = [report] exclude_lines = pragma: no cover def __repr__ if self.debug: if settings.DEBUG raise AssertionError raise NotImplementedError if 0: if __name__ == .__main__.: def main ================================================ FILE: .github/ISSUE_TEMPLATE.md ================================================ * evalify version: * Python version: * Operating System: ### Description Describe what you were trying to get done. Tell us what happened, what went wrong, and what you expected to happen. ### What I Did ``` Paste the command(s) you ran and the output. If there was a crash, please include the traceback here. ``` ================================================ FILE: .github/workflows/codeql-analysis.yml ================================================ # For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. # # ******** NOTE ******** # We have attempted to detect the languages in your repository. Please check # the `language` matrix defined below to confirm you have the correct set of # supported CodeQL languages. # name: "CodeQL" on: push: branches: [ main ] pull_request: # The branches below must be a subset of the branches above branches: [ main ] schedule: - cron: '41 19 * * 2' jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write strategy: fail-fast: false matrix: language: [ 'python' ] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] # Learn more about CodeQL language support at https://git.io/codeql-language-support steps: - name: Checkout repository uses: actions/checkout@v2 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v1 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # queries: ./path/to/local/query, your-org/your-repo/queries@main # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@v1 # ℹ️ Command-line programs to run using the OS shell. # 📚 https://git.io/JvXDl # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines # and modify them (or add more) to build your code if your project # uses a compiled language #- run: | # make bootstrap # make release - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v1 ================================================ FILE: .github/workflows/dev.yml ================================================ name: build on: push: branches: [main] pull_request: branches: [main] workflow_dispatch: jobs: test: strategy: matrix: python-versions: ["3.9", "3.10", "3.11", "3.12"] os: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-versions }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install poetry tox tox-gh-actions - name: test with tox run: tox - name: list files run: ls -l . publish_dev_build: needs: test runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: 3.12 - name: Install dependencies run: | python -m pip install --upgrade pip pip install poetry tox tox-gh-actions - name: test with tox run: tox - name: list files run: ls -l . - uses: codecov/codecov-action@v4 with: fail_ci_if_error: false files: coverage.xml token: ${{ secrets.CODECOV_TOKEN }} - name: Build wheels and source tarball run: | poetry version $(poetry version --short)-dev.$GITHUB_RUN_NUMBER poetry version --short poetry build - name: publish to Test PyPI uses: pypa/gh-action-pypi-publish@v1.12.2 with: user: __token__ password: ${{ secrets.TEST_PYPI_API_TOKEN}} repository-url: https://test.pypi.org/legacy/ skip-existing: true ================================================ FILE: .github/workflows/release.yml ================================================ name: release & publish workflow on: push: tags: - "v1.*.*" workflow_dispatch: jobs: release: name: Create Release runs-on: ubuntu-latest strategy: matrix: python-versions: [3.12] steps: - name: Checks-out uses: actions/checkout@v4 - name: "Build Changelog" id: build_changelog uses: mikepenz/release-changelog-builder-action@v5.0.0 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-versions }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install tox-gh-actions poetry - name: pre-publish documentation run: | poetry install -E doc poetry run mkdocs build - name: publish documentation uses: peaceiris/actions-gh-pages@v4 with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: ./site - name: Build wheels and source tarball run: >- poetry build - name: show temporary files run: >- ls -l - name: create github release id: create_release uses: softprops/action-gh-release@v2.0.9 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: body: ${{steps.build_changelog.outputs.changelog}} # body_path: ./CHANGELOG.md files: dist/*.whl draft: false prerelease: false - name: create pypi release uses: pypa/gh-action-pypi-publish@v1.12.2 with: user: __token__ password: ${{ secrets.PYPI_API_TOKEN }} ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # dotenv .env # virtualenv .venv venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # IDE settings .vscode/ # mkdocs build dir site/ # logo logo/ poetry.lock .ruff_cache/ ================================================ FILE: AUTHORS.md ================================================ # Credits ## Development Lead * Mahmoud Bahaa ## Contributors None yet. Why not be the first? ## Others * This package was created with [Cookiecutter](https://github.com/audreyr/cookiecutter) and the [zillionare/cookiecutter-pypackage](https://github.com/zillionare/cookiecutter-pypackage) project template. * Logo was created using font [GlacialIndifference-Regular](https://hanken.co/product/hk-grotesk/) by [Hanken Design Co.](https://hanken.co/) * Logo icon designed by Mauro Lucchesi ================================================ FILE: CITATION.cff ================================================ cff-version: 1.2.0 title: evalify message: " If you use this software, please cite it using the metadata from this file." type: software authors: - given-names: Mahmoud family-names: Bahaa email: evalify@ma7555.anonaddy.com affiliation: Nile University orcid: "https://orcid.org/0000-0001-8688-6495" doi: 10.5281/zenodo.6181723 date-released: 2022-02-20 ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing Contributions are welcomed, and they are greatly appreciated! Every little bit helps, and credit will always be given. You can contribute in many ways: ## Types of Contributions ### Report Bugs Report bugs at https://github.com/ma7555/evalify/issues. If you are reporting a bug, please include: * Your operating system name and version. * Any details about your local setup that might be helpful in troubleshooting. * Detailed steps to reproduce the bug. ### Fix Bugs Look through the GitHub issues for bugs. Anything tagged with "bug" and "help wanted" is open to whoever wants to implement it. ### Implement Features Look through the GitHub issues for features. Anything tagged with "enhancement" and "help wanted" is open to whoever wants to implement it. ### Write Documentation evalify could always use more documentation, whether as part of the official evalify docs, in docstrings, or even on the web in blog posts, articles, and such. ### Submit Feedback The best way to send feedback is to file an issue at https://github.com/ma7555/evalify/issues. If you are proposing a feature: * Explain in detail how it would work. * Keep the scope as narrow as possible, to make it easier to implement. * Remember that this is a volunteer-driven project, and that contributions are welcome :) ## Get Started! Ready to contribute? Here's how to set up `evalify` for local development. 1. Fork the `evalify` repo on GitHub. 2. Clone your fork locally ```bash git clone git@github.com:your_name_here/evalify.git ``` 3. Ensure [poetry](https://python-poetry.org/docs/) is installed. 4. Install dependencies and start your virtualenv: ```bash poetry install -E test -E doc -E dev ``` 5. Create a branch for local development: ```bash git checkout -b name-of-your-bugfix-or-feature ``` Now you can make your changes locally. 6. When you're done making changes, check that your changes pass the tests, including testing other Python versions, with tox: ```bash tox ``` 7. Commit your changes and push your branch to GitHub: ```bash git add . git commit -m "Your detailed description of your changes." git push origin name-of-your-bugfix-or-feature ``` 8. Submit a pull request through the GitHub website. ## Pull Request Guidelines Before you submit a pull request, check that it meets these guidelines: 1. The pull request should include tests. 2. If the pull request adds functionality, the docs should be updated. Put your new functionality into a function with a docstring, and add the feature to the list in README.md. 3. The pull request should work for Python 3.9, 3.10, 3.11, 3.12 and for PyPy. Check https://github.com/ma7555/evalify/actions and make sure that the tests pass for all supported Python versions. ## ```bash python -m unittest ``` or ```bash pytest ``` To run a subset of tests. ## Deploying A reminder for the maintainers on how to deploy. Make sure all your changes are committed (including an entry in HISTORY.md). Then run: ```bash git push git push --tags ``` Github Actions will then deploy to PyPI if tests pass. ================================================ FILE: HISTORY.md ================================================ # History ## 0.1.0 (2022-02-20) * First release on PyPI. ## 0.1.1 (2022-02-22) * Run time enhancement. ## 0.1.2 (2022-02-23) * Various enhancements and refactoring. ## 0.1.3 (2022-02-24) * Add pearson similarity as a metric ## 0.1.4 (2022-02-24) * Add EER calculation function. * Drop support for python 3.7 ## 1.0.0 (2024-11-08) * Bump dependencies. * Drop support for python 3.8 * Add support for TAR @ FAR ================================================ FILE: LICENSE ================================================ BSD 3-Clause License Copyright (c) 2022, Mahmoud Bahaa All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ # evalify

Logo

License DOI Python 3.7 | 3.8 | 3.9 | 3 Release Status CI Status Documentation Status Code style: Ruff PyPI Downloads/Month

**Evaluate Biometric Authentication Models Literally in Seconds.** ## Installation #### Stable release: ```bash pip install evalify ``` #### Bleeding edge: ```bash pip install git+https://github.com/ma7555/evalify.git ``` ## Used for Evaluating all biometric authentication models, where the model output is a high-level embeddings known as feature vectors for visual or behaviour biometrics or d-vectors for auditory biometrics. ## Usage ```python import numpy as np from evalify import Experiment rng = np.random.default_rng() nphotos = 500 emb_size = 32 nclasses = 10 X = rng.random((self.nphotos, self.emb_size)) y = rng.integers(self.nclasses, size=self.nphotos) experiment = Experiment() experiment.run(X, y) experiment.get_roc_auc() print(experiment.roc_auc) print(experiment.find_threshold_at_fpr(0.01)) ``` ## How it works * When you run an experiment, evalify tries all the possible combinations between individuals for authentication based on the `X` and `y` parameters and returns the results including FPR, TPR, FNR, TNR and ROC AUC. `X` is an array of embeddings and `y` is an array of corresponding targets. * Evalify can find the optimal threshold based on your agreed FPR and desired similarity or distance metric. ## Documentation: * ## Features * Blazing fast implementation for metrics calculation through optimized einstein sum and vectorized calculations. * Many operations are dispatched to canonical BLAS, cuBLAS, or other specialized routines. * Smart sampling options using direct indexing from pre-calculated arrays with total control over sampling strategy and sampling numbers. * Supports most evaluation metrics: - `cosine_similarity` - `pearson_similarity` - `cosine_distance` - `euclidean_distance` - `euclidean_distance_l2` - `minkowski_distance` - `manhattan_distance` - `chebyshev_distance` * Computation time for 4 metrics 4.2 million samples experiment is **24 seconds vs 51 minutes** if looping using `scipy.spatial.distance` implemntations. ## TODO * Safer memory allocation. I did not have issues but if you ran out of memory please manually set the `batch_size` argument. ## Contribution * Contributions are welcomed, and they are greatly appreciated! Every little bit helps, and credit will always be given. * Please check [CONTRIBUTING.md](https://github.com/ma7555/evalify/blob/main/CONTRIBUTING.md) for guidelines. ## Citation * If you use this software, please cite it using the metadata from [CITATION.cff](https://github.com/ma7555/evalify/blob/main/CITATION.cff) ================================================ FILE: codecov.yml ================================================ coverage: status: project: default: target: 90% patch: default: target: 85% ================================================ FILE: docs/api.md ================================================ ::: evalify.evalify handler: python ================================================ FILE: docs/authors.md ================================================ {% include-markdown "../AUTHORS.md" %} ================================================ FILE: docs/contributing.md ================================================ {% include-markdown "../CONTRIBUTING.md" %} ================================================ FILE: docs/history.md ================================================ {% include-markdown "../HISTORY.md" %} ================================================ FILE: docs/index.md ================================================ {% include-markdown "../README.md" %} ================================================ FILE: docs/installation.md ================================================ # Installation ## Stable release To install evalify, run this command in your terminal: ```bash pip install evalify ``` This is the preferred method to install evalify, as it will always install the most recent stable release. If you don't have [pip][] installed, this [Python installation guide][] can guide you through the process. ## From source The source for evalify can be downloaded from the [Github repo][]. You can either clone the public repository: ```bash git clone git://github.com/ma7555/evalify ``` Or download the [tarball][]: ```bash curl -OJL https://github.com/ma7555/evalify/tarball/master ``` Once you have a copy of the source, you can install it with: ```bash pip install . ``` [pip]: https://pip.pypa.io [Python installation guide]: http://docs.python-guide.org/en/latest/starting/installation/ [Github repo]: https://github.com/%7B%7B%20cookiecutter.github_username%20%7D%7D/%7B%7B%20cookiecutter.project_slug%20%7D%7D [tarball]: https://github.com/%7B%7B%20cookiecutter.github_username%20%7D%7D/%7B%7B%20cookiecutter.project_slug%20%7D%7D/tarball/master ================================================ FILE: docs/usage.md ================================================ # Usage To use evalify in a project ```python import numpy as np from evalify import Experiment rng = np.random.default_rng() nphotos = 500 emb_size = 32 nclasses = 10 X = rng.random((self.nphotos, self.emb_size)) y = rng.integers(self.nclasses, size=self.nphotos) experiment = Experiment() experiment.run(X, y) experiment.get_roc_auc() print(experiment.df.roc_auc) ``` For a working experiment using real face embeddings, please refer to `LFW.py` under `./examples`. ```python python ./examples/LFW.py ``` ``` Total available embeddings 2921 resulted in 4264660 samples for the experiment. Metrics calculations executed in 24.05 seconds ROC AUC: OrderedDict([('euclidean_distance', 0.9991302819624498), ('cosine_distance', 0.9991302818953706), ('euclidean_distance_l2', 0.9991302818953706), ('manhattan_distance', 0.9991260462584446)]) ``` ================================================ FILE: evalify/__init__.py ================================================ """Top-level package for evalify.""" from evalify.evalify import Experiment as Experiment __author__ = """Mahmoud Bahaa""" __email__ = "evalify@ma7555.anonaddy.com" __version__ = "0.1.0" ================================================ FILE: evalify/evalify.py ================================================ """Evalify main module used for creating the verification experiments. Creates experiments with embedding pairs to compare for face verification tasks including positive pairs, negative pairs and metrics calculations using a very optimized einstein sum. Many operations are dispatched to canonical BLAS, cuBLAS, or other specialized routines. Extremely large arrays are split into smaller batches, every batch would consume the roughly the maximum available memory. Typical usage example: ``` experiment = Experiment() experiment.run(X, y) ``` """ import itertools import sys from collections import OrderedDict from typing import Any, List, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd from sklearn.metrics import auc, confusion_matrix, roc_curve from evalify.metrics import ( DISTANCE_TO_SIMILARITY, METRICS_NEED_NORM, METRICS_NEED_ORDER, REVERSE_DISTANCE_TO_SIMILARITY, metrics_caller, ) from evalify.utils import _validate_vectors, calculate_best_batch_size StrOrInt = Union[str, int] StrIntSequence = Union[str, int, Sequence[Union[str, int]]] class Experiment: """Defines an experiment for evalifying. Args: metrics: The list of metrics to use. Can be one or more of the following: `cosine_similarity`, `pearson_similarity`, `cosine_distance`, `euclidean_distance`, `euclidean_distance_l2`, `minkowski_distance`, `manhattan_distance` and `chebyshev_distance` same_class_samples: - 'full': Samples all possible images within each class to create all all possible positive pairs. - int: Samples specific number of images for every class to create nC2 pairs where n is passed integer. different_class_samples: - 'full': Samples one image from every class with all possible pairs of different classes. This can grow exponentially as the number of images increase. (N, M) = (1, "full") - 'minimal': Samples one image from every class with one image of all other classes. (N, M) = (1, 1). (Default) - int: Samples one image from every class with provided number of images of every other class. - tuple or list: (N, M) Samples N images from every class with M images of every other class. seed: Optional random seed for reproducibility. Notes: - `same_class_samples`: If the provided number is greater than the achievable for the class, the maximum possible combinations are used. - `different_class_samples`: If the provided number is greater than the achievable for the class, the maximum possible combinations are used. (N, M) can also be ('full', 'full') but this will calculate all possible combinations between all posibile negative samples. If the dataset is not small this will probably result in an extremely large array!. """ def __init__( self, metrics: Union[str, Sequence[str]] = "cosine_similarity", same_class_samples: StrOrInt = "full", different_class_samples: StrIntSequence = "minimal", seed: Optional[int] = None, ) -> None: self.experiment_success = False self.cached_predicted_as_similarity = {} self.metrics = (metrics,) if isinstance(metrics, str) else metrics self.same_class_samples = same_class_samples self.different_class_samples = different_class_samples self.seed = seed def __call__(self, *args: Any, **kwds: Any) -> Any: return self.run(*args, **kwds) @staticmethod def _validate_args( metrics: Sequence[str], same_class_samples: StrOrInt, different_class_samples: StrIntSequence, batch_size: Optional[StrOrInt], p, ) -> None: """Validates passed arguments to Experiment.run() method.""" if same_class_samples != "full" and not isinstance(same_class_samples, int): msg = ( "`same_class_samples` argument must be one of 'full' or an integer " f"Received: same_class_samples={same_class_samples}" ) raise ValueError( msg, ) if different_class_samples not in ("full", "minimal"): if not isinstance(different_class_samples, (int, list, tuple)): msg = ( "`different_class_samples` argument must be one of 'full', " "'minimal', an integer, a list or tuple of integers or keyword " "'full'." f"Received: different_class_samples={different_class_samples}." ) raise ValueError( msg, ) if isinstance(different_class_samples, (list, tuple)) and ( not ( all( isinstance(i, int) or i == "full" for i in different_class_samples ) ) or (len(different_class_samples)) != 2 ): msg = ( "When passing `different_class_samples` as a tuple or list, " "elements must be exactly two of integer type or keyword 'full' " "(N, M). " f"Received: different_class_samples={different_class_samples}." ) raise ValueError( msg, ) if ( batch_size != "best" and not isinstance(batch_size, int) and batch_size is not None ): msg = ( '`batch_size` argument must be either "best" or of type integer ' f"Received: batch_size={batch_size} with type {type(batch_size)}." ) raise ValueError( msg, ) if any(metric not in metrics_caller for metric in metrics): msg = ( f"`metric` argument must be one of {tuple(metrics_caller.keys())} " f"Received: metric={metrics}" ) raise ValueError( msg, ) if p < 1: msg = f"`p` must be an int and at least 1. Received: p={p}" raise ValueError(msg) def _get_pairs( self, y, same_class_samples, different_class_samples, target, ) -> List[Tuple]: """Generates experiment pairs.""" same_ixs_full = np.argwhere(y == target).ravel() if isinstance(same_class_samples, int): same_class_samples = min(len(same_ixs_full), same_class_samples) same_ixs = self.rng.choice(same_ixs_full, same_class_samples) elif same_class_samples == "full": same_ixs = same_ixs_full same_pairs = itertools.combinations(same_ixs, 2) same_pairs = [(a, b, target, target, 1) for a, b in same_pairs] different_ixs = np.argwhere(y != target).ravel() diff_df = pd.DataFrame( data={"sample_idx": different_ixs, "target": y[different_ixs]}, ) diff_df = diff_df.sample(frac=1, random_state=self.seed) if different_class_samples in ["full", "minimal"] or isinstance( different_class_samples, int, ): N = 1 if different_class_samples == "minimal": diff_df = diff_df.drop_duplicates(subset=["target"]) else: N, M = different_class_samples N = len(same_ixs_full) if N == "full" else min(N, len(same_ixs_full)) if M != "full": diff_df = ( diff_df.groupby("target") .apply(lambda x: x[:M], include_groups=False) .droplevel(0) ) different_ixs = diff_df.sample_idx.to_numpy() different_pairs = itertools.product( self.rng.choice(same_ixs_full, N, replace=False), different_ixs, ) different_pairs = [(a, b, target, y[b], 0) for a, b in different_pairs if a < b] return same_pairs + different_pairs def run( self, X: np.ndarray, y: np.ndarray, batch_size: Optional[StrOrInt] = "best", shuffle: bool = False, return_embeddings: bool = False, p: int = 3, ) -> pd.DataFrame: """Runs an experiment for face verification Args: X: Embeddings array y: Targets for X as integers batch_size: - 'best': Let the program decide based on available memory such that every batch will fit into the available memory. (Default) - int: Manually decide the batch_size. - None: No batching. All experiment and intermediate results must fit entirely into memory or a MemoryError will be raised. shuffle: Shuffle the returned experiment dataframe. Default: False. return_embeddings: Whether to return the embeddings instead of indexes. Default: False p: The order of the norm of the difference. Should be `p >= 1`, Only valid with minkowski_distance as a metric. Default = 3. Returns: pandas.DataFrame: A DataFrame representing the experiment results. Raises: ValueError: An error occurred with the provided arguments. """ self._validate_args( self.metrics, self.same_class_samples, self.different_class_samples, batch_size, p, ) X, y = _validate_vectors(X, y) all_targets = np.unique(y) all_pairs = [] metric_fns = list(map(metrics_caller.get, self.metrics)) self.rng = np.random.default_rng(self.seed) for target in all_targets: all_pairs += self._get_pairs( y, self.same_class_samples, self.different_class_samples, target, ) self.df = pd.DataFrame( data=all_pairs, columns=["emb_a", "emb_b", "target_a", "target_b", "target"], ) experiment_size = len(self.df) if shuffle: self.df = self.df.sample(frac=1, random_state=self.seed) if batch_size == "best": batch_size = calculate_best_batch_size(X) elif batch_size is None: batch_size = experiment_size kwargs = {} if any(metric in METRICS_NEED_NORM for metric in self.metrics): kwargs["norms"] = np.linalg.norm(X, axis=1) if any(metric in METRICS_NEED_ORDER for metric in self.metrics): kwargs["p"] = p emb_a = self.df.emb_a.to_numpy() emb_b = self.df.emb_b.to_numpy() emb_a_s = np.array_split(emb_a, np.ceil(experiment_size / batch_size)) emb_b_s = np.array_split(emb_b, np.ceil(experiment_size / batch_size)) for metric, metric_fn in zip(self.metrics, metric_fns): self.df[metric] = np.hstack( [metric_fn(X, i, j, **kwargs) for i, j in zip(emb_a_s, emb_b_s)], ) if return_embeddings: self.df["emb_a"] = X[emb_a].tolist() self.df["emb_b"] = X[emb_b].tolist() self.experiment_success = True return self.df def find_optimal_cutoff(self) -> dict: """Finds the optimal cutoff threshold for each metric based on the ROC curve. This function calculates the optimal threshold for each metric by finding the point on the Receiver Operating Characteristic (ROC) curve where the difference between the True Positive Rate (TPR) and the False Positive Rate (FPR) is minimized. Returns: dict: A dictionary with metrics as keys and their corresponding optimal threshold as values. """ self.check_experiment_run() self.optimal_cutoff = {} for metric in self.metrics: fpr, tpr, threshold = roc_curve(self.df["target"], self.df[metric]) i = np.arange(len(tpr)) roc = pd.DataFrame( { "tf": pd.Series(tpr - (1 - fpr), index=i), "threshold": pd.Series(threshold, index=i), }, ) roc_t = roc.iloc[(roc.tf - 0).abs().argsort()[:1]] self.optimal_cutoff[metric] = roc_t["threshold"].item() return self.optimal_cutoff def threshold_at_fpr(self, fpr: float) -> dict: """Find the threshold at a specified False Positive Rate (FPR) for each metric. The function calculates the threshold at the specified FPR for each metric by using the Receiver Operating Characteristic (ROC) curve. If the desired FPR is 0 or 1, or no exact match is found, the closest thresholds are used. Args: fpr (float): Desired False Positive Rate. Must be between 0 and 1. Returns: dict: A dictionary where keys are the metrics and values are dictionaries containing FPR, TPR, and threshold at the specified FPR. Raises: ValueError: If the provided `fpr` is not between 0 and 1. """ self.check_experiment_run() if not 0 <= fpr <= 1: msg = "`fpr` must be between 0 and 1. " f"Received wanted_fpr={fpr}" raise ValueError( msg, ) threshold_at_fpr = {} for metric in self.metrics: predicted = self.predicted_as_similarity(metric) FPR, TPR, thresholds = roc_curve( self.df["target"], predicted, drop_intermediate=False, ) df_fpr_tpr = pd.DataFrame({"FPR": FPR, "TPR": TPR, "threshold": thresholds}) ix_left = np.searchsorted(df_fpr_tpr["FPR"], fpr, side="left") ix_right = np.searchsorted(df_fpr_tpr["FPR"], fpr, side="right") if fpr == 0: best = df_fpr_tpr.iloc[ix_right] elif fpr == 1 or ix_left == ix_right: best = df_fpr_tpr.iloc[ix_left] else: best = ( df_fpr_tpr.iloc[ix_left] if abs(df_fpr_tpr.iloc[ix_left].FPR - fpr) < abs(df_fpr_tpr.iloc[ix_right].FPR - fpr) else df_fpr_tpr.iloc[ix_right] ) best = best.to_dict() if metric in REVERSE_DISTANCE_TO_SIMILARITY: best["threshold"] = REVERSE_DISTANCE_TO_SIMILARITY.get(metric)( best["threshold"], ) threshold_at_fpr[metric] = best return threshold_at_fpr def get_binary_prediction(self, metric: str, threshold: float) -> pd.Series: """Binary classification prediction based on the given metric and threshold. Args: metric: Metric name for the desired prediction. threshold: Cut off threshold. Returns: pd.Series: Binary predictions. """ return ( self.df[metric].apply(lambda x: 1 if x < threshold else 0) if metric in DISTANCE_TO_SIMILARITY else self.df[metric].apply(lambda x: 1 if x > threshold else 0) ) def evaluate_at_threshold(self, threshold: float, metric: str) -> dict: """Evaluate performance at specific threshold Args: threshold: Cut-off threshold. metric: Metric to use. Returns: dict: A dict ontaining all evaluation metrics. """ self.metrics_evaluation = {} self.check_experiment_run(metric) for metric in self.metrics: predicted = self.get_binary_prediction(metric, threshold) cm = confusion_matrix(self.df["target"], predicted) tn, fp, fn, tp = cm.ravel() TPR = tp / (tp + fn) # recall / true positive rate TNR = tn / (tn + fp) # true negative rate PPV = tp / (tp + fp) # precision / positive predicted value NPV = tn / (tn + fn) # negative predictive value FPR = fp / (fp + tn) # false positive rate FNR = 1 - TPR # false negative rate FDR = 1 - PPV # false discovery rate FOR = 1 - NPV # false omission rate F1 = 2 * (PPV * TPR) / (PPV + TPR) evaluation = { "TPR": TPR, "TNR": TNR, "PPV": PPV, "NPV": NPV, "FPR": FPR, "FNR": FNR, "FDR": FDR, "FOR": FOR, "F1": F1, } return evaluation def check_experiment_run(self, metric: Optional[str] = None) -> bool: caller = sys._getframe().f_back.f_code.co_name if not self.experiment_success: msg = ( f"`{caller}` function can only be run after running " "`run_experiment`." ) raise NotImplementedError( msg, ) if metric is not None and metric not in self.metrics: msg = ( f"`{caller}` function can only be called with `metric` from " f"{self.metrics} which were used while running the experiment" ) raise ValueError( msg, ) return True def roc_auc(self) -> OrderedDict: """Find ROC AUC for all the metrics used. Returns: OrderedDict: An OrderedDict with AUC for all metrics. """ self.check_experiment_run() self.roc_auc = {} for metric in self.metrics: predicted = self.predicted_as_similarity(metric) fpr, tpr, thresholds = roc_curve( self.df["target"], predicted, drop_intermediate=False, ) self.roc_auc[metric] = auc(fpr, tpr).item() self.roc_auc = OrderedDict( sorted(self.roc_auc.items(), key=lambda x: x[1], reverse=True), ) return self.roc_auc def predicted_as_similarity(self, metric: str) -> pd.Series: """Convert distance metrics to a similarity measure. Args: metric: distance metric to convert to similarity. If a similarity metric is passed, It gets returned unchanged. Returns: pd.Series: Converted distance to similarity. """ predicted = self.df[metric] if metric in DISTANCE_TO_SIMILARITY: predicted = ( self.cached_predicted_as_similarity[metric] if metric in self.cached_predicted_as_similarity else DISTANCE_TO_SIMILARITY.get(metric)(predicted) ) self.cached_predicted_as_similarity[metric] = predicted return predicted def eer(self) -> OrderedDict: """Calculates the Equal Error Rate (EER) for each metric. Returns: OrderedDict: A dictionary containing the EER value and threshold for each metric. The metrics are sorted in ascending order based on the EER values. Example: {'metric1': {'EER': 0.123, 'threshold': 0.456}, ...} """ self.check_experiment_run() self.eer = {} for metric in self.metrics: predicted = self.predicted_as_similarity(metric) actual = self.df["target"] fpr, tpr, thresholds = roc_curve( actual, predicted, pos_label=1, drop_intermediate=False, ) fnr = 1 - tpr eer_threshold = thresholds[np.nanargmin(np.absolute(fnr - fpr))].item() eer_1 = fpr[np.nanargmin(np.absolute(fnr - fpr))].item() eer_2 = fnr[np.nanargmin(np.absolute(fnr - fpr))].item() if metric in REVERSE_DISTANCE_TO_SIMILARITY: eer_threshold = REVERSE_DISTANCE_TO_SIMILARITY.get(metric)( eer_threshold, ) self.eer[metric] = {"EER": (eer_1 + eer_2) / 2, "threshold": eer_threshold} self.eer = OrderedDict( sorted(self.eer.items(), key=lambda x: x[1]["EER"], reverse=False), ) return self.eer def tar_at_far(self, far_values: List[float]) -> OrderedDict: """Calculates TAR at specified FAR values for each metric. Args: far_values (List[float]): A list of False Accept Rates (FAR) to get TAR values for. Returns: OrderedDict: A dictionary with keys as metrics and values as dictionaries of FAR:TAR pairs. Raises: ValueError: If any FAR in far_values is not between 0 and 1. """ if isinstance(far_values, (float, int)): far_values = [float(far_values)] if not all(0 <= far <= 1 for far in far_values): raise ValueError("All FAR values must be between 0 and 1.") self.check_experiment_run() tar_at_far_results = {} for metric in self.metrics: predicted = self.predicted_as_similarity(metric) fpr, tpr, _ = roc_curve(self.df["target"], predicted, pos_label=1) tar_values = {} for far in far_values: idx = np.searchsorted(fpr, far, side="right") - 1 idx = max(0, min(idx, len(fpr) - 1)) # Ensure idx is within bounds tar_values[far] = tpr[idx].item() tar_at_far_results[metric] = tar_values self.tar_at_far_results = OrderedDict( sorted(tar_at_far_results.items(), key=lambda x: list(x[1].keys())[0]) ) return self.tar_at_far_results ================================================ FILE: evalify/metrics.py ================================================ """Evalify metrics module used for calculating the evaluation metrics. Optimized calculations using einstein sum. Embeddings array and norm arrays are indexed with every split and calculations happens over large data chunks very quickly. """ import numpy as np def _inner1d(A, B): """Calculate the inner product between two arrays of vectors. Args: A (numpy.ndarray): 2D array of shape (n_samples, n_features) B (numpy.ndarray): 2D array of shape (n_samples, n_features) Returns: numpy.ndarray: 1D array of shape (n_samples,) where each element is the inner product of the corresponding rows in A and B """ return np.einsum("ij,ij->i", A, B, optimize="optimal") def cosine_similarity(embs, ix, iy, norms, return_distance=False, **kwargs): """Calculate the cosine similarity between two arrays of vectors. Args: embs (numpy.ndarray): 2D array of shape (n_samples, n_features) ix (numpy.ndarray): 1D array of shape (n_samples,) containing the indices of the first array iy (numpy.ndarray): 1D array of shape (n_samples,) containing the indices of the second array norms (numpy.ndarray): 1D array of shape (n_samples,) containing the L2 norm of each row in X return_distance (bool): Whether to return the cosine distance instead of the cosine similarity. Defaults to False. Returns: numpy.ndarray: 1D array of shape (n_samples,) where each element is the cosine similarity (or cosine distance) of the corresponding rows in X. """ similarity = _inner1d(embs[ix], embs[iy]) / (norms[ix] * norms[iy]) return 1 - similarity if return_distance else similarity def euclidean_distance_l2(embs, ix, iy, norms, **kwargs): """Calculate the L2-normalized Euclidean distance between two arrays of vectors. Args: embs (numpy.ndarray): 2D array of shape (n_samples, n_features). ix (numpy.ndarray): 1D array of shape (n_samples,) containing the indices of the first array. iy (numpy.ndarray): 1D array of shape (n_samples,) containing the indices of the second array. norms (numpy.ndarray): 1D array of shape (n_samples,) containing the L2 norm of each row in embs. Returns: numpy.ndarray: 1D array of shape (n_samples,) where each element is the L2-normalized Euclidean distance of the corresponding rows in embs. """ X = embs[ix] / norms[ix].reshape(-1, 1) - embs[iy] / norms[iy].reshape(-1, 1) return np.linalg.norm(X, axis=1) def minkowski_distance(embs, ix, iy, p, **kwargs): """Calculate the element-wise Minkowski or Manhattan or Chebyshev distance. Args: embs (numpy.ndarray): 2D array of shape (n_samples, n_features) ix (numpy.ndarray): 1D array of shape (n_samples,) containing the indices of the first array iy (numpy.ndarray): 1D array of shape (n_samples,) containing the indices of the second array p (int): The order of the norm of the difference. Returns: numpy.ndarray: 1D array of shape (n_samples,) where each element is the Minkowski distance of the corresponding rows in embs. """ return np.linalg.norm(embs[ix] - embs[iy], ord=p, axis=1) def pearson_similarity(embs, ix, iy, **kwargs): """Calculate the Pearson correlation coefficient between two arrays of vectors. Args: embs (numpy.ndarray): 2D array of shape (n_samples, n_features) ix (numpy.ndarray): 1D array of shape (n_samples,) containing the indices of the first array iy (numpy.ndarray): 1D array of shape (n_samples,) containing the indices of the second array Returns: numpy.ndarray: 1D array of shape (n_samples,) where each element is the Pearson correlation coefficient of the corresponding rows in embs. """ A = embs[ix] B = embs[iy] A_mA = A - np.expand_dims(A.mean(axis=1), -1) B_mB = B - np.expand_dims(B.mean(axis=1), -1) ssA = np.expand_dims((A_mA**2).sum(axis=1), -1) ssB = np.expand_dims((B_mB**2).sum(axis=1), -1) return _inner1d(A_mA, B_mB) / np.sqrt(_inner1d(ssA, ssB)) metrics_caller = { "cosine_similarity": cosine_similarity, "pearson_similarity": pearson_similarity, "cosine_distance": lambda embs, ix, iy, norms, **kwargs: cosine_similarity( embs, ix, iy, norms, return_distance=True, ), "euclidean_distance": lambda embs, ix, iy, **kwargs: minkowski_distance( embs, ix, iy, p=2, ), "euclidean_distance_l2": euclidean_distance_l2, "minkowski_distance": minkowski_distance, "manhattan_distance": lambda embs, ix, iy, **kwargs: minkowski_distance( embs, ix, iy, p=1, ), "chebyshev_distance": lambda embs, ix, iy, **kwargs: minkowski_distance( embs, ix, iy, p=np.inf, ), } METRICS_NEED_NORM = ["cosine_similarity", "cosine_distance", "euclidean_distance_l2"] METRICS_NEED_ORDER = ["minkowski_distance"] DISTANCE_TO_SIMILARITY = { "cosine_distance": lambda x: 1 - x, "euclidean_distance": lambda x: 1 / (1 + x), "euclidean_distance_l2": lambda x: 1 - x, "minkowski_distance": lambda x: 1 / (1 + x), "manhattan_distance": lambda x: 1 / (1 + x), "chebyshev_distance": lambda x: 1 / (1 + x), } REVERSE_DISTANCE_TO_SIMILARITY = { "cosine_distance": lambda x: 1 - x, "euclidean_distance": lambda x: (1 / x) - 1, "euclidean_distance_l2": lambda x: 1 - x, "minkowski_distance": lambda x: (1 / x) - 1, "manhattan_distance": lambda x: (1 / x) - 1, "chebyshev_distance": lambda x: (1 / x) - 1, } ================================================ FILE: evalify/utils.py ================================================ """Evalify utils module contains various utilites serving other modules.""" import numpy as np import psutil GB_TO_BYTE = 1024**3 def _validate_vectors(X, y): X = np.asarray(X, dtype=np.float32) y = np.asarray(y, dtype=np.int32).squeeze() if X.ndim != 2: msg = "Embeddings vector should be 2-D." raise ValueError(msg) if y.ndim != 1: msg = "Target vector should be 1-D." raise ValueError(msg) return X, y def _calc_available_memory(): """Calculate available memory in system.""" mem = psutil.virtual_memory() return mem[1] def calculate_best_batch_size(X, available_mem=None): """Calculate maximum rows to fetch per batch without going out of memory. We need 3 big arrays to be held in memory (A, B, A*B) """ available_mem = _calc_available_memory() if available_mem is None else available_mem if available_mem > 2 * GB_TO_BYTE: max_total_rows = np.floor(available_mem - GB_TO_BYTE / X[0].nbytes) return max_total_rows // 3 max_total_rows = np.floor(available_mem / X[0].nbytes) return max_total_rows // 5 ================================================ FILE: examples/LFW.py ================================================ """ File LFW.npz contains sample embeddings and targets from LFW dataset""" from pathlib import Path import time import numpy as np from evalify import Experiment lfw_npz = Path(__file__).parent.parent / Path("tests/data/LFW.npz") X_y_array = np.load(lfw_npz) X = X_y_array["X"][:1000] y = X_y_array["y"][:1000] experiment = Experiment( metrics=( "cosine_similarity", "pearson_similarity", "euclidean_distance_l2", ), same_class_samples="full", different_class_samples=("full", "full"), ) start_time = time.time() print("Starting Experiment") experiment.run(X, y) print( f"Total available embeddings {len(y)} resulted in {len(experiment.df)} " "samples for the experiment." ) print(f"Metrics calculations executed in {time.time()-start_time:.2f} seconds") print("ROC AUC:") print(experiment.roc_auc()) print("threshold @ FPR:") print(experiment.threshold_at_fpr(0.01)) print("EER:") print(experiment.eer()) print("TAR@FAR:") print(experiment.tar_at_far([0.01, 0.001])) ================================================ FILE: mkdocs.yml ================================================ site_name: evalify repo_url: https://github.com/ma7555/evalify repo_name: evalify nav: - home: index.md - installation: installation.md - usage: usage.md - modules: api.md - contributing: contributing.md - authors: authors.md - history: history.md theme: name: material language: en logo: https://user-images.githubusercontent.com/7144929/154332210-fa1fee34-faae-4567-858a-49fa53e99a2b.svg palette: - media: "(prefers-color-scheme: light)" scheme: default toggle: icon: material/weather-night name: Switch to dark mode - media: "(prefers-color-scheme: dark)" scheme: slate toggle: icon: material/weather-sunny name: Switch to light mode features: - navigation.indexes - navigation.tabs - navigation.instant - navigation.tabs.sticky markdown_extensions: - pymdownx.emoji: emoji_index: !!python/name:material.extensions.emoji.twemoji emoji_generator: !!python/name:material.extensions.emoji.to_svg - pymdownx.critic - pymdownx.caret - pymdownx.mark - pymdownx.tilde - pymdownx.tabbed - attr_list - pymdownx.arithmatex: generic: true - pymdownx.highlight: linenums: true - pymdownx.superfences - pymdownx.details - admonition - toc: baselevel: 2 permalink: true - meta plugins: - include-markdown - search: lang: en - mkdocstrings extra: social: - icon: fontawesome/brands/github link: https://github.com/ma7555/evalify name: Github - icon: material/email link: "mailto:evalify@ma7555.anonaddy.com" ================================================ FILE: pyproject.toml ================================================ [tool.poetry] name = "evalify" version = "1.0.0" homepage = "https://github.com/ma7555/evalify" description = "Evaluate your face or voice verification models literally in seconds." authors = ["Mahmoud Bahaa "] keywords = ["biometric verification", "biometric authentication", "evaluation"] readme = "README.md" license = "BSD-3-Clause" classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Natural Language :: English", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] packages = [ { include = "evalify" }, ] [tool.poetry.dependencies] python = ">=3.9,<4.0" pandas = "^2.0.0" numpy = "^2.0.0" psutil = "^5.9.0" scikit-learn = "^1.2.0" # Optional Dependencies ruff = { version = ">=0.7.2", optional = true } pytest = { version = "^7.2.0", optional = true } pytest-cov = { version = "^4.0.0", optional = true } scipy = { version = ">=1.10.0", optional = true } tox = { version = "^4.7.0", optional = true } virtualenv = { version = ">=20.24.0", optional = true } pip = { version = ">=23.2.0", optional = true } mkdocs = { version = ">=1.4.0", optional = true } mkdocs-material = { version = "^9.2.0", optional = true } mkdocstrings = { version = ">=0.26.0", optional = true } mkdocstrings-python = { version = ">=1.12.2", optional = true } mkdocs-include-markdown-plugin = { version = ">=6.0.0", optional = true } twine = { version = "^5.0.0", optional = true } toml = { version = ">0.8.0", optional = true } pyreadline3 = { version = "^3.4.1", optional = true } poetry = { version = "^1.8.0", optional = true } [tool.poetry.extras] test = [ "pytest", "ruff", "pytest-cov", "pyreadline3", "scipy", ] dev = [ "tox", "virtualenv", "pip", "twine", "toml", "poetry", ] doc = [ "mkdocs", "mkdocs-material", "mkdocstrings", "mkdocstrings-python", "mkdocs-include-markdown-plugin", ] [build-system] requires = ["poetry-core>=1.8.0"] build-backend = "poetry.core.masonry.api" [tool.ruff] line-length = 88 indent-width = 4 [tool.ruff.lint] select = [ "E", # pycodestyle error "F", # Pyflakes "I", # isort ] dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" [tool.ruff.format] quote-style = "double" [tool.ruff.lint.isort] known-first-party = ["evalify"] ================================================ FILE: tests/__init__.py ================================================ """Unit test package for evalify.""" ================================================ FILE: tests/test_evalify.py ================================================ #!/usr/bin/env python """Tests for `evalify` package.""" import unittest import numpy as np from scipy.special import comb from evalify import Experiment from evalify.metrics import metrics_caller class TestEvalify(unittest.TestCase): """Tests for `evalify` package.""" def setUp(self): """Set up test fixtures, if any.""" rng = np.random.default_rng(555) self.nphotos = 500 self.emb_size = 8 self.nclasses = 10 self.embs = rng.random((self.nphotos, self.emb_size), dtype=np.float32) self.targets = rng.integers(self.nclasses, size=self.nphotos) def test_run_euclidean_distance(self): """Test run with euclidean_distance""" experiment = Experiment(metrics="euclidean_distance") df = experiment.run(self.embs, self.targets) experiment = Experiment(metrics="euclidean_distance_l2") df_l2 = experiment.run(self.embs, self.targets) self.assertGreater(df.euclidean_distance.max(), 0) self.assertGreater(df_l2.euclidean_distance_l2.max(), 0) def test_run_cosine_similarity(self): """Test run with cosine_similarity""" experiment = Experiment(metrics="cosine_similarity") df = experiment.run(self.embs, self.targets) self.assertLessEqual(df.cosine_similarity.max(), 1) def test_run_all_metrics_separated(self): for metric in metrics_caller.keys(): experiment = Experiment(metrics=metric) df = experiment.run(self.embs, self.targets) self.assertTrue(metric in df.columns) def test_run_all_metrics_combined(self): metrics = set(metrics_caller.keys()) experiment = Experiment(metrics=metrics) df = experiment.run(self.embs, self.targets) self.assertTrue(metrics.issubset(df.columns)) def test_run_full_class_samples(self): """Test run with return_embeddings""" experiment = Experiment( same_class_samples="full", different_class_samples=("full", "full"), ) df = experiment.run( self.embs, self.targets, ) self.assertEqual(len(df), comb(self.nphotos, 2)) def test_run_custom_class_samples(self): """Test run with custom same_class_samples and different_class_samples""" N, M = (2, 5) experiment = Experiment(same_class_samples=2, different_class_samples=(N, M)) same_class_samples = 3 df = experiment.run( self.embs, self.targets, ) self.assertLessEqual( len(df), (comb(same_class_samples, 2) * self.nclasses) + (self.nclasses * (self.nclasses - 1)) * M * N, ) def test_run_shuffle(self): """Test run with shuffle""" experiment = Experiment(seed=555) df1 = experiment.run(self.embs, self.targets, shuffle=True) df2 = experiment.run(self.embs, self.targets, shuffle=True) self.assertEqual(len(df1), len(df2)) self.assertEqual(sum(df1.index), sum(df2.index)) self.assertTrue(all(ix in df2.index for ix in df1.index)) def test_run_no_batch_size(self): """Test run with no batch_size""" experiment = Experiment( same_class_samples=2, different_class_samples=(1, 1), seed=555, ) experiment.run(self.embs, self.targets, batch_size=None) self.assertTrue(experiment.check_experiment_run()) def test_run_return_embeddings(self): """Test run with return_embeddings""" experiment = Experiment() df = experiment.run(self.embs, self.targets, return_embeddings=True) self.assertLessEqual(len(df.at[0, "emb_a"]), self.emb_size) def test_run_evaluate_at_threshold(self): """Test run with evaluate_at_threshold""" metrics = ["cosine_similarity", "euclidean_distance_l2"] experiment = Experiment(metrics=metrics) experiment.run( self.embs, self.targets, ) evaluations = experiment.evaluate_at_threshold(0.5, "cosine_similarity") # self.assertEqual(len(evaluations), len(metrics)) self.assertEqual(len(evaluations), 9) def test_run_find_optimal_cutoff(self): """Test run with find_optimal_cutoff""" metrics = ["cosine_similarity", "euclidean_distance_l2"] experiment = Experiment(metrics=metrics) experiment.run( self.embs, self.targets, ) evaluations = experiment.find_optimal_cutoff() self.assertEqual(len(evaluations), len(metrics)) self.assertTrue(all(evaluation in metrics for evaluation in evaluations)) def test_run_get_roc_auc(self): """Test run with get_roc_auc""" metrics = ["cosine_similarity", "euclidean_distance_l2"] experiment = Experiment(metrics=metrics) experiment.run( self.embs, self.targets, ) roc_auc = experiment.roc_auc() # self.assertEqual(len(evaluations), len(metrics)) self.assertEqual(len(roc_auc), len(metrics)) self.assertTrue(all(auc in metrics for auc in roc_auc)) def test_run_predicted_as_similarity(self): """Test run with predicted_as_similarity""" experiment = Experiment(metrics=["cosine_similarity", "cosine_distance"]) experiment.run( self.embs, self.targets, ) result = experiment.predicted_as_similarity("cosine_similarity") result_2 = experiment.predicted_as_similarity("cosine_distance") self.assertTrue(np.allclose(result, result_2)) def test_run_find_threshold_at_fpr(self): """Test run with find_threshold_at_fpr""" metric = "cosine_similarity" experiment = Experiment( metrics=metric, different_class_samples=("full", "full"), ) experiment.run( self.embs, self.targets, ) fpr_d01 = experiment.threshold_at_fpr(0.1) fpr_d1 = experiment.threshold_at_fpr(1) fpr_d0 = experiment.threshold_at_fpr(0) self.assertEqual(len(fpr_d01[metric]), 3) self.assertAlmostEqual(fpr_d01[metric]["threshold"], 0.8939142, 3) self.assertAlmostEqual(fpr_d0[metric]["threshold"], 0.9953355, 3) self.assertAlmostEqual(fpr_d1[metric]["threshold"], 0.2060538, 3) def test_run_calculate_eer(self): """Test run with calculate_eer""" metric = "cosine_similarity" experiment = Experiment( metrics=metric, different_class_samples=("full", "full"), ) experiment.run( self.embs, self.targets, ) eer = experiment.eer() self.assertTrue("EER" in eer[metric]) def test__call__(self): """Test run with __call__""" experiment = Experiment(seed=555) result = experiment.run(self.embs, self.targets) result_2 = experiment(self.embs, self.targets) self.assertTrue(np.array_equal(result.to_numpy(), result_2.to_numpy())) def test_run_errors(self): """Test run errors""" with self.assertRaisesRegex( ValueError, "`same_class_samples` argument must be one of 'full' or an integer ", ): experiment = Experiment(same_class_samples=54.4) experiment.run(self.embs, self.targets) with self.assertRaisesRegex( ValueError, "`different_class_samples` argument must be one of 'full', 'minimal'", ): experiment = Experiment(different_class_samples="all") experiment.run(self.embs, self.targets) with self.assertRaisesRegex( ValueError, "When passing `different_class_samples` as a tuple or list. ", ): experiment = Experiment(different_class_samples=(1, 2, 3)) experiment.run( self.embs, self.targets, ) with self.assertRaisesRegex( ValueError, '`batch_size` argument must be either "best" or of type integer', ): experiment = Experiment() experiment.run(self.embs, self.targets, batch_size="all") with self.assertRaisesRegex(ValueError, "`metric` argument must be one of "): experiment = Experiment(metrics="dot_prod") experiment.run(self.embs, self.targets) with self.assertRaisesRegex( ValueError, "`p` must be an int and at least 1. Received: p=", ): experiment = Experiment() experiment.run(self.embs, self.targets, p=0.1) with self.assertRaisesRegex( NotImplementedError, "`evaluate_at_threshold` function can only be run after running " "`run_experiment`.", ): experiment = Experiment() experiment.evaluate_at_threshold(0.5, "euclidean_distance") with self.assertRaisesRegex( ValueError, "`evaluate_at_threshold` function can only be called with `metric` from ", ): experiment = Experiment(metrics="euclidean_distance") experiment.run(self.embs, self.targets) experiment.evaluate_at_threshold(0.5, "cosine_similarity") with self.assertRaisesRegex( ValueError, "`fpr` must be between 0 and 1. Received wanted_fpr=", ): experiment = Experiment(metrics="euclidean_distance") experiment.run(self.embs, self.targets) experiment.threshold_at_fpr(-1.1) ================================================ FILE: tests/test_experiment_real_data.py ================================================ # tests/test_experiment_real_data_small.py import os import pathlib import unittest from collections import OrderedDict import numpy as np from evalify import Experiment class TestExperimentRealDataSmall(unittest.TestCase): """Tests for Experiment class using a subset of the LFW dataset""" def setUp(self): """Set up test fixtures.""" # Path to LFW.npz, assuming it's in the tests/data/ directory self.lfw_npz = os.path.join(pathlib.Path(__file__).parent, "data", "LFW.npz") if not os.path.exists(self.lfw_npz): self.fail(f"LFW.npz not found at {self.lfw_npz}") X_y_array = np.load(self.lfw_npz) self.X = X_y_array["X"][:1000] self.y = X_y_array["y"][:1000] self.metrics = [ "cosine_similarity", "pearson_similarity", "euclidean_distance_l2", ] self.experiment = Experiment( metrics=self.metrics, same_class_samples="full", different_class_samples=("full", "full"), seed=555, # To ensure reproducibility ) # Run the experiment once during setup to reuse the results in multiple tests self.df = self.experiment.run(self.X, self.y) def test_number_of_samples(self): """Test that the number of generated samples matches the expected count.""" expected_num_samples = 499500 actual_num_samples = len(self.df) self.assertEqual( actual_num_samples, expected_num_samples, f"Expected {expected_num_samples} samples, got {actual_num_samples}.", ) def test_roc_auc(self): """Test that ROC AUC values match the expected results.""" expected_roc_auc = OrderedDict( { "euclidean_distance_l2": 0.9998640116393942, "cosine_similarity": 0.9998640114481793, "pearson_similarity": 0.999858162377461, } ) actual_roc_auc = self.experiment.roc_auc() self.assertEqual( len(actual_roc_auc), len(self.metrics), f"Expected ROC AUC for {len(self.metrics)} metrics, got " f"{len(actual_roc_auc)}.", ) for metric in self.metrics: self.assertIn( metric, actual_roc_auc, f"ROC AUC for metric '{metric}' not found." ) self.assertAlmostEqual( actual_roc_auc[metric], expected_roc_auc[metric], places=6, msg=f"ROC AUC for metric '{metric}' does not match.", ) def test_threshold_at_fpr(self): """Test that thresholds at a specified FPR match expected values.""" far = 0.01 expected_threshold_at_fpr = { "cosine_similarity": { "FPR": 0.010001841326240518, "TPR": 0.9973539973539973, "threshold": 0.37717896699905396, }, "pearson_similarity": { "FPR": 0.010001841326240518, "TPR": 0.9973539973539973, "threshold": 0.37802454829216003, }, "euclidean_distance_l2": { "FPR": 0.010001841326240518, "TPR": 0.9973539973539973, "threshold": 1.1160835027694702, }, } actual_threshold_at_fpr = self.experiment.threshold_at_fpr(far) self.assertEqual( len(actual_threshold_at_fpr), len(self.metrics), f"Expected Threshold @ FPR for {len(self.metrics)} metrics, got " f"{len(actual_threshold_at_fpr)}.", ) for metric in self.metrics: self.assertIn( metric, actual_threshold_at_fpr, f"Threshold @ FPR for metric '{metric}' not found.", ) expected = expected_threshold_at_fpr[metric] actual = actual_threshold_at_fpr[metric] self.assertAlmostEqual( actual["FPR"], expected["FPR"], places=6, msg=f"FPR for metric '{metric}' does not match.", ) self.assertAlmostEqual( actual["TPR"], expected["TPR"], places=6, msg=f"TPR for metric '{metric}' does not match.", ) self.assertAlmostEqual( actual["threshold"], expected["threshold"], places=6, msg=f"Threshold for metric '{metric}' at FAR={far} does not match.", ) def test_eer(self): """Test that EER values and thresholds match the expected results.""" expected_eer = OrderedDict( { "cosine_similarity": { "EER": 0.004724863226023654, "threshold": 0.4244731664657593, }, "euclidean_distance_l2": { "EER": 0.004724863226023654, "threshold": 1.0728718042373657, }, "pearson_similarity": { "EER": 0.004914464785693375, "threshold": 0.4228288531303406, }, } ) actual_eer = self.experiment.eer() self.assertEqual( len(actual_eer), len(self.metrics), f"Expected EER for {len(self.metrics)} metrics, got {len(actual_eer)}.", ) for metric in self.metrics: self.assertIn(metric, actual_eer, f"EER for metric '{metric}' not found.") expected = expected_eer[metric] actual = actual_eer[metric] self.assertAlmostEqual( actual["EER"], expected["EER"], places=6, msg=f"EER for metric '{metric}' does not match.", ) self.assertAlmostEqual( actual["threshold"], expected["threshold"], places=6, msg=f"Threshold for EER of metric '{metric}' does not match.", ) def test_tar_at_far(self): """Test the tar_at_far method with specific FAR values.""" # Define FAR values to test far_values = [0.01, 0.001] # Define expected TAR values based on the recent experiment expected_tar_at_far = OrderedDict( { "cosine_similarity": { 0.01: 0.9973539973539973, 0.001: 0.9795879795879796, }, "pearson_similarity": { 0.01: 0.9973539973539973, 0.001: 0.9793989793989794, }, "euclidean_distance_l2": { 0.01: 0.9973539973539973, 0.001: 0.9795879795879796, }, } ) # Call tar_at_far with the FAR values actual_tar_at_far = self.experiment.tar_at_far(far_values) # Assert the returned TAR@FAR matches expected values self.assertEqual( len(actual_tar_at_far), len(self.metrics), f"Expected TAR@FAR for {len(self.metrics)} metrics, got " f"{len(actual_tar_at_far)}.", ) for metric in self.metrics: self.assertIn( metric, actual_tar_at_far, f"TAR@FAR for metric '{metric}' not found." ) for far in far_values: self.assertIn( far, actual_tar_at_far[metric], f"TAR@FAR for metric '{metric}' at FAR={far} not found.", ) expected_tar = expected_tar_at_far[metric][far] actual_tar = actual_tar_at_far[metric][far] self.assertAlmostEqual( actual_tar, expected_tar, places=6, msg=f"TAR@FAR for metric '{metric}' at FAR={far} does not match.", ) # if __name__ == '__main__': # unittest.main() ================================================ FILE: tests/test_metrics.py ================================================ #!/usr/bin/env python """Tests for `evalify` package.""" import unittest import numpy as np from scipy.spatial import distance from scipy.stats import pearsonr from evalify import metrics class TestMetrics(unittest.TestCase): """Tests for `evalify` package.""" def setUp(self): """Set up test fixtures, if any.""" rng = np.random.default_rng(555) self.nphotos = 500 self.emb_size = 8 self.slice_size = 100 self.embs = rng.random((self.nphotos, self.emb_size), dtype=np.float32) self.norms = np.linalg.norm(self.embs, axis=1) self.ix = rng.integers(self.nphotos, size=self.slice_size) self.iy = rng.integers(self.nphotos, size=self.slice_size) def test_cosine_similarity(self): """Test cosine_similarity""" result = metrics.cosine_similarity(self.embs, self.ix, self.iy, self.norms) result_2 = 1 - np.array( [ distance.cosine(self.embs[ix], self.embs[iy]) for (ix, iy) in zip(self.ix, self.iy) ], ) self.assertEqual(result.shape, (self.slice_size,)) self.assertTrue(np.allclose(result, result_2)) def test_pearson_similarity(self): """Test pearson_similarity""" result = metrics.pearson_similarity(self.embs, self.ix, self.iy) result_2 = np.array( [ pearsonr(self.embs[ix], self.embs[iy])[0] for (ix, iy) in zip(self.ix, self.iy) ], ) self.assertEqual(result.shape, (self.slice_size,)) self.assertTrue(np.allclose(result, result_2)) def test_euclidean_distance(self): """Test euclidean_distance""" result = metrics.metrics_caller.get("euclidean_distance")( self.embs, self.ix, self.iy, ) result_2 = np.array( [ distance.euclidean(self.embs[ix], self.embs[iy]) for (ix, iy) in zip(self.ix, self.iy) ], ) self.assertEqual(result.shape, (self.slice_size,)) self.assertTrue(np.allclose(result, result_2)) def test_euclidean_distance_l2(self): """Test euclidean_distance""" result = metrics.metrics_caller.get("euclidean_distance_l2")( self.embs, self.ix, self.iy, self.norms, ) result_2 = np.array( [ distance.euclidean( self.embs[ix] / np.sqrt(np.sum(self.embs[ix] ** 2)), self.embs[iy] / np.sqrt(np.sum(self.embs[iy] ** 2)), ) for (ix, iy) in zip(self.ix, self.iy) ], ) self.assertEqual(result.shape, (len(self.ix),)) self.assertTrue(np.allclose(result, result_2)) def test_minkowski_distance_distance(self): """Test euclidean_distance""" result = metrics.metrics_caller.get("minkowski_distance")( self.embs, self.ix, self.iy, p=3, ) result_2 = np.array( [ distance.minkowski(self.embs[ix], self.embs[iy], p=3) for (ix, iy) in zip(self.ix, self.iy) ], ) self.assertEqual(result.shape, (self.slice_size,)) self.assertTrue(np.allclose(result, result_2)) def test_manhattan_distance_distance(self): """Test euclidean_distance""" result = metrics.metrics_caller.get("manhattan_distance")( self.embs, self.ix, self.iy, ) result_2 = np.array( [ distance.cityblock(self.embs[ix], self.embs[iy]) for (ix, iy) in zip(self.ix, self.iy) ], ) self.assertEqual(result.shape, (self.slice_size,)) self.assertTrue(np.allclose(result, result_2)) def test_chebyshev_distance_distance(self): """Test euclidean_distance""" result = metrics.metrics_caller.get("chebyshev_distance")( self.embs, self.ix, self.iy, ) result_2 = np.array( [ distance.chebyshev(self.embs[ix], self.embs[iy]) for (ix, iy) in zip(self.ix, self.iy) ], ) self.assertEqual(result.shape, (self.slice_size,)) self.assertTrue(np.allclose(result, result_2)) ================================================ FILE: tests/test_utils.py ================================================ #!/usr/bin/env python """Tests for `evalify` package.""" import unittest import numpy as np from evalify import utils class TestUtils(unittest.TestCase): """Tests for `evalify` package.""" def setUp(self): """Set up test fixtures, if any.""" self.rng = np.random.default_rng(555) self.nphotos = 100 self.emb_size = 8 self.nclasses = 10 self.embs = self.rng.random((self.nphotos, self.emb_size), dtype=np.float32) self.targets = self.rng.integers(self.nclasses, size=self.nphotos) def tearDown(self): """Tear down test fixtures, if any.""" def test_validate_vectors(self): """Test _validate_vectors""" embs = self.embs.tolist() targets = self.targets.tolist() X, y = utils._validate_vectors(embs, targets) self.assertEqual(X.shape, (self.nphotos, self.emb_size)) self.assertEqual(y.shape, (self.nphotos,)) def test_calculate_best_batch_size(self): """Test calculate_best_batch_size""" batch_size = utils.calculate_best_batch_size(self.embs, 4 * utils.GB_TO_BYTE) self.assertEqual(batch_size, 1420470954) def test_run_errors(self): """Test run errors""" with self.assertRaisesRegex(ValueError, "Embeddings vector should be 2-D."): _ = utils._validate_vectors( X=self.rng.random(5), y=self.rng.integers(10, size=5), ) with self.assertRaisesRegex(ValueError, "Target vector should be 1-D."): _ = utils._validate_vectors( X=self.rng.random((5, 5)), y=self.rng.integers(10, size=(5, 2)), ) ================================================ FILE: tox.ini ================================================ [tox] isolated_build = true envlist = py39, py310, py311, py312, lint [gh-actions] python = 3.12: py312 3.11: py311 3.10: py310 3.9: py39 [testenv:lint] allowlist_externals = python deps = .[test, doc, dev] commands = python -m ruff check evalify tests --fix python -m poetry build python -m mkdocs build python -m twine check dist/* [testenv] allowlist_externals = pytest setenv = PYTHONPATH = {toxinidir} PYTHONWARNINGS = ignore deps = .[test] commands = pytest -s --cov=evalify --cov-append --cov-report=xml --cov-report term-missing tests