[
  {
    "path": ".gitchangelog.rc",
    "content": "# -*- coding: utf-8; mode: python -*-\n##\n## Format\n##\n##   ACTION: [AUDIENCE:] COMMIT_MSG [!TAG ...]\n##\n## Description\n##\n##   ACTION is one of 'chg', 'fix', 'new'\n##\n##       Is WHAT the change is about.\n##\n##       'chg' is for refactor, small improvement, cosmetic changes...\n##       'fix' is for bug fixes\n##       'new' is for new features, big improvement\n##\n##   AUDIENCE is optional and one of 'dev', 'usr', 'pkg', 'test', 'doc'\n##\n##       Is WHO is concerned by the change.\n##\n##       'dev'  is for developpers (API changes, refactors...)\n##       'usr'  is for final users (UI changes)\n##       'pkg'  is for packagers   (packaging changes)\n##       'test' is for testers     (test only related changes)\n##       'doc'  is for doc guys    (doc only changes)\n##\n##   COMMIT_MSG is ... well ... the commit message itself.\n##\n##   TAGs are additionnal adjective as 'refactor' 'minor' 'cosmetic'\n##\n##       They are preceded with a '!' or a '@' (prefer the former, as the\n##       latter is wrongly interpreted in github.) Commonly used tags are:\n##\n##       'refactor' is obviously for refactoring code only\n##       'minor' is for a very meaningless change (a typo, adding a comment)\n##       'cosmetic' is for cosmetic driven change (re-indentation, 80-col...)\n##       'wip' is for partial functionality but complete subfunctionality.\n##\n## Example:\n##\n##   new: usr: support of bazaar implemented\n##   chg: re-indentend some lines !cosmetic\n##   new: dev: updated code to be compatible with last version of killer lib.\n##   fix: pkg: updated year of licence coverage.\n##   new: test: added a bunch of test around user usability of feature X.\n##   fix: typo in spelling my name in comment. !minor\n##\n##   Please note that multi-line commit message are supported, and only the\n##   first line will be considered as the \"summary\" of the commit message. So\n##   tags, and other rules only applies to the summary.  The body of the commit\n##   message will be displayed in the changelog without reformatting.\n\n\n##\n## ``ignore_regexps`` is a line of regexps\n##\n## Any commit having its full commit message matching any regexp listed here\n## will be ignored and won't be reported in the changelog.\n##\nignore_regexps = [\n    r'@minor', r'!minor',\n    r'@cosmetic', r'!cosmetic',\n    r'@refactor', r'!refactor',\n    r'@wip', r'!wip',\n    r'^([cC]hg|[fF]ix|[nN]ew)\\s*:\\s*[p|P]kg:',\n    r'^([cC]hg|[fF]ix|[nN]ew)\\s*:\\s*[d|D]ev:',\n    r'^(.{3,3}\\s*:)?\\s*[fF]irst commit.?\\s*$',\n    r'^$',  ## ignore commits with empty messages\n]\n\n\n## ``section_regexps`` is a list of 2-tuples associating a string label and a\n## list of regexp\n##\n## Commit messages will be classified in sections thanks to this. Section\n## titles are the label, and a commit is classified under this section if any\n## of the regexps associated is matching.\n##\n## Please note that ``section_regexps`` will only classify commits and won't\n## make any changes to the contents. So you'll probably want to go check\n## ``subject_process`` (or ``body_process``) to do some changes to the subject,\n## whenever you are tweaking this variable.\n##\nsection_regexps = [\n    ('New', [\n        r'^[nN]ew\\s*:\\s*((dev|use?r|pkg|test|doc)\\s*:\\s*)?([^\\n]*)$',\n     ]),\n    ('Changes', [\n        r'^[cC]hg\\s*:\\s*((dev|use?r|pkg|test|doc)\\s*:\\s*)?([^\\n]*)$',\n     ]),\n    ('Fix', [\n        r'^[fF]ix\\s*:\\s*((dev|use?r|pkg|test|doc)\\s*:\\s*)?([^\\n]*)$',\n     ]),\n\n    ('Other', None ## Match all lines\n     ),\n\n]\n\n\n## ``body_process`` is a callable\n##\n## This callable will be given the original body and result will\n## be used in the changelog.\n##\n## Available constructs are:\n##\n##   - any python callable that take one txt argument and return txt argument.\n##\n##   - ReSub(pattern, replacement): will apply regexp substitution.\n##\n##   - Indent(chars=\"  \"): will indent the text with the prefix\n##     Please remember that template engines gets also to modify the text and\n##     will usually indent themselves the text if needed.\n##\n##   - Wrap(regexp=r\"\\n\\n\"): re-wrap text in separate paragraph to fill 80-Columns\n##\n##   - noop: do nothing\n##\n##   - ucfirst: ensure the first letter is uppercase.\n##     (usually used in the ``subject_process`` pipeline)\n##\n##   - final_dot: ensure text finishes with a dot\n##     (usually used in the ``subject_process`` pipeline)\n##\n##   - strip: remove any spaces before or after the content of the string\n##\n##   - SetIfEmpty(msg=\"No commit message.\"): will set the text to\n##     whatever given ``msg`` if the current text is empty.\n##\n## Additionally, you can `pipe` the provided filters, for instance:\n#body_process = Wrap(regexp=r'\\n(?=\\w+\\s*:)') | Indent(chars=\"  \")\n#body_process = Wrap(regexp=r'\\n(?=\\w+\\s*:)')\n#body_process = noop\nbody_process = ReSub(r'((^|\\n)[A-Z]\\w+(-\\w+)*: .*(\\n\\s+.*)*)+$', r'') | strip\n\n\n## ``subject_process`` is a callable\n##\n## This callable will be given the original subject and result will\n## be used in the changelog.\n##\n## Available constructs are those listed in ``body_process`` doc.\nsubject_process = (strip |\n    ReSub(r'^([cC]hg|[fF]ix|[nN]ew)\\s*:\\s*((dev|use?r|pkg|test|doc)\\s*:\\s*)?([^\\n@]*)(@[a-z]+\\s+)*$', r'\\4') |\n    SetIfEmpty(\"No commit message.\") | ucfirst | final_dot)\n\n\n## ``tag_filter_regexp`` is a regexp\n##\n## Tags that will be used for the changelog must match this regexp.\n##\ntag_filter_regexp = r'^[0-9]+\\.[0-9]+(\\.[0-9]+)?$'\n\n\n## ``unreleased_version_label`` is a string or a callable that outputs a string\n##\n## This label will be used as the changelog Title of the last set of changes\n## between last valid tag and HEAD if any.\nunreleased_version_label = \"(unreleased)\"\n\n\n## ``output_engine`` is a callable\n##\n## This will change the output format of the generated changelog file\n##\n## Available choices are:\n##\n##   - rest_py\n##\n##        Legacy pure python engine, outputs ReSTructured text.\n##        This is the default.\n##\n##   - mustache(<template_name>)\n##\n##        Template name could be any of the available templates in\n##        ``templates/mustache/*.tpl``.\n##        Requires python package ``pystache``.\n##        Examples:\n##           - mustache(\"markdown\")\n##           - mustache(\"restructuredtext\")\n##\n##   - makotemplate(<template_name>)\n##\n##        Template name could be any of the available templates in\n##        ``templates/mako/*.tpl``.\n##        Requires python package ``mako``.\n##        Examples:\n##           - makotemplate(\"restructuredtext\")\n##\noutput_engine = rest_py\n#output_engine = mustache(\"restructuredtext\")\n#output_engine = mustache(\"markdown\")\n#output_engine = makotemplate(\"restructuredtext\")\n\n\n## ``include_merge`` is a boolean\n##\n## This option tells git-log whether to include merge commits in the log.\n## The default is to include them.\ninclude_merge = True\n\n\n## ``log_encoding`` is a string identifier\n##\n## This option tells gitchangelog what encoding is outputed by ``git log``.\n## The default is to be clever about it: it checks ``git config`` for\n## ``i18n.logOutputEncoding``, and if not found will default to git's own\n## default: ``utf-8``.\n#log_encoding = 'utf-8'\n\n\n## ``publish`` is a callable\n##\n## Sets what ``gitchangelog`` should do with the output generated by\n## the output engine. ``publish`` is a callable taking one argument\n## that is an interator on lines from the output engine.\n##\n## Some helper callable are provided:\n##\n## Available choices are:\n##\n##   - stdout\n##\n##        Outputs directly to standard output\n##        (This is the default)\n##\n##   - FileInsertAtFirstRegexMatch(file, pattern, idx=lamda m: m.start())\n##\n##        Creates a callable that will parse given file for the given\n##        regex pattern and will insert the output in the file.\n##        ``idx`` is a callable that receive the matching object and\n##        must return a integer index point where to insert the\n##        the output in the file. Default is to return the position of\n##        the start of the matched string.\n##\n##   - FileRegexSubst(file, pattern, replace, flags)\n##\n##        Apply a replace inplace in the given file. Your regex pattern must\n##        take care of everything and might be more complex. Check the README\n##        for a complete copy-pastable example.\n##\n# publish = FileInsertIntoFirstRegexMatch(\n#     \"CHANGELOG.rst\",\n#     r'/(?P<rev>[0-9]+\\.[0-9]+(\\.[0-9]+)?)\\s+\\([0-9]+-[0-9]{2}-[0-9]{2}\\)\\n--+\\n/',\n#     idx=lambda m: m.start(1)\n# )\n#publish = stdout\n\n\n## ``revs`` is a list of callable or a list of string\n##\n## callable will be called to resolve as strings and allow dynamical\n## computation of these. The result will be used as revisions for\n## gitchangelog (as if directly stated on the command line). This allows\n## to filter exaclty which commits will be read by gitchangelog.\n##\n## To get a full documentation on the format of these strings, please\n## refer to the ``git rev-list`` arguments. There are many examples.\n##\n## Using callables is especially useful, for instance, if you\n## are using gitchangelog to generate incrementally your changelog.\n##\n## Some helpers are provided, you can use them::\n##\n##   - FileFirstRegexMatch(file, pattern): will return a callable that will\n##     return the first string match for the given pattern in the given file.\n##     If you use named sub-patterns in your regex pattern, it'll output only\n##     the string matching the regex pattern named \"rev\".\n##\n##   - Caret(rev): will return the rev prefixed by a \"^\", which is a\n##     way to remove the given revision and all its ancestor.\n##\n## Please note that if you provide a rev-list on the command line, it'll\n## replace this value (which will then be ignored).\n##\n## If empty, then ``gitchangelog`` will act as it had to generate a full\n## changelog.\n##\n## The default is to use all commits to make the changelog.\n#revs = [\"^1.0.3\", ]\n#revs = [\n#    Caret(\n#        FileFirstRegexMatch(\n#            \"CHANGELOG.rst\",\n#            r\"(?P<rev>[0-9]+\\.[0-9]+(\\.[0-9]+)?)\\s+\\([0-9]+-[0-9]{2}-[0-9]{2}\\)\\n--+\\n\")),\n#    \"HEAD\"\n#]\nrevs = []\n\ninclude_merge = False\n"
  },
  {
    "path": ".github/pull_request_template.md",
    "content": "### Related Issues\n\n- fixes #issue-number\n\n### Proposed Changes:\n\n <!--- In case of a bug: Describe what caused the issue and how you solved it -->\n <!--- In case of a feature: Describe what did you add and how it works -->\n\n### How did you test it?\n\n<!-- unit tests, integration tests, manual verification, instructions for manual tests -->\n\n### Notes for the reviewer\n\n<!-- E.g. point out section where the reviewer  -->\n\n### Checklist\n\n- I have read the [contributors guidelines](https://github.com/deepset-ai/haystack/blob/main/CONTRIBUTING.md) and the [code of conduct](https://github.com/deepset-ai/haystack/blob/main/code_of_conduct.txt)\n- I have updated the related issue with new insights and changes\n- I added unit tests and updated the docstrings\n- I've used one of the [conventional commit types](https://www.conventionalcommits.org/en/v1.0.0/) for my PR title: `fix:`, `feat:`, `build:`, `chore:`, `ci:`, `docs:`, `style:`, `refactor:`, `perf:`, `test:` and added `!` in case the PR includes breaking changes.\n- I documented my code\n- I ran [pre-commit hooks](https://github.com/deepset-ai/haystack/blob/main/CONTRIBUTING.md#installation) and fixed any issue\n"
  },
  {
    "path": ".github/workflows/CI-checks.yml",
    "content": "name: Linting, Type Checking, and Testing\n\non:\n  push:\n    branches: [ main ]\n  pull_request:\n    branches: [ main ]\n\njobs:\n  test:\n    runs-on: ${{ matrix.os }}\n    strategy:\n      matrix:\n        os: [ubuntu-latest, windows-latest, macos-latest]\n        python-version: [\"3.11\"]\n\n    steps:\n    - uses: actions/checkout@v4\n    \n    - name: Set up Python ${{ matrix.python-version }}\n      uses: actions/setup-python@v5\n      with:\n        python-version: ${{ matrix.python-version }}\n    \n    - name: Install Hatch\n      run: |\n        python -m pip install --upgrade pip\n        pip install hatch\n    \n    - name: Running linters\n      run: |\n        hatch -e dev run lint\n\n    - name: Type checking with mypy\n      run: |\n        hatch -e dev run typing\n\n    - name: Running tests\n      run: |\n        hatch -e dev run test"
  },
  {
    "path": ".gitignore",
    "content": "**.coverage\n**.ipynb_checkpoints/\n**.mypy_cache/\n**/.python-version\n**__pycache__/\n.tox/\n.venv/\nbuild/\ncoverage.xml\ndist/\nnervaluate.egg-info/\n**/.DS_Store\n.idea\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n\n- repo: https://github.com/pre-commit/pre-commit-hooks\n  rev: v4.1.0\n  hooks:\n  - id: check-yaml\n\n- repo: https://github.com/psf/black\n  rev: 22.3.0\n  hooks:\n  - id: black\n    args: [-t, py38, -l 120]\n\n- repo: local\n  hooks:\n  - id: pylint\n    name: pylint\n    entry: pylint\n    language: system\n    types: [ python ]\n    args: [--rcfile=pylint.cfg]\n\n- repo: local\n  hooks:\n  - id: flake8\n    name: flake8\n    entry: flake8\n    language: system\n    types: [ python ]\n    args: [--config=setup.cfg]\n\n- repo: local\n  hooks:\n    - id: mypy\n      name: mypy\n      entry: mypy\n      language: python\n      language_version: python3.8\n      types: [python]\n      exclude: examples|tests\n      require_serial: true  # use require_serial so that script is only called once per commit\n      verbose: true  # print the number of files as a sanity-check\n      args: [--config, setup.cfg]"
  },
  {
    "path": "CHANGELOG.rst",
    "content": "Changelog\n=========\n\n\n(unreleased)\n------------\n- Adding tests + updating README.md. [David S. Batista]\n- Fix partial and ent_type precision/recall when merging multi-document\n  results. [David S. Batista]\n\n  _merge_results() was calling compute_metrics() with no arguments after\n  merging counts, so partial_or_type defaulted to False and strict\n  formula (COR/ACT, COR/POS) was used for all strategies. That overwrote\n  the correct partial/ent_type P/R (COR+0.5*PAR)/ACT and (COR+0.5*PAR)/POS.\n\n  Now pass strategy_name into _merge_results and call\n  compute_metrics(partial_or_type=True) for 'partial' and 'ent_type'\n  so merged results keep the SemEval partial-match formula.\n\n  Fixes the bug where partial (and ent_type) reported same P/R as strict\n  (e.g. README example showed 0.40 instead of 0.70 for partial).\n\n\n1.2.0 (2026-03-09)\n------------------\n- 1.2.0 release. [David S. Batista]\n- Updating CHANGELOG. [David S. Batista]\n- Bumping version to 1.2.0. [David S. Batista]\n- Adding more tests. [David S. Batista]\n- Explaining new behaviour docstring + README.md. [David S. Batista]\n- Fixing typo. [David S. Batista]\n- Fixing typo + linting error. [David S. Batista]\n- Refactor: better naming. [DinizNicolas]\n- Docs: update docstring. [DinizNicolas]\n- Docs: docstring update. [DinizNicolas]\n- Style: lint. [DinizNicolas]\n- Refactor: change ugly ifs. [DinizNicolas]\n- Feat: nested entities support. [DinizNicolas]\n\n  Change partial evaluation to resolve nested entities edge cases.\n- Test: nested entities support. [DinizNicolas]\n\n  Add tests for nested entities partial evaluation\n- Feat: nested entities support. [DinizNicolas]\n\n  Change exact evaluation to resolve nested entities edge cases.\n\n  Same changes as strict evaluation\n- Test: nested entities support. [DinizNicolas]\n\n  Add tests for nested entities exact evaluation\n- Feat: nested entities support. [DinizNicolas]\n\n  Change entity type evaluation to resolve nested entities edge cases.\n\n  Before : when a sufficient overlap is found between to entities of the same label, it was counted as correct.\n\n  Now : we search for the best match to count a correct entity. The best match being the one with minimum gap between predicted and true entities boundaries.\n- Add tests for nested entities entity type evaluation. [diniznicol]\n- Feat: nested entities support Change strict evaluation to resolve\n  nested entities edge cases. Before: without a perfect match beetwen a\n  true/pred pair, if an overlap was found, it was directly counted as\n  incorrect. Now : Only when no perfect match is found in every true\n  entity, the first overlapping pred entity found is counted as\n  incorrect. [DinizNicolas]\n- Test: nested entities support test correction. [DinizNicolas]\n- Test: nested entities support. [DinizNicolas]\n\n  Add tests for nested entities strict evaluation\n\n\n1.1.0 (2025-09-06)\n------------------\n- 1.1.0 release. [David S. Batista]\n- 1.1.0 release. [David S. Batista]\n- Testing for single character entities. [David S. Batista]\n- Fixing linting issues. [David S. Batista]\n- Fixing linting issues. [David S. Batista]\n- Defining a min ground truth percentage to be considered an overlap.\n  [David S. Batista]\n- Chore: removing script to compare old and new version outputs. [David\n  S. Batista]\n\n\n1.0.0 (2025-08-18)\n------------------\n- 1.0.0 release. [David S. Batista]\n- Bumping version. [David S. Batista]\n- Removing pandas dependency. [David S. Batista]\n- Relaxing tests invalid mode and scenario. [David S. Batista]\n- Saving to CSV file or return CSV string. [David S. Batista]\n- Adds tests for result indices in all strategies. [Jack Boylan]\n- Adds indices tests for `ent_type` strategy. [Jack Boylan]\n- Linting import statment. [David S. Batista]\n- Wip: using hatch in contributing. [David S. Batista]\n- Updating CITATION and removing flake. [David S. Batista]\n- Renaming evaluation_strategies to strategies and improving README.\n  [David S. Batista]\n- Removing old files. [David S. Batista]\n- Removing old files. [David S. Batista]\n- Updating README.MD. [David S. Batista]\n- One more use case. [David S. Batista]\n- Comparative indices report overall. [David S. Batista]\n- Wip: fixing report indices. [David S. Batista]\n- Wip. [David S. Batista]\n- Wip: fixing report for entities. [David S. Batista]\n- Adding function to generate synthetic data. [David S. Batista]\n- Wip: fixing report for entities. [David S. Batista]\n- Wip: fixing report for entities. [David S. Batista]\n- Only showing entities report for entities that actually apper on\n  either true or pred data. [David S. Batista]\n- Wip: checking summary with aggregated entities and a specific\n  scenario. [David S. Batista]\n- Wip: checking summary with aggregated entities and a specific\n  scenario. [David S. Batista]\n- Updating evaluation strategies tests. [David S. Batista]\n- Correcting and fixing type strategy. [David S. Batista]\n- Correcting and fixing strict strategy. [David S. Batista]\n- Correcting and fixing partial strategy. [David S. Batista]\n- Adding partial to evaluation strategies. [David S. Batista]\n- Fixing docs lenghts tests. [David S. Batista]\n- Fixing docs lenghts tests. [David S. Batista]\n- Working on comparative example. [David S. Batista]\n- Fixes. [David S. Batista]\n- Fxing empty entities. [David S. Batista]\n- Fixing imports. [David S. Batista]\n- Moving reporting to the Evaluator class. [David S. Batista]\n- Working on new versions of summary reports. [David S. Batista]\n- Cleaning up README.MD. [David S. Batista]\n- Adding missed pyproject.toml. [David S. Batista]\n- Type checking. [David S. Batista]\n- Fixing all tests. [David S. Batista]\n- Adding refactored code. [David S. Batista]\n- Separating new and old evaluator logic. [David S. Batista]\n- Fixing loaders. [David S. Batista]\n- Fixing loading test_conll_loader. [David S. Batista]\n- Fixing loading test_dict_loader. [David S. Batista]\n- Fixing loading test_list_loader. [David S. Batista]\n- Adding tests. [David S. Batista]\n\n\n0.3.1 (2025-06-05)\n------------------\n- Fixing pandas dependency. [David S. Batista]\n- Fixing pandas dependency. [David S. Batista]\n\n\n0.3.0 (2025-06-05)\n------------------\n\nChanges\n~~~~~~~\n- Update changelog for 0.2.0 release. [Matthew Upson]\n\nFix\n~~~\n- Mypy configuration error. [angelo-digian]\n- Typo in type annotation. [angelo-digian]\n- Switched order of imports. [angelo-digian]\n\nOther\n~~~~~\n- 0.3.0 release. [David S. Batista]\n- Adding deprecation warnings. [David S. Batista]\n- Create pull_request_template.md. [David S. Batista]\n- Upgrading dev tools versions. [David S. Batista]\n- Initial import. [David S. Batista]\n- Adding scenario type for summary report. [David S. Batista]\n- Update README.md. [David S. Batista]\n- Updating README.MD. [David S. Batista]\n- Removing unused variable. [David S. Batista]\n- Update src/nervaluate/reporting.py. [Copilot, David S. Batista]\n- Update src/nervaluate/reporting.py. [Copilot, David S. Batista]\n- Removing Makefile. [David S. Batista]\n- Drafting CONTRIBUTE.md. [David S. Batista]\n- Drafting CONTRIBUTE.md. [David S. Batista]\n- Removing flake8. [David S. Batista]\n- Removing old config files. [David S. Batista]\n- Running on ubuntu, windows and macos. [David S. Batista]\n- Reverting to ubuntu only. [David S. Batista]\n- Adding new file. [David S. Batista]\n- Removing old workflow file. [David S. Batista]\n- Adding windows and macos to CI. [David S. Batista]\n- Streamlining CI checks. [David S. Batista]\n- Disabling old github workflow and triggering new one. [David S.\n  Batista]\n- Changing github workflow. [David S. Batista]\n- Fixing linting and typing issues. [David S. Batista]\n- Adding pytest-cov as dependency. [David S. Batista]\n- Adding hatch as project manager; linting and typing. [David S.\n  Batista]\n- Fixing type hints. [David S. Batista]\n- Wip. [David S. Batista]\n- Adding docstrings. [David S. Batista]\n- Adding more tests. [David S. Batista]\n- Adding more tests. [David S. Batista]\n- Adding docstrings and increasing test coverage. [David S. Batista]\n- Removing requirements_dev.txt. [David S. Batista]\n- Blackening for py311. [David S. Batista]\n- Fixing pyprojec.toml dependencies. [David S. Batista]\n- Fixing pyprojec.toml dependencies. [David S. Batista]\n- Fixing pyprojec.toml dependencies. [David S. Batista]\n- Fixing pyprojec.toml dependencies. [David S. Batista]\n- Fixing pyprojec.toml dependencies. [David S. Batista]\n- Refactor: move dev dependencies to pyproject.toml and update CI\n  workflow. [David S. Batista]\n- Adding wrongly removed pre-commit. [David S. Batista]\n- Fixing type hints. [David S. Batista]\n- Removing unused imports and mutuable default arguments. [David S.\n  Batista]\n- Update README.md. [Tim Miller]\n- Update README.md. [adgianv]\n- Update README.md - change the pdf link. [adgianv]\n- Added type annotations to functions. [angelo-digian]\n- Pandas version downgraded to 2.0.1 because incompatible with python\n  version. [angelo-digian]\n- Fixed pandas version to 2.2.1. [angelo-digian]\n- Add pandas as a dependency in pyproject.toml. [angelo-digian]\n- Adding pandas in the requirements file. [angelo-digian]\n- Update tests/test_evaluator.py. [David S. Batista]\n- Modified results_to_df method and added test. [angelo-digian]\n- Expanded evaluator class: added method to return results of the nested\n  dictionary as a dataframe. [angelo-digian]\n\n\n0.2.0 (2024-04-10)\n------------------\n\nNew\n~~~\n- Add pre-commit. [Matthew Upson]\n- Add CITATION.cff file. [Matthew Upson]\n- Upload artefacts to codecov. [Matthew Upson]\n- Run tests on windows instance. [Matthew Upson]\n\nChanges\n~~~~~~~\n- Add codecov config. [Matthew Upson]\n- Remove .travis.yml. [Matthew Upson]\n- Update tox.ini. [Matthew Upson]\n- Update versions to test. [Matthew Upson]\n- Add tox tests as github action. [Matthew Upson]\n\nFix\n~~~\n- Grant write permission to CICD workflow. [Matthew Upson]\n- Run on windows and linux matrix. [Matthew Upson]\n\nOther\n~~~~~\n- Updates README to reflect new functionality. [Jack Boylan]\n- Removes extra 'indices' printed. [Jack Boylan]\n- Bump black from 23.3.0 to 24.3.0. [dependabot[bot]]\n\n  Bumps [black](https://github.com/psf/black) from 23.3.0 to 24.3.0.\n  - [Release notes](https://github.com/psf/black/releases)\n  - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md)\n  - [Commits](https://github.com/psf/black/compare/23.3.0...24.3.0)\n\n  ---\n  updated-dependencies:\n  - dependency-name: black\n    dependency-type: direct:development\n  ...\n- Fixed Typo in README. [Giovanni Casari]\n- Reformats quotes in `test_nervaluate.py` [Jack Boylan]\n- Initial import. [David S. Batista]\n- Handles case when `predictions` is empty. [Jack Boylan]\n- Adds unit tests for evaluation indices output. [Jack Boylan]\n- Adds summary print functions for overall indices and per-entity\n  indices results. [Jack Boylan]\n- Adds `within_instance_index` to evaluation indices outputs. [Jack\n  Boylan]\n- Ensures compatibility with existing unit tests. [Jack Boylan]\n- Adheres to code quality checks. [Jack Boylan]\n- Adds more descriptive variable names. [Jack Boylan]\n- Adds correct indices to result indices output. [Jack Boylan]\n- Moves evaluation indices to separate data structures. [Jack Boylan]\n- Adds index lists to output for examples with incorrect, partial,\n  spurious, and missed entities. [Jack Boylan]\n- Docs: fix typo \"spurius\" > \"spurious\" [DanShatford]\n- Added test for issue #40. [g.casari]\n- Solved issue #40. [g.casari]\n- Update README.md. [David S. Batista]\n- Cleaning README.MD. [David S. Batista]\n- Attending PR comments. [David S. Batista]\n- Fixing links on README.MD. [David S. Batista]\n- Updating pyproject.toml. [David S. Batista]\n- Updating pyproject.toml. [David S. Batista]\n- Updating README.MD and bumping version to 0.2.0. [David S. Batista]\n- Updating README.MD. [David S. Batista]\n- Reverting to Python 3.8. [David S. Batista]\n- Adding some badges to the README. [David S. Batista]\n- Initial commit. [David S. Batista]\n- Wip: adding poetry. [David S. Batista]\n- Full working example. [David S. Batista]\n- Nit. [David S. Batista]\n- Wip: adding summary report and examples. [David S. Batista]\n- Wip: adding summary report and examples. [David S. Batista]\n- Wip: adding summary report and examples. [David S. Batista]\n- Wip: adding summary report and examples. [David S. Batista]\n- Wip: adding summary report and examples. [David S. Batista]\n- Wip: adding summary report. [David S. Batista]\n- Wip: adding summary report. [David S. Batista]\n- Removed codecov from requirements.txt. [David S. Batista]\n- Removing duplicated code and fixing type hit. [David S. Batista]\n- Updated Makefile: install package in editable mode. [David S. Batista]\n- Updated name. [David S. Batista]\n- Minimum version Python 3.8. [David S. Batista]\n- Fixing Makefile and pre-commit. [David S. Batista]\n- Adding DS_Store and .idea to gitignore. [David S. Batista]\n- Updating Makefile. [David S. Batista]\n- WIP: pre-commit. [David S. Batista]\n- WIP: pre-commit. [David S. Batista]\n- WIP: pre-commit. [David S. Batista]\n- WIP: pre-commit. [David S. Batista]\n- WIP: pre-commit. [David S. Batista]\n- WIP: pre-commit. [David S. Batista]\n- WIP: pre-commit. [David S. Batista]\n- WIP: pre-commit. [David S. Batista]\n- Fixing types. [David S. Batista]\n- Finished adding type hints, some were skipped, code needs refactoring.\n  [David S. Batista]\n- WIP: adding type hints. [David S. Batista]\n- WIP: adding type hints. [David S. Batista]\n- WIP: adding type hints. [David S. Batista]\n- WIP: adding type hints. [David S. Batista]\n- Adding some execptions, code needs refactoring. [David S. Batista]\n- Fixing pyling and flake8 issues. [David S. Batista]\n- Replaced setup.py with pyproject.toml. [David S. Batista]\n- Reverting utils import. [David S. Batista]\n- Fixing types and wrappint at 120 characters. [David S. Batista]\n- Update CITATION.cff. [David S. Batista]\n\n  updating orcid\n- Fix recall formula readme. [fgh95]\n- Update LICENSE. [ivyleavedtoadflax]\n- Update LICENSE. [ivyleavedtoadflax]\n- Delete .python-version. [ivyleavedtoadflax]\n\n\n0.1.8 (2020-10-16)\n------------------\n\nNew\n~~~\n- Add test for whole span length entities (see #32) [Matthew Upson]\n- Summarise blog post in README. [Matthew Upson]\n\nChanges\n~~~~~~~\n- Bump version in setup.py. [Matthew Upson]\n- Update CHANGELOG (#36) [ivyleavedtoadflax]\n- Fix tests to match #32. [Matthew Upson]\n\nFix\n~~~\n- Correct catch sequence of just one entity. [Matthew Upson]\n\n  Incorporate edits in #28 but includes tests.\n\nOther\n~~~~~\n- Add code coverage. [ivyleavedtoadflax]\n- Crucial fixes for evaluation. [Alex Flückiger]\n- Update utils.py. [ivyleavedtoadflax]\n\n  Tiny change to kick off CI\n- Fix to catch last entites Small change to catch entities that go up\n  until last character when there is no tag. [pim]\n\n\n0.1.7 (2019-12-07)\n------------------\n\nNew\n~~~\n- Add tests. [Matthew Upson]\n\n  * Linting\n  * Rename existing tests to disambiguate\n- Add loaders to nervaluate. [Matthew Upson]\n\n  * Add list and conll formats\n\nChanges\n~~~~~~~\n- Update README. [Matthew Upson]\n\nFix\n~~~\n- Issue with setup.py. [Matthew Upson]\n\n  * Add docstring to __version__.py\n\n\n0.1.6 (2019-12-07)\n------------------\n\nNew\n~~~\n- Add gitchangelog and Makefile recipe. [Matthew Upson]\n\nChanges\n~~~~~~~\n- Bump version to 0.1.6. [Matthew Upson]\n- Remove examples. [Matthew Upson]\n\n  These are not accessible from the package in any case.\n- Add dev requirements. [Matthew Upson]\n\n\n0.1.5 (2019-12-06)\n------------------\n\nChanges\n~~~~~~~\n- Bump version to 0.1.5. [Matthew Upson]\n- Update setup.py. [Matthew Upson]\n- Update package url to point at pypi. [Matthew Upson]\n\n\n0.1.4 (2019-12-06)\n------------------\n\nNew\n~~~\n- Add dist to .gitignore. [Matthew Upson]\n- Create pypi friendly README/long description. [Matthew Upson]\n- Clean entity dicts of extraneous keys. [Matthew Upson]\n\n  * Failing to do this can cause problems in evaluations\n  * Add tests\n\nChanges\n~~~~~~~\n- Bump version to 0.1.4. [Matthew Upson]\n- Make setup.py pypi compliant. [Matthew Upson]\n\n\n0.1.2 (2019-12-04)\n------------------\n\nNew\n~~~\n- Add missing prodigy format tests. [Matthew Upson]\n- Pass argument when using list. [Matthew Upson]\n- Setup module structure. [Matthew Upson]\n- Add get_tags() and tests. [Matthew Upson]\n\n  Adds function to extract all the NER tags from a list of sentences.\n- Add Evaluator class. [Matthew Upson]\n\n  * Add some logging statements\n  * Add input checks on number of documents and tokens per document\n  * Allow target labels to be passed as argument to compute_metrics. Note\n      that if a label is predicted and it is not in this list, then it\n      will be classed as spurious for the aggregated scores, and on each\n      entity level result (because it is unclear where the spurious value\n      should be applied, it is applied to all)\n  * linting\n  * Add many new tests\n- Don't evaluate precision and recall for each sentence. [Matthew Upson]\n\n  Rather than automatically calculate precision and recall at the sentence\n  level, this change adds a new function compute_precision_recall_wrapper\n  which can be run after all the metrics whether for 1 document, or 1000,\n  have been calculated. This has the benefit that we can reuse the same\n  code for calculating precision/recall, and allows us to calculate entity\n  level precision/recall if required.\n- Calculate entity level score. [Matthew Upson]\n- Add compute_actual_possible function. [Matthew Upson]\n- Record results for each entity type. [Matthew Upson]\n- Add scenario comments matching blog table. [Matthew Upson]\n- Test results at individual entity level. [Matthew Upson]\n- Add .gitinore file. [Matthew Upson]\n- Add requirements.txt. [Matthew Upson]\n\nChanges\n~~~~~~~\n- Bump version to 0.1.2. [Matthew Upson]\n- Bump version number to 0.1.1. [Matthew Upson]\n- Reduce logging verbosity. [ivyleavedtoadflax]\n- Add example to README.md. [Matthew Upson]\n- Create virtualenv recipe. [Matthew Upson]\n\n  * Move example dependencies to requirements_example.txt\n  * Add virtualenv recipe to Makefile\n  * Update .gitignore\n- Remove unused dependencies. [Matthew Upson]\n\n  * Dependencies for the examples should not be included in setup.py, instead\n  move them to requirements_examples.txt\n- Update example notebook. [Matthew Upson]\n- Remove unwanted tags from pred_named_entities. [Matthew Upson]\n- Remove superfluous get_tags() function. [Matthew Upson]\n- Update notebook. [Matthew Upson]\n- Update notebook. [Matthew Upson]\n- Update tests. [Matthew Upson]\n- Update .gitignore. [Matthew Upson]\n- Replace spurius with spurious. [Matthew Upson]\n- Update README with requirements and test info. [Matthew Upson]\n- Update setup.cfg with source and omit paths. [Matthew Upson]\n- Use pytest instead of unittest. [Matthew Upson]\n\nOther\n~~~~~\n- Revert \"Remove tox and use pytest\" [Matthew Upson]\n\n  * Better to keep tox for local testing in the Makefile and resolve\n    issues running tox on the developers machine.\n\n  This reverts commit 8578795e62ca384adf054c1b85a1c1d7f0d089d5.\n- Remove tox and use pytest. [Elizabeth Gallagher]\n- Add f1 output to nervaluate and update all tests. [Elizabeth\n  Gallagher]\n- Update .travis.yml. [ivyleavedtoadflax]\n- Update README.md. [Matt Upson]\n- Build(deps): bump nltk from 3.4.4 to 3.4.5. [dependabot[bot]]\n\n  Bumps [nltk](https://github.com/nltk/nltk) from 3.4.4 to 3.4.5.\n  - [Release notes](https://github.com/nltk/nltk/releases)\n  - [Changelog](https://github.com/nltk/nltk/blob/develop/ChangeLog)\n  - [Commits](https://github.com/nltk/nltk/compare/3.4.4...3.4.5)\n- Update __version__.py. [Matt Upson]\n- PEPed8 things a bit. [David Soares Batista]\n- Update README.md. [David S. Batista]\n- Update README.md. [David S. Batista]\n- Notebook. [David Soares Batista]\n- Updated notebook. [David Soares Batista]\n- Update README.md. [David S. Batista]\n- Update README.md. [David S. Batista]\n- Renamed notebook. [David Soares Batista]\n- Bug fixing. [David Soares Batista]\n- Test. [David Soares Batista]\n- Typo in comment. [David Soares Batista]\n- Use find_overlap to find all overlap cases. [Matthew Upson]\n\n  Adds the find_overlap function which captures the three possible overlap\n  scenarios (Total, Start, and End). This is examplained in graph below.\n\n  Character Offset:   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |\n  True:               |   |   |   |LOC|LOC|LOC|LOC|LOC|   |   |\n  Total Overlap:      |   |   |LOC|LOC|LOC|LOC|LOC|LOC|LOC|   |\n  Start Overlap:      |   |   |LOC|LOC|LOC|   |   |   |   |   |\n  End Overlap:        |   |   |   |   |   |   |LOC|LOC|LOC|   |\n- Removed debug stamt. [David Soares Batista]\n- Added partial and exact evaluation and tests. [David Soares Batista]\n- Update. [David Soares Batista]\n- Updated README. [David Soares Batista]\n- - fixed bugs and added tests - added pytest. [David Soares Batista]\n- Update ner_evaluation.py. [David S. Batista]\n- Redefined evaluation according to discussion here:\n  https://github.com/davidsbatista/NER-Evaluation/issues/2. [David\n  Soares Batista]\n- Fixed a BUG in collect_named_entites() issued by\n  rjlotok.dblma@gmail.com. [David Soares Batista]\n- Update README.md. [David S. Batista]\n- Update README.md. [David S. Batista]\n- Major refactoring. [David Soares Batista]\n- Create README.md. [David S. Batista]\n- Initial import. [David Soares Batista]\n- Initial commit. [David S. Batista]\n\n\n"
  },
  {
    "path": "CITATION.cff",
    "content": "cff-version: 1.2.1\nmessage: \"If you use this software, please cite it as below.\"\ntitle: \"nervaluate\"\ndate-released: 2026-03-12\nurl: \"https://github.com/mantisnlp/nervaluate\"\nversion: 1.2.1\nauthors:\n- family-names: \"Batista\"\n  given-names: \"David\"\n  orcid: \"https://orcid.org/0000-0002-9324-5773\"\n- family-names: \"Upson\"\n  given-names: \"Matthew Antony\"\n  orcid: \"https://orcid.org/0000-0002-1040-8048\"\n\n\n\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing to `nervaluate`\n\nThank you for your interest in contributing to `nervaluate`! This document provides guidelines and instructions for contributing to the project.\n\n## Development Setup\n\n1. Fork the repository\n2. Clone your fork:\n   ```bash\n   git clone https://github.com/your-username/nervaluate.git\n   cd nervaluate\n   ```\n3. Make sure you have hatch installed, then create a virtual environment:\n   # ToDo\n\n## Adding Tests\n\n`nervaluate` uses pytest for testing. Here are the guidelines for adding tests:\n\n1. All new features and bug fixes should include tests\n2. Tests should be placed in the `tests/` directory\n3. Test files should be named `test_*.py`\n4. Test functions should be named `test_*`\n5. Use pytest fixtures when appropriate for test setup and teardown\n6. Run tests locally before submitting a pull request:\n   ```bash\n   hatch -e \n   ```\n\n\n## Changelog Management\n\n`nervaluate` uses gitchangelog to maintain the CHANGELOG.rst file. Here's how to use it:\n\n1. Make your changes in a new branch\n2. Write your commit messages following these conventions:\n   - Use present tense (\"Add feature\" not \"Added feature\")\n   - Use imperative mood (\"Move cursor to...\" not \"Moves cursor to...\")\n   - Limit the first line to 72 characters or less\n   - Reference issues and pull requests liberally after the first line\n\n3. The commit message format should be:\n   ```\n   type(scope): subject\n\n   body\n   ```\n\n   Where type can be:\n   - feat: A new feature\n   - fix: A bug fix\n   - docs: Documentation changes\n   - style: Changes that do not affect the meaning of the code\n   - refactor: A code change that neither fixes a bug nor adds a feature\n   - perf: A code change that improves performance\n   - test: Adding missing tests or correcting existing tests\n   - chore: Changes to the build process or auxiliary tools\n\n4. After committing your changes, you can generate the changelog:\n   ```bash\n   gitchangelog > CHANGELOG.rst\n   ```\n\n## Pull Request Process\n\n1. Update the README.md with details of changes if needed\n2. Update the CHANGELOG.rst using gitchangelog\n3. The PR will be merged once you have the sign-off of at least one other developer\n4. Make sure all tests pass and there are no linting errors\n\n## Code Style\n\n- Follow PEP 8 guidelines\n- Use type hints\n\n## Questions?\n\nFeel free to open an issue if you have any questions about contributing to `nervaluate`. "
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2020 David S. Batista and Matthew A. Upson\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "[![python](https://img.shields.io/badge/Python-3.11-3776AB.svg?style=flat&logo=python&logoColor=white)](https://www.python.org)\n&nbsp;\n[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)\n&nbsp;\n[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)\n&nbsp;\n![GitHub](https://img.shields.io/github/license/ivyleavedtoadflax/nervaluate)\n&nbsp;\n![Pull Requests Welcome](https://img.shields.io/badge/pull%20requests-welcome-brightgreen.svg)\n&nbsp;\n![PyPI](https://img.shields.io/pypi/v/nervaluate)\n\n# nervaluate\n\n`nervaluate` is a module for evaluating Named Entity Recognition (NER) models as defined in the SemEval 2013 - 9.1 task.\n\nThe evaluation metrics output by nervaluate go beyond a simple token/tag based schema, and consider different scenarios \nbased on whether all the tokens that belong to a named entity were classified or not, and also whether the correct \nentity type was assigned.\n\nThis full problem is described in detail in the [original blog](http://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/) \npost by [David Batista](https://github.com/davidsbatista), and this package extends the code in the [original repository](https://github.com/davidsbatista/NER-Evaluation) \nwhich accompanied the blog post.\n\nThe code draws heavily on the papers:\n\n* [SemEval-2013 Task 9 : Extraction of Drug-Drug Interactions from Biomedical Texts (DDIExtraction 2013)](https://www.aclweb.org/anthology/S13-2056)\n\n* [SemEval-2013 Task 9.1 - Evaluation Metrics](https://davidsbatista.net/assets/documents/others/semeval_2013-task-9_1-evaluation-metrics.pdf)\n\n# Usage example\n\n```\npip install nervaluate\n```\n\nA possible input format are lists of NER labels, where each list corresponds to a sentence and each label is a token label.\nInitialize the `Evaluator` class with the true labels and predicted labels, and specify the entity types we want to evaluate.\n\n```python\nfrom nervaluate.evaluator import Evaluator\n\ntrue = [\n    ['O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG'],  # \"The John Smith who works at Google Inc\"\n    ['O', 'B-LOC', 'B-PER', 'I-PER', 'O', 'O', 'B-DATE'],      # \"In Paris Marie Curie lived in 1895\"\n]\n  \npred = [\n    ['O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-ORG', 'I-ORG'],\n    ['O', 'B-LOC', 'I-LOC', 'B-PER', 'O', 'O', 'B-DATE'],\n]\n   \nevaluator = Evaluator(true, pred, tags=['PER', 'ORG', 'LOC', 'DATE'], loader=\"list\")\n```\n\nPrint the summary report for the evaluation, which will show the metrics for each entity type and evaluation scenario:\n\n```python\n\nprint(evaluator.summary_report())\n\nScenario: all\n\n              correct   incorrect     partial      missed    spurious   precision      recall    f1-score\n\nent_type            5           0           0           0           0        1.00        1.00        1.00\n   exact            2           3           0           0           0        0.40        0.40        0.40\n partial            2           0           3           0           0        0.70        0.70        0.70\n  strict            2           3           0           0           0        0.40        0.40        0.40\n```  \n\nor aggregated by entity type under a specific evaluation scenario:\n\n```python\nprint(evaluator.summary_report(mode='entities'))  \n  \nScenario: strict\n\n             correct   incorrect     partial      missed    spurious   precision      recall    f1-score\n\n   DATE            1           0           0           0           0        1.00        1.00        1.00\n    LOC            0           1           0           0           0        0.00        0.00        0.00\n    ORG            1           0           0           0           0        1.00        1.00        1.00\n    PER            0           2           0           0           0        0.00        0.00        0.00\n```\n\n# Evaluation Scenarios\n\n## Token level evaluation for NER is too simplistic\n\nWhen running machine learning models for NER, it is common to report metrics at the individual token level. This may \nnot be the best approach, as a named entity can be made up of multiple tokens, so a full-entity accuracy would be \ndesirable.\n\nWhen comparing the golden standard annotations with the output of a NER system different scenarios might occur:\n\n__I. Surface string and entity type match__\n\n| Token | Gold  | Prediction |\n|-------|-------|------------|\n| in    | O     | O          |\n| New   | B-LOC | B-LOC      |\n| York  | I-LOC | I-LOC      |\n| .     | O     | O          |\n\n__II. System hypothesized an incorrect entity__\n\n| Token    | Gold | Prediction |\n|----------|------|------------|\n| an       | O    | O          |\n| Awful    | O    | B-ORG      |\n| Headache | O    | I-ORG      |\n| in       | O    | O          |\n\n__III. System misses an entity__\n\n| Token | Gold  | Prediction |\n|-------|-------|------------|\n| in    | O     | O          |\n| Palo  | B-LOC | O          |\n| Alto  | I-LOC | O          |\n| ,     | O     | O          |\n\nBased on these three scenarios we have a simple classification evaluation that can be measured in terms of false \npositives, true positives, false negatives and false positives, and subsequently compute precision, recall and \nF1-score for each named-entity type.\n\nHowever, this simple schema ignores the possibility of partial matches or other scenarios when the NER system gets\nthe named-entity surface string correct but the type wrong. We might also want to evaluate these scenarios \nagain at a full-entity level.\n\nFor example:\n\n__IV. System identifies the surface string but assigns the wrong entity type__\n\n| Token | Gold  | Prediction |\n|-------|-------|------------|\n| I     | O     | O          |\n| live  | O     | O          |\n| in    | O     | O          |\n| Palo  | B-LOC | B-ORG      |\n| Alto  | I-LOC | I-ORG      |\n| ,     | O     | O          |\n\n__V. System gets the boundaries of the surface string wrong__\n\n| Token   | Gold  | Prediction |\n|---------|-------|------------|\n| Unless  | O     | B-PER      |\n| Karl    | B-PER | I-PER      |\n| Smith   | I-PER | I-PER      |\n| resigns | O     | O          |\n\n__VI. System gets the boundaries and entity type wrong__\n\n| Token   | Gold  | Prediction |\n|---------|-------|------------|\n| Unless  | O     | B-ORG      |\n| Karl    | B-PER | I-ORG      |\n| Smith   | I-PER | I-ORG      |\n| resigns | O     | O          |\n\n\n## Defining evaluation metrics\n\nHow can we incorporate these described scenarios into evaluation metrics? See the [original blog](http://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/) \nfor a great explanation, a summary is included here.\n\nWe can define the following five metrics to consider different categories of errors:\n\n| Error type      | Explanation                                                              |\n|-----------------|--------------------------------------------------------------------------|\n| Correct (COR)   | both are the same                                                        |\n| Incorrect (INC) | the output of a system and the golden annotation don’t match             |\n| Partial (PAR)   | system and the golden annotation are somewhat “similar” but not the same |\n| Missing (MIS)   | a golden annotation is not captured by a system                          |\n| Spurious (SPU)  | system produces a response which doesn’t exist in the golden annotation  |\n\nThese five metrics can be measured in four different ways:\n\n| Evaluation schema | Explanation                                                                       |\n|-------------------|-----------------------------------------------------------------------------------|\n| Strict            | exact boundary surface string match and entity type                               |\n| Exact             | exact boundary match over the surface string, regardless of the type              |\n| Partial           | partial boundary match over the surface string, regardless of the type            |\n| Type              | some overlap between the system tagged entity and the gold annotation is required |\n\nThese five errors and four evaluation schema interact in the following ways:\n\n| Scenario | Gold entity | Gold string    | Pred entity | Pred string         | Type | Partial | Exact | Strict |\n|----------|-------------|----------------|-------------|---------------------|------|---------|-------|--------|\n| III      | BRAND       | tikosyn        |             |                     | MIS  | MIS     | MIS   | MIS    |\n| II       |             |                | BRAND       | healthy             | SPU  | SPU     | SPU   | SPU    |\n| V        | DRUG        | warfarin       | DRUG        | of warfarin         | COR  | PAR     | INC   | INC    |\n| IV       | DRUG        | propranolol    | BRAND       | propranolol         | INC  | COR     | COR   | INC    |\n| I        | DRUG        | phenytoin      | DRUG        | phenytoin           | COR  | COR     | COR   | COR    |\n| VI       | GROUP       | contraceptives | DRUG        | oral contraceptives | INC  | PAR     | INC   | INC    |\n\nThen precision, recall and f1-score are calculated for each different evaluation schema. In order to achieve data, \ntwo more quantities need to be calculated:\n\n```\nPOSSIBLE (POS) = COR + INC + PAR + MIS = TP + FN\nACTUAL (ACT) = COR + INC + PAR + SPU = TP + FP\n```\n\nThen we can compute precision, recall, f1-score, where roughly describing precision is the percentage of correct \nnamed-entities found by the NER system. Recall as the percentage of the named-entities in the golden annotations \nthat are retrieved by the NER system. \n\nThis is computed in two different ways depending on whether we want an exact  match (i.e., strict and exact ) or a \npartial match (i.e., partial and type) scenario:\n\n__Exact Match (i.e., strict and exact )__\n```\nPrecision = (COR / ACT) = TP / (TP + FP)\nRecall = (COR / POS) = TP / (TP+FN)\n```\n\n__Partial Match (i.e., partial and type)__\n```\nPrecision = (COR + 0.5 × PAR) / ACT = TP / (TP + FP)\nRecall = (COR + 0.5 × PAR)/POS = COR / ACT = TP / (TP + FN)\n```\n\n__Putting all together:__\n\n| Measure   | Type | Partial | Exact | Strict |\n|-----------|------|---------|-------|--------|\n| Correct   | 3    | 3       | 3     | 2      |\n| Incorrect | 2    | 0       | 2     | 3      |\n| Partial   | 0    | 2       | 0     | 0      |\n| Missed    | 1    | 1       | 1     | 1      |\n| Spurious  | 1    | 1       | 1     | 1      |\n| Precision | 0.5  | 0.66    | 0.5   | 0.33   |\n| Recall    | 0.5  | 0.66    | 0.5   | 0.33   |\n| F1        | 0.5  | 0.66    | 0.5   | 0.33   |\n\n\n## Notes:\n\nIn scenarios IV and VI the entity type of the `true` and `pred` does not match, in both cases we only scored against \nthe true entity, not the predicted one. You can argue that the predicted entity could also be scored as spurious, \nbut according to the definition of `spurious`:\n\n* Spurious (SPU) : system produces a response which does not exist in the golden annotation;\n\nIn this case there exists an annotation, but with a different entity type, so we assume it's only incorrect.\n\nFor the **Type** (ent_type) strategy, if multiple true entities of the same label overlap a\nprediction, the match is resolved by closest boundaries. This can change which\n``(instance_index, entity_index)`` appears in ``missed_indices`` compared to list order,\nwhile aggregate counts stay the same.\n\n\n## Contributing to the `nervaluate` package\n\n### Extending the package to accept more formats\n\nThe `Evaluator` accepts the following formats:\n\n* Nested lists containing NER labels\n* CoNLL style tab delimited strings\n* [prodi.gy](https://prodi.gy) style lists of spans\n\nAdditional formats can easily be added by creating a new loader class in `nervaluate/loaders.py`. The  loader class \nshould inherit from the `DataLoader` base class and implement the `load` method. \n\nThe `load` method should return a list of entity lists, where each entity is represented as a dictionary \nwith `label`, `start`, and `end` keys.\n\nThe new loader can then be added to the `_setup_loaders` method in the `Evaluator` class, and can be selected with the\n `loader` argument when instantiating the `Evaluator` class.\n\nHere is list of formats we intend to [include](https://github.com/MantisAI/nervaluate/issues/3).\n\n### General Contributing\n\nImprovements, adding new features and bug fixes are welcome. If you wish to participate in the development of `nervaluate` \nplease read the guidelines in the [CONTRIBUTING.md](CONTRIBUTING.md) file.\n\n---\n\nGive a ⭐️ if this project helped you!\n"
  },
  {
    "path": "examples/example_no_loader.py",
    "content": "import nltk\nimport sklearn_crfsuite\nfrom sklearn.metrics import classification_report\n\nfrom nervaluate import Evaluator, collect_named_entities, summary_report_ent, summary_report_overall\n\n\ndef word2features(sent, i):\n    word = sent[i][0]\n    postag = sent[i][1]\n\n    features = {\n        \"bias\": 1.0,\n        \"word.lower()\": word.lower(),\n        \"word[-3:]\": word[-3:],\n        \"word[-2:]\": word[-2:],\n        \"word.isupper()\": word.isupper(),\n        \"word.istitle()\": word.istitle(),\n        \"word.isdigit()\": word.isdigit(),\n        \"postag\": postag,\n        \"postag[:2]\": postag[:2],\n    }\n    if i > 0:\n        word1 = sent[i - 1][0]\n        postag1 = sent[i - 1][1]\n        features.update(\n            {\n                \"-1:word.lower()\": word1.lower(),\n                \"-1:word.istitle()\": word1.istitle(),\n                \"-1:word.isupper()\": word1.isupper(),\n                \"-1:postag\": postag1,\n                \"-1:postag[:2]\": postag1[:2],\n            }\n        )\n    else:\n        features[\"BOS\"] = True\n\n    if i < len(sent) - 1:\n        word1 = sent[i + 1][0]\n        postag1 = sent[i + 1][1]\n        features.update(\n            {\n                \"+1:word.lower()\": word1.lower(),\n                \"+1:word.istitle()\": word1.istitle(),\n                \"+1:word.isupper()\": word1.isupper(),\n                \"+1:postag\": postag1,\n                \"+1:postag[:2]\": postag1[:2],\n            }\n        )\n    else:\n        features[\"EOS\"] = True\n\n    return features\n\n\ndef sent2features(sent):\n    return [word2features(sent, i) for i in range(len(sent))]\n\n\ndef sent2labels(sent):\n    return [label for token, postag, label in sent]\n\n\ndef sent2tokens(sent):\n    return [token for token, postag, label in sent]\n\n\ndef main():\n    print(\"Loading CoNLL 2002 NER Spanish data\")\n    nltk.corpus.conll2002.fileids()\n    train_sents = list(nltk.corpus.conll2002.iob_sents(\"esp.train\"))\n    test_sents = list(nltk.corpus.conll2002.iob_sents(\"esp.testb\"))\n\n    x_train = [sent2features(s) for s in train_sents]\n    y_train = [sent2labels(s) for s in train_sents]\n\n    x_test = [sent2features(s) for s in test_sents]\n    y_test = [sent2labels(s) for s in test_sents]\n\n    print(\"Train a CRF on the CoNLL 2002 NER Spanish data\")\n    crf = sklearn_crfsuite.CRF(algorithm=\"lbfgs\", c1=0.1, c2=0.1, max_iterations=10, all_possible_transitions=True)\n    try:\n        crf.fit(x_train, y_train)\n    except AttributeError:\n        pass\n\n    y_pred = crf.predict(x_test)\n    labels = list(crf.classes_)\n    labels.remove(\"O\")  # remove 'O' label from evaluation\n    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))  # group B- and I- results\n    y_test_flat = [y for msg in y_test for y in msg]\n    y_pred_flat = [y for msg in y_pred for y in msg]\n    print(classification_report(y_test_flat, y_pred_flat, labels=sorted_labels))\n\n    test_sents_labels = []\n    for sentence in test_sents:\n        sentence = [token[2] for token in sentence]\n        test_sents_labels.append(sentence)\n\n    pred_collected = [collect_named_entities(msg) for msg in y_pred]\n    test_collected = [collect_named_entities(msg) for msg in y_test]\n\n    evaluator = Evaluator(test_collected, pred_collected, [\"LOC\", \"MISC\", \"PER\", \"ORG\"])\n    results, results_agg = evaluator.evaluate()\n\n    print(\"\\n\\nOverall\")\n    print(summary_report_overall(results))\n    print(\"\\n\\n'Strict'\")\n    print(summary_report_ent(results_agg, scenario=\"strict\"))\n    print(\"\\n\\n'Ent_Type'\")\n    print(summary_report_ent(results_agg, scenario=\"ent_type\"))\n    print(\"\\n\\n'Partial'\")\n    print(summary_report_ent(results_agg, scenario=\"partial\"))\n    print(\"\\n\\n'Exact'\")\n    print(summary_report_ent(results_agg, scenario=\"exact\"))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/run_example.sh",
    "content": "#!/bin/bash\n\npip install nltk\npip install sklearn\npip install sklearn_crfsuite\npython -m nltk.downloader conll2002\npython example_no_loader.py\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[build-system]\nrequires = [\"setuptools\", \"setuptools-scm\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[project]\nname = \"nervaluate\"\nversion = \"1.2.1\"\nauthors = [\n    { name=\"David S. Batista\"},\n    { name=\"Matthew Upson\"}\n]\ndescription = \"NER evaluation considering partial match scoring\"\nreadme = \"README.md\"\nrequires-python = \">=3.11\"\nkeywords = [\"named-entity-recognition\", \"ner\", \"evaluation-metrics\", \"partial-match-scoring\", \"nlp\"]\nlicense = {text = \"MIT License\"}\nclassifiers = [\n    \"Programming Language :: Python :: 3\",\n    \"Operating System :: OS Independent\"\n]\n\n[project.optional-dependencies]\ndev = [\n    \"black>=25.1.0\",\n    \"coverage>=7.8.0\",\n    \"gitchangelog\",\n    \"mypy>=1.15.0\",\n    \"pre-commit==3.3.1\",\n    \"pylint>=3.3.7\",\n    \"pytest>=8.3.5\",\n    \"pytest-cov>=6.1.1\",\n]\n\n[project.urls]\n\"Homepage\" = \"https://github.com/MantisAI/nervaluate\"\n\"Bug Tracker\" = \"https://github.com/MantisAI/nervaluate/issues\"\n\n[tool.pytest.ini_options]\ntestpaths = [\"tests\"]\npython_files = [\"test_*.py\"]\naddopts = \"--cov=nervaluate --cov-report=term-missing\"\n\n[tool.coverage.run]\nsource = [\"nervaluate\"]\nomit = [\"*__init__*\"]\n\n[tool.coverage.report]\nshow_missing = true\nprecision = 2\nsort = \"Miss\"\n\n[tool.black]\nline-length = 120\ntarget-version = [\"py311\"]\n\n[tool.pylint.messages_control]\ndisable = [\n    \"C0111\",  # missing-docstring\n    \"C0103\",  # invalid-name\n    \"W0511\",  # fixme\n    \"W0603\",  # global-statement\n    \"W1202\",  # logging-format-interpolation\n    \"W1203\",  # logging-fstring-interpolation\n    \"E1126\",  # invalid-sequence-index\n    \"E1137\",  # invalid-slice-index\n    \"I0011\",  # bad-option-value\n    \"I0020\",  # bad-option-value\n    \"R0801\",  # duplicate-code\n    \"W9020\",  # bad-option-value\n    \"W0621\",  # redefined-outer-name\n    \"W0212\",  # protected-access\n]\n\n[tool.pylint.'DESIGN']\nmax-args = 38           # Default is 5\nmax-attributes = 28     # Default is 7\nmax-branches = 14       # Default is 12\nmax-locals = 45         # Default is 15\nmax-module-lines = 2468 # Default is 1000\nmax-nested-blocks = 9   # Default is 5\nmax-statements = 206    # Default is 50\nmin-public-methods = 1  # Allow classes with just one public method\n\n[tool.pylint.format]\nmax-line-length = 120\n\n[tool.pylint.basic]\naccept-no-param-doc = true\naccept-no-raise-doc = true\naccept-no-return-doc = true\naccept-no-yields-doc = true\ndefault-docstring-type = \"numpy\"\n\n[tool.pylint.master]\nload-plugins = [\"pylint.extensions.docparams\"]\nignore-paths = [\"./examples/.*\"]\n\n[tool.mypy]\npython_version = \"3.11\"\nignore_missing_imports = true\ndisallow_any_unimported = true\ndisallow_untyped_defs = true\nwarn_redundant_casts = true\nwarn_unused_ignores = true\nwarn_unused_configs = true\n\n[[tool.mypy.overrides]]\nmodule = \"examples.*\"\nfollow_imports = \"skip\"\n\n[tool.hatch.envs.dev]\ndependencies = [\n    \"black==24.3.0\",\n    \"coverage==7.2.5\",\n    \"gitchangelog\",\n    \"mypy==1.3.0\",\n    \"pre-commit==3.3.1\",\n    \"pylint==2.17.4\",\n    \"pytest==7.3.1\",\n    \"pytest-cov==4.1.0\",\n]\n\n[tool.hatch.envs.dev.scripts]\nlint = [\n    \"black -t py311 -l 120 src tests\",\n    \"pylint src tests\"\n]\ntyping = \"mypy src\"\ntest = \"pytest\"\nclean = \"rm -rf dist src/nervaluate.egg-info .coverage .mypy_cache .pytest_cache\"\nchangelog = \"gitchangelog > CHANGELOG.rst\"\nall = [\n    \"clean\",\n    \"lint\",\n    \"typing\",\n    \"test\"\n]\n"
  },
  {
    "path": "src/nervaluate/__init__.py",
    "content": "from .evaluator import Evaluator\nfrom .utils import collect_named_entities, conll_to_spans, list_to_spans, split_list\n"
  },
  {
    "path": "src/nervaluate/entities.py",
    "content": "from dataclasses import dataclass\nfrom typing import List, Tuple\n\n\n@dataclass\nclass Entity:\n    \"\"\"Represents a named entity with its position and label.\"\"\"\n\n    label: str\n    start: int\n    end: int\n\n    def __eq__(self, other: object) -> bool:\n        if not isinstance(other, Entity):\n            return NotImplemented\n        return self.label == other.label and self.start == other.start and self.end == other.end\n\n    def __hash__(self) -> int:\n        return hash((self.label, self.start, self.end))\n\n\n@dataclass\nclass EvaluationResult:\n    \"\"\"Represents the evaluation metrics for a single entity type or overall.\"\"\"\n\n    correct: int = 0\n    incorrect: int = 0\n    partial: int = 0\n    missed: int = 0\n    spurious: int = 0\n    precision: float = 0.0\n    recall: float = 0.0\n    f1: float = 0.0\n    actual: int = 0\n    possible: int = 0\n\n    def compute_metrics(self, partial_or_type: bool = False) -> None:\n        \"\"\"Compute precision, recall and F1 score.\"\"\"\n        self.actual = self.correct + self.incorrect + self.partial + self.spurious\n        self.possible = self.correct + self.incorrect + self.partial + self.missed\n\n        if partial_or_type:\n            precision = (self.correct + 0.5 * self.partial) / self.actual if self.actual > 0 else 0\n            recall = (self.correct + 0.5 * self.partial) / self.possible if self.possible > 0 else 0\n        else:\n            precision = self.correct / self.actual if self.actual > 0 else 0\n            recall = self.correct / self.possible if self.possible > 0 else 0\n\n        self.precision = precision\n        self.recall = recall\n        self.f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0\n\n\n@dataclass\nclass EvaluationIndices:\n    \"\"\"Represents the indices of entities in different evaluation categories.\"\"\"\n\n    correct_indices: List[Tuple[int, int]] = None  # type: ignore\n    incorrect_indices: List[Tuple[int, int]] = None  # type: ignore\n    partial_indices: List[Tuple[int, int]] = None  # type: ignore\n    missed_indices: List[Tuple[int, int]] = None  # type: ignore\n    spurious_indices: List[Tuple[int, int]] = None  # type: ignore\n\n    def __post_init__(self) -> None:\n        if self.correct_indices is None:\n            self.correct_indices = []\n        if self.incorrect_indices is None:\n            self.incorrect_indices = []\n        if self.partial_indices is None:\n            self.partial_indices = []\n        if self.missed_indices is None:\n            self.missed_indices = []\n        if self.spurious_indices is None:\n            self.spurious_indices = []\n"
  },
  {
    "path": "src/nervaluate/evaluator.py",
    "content": "from typing import List, Dict, Any, Union, Optional\nimport csv\nimport io\n\nfrom .entities import EvaluationResult, EvaluationIndices\nfrom .strategies import (\n    EvaluationStrategy,\n    StrictEvaluation,\n    PartialEvaluation,\n    EntityTypeEvaluation,\n    ExactEvaluation,\n)\nfrom .loaders import DataLoader, ConllLoader, ListLoader, DictLoader\nfrom .entities import Entity\n\n\nclass Evaluator:\n    \"\"\"Main evaluator class for NER evaluation.\"\"\"\n\n    def __init__(\n        self, true: Any, pred: Any, tags: List[str], loader: str = \"default\", min_overlap_percentage: float = 1.0\n    ) -> None:\n        \"\"\"\n        Initialize the evaluator.\n\n        Args:\n            true: True entities in any supported format\n            pred: Predicted entities in any supported format\n            tags: List of valid entity tags\n            loader: Name of the loader to use\n            min_overlap_percentage: Minimum overlap percentage for partial matches (1-100)\n        \"\"\"\n        self.tags = tags\n        self.min_overlap_percentage = min_overlap_percentage\n        self._setup_loaders()\n        self._load_data(true, pred, loader)\n        self._setup_evaluation_strategies()\n\n    def _setup_loaders(self) -> None:\n        \"\"\"Setup available data loaders.\"\"\"\n        self.loaders: Dict[str, DataLoader] = {\"conll\": ConllLoader(), \"list\": ListLoader(), \"dict\": DictLoader()}\n\n    def _setup_evaluation_strategies(self) -> None:\n        \"\"\"Setup evaluation strategies with overlap threshold.\"\"\"\n        self.strategies: Dict[str, EvaluationStrategy] = {\n            \"strict\": StrictEvaluation(self.min_overlap_percentage),\n            \"partial\": PartialEvaluation(self.min_overlap_percentage),\n            \"ent_type\": EntityTypeEvaluation(self.min_overlap_percentage),\n            \"exact\": ExactEvaluation(self.min_overlap_percentage),\n        }\n\n    def _load_data(self, true: Any, pred: Any, loader: str) -> None:\n        \"\"\"Load the true and predicted data.\"\"\"\n        if loader == \"default\":\n            # Try to infer the loader based on input type\n            if isinstance(true, str):\n                loader = \"conll\"\n            elif isinstance(true, list) and true and isinstance(true[0], list):\n                if isinstance(true[0][0], dict):\n                    loader = \"dict\"\n                else:\n                    loader = \"list\"\n            else:\n                raise ValueError(\"Could not infer loader from input type\")\n\n        if loader not in self.loaders:\n            raise ValueError(f\"Unknown loader: {loader}\")\n\n        # For list loader, check document lengths before loading\n        if loader == \"list\":\n            if len(true) != len(pred):\n                raise ValueError(\"Number of predicted documents does not equal true\")\n\n            # Check that each document has the same length\n            for i, (true_doc, pred_doc) in enumerate(zip(true, pred)):\n                if len(true_doc) != len(pred_doc):\n                    raise ValueError(f\"Document {i} has different lengths: true={len(true_doc)}, pred={len(pred_doc)}\")\n\n        self.true = self.loaders[loader].load(true)\n        self.pred = self.loaders[loader].load(pred)\n\n        if len(self.true) != len(self.pred):\n            raise ValueError(\"Number of predicted documents does not equal true\")\n\n    def evaluate(self) -> Dict[str, Any]:\n        \"\"\"\n        Run the evaluation.\n\n        Returns:\n            Dictionary containing evaluation results for each strategy and entity type\n        \"\"\"\n        results = {}\n        # Get unique tags that appear in either true or predicted data\n        used_tags = set()  # type: ignore\n        for doc in self.true:\n            used_tags.update(e.label for e in doc)\n        for doc in self.pred:\n            used_tags.update(e.label for e in doc)\n        # Only keep tags that are both used and in the allowed tags list\n        used_tags = used_tags.intersection(set(self.tags))\n\n        entity_results: Dict[str, Dict[str, EvaluationResult]] = {tag: {} for tag in used_tags}\n        indices = {}\n        entity_indices: Dict[str, Dict[str, EvaluationIndices]] = {tag: {} for tag in used_tags}\n\n        # Evaluate each document\n        for doc_idx, (true_doc, pred_doc) in enumerate(zip(self.true, self.pred)):\n            # Filter entities by valid tags\n            true_doc = [e for e in true_doc if e.label in self.tags]\n            pred_doc = [e for e in pred_doc if e.label in self.tags]\n\n            # Evaluate with each strategy\n            for strategy_name, strategy in self.strategies.items():\n                result, doc_indices = strategy.evaluate(true_doc, pred_doc, self.tags, doc_idx)\n\n                # Update overall results\n                if strategy_name not in results:\n                    results[strategy_name] = result\n                    indices[strategy_name] = doc_indices\n                else:\n                    self._merge_results(results[strategy_name], result, strategy_name)\n                    self._merge_indices(indices[strategy_name], doc_indices)\n\n                # Update entity-specific results\n                for tag in used_tags:\n                    # Filter entities for this specific tag\n                    true_tag_doc = [e for e in true_doc if e.label == tag]\n                    pred_tag_doc = [e for e in pred_doc if e.label == tag]\n\n                    # Evaluate only entities of this tag\n                    tag_result, tag_indices = strategy.evaluate(true_tag_doc, pred_tag_doc, [tag], doc_idx)\n\n                    if tag not in entity_results:\n                        entity_results[tag] = {}\n                        entity_indices[tag] = {}\n                    if strategy_name not in entity_results[tag]:\n                        entity_results[tag][strategy_name] = tag_result\n                        entity_indices[tag][strategy_name] = tag_indices\n                    else:\n                        self._merge_results(entity_results[tag][strategy_name], tag_result, strategy_name)\n                        self._merge_indices(entity_indices[tag][strategy_name], tag_indices)\n\n        return {\n            \"overall\": results,\n            \"entities\": entity_results,\n            \"overall_indices\": indices,\n            \"entity_indices\": entity_indices,\n        }\n\n    @staticmethod\n    def _merge_results(\n        target: EvaluationResult, source: EvaluationResult, strategy_name: str\n    ) -> None:\n        \"\"\"Merge two evaluation results.\"\"\"\n        target.correct += source.correct\n        target.incorrect += source.incorrect\n        target.partial += source.partial\n        target.missed += source.missed\n        target.spurious += source.spurious\n        use_partial_formula = strategy_name in (\"partial\", \"ent_type\")\n        target.compute_metrics(partial_or_type=use_partial_formula)\n\n    @staticmethod\n    def _merge_indices(target: EvaluationIndices, source: EvaluationIndices) -> None:\n        \"\"\"Merge two evaluation indices.\"\"\"\n        target.correct_indices.extend(source.correct_indices)\n        target.incorrect_indices.extend(source.incorrect_indices)\n        target.partial_indices.extend(source.partial_indices)\n        target.missed_indices.extend(source.missed_indices)\n        target.spurious_indices.extend(source.spurious_indices)\n\n    def results_to_csv(\n        self, mode: str = \"overall\", scenario: str = \"strict\", file_path: Optional[str] = None\n    ) -> Union[str, None]:\n        \"\"\"\n        Convert results to CSV format.\n\n        Args:\n            mode: Either 'overall' for overall metrics or 'entities' for per-entity metrics\n            scenario: The scenario to report on (only used when mode is 'entities')\n            file_path: Optional path to save CSV file. If None, returns CSV as string\n\n        Returns:\n            CSV content as string if file_path is None, otherwise None (saves to file)\n        \"\"\"\n        valid_modes = {\"overall\", \"entities\"}\n        valid_scenarios = {\"strict\", \"ent_type\", \"partial\", \"exact\"}\n\n        if mode not in valid_modes:\n            raise ValueError(f\"Invalid mode: must be one of {valid_modes}\")\n\n        if mode == \"entities\" and scenario not in valid_scenarios:\n            raise ValueError(f\"Invalid scenario: must be one of {valid_scenarios}\")\n\n        results = self.evaluate()\n\n        if mode == \"overall\":\n            # For overall mode, include all scenarios\n            csv_data = [\n                [\"Strategy\", \"Correct\", \"Incorrect\", \"Partial\", \"Missed\", \"Spurious\", \"Precision\", \"Recall\", \"F1-Score\"]\n            ]\n            results_data = results[\"overall\"]\n            for strategy_name, strategy_result in results_data.items():\n                csv_data.append(\n                    [\n                        strategy_name,\n                        strategy_result.correct,\n                        strategy_result.incorrect,\n                        strategy_result.partial,\n                        strategy_result.missed,\n                        strategy_result.spurious,\n                        strategy_result.precision,\n                        strategy_result.recall,\n                        strategy_result.f1,\n                    ]\n                )\n        else:\n            csv_data = [\n                [\"Entity\", \"Correct\", \"Incorrect\", \"Partial\", \"Missed\", \"Spurious\", \"Precision\", \"Recall\", \"F1-Score\"]\n            ]\n            results_data = results[\"entities\"]\n            for entity_type, entity_results in results_data.items():\n                if scenario in entity_results:\n                    strategy_result = entity_results[scenario]\n                    csv_data.append(\n                        [\n                            entity_type,\n                            strategy_result.correct,\n                            strategy_result.incorrect,\n                            strategy_result.partial,\n                            strategy_result.missed,\n                            strategy_result.spurious,\n                            strategy_result.precision,\n                            strategy_result.recall,\n                            strategy_result.f1,\n                        ]\n                    )\n\n        if file_path:\n            with open(file_path, \"w\", newline=\"\", encoding=\"utf-8\") as csvfile:\n                writer = csv.writer(csvfile)\n                writer.writerows(csv_data)\n            return None\n\n        output = io.StringIO()\n        writer = csv.writer(output)\n        writer.writerows(csv_data)\n        return output.getvalue()\n\n    def summary_report(self, mode: str = \"overall\", scenario: str = \"strict\", digits: int = 2) -> str:\n        \"\"\"\n        Generate a summary report of the evaluation results.\n\n        Args:\n            mode: Either 'overall' for overall metrics or 'entities' for per-entity metrics.\n            scenario: The scenario to report on. Only used when mode is 'entities'.\n                      Must be one of:\n                        - 'strict' exact boundary surface string match and entity type;\n                        - 'exact': exact boundary match over the surface string and entity type;\n                        - 'partial': partial boundary match over the surface string, regardless of the type;\n                        - 'ent_type': exact boundary match over the surface string, regardless of the type;\n            digits: The number of digits to round the results to.\n\n        Returns:\n            A string containing the summary report.\n\n        Raises:\n            ValueError: If the scenario or mode is invalid.\n        \"\"\"\n        valid_scenarios = {\"strict\", \"ent_type\", \"partial\", \"exact\"}\n        valid_modes = {\"overall\", \"entities\"}\n\n        if mode not in valid_modes:\n            raise ValueError(f\"Invalid mode: must be one of {valid_modes}\")\n\n        if mode == \"entities\" and scenario not in valid_scenarios:\n            raise ValueError(f\"Invalid scenario: must be one of {valid_scenarios}\")\n\n        headers = [\"correct\", \"incorrect\", \"partial\", \"missed\", \"spurious\", \"precision\", \"recall\", \"f1-score\"]\n        rows = [headers]\n\n        results = self.evaluate()\n        if mode == \"overall\":\n            # Process overall results - show all scenarios\n            results_data = results[\"overall\"]\n            for eval_schema in sorted(valid_scenarios):  # Sort to ensure consistent order\n                if eval_schema not in results_data:\n                    continue\n                results_schema = results_data[eval_schema]\n                rows.append(\n                    [\n                        eval_schema,\n                        results_schema.correct,\n                        results_schema.incorrect,\n                        results_schema.partial,\n                        results_schema.missed,\n                        results_schema.spurious,\n                        results_schema.precision,\n                        results_schema.recall,\n                        results_schema.f1,\n                    ]\n                )\n        else:\n            # Process entity-specific results for the specified scenario only\n            results_data = results[\"entities\"]\n            target_names = sorted(results_data.keys())\n            for ent_type in target_names:\n                if scenario not in results_data[ent_type]:\n                    continue  # Skip if scenario not available for this entity type\n\n                results_ent = results_data[ent_type][scenario]\n                rows.append(\n                    [\n                        ent_type,\n                        results_ent.correct,\n                        results_ent.incorrect,\n                        results_ent.partial,\n                        results_ent.missed,\n                        results_ent.spurious,\n                        results_ent.precision,\n                        results_ent.recall,\n                        results_ent.f1,\n                    ]\n                )\n\n        # Format the report\n        name_width = max(len(str(row[0])) for row in rows)\n        width = max(name_width, digits)\n        head_fmt = \"{:>{width}s} \" + \" {:>11}\" * len(headers)\n        report = f\"Scenario: {scenario if mode == 'entities' else 'all'}\\n\\n\" + head_fmt.format(\n            \"\", *headers, width=width\n        )\n        report += \"\\n\\n\"\n        row_fmt = \"{:>{width}s} \" + \" {:>11}\" * 5 + \" {:>11.{digits}f}\" * 3 + \"\\n\"\n\n        for row in rows[1:]:\n            report += row_fmt.format(*row, width=width, digits=digits)\n\n        return report\n\n    def summary_report_indices(  # pylint: disable=too-many-branches\n        self, mode: str = \"overall\", scenario: str = \"strict\", colors: bool = False\n    ) -> str:\n        \"\"\"\n        Generate a summary report of the evaluation indices.\n\n        Args:\n            mode: Either 'overall' for overall metrics or 'entities' for per-entity metrics.\n            scenario: The scenario to report on. Must be one of: 'strict', 'ent_type', 'partial', 'exact'.\n                     Only used when mode is 'entities'. Defaults to 'strict'.\n            colors: Whether to use colors in the output. Defaults to False.\n\n        Returns:\n            A string containing the summary report of indices.\n\n        Raises:\n            ValueError: If the scenario or mode is invalid.\n        \"\"\"\n        valid_scenarios = {\"strict\", \"ent_type\", \"partial\", \"exact\"}\n        valid_modes = {\"overall\", \"entities\"}\n\n        if mode not in valid_modes:\n            raise ValueError(f\"Invalid mode: must be one of {valid_modes}\")\n\n        if mode == \"entities\" and scenario not in valid_scenarios:\n            raise ValueError(f\"Invalid scenario: must be one of {valid_scenarios}\")\n\n        # ANSI color codes\n        COLORS = {\n            \"reset\": \"\\033[0m\",\n            \"bold\": \"\\033[1m\",\n            \"red\": \"\\033[91m\",\n            \"green\": \"\\033[92m\",\n            \"yellow\": \"\\033[93m\",\n            \"blue\": \"\\033[94m\",\n            \"magenta\": \"\\033[95m\",\n            \"cyan\": \"\\033[96m\",\n            \"white\": \"\\033[97m\",\n        }\n\n        def colorize(text: str, color: str) -> str:\n            \"\"\"Helper function to colorize text if colors are enabled.\"\"\"\n            if colors:\n                return f\"{COLORS[color]}{text}{COLORS['reset']}\"\n            return text\n\n        def get_prediction_info(pred: Union[Entity, str]) -> str:\n            \"\"\"Helper function to get prediction info based on pred type.\"\"\"\n            if isinstance(pred, Entity):\n                return f\"Label={pred.label}, Start={pred.start}, End={pred.end}\"\n            # String (BIO tag)\n            return f\"Tag={pred}\"\n\n        results = self.evaluate()\n        report = \"\"\n\n        # Create headers for the table\n        headers = [\"Category\", \"Instance\", \"Entity\", \"Details\"]\n        header_fmt = \"{:<20} {:<10} {:<8} {:<25}\"\n        row_fmt = \"{:<20} {:<10} {:<8} {:<10}\"\n\n        if mode == \"overall\":\n            # Get the indices from the overall results\n            indices_data = results[\"overall_indices\"][scenario]\n            report += f\"\\n{colorize('Indices for error schema', 'bold')} '{colorize(scenario, 'cyan')}':\\n\\n\"\n            report += colorize(header_fmt.format(*headers), \"bold\") + \"\\n\"\n            report += colorize(\"-\" * 78, \"white\") + \"\\n\"\n\n            for category, indices in indices_data.__dict__.items():\n                if not category.endswith(\"_indices\"):\n                    continue\n                category_name = category.replace(\"_indices\", \"\").replace(\"_\", \" \").capitalize()\n\n                # Color mapping for categories\n                category_colors = {\n                    \"Correct\": \"green\",\n                    \"Incorrect\": \"red\",\n                    \"Partial\": \"yellow\",\n                    \"Missed\": \"magenta\",\n                    \"Spurious\": \"blue\",\n                }\n\n                if indices:\n                    for instance_index, entity_index in indices:\n                        if self.pred != [[]]:\n                            pred = self.pred[instance_index][entity_index]\n                            prediction_info = get_prediction_info(pred)\n                            report += (\n                                row_fmt.format(\n                                    colorize(category_name, category_colors.get(category_name, \"white\")),\n                                    f\"{instance_index}\",\n                                    f\"{entity_index}\",\n                                    prediction_info,\n                                )\n                                + \"\\n\"\n                            )\n                        else:\n                            report += (\n                                row_fmt.format(\n                                    colorize(category_name, category_colors.get(category_name, \"white\")),\n                                    f\"{instance_index}\",\n                                    f\"{entity_index}\",\n                                    \"No prediction info\",\n                                )\n                                + \"\\n\"\n                            )\n                else:\n                    report += (\n                        row_fmt.format(\n                            colorize(category_name, category_colors.get(category_name, \"white\")), \"-\", \"-\", \"None\"\n                        )\n                        + \"\\n\"\n                    )\n        else:\n            # Get the indices from the entity-specific results\n            for entity_type, entity_results in results[\"entity_indices\"].items():\n                report += f\"\\n{colorize('Entity Type', 'bold')}: {colorize(entity_type, 'cyan')}\\n\"\n                report += f\"{colorize('Error Schema', 'bold')}: '{colorize(scenario, 'cyan')}'\\n\\n\"\n                report += colorize(header_fmt.format(*headers), \"bold\") + \"\\n\"\n                report += colorize(\"-\" * 78, \"white\") + \"\\n\"\n\n                error_data = entity_results[scenario]\n                for category, indices in error_data.__dict__.items():\n                    if not category.endswith(\"_indices\"):\n                        continue\n                    category_name = category.replace(\"_indices\", \"\").replace(\"_\", \" \").capitalize()\n\n                    # Color mapping for categories\n                    category_colors = {\n                        \"Correct\": \"green\",\n                        \"Incorrect\": \"red\",\n                        \"Partial\": \"yellow\",\n                        \"Missed\": \"magenta\",\n                        \"Spurious\": \"blue\",\n                    }\n\n                    if indices:\n                        for instance_index, entity_index in indices:\n                            if self.pred != [[]]:\n                                pred = self.pred[instance_index][entity_index]\n                                prediction_info = get_prediction_info(pred)\n                                report += (\n                                    row_fmt.format(\n                                        colorize(category_name, category_colors.get(category_name, \"white\")),\n                                        f\"{instance_index}\",\n                                        f\"{entity_index}\",\n                                        prediction_info,\n                                    )\n                                    + \"\\n\"\n                                )\n                            else:\n                                report += (\n                                    row_fmt.format(\n                                        colorize(category_name, category_colors.get(category_name, \"white\")),\n                                        f\"{instance_index}\",\n                                        f\"{entity_index}\",\n                                        \"No prediction info\",\n                                    )\n                                    + \"\\n\"\n                                )\n                    else:\n                        report += (\n                            row_fmt.format(\n                                colorize(category_name, category_colors.get(category_name, \"white\")), \"-\", \"-\", \"None\"\n                            )\n                            + \"\\n\"\n                        )\n\n        return report\n"
  },
  {
    "path": "src/nervaluate/loaders.py",
    "content": "from abc import ABC, abstractmethod\nfrom typing import List, Dict, Any\n\nfrom .entities import Entity\n\n\nclass DataLoader(ABC):\n    \"\"\"Abstract base class for data loaders.\"\"\"\n\n    @abstractmethod\n    def load(self, data: Any) -> List[List[Entity]]:\n        \"\"\"Load data into a list of entity lists.\"\"\"\n\n\nclass ConllLoader(DataLoader):\n    \"\"\"Loader for CoNLL format data.\"\"\"\n\n    def load(self, data: str) -> List[List[Entity]]:  # pylint: disable=too-many-branches\n        \"\"\"Load CoNLL format data into a list of Entity lists.\"\"\"\n        if not isinstance(data, str):\n            raise ValueError(\"ConllLoader expects string input\")\n\n        if not data:\n            return []\n\n        result: List[List[Entity]] = []\n        # Strip trailing whitespace and newlines to avoid empty documents\n        documents = data.rstrip().split(\"\\n\\n\")\n\n        for doc in documents:\n            if not doc.strip():\n                result.append([])\n                continue\n\n            current_doc = []\n            start_offset = None\n            end_offset = None\n            ent_type = None\n            has_entities = False\n\n            for offset, line in enumerate(doc.split(\"\\n\")):\n                if not line.strip():\n                    continue\n\n                parts = line.split(\"\\t\")\n                if len(parts) < 2:\n                    raise ValueError(f\"Invalid CoNLL format: line '{line}' does not contain a tab separator\")\n\n                token_tag = parts[1]\n\n                if token_tag == \"O\":\n                    if ent_type is not None and start_offset is not None:\n                        end_offset = offset - 1\n                        if isinstance(start_offset, int) and isinstance(end_offset, int):\n                            current_doc.append(Entity(label=ent_type, start=start_offset, end=end_offset))\n                        start_offset = None\n                        end_offset = None\n                        ent_type = None\n\n                elif ent_type is None:\n                    if not (token_tag.startswith(\"B-\") or token_tag.startswith(\"I-\")):\n                        raise ValueError(f\"Invalid tag format: {token_tag}\")\n                    ent_type = token_tag[2:]  # Remove B- or I- prefix\n                    start_offset = offset\n                    has_entities = True\n\n                elif ent_type != token_tag[2:] or (ent_type == token_tag[2:] and token_tag[:1] == \"B\"):\n                    end_offset = offset - 1\n                    if isinstance(start_offset, int) and isinstance(end_offset, int):\n                        current_doc.append(Entity(label=ent_type, start=start_offset, end=end_offset))\n\n                    # start of a new entity\n                    if not (token_tag.startswith(\"B-\") or token_tag.startswith(\"I-\")):\n                        raise ValueError(f\"Invalid tag format: {token_tag}\")\n                    ent_type = token_tag[2:]\n                    start_offset = offset\n                    end_offset = None\n                    has_entities = True\n\n            # Catches an entity that goes up until the last token\n            if ent_type is not None and start_offset is not None and end_offset is None:\n                if isinstance(start_offset, int):\n                    current_doc.append(Entity(label=ent_type, start=start_offset, end=len(doc.split(\"\\n\")) - 1))\n                has_entities = True\n\n            result.append(current_doc if has_entities else [])\n\n        return result\n\n\nclass ListLoader(DataLoader):\n    \"\"\"Loader for list format data.\"\"\"\n\n    def load(self, data: List[List[str]]) -> List[List[Entity]]:  # pylint: disable=too-many-branches\n        \"\"\"Load list format data into a list of entity lists.\"\"\"\n        if not isinstance(data, list):\n            raise ValueError(\"ListLoader expects list input\")\n\n        if not data:\n            return []\n\n        result = []\n\n        for doc in data:\n            if not isinstance(doc, list):\n                raise ValueError(\"Each document must be a list of tags\")\n\n            current_doc = []\n            start_offset = None\n            end_offset = None\n            ent_type = None\n\n            for offset, token_tag in enumerate(doc):\n                if not isinstance(token_tag, str):\n                    raise ValueError(f\"Invalid tag type: {type(token_tag)}\")\n\n                if token_tag == \"O\":\n                    if ent_type is not None and start_offset is not None:\n                        end_offset = offset - 1\n                        if isinstance(start_offset, int) and isinstance(end_offset, int):\n                            current_doc.append(Entity(label=ent_type, start=start_offset, end=end_offset))\n                        start_offset = None\n                        end_offset = None\n                        ent_type = None\n\n                elif ent_type is None:\n                    if not (token_tag.startswith(\"B-\") or token_tag.startswith(\"I-\")):\n                        raise ValueError(f\"Invalid tag format: {token_tag}\")\n                    ent_type = token_tag[2:]  # Remove B- or I- prefix\n                    start_offset = offset\n\n                elif ent_type != token_tag[2:] or (ent_type == token_tag[2:] and token_tag[:1] == \"B\"):\n                    end_offset = offset - 1\n                    if isinstance(start_offset, int) and isinstance(end_offset, int):\n                        current_doc.append(Entity(label=ent_type, start=start_offset, end=end_offset))\n\n                    # start of a new entity\n                    if not (token_tag.startswith(\"B-\") or token_tag.startswith(\"I-\")):\n                        raise ValueError(f\"Invalid tag format: {token_tag}\")\n                    ent_type = token_tag[2:]\n                    start_offset = offset\n                    end_offset = None\n\n            # Catches an entity that goes up until the last token\n            if ent_type is not None and start_offset is not None and end_offset is None:\n                if isinstance(start_offset, int):\n                    current_doc.append(Entity(label=ent_type, start=start_offset, end=len(doc) - 1))\n\n            result.append(current_doc)\n\n        return result\n\n\nclass DictLoader(DataLoader):\n    \"\"\"Loader for dictionary format data.\"\"\"\n\n    def load(self, data: List[List[Dict[str, Any]]]) -> List[List[Entity]]:\n        \"\"\"Load dictionary format data into a list of entity lists.\"\"\"\n        if not isinstance(data, list):\n            raise ValueError(\"DictLoader expects list input\")\n\n        if not data:\n            return []\n\n        result = []\n\n        for doc in data:\n            if not isinstance(doc, list):\n                raise ValueError(\"Each document must be a list of entity dictionaries\")\n\n            current_doc = []\n            for entity in doc:\n                if not isinstance(entity, dict):\n                    raise ValueError(f\"Invalid entity type: {type(entity)}\")\n\n                required_keys = {\"label\", \"start\", \"end\"}\n                if not all(key in entity for key in required_keys):\n                    raise ValueError(f\"Entity missing required keys: {required_keys}\")\n\n                if not isinstance(entity[\"label\"], str):\n                    raise ValueError(\"Entity label must be a string\")\n\n                if not isinstance(entity[\"start\"], int) or not isinstance(entity[\"end\"], int):\n                    raise ValueError(\"Entity start and end must be integers\")\n\n                current_doc.append(Entity(label=entity[\"label\"], start=entity[\"start\"], end=entity[\"end\"]))\n            result.append(current_doc)\n\n        return result\n"
  },
  {
    "path": "src/nervaluate/strategies.py",
    "content": "from abc import ABC, abstractmethod\nfrom typing import List, Tuple\n\nfrom .entities import Entity, EvaluationResult, EvaluationIndices\n\n\nclass EvaluationStrategy(ABC):\n    \"\"\"Abstract base class for evaluation strategies.\"\"\"\n\n    def __init__(self, min_overlap_percentage: float = 1.0):\n        \"\"\"\n        Initialize strategy with minimum overlap threshold.\n\n        Args:\n            min_overlap_percentage: Minimum overlap percentage required (1-100)\n        \"\"\"\n        if not 1.0 <= min_overlap_percentage <= 100.0:\n            raise ValueError(\"min_overlap_percentage must be between 1.0 and 100.0\")\n        self.min_overlap_percentage = min_overlap_percentage\n\n    @staticmethod\n    def _calculate_overlap_percentage(pred: Entity, true: Entity) -> float:\n        \"\"\"\n        Calculate the percentage overlap between predicted and true entities.\n\n        Returns:\n            Overlap percentage based on true entity span (0-100)\n        \"\"\"\n        # Check if there's any overlap first\n        if pred.start > true.end or pred.end < true.start:\n            return 0.0\n\n        # Calculate overlap boundaries\n        overlap_start = max(pred.start, true.start)\n        overlap_end = min(pred.end, true.end)\n\n        # Calculate spans (adding 1 because end is inclusive)\n        overlap_span = overlap_end - overlap_start + 1\n        true_span = true.end - true.start + 1\n\n        # Calculate percentage based on true entity span\n        return (overlap_span / true_span) * 100.0\n\n    @staticmethod\n    def _calculate_boundaries_distance(pred: Entity, true: Entity) -> float:\n        \"\"\"\n        Calculate distance between predicted and true entities boundaries.\n\n        Returns:\n            Distance between predicted and true boundaries\n        \"\"\"\n        # Calculate boundaries gaps\n        distance_starts = abs(pred.start - true.start)\n        distance_ends = abs(pred.end - true.end)\n\n        return distance_starts + distance_ends\n\n    def _has_sufficient_overlap(self, pred: Entity, true: Entity) -> bool:\n        \"\"\"Check if entities have sufficient overlap based on threshold.\"\"\"\n        overlap_percentage = EvaluationStrategy._calculate_overlap_percentage(pred, true)\n        return overlap_percentage >= self.min_overlap_percentage\n\n    @abstractmethod\n    def evaluate(\n        self, true_entities: List[Entity], pred_entities: List[Entity], tags: List[str], instance_index: int = 0\n    ) -> Tuple[EvaluationResult, EvaluationIndices]:\n        \"\"\"Evaluate the predicted entities against the true entities.\"\"\"\n\n\nclass StrictEvaluation(EvaluationStrategy):\n    \"\"\"\n    Strict evaluation strategy - entities must match exactly.\n\n    If there's a predicted entity that perfectly matches a true entity and they have the same label\n    we mark it as correct.\n    If there's a predicted entity that doesn't perfectly match any true entity, we mark it as spurious.\n    If there's a true entity that doesn't perfecly match any predicted entity, we mark it as missed.\n    All other cases are marked as incorrect.\n    \"\"\"\n\n    def evaluate(\n        self, true_entities: List[Entity], pred_entities: List[Entity], tags: List[str], instance_index: int = 0\n    ) -> Tuple[EvaluationResult, EvaluationIndices]:\n        \"\"\"\n        Evaluate the predicted entities against the true entities using strict matching.\n        \"\"\"\n        result = EvaluationResult()\n        indices = EvaluationIndices()\n        matched_true = set()\n\n        for pred_idx, pred in enumerate(pred_entities):\n            found_match = False\n            found_incorrect = False\n\n            for true_idx, true in enumerate(true_entities):\n                if true_idx in matched_true:\n                    continue\n\n                # Check for perfect match (same boundaries and label)\n                if pred.label == true.label and pred.start == true.start and pred.end == true.end:\n                    result.correct += 1\n                    indices.correct_indices.append((instance_index, pred_idx))\n                    matched_true.add(true_idx)\n                    found_match = True\n                    break\n                # Check for sufficient overlap with min threshold\n                if self._has_sufficient_overlap(pred, true) and not found_incorrect:\n                    incorrect_true_idx = true_idx\n                    incorrect_pred_idx = pred_idx\n                    found_incorrect = True\n\n            if not found_match:\n                if found_incorrect:\n                    result.incorrect += 1\n                    indices.incorrect_indices.append((instance_index, incorrect_pred_idx))\n                    matched_true.add(incorrect_true_idx)\n                else:\n                    result.spurious += 1\n                    indices.spurious_indices.append((instance_index, pred_idx))\n\n        for true_idx, true in enumerate(true_entities):\n            if true_idx not in matched_true:\n                result.missed += 1\n                indices.missed_indices.append((instance_index, true_idx))\n\n        result.compute_metrics()\n        return result, indices\n\n\nclass PartialEvaluation(EvaluationStrategy):\n    \"\"\"\n    Partial evaluation strategy - allows for partial matches.\n\n    If there's a predicted entity that perfectly matches a true entity, we mark it as correct.\n    If there's a predicted entity that doesn't match any true entity and that has some minimum\n    overlap with a true entity we mark it as partial.\n    If there's a predicted entity that doesn't match any true entity, we mark it as spurious.\n    If there's a true entity that doesn't match any predicted entity, we mark it as missed.\n\n    There's never entity type/label checking in this strategy, and there's never an entity marked as incorrect.\n    \"\"\"\n\n    def evaluate(\n        self, true_entities: List[Entity], pred_entities: List[Entity], tags: List[str], instance_index: int = 0\n    ) -> Tuple[EvaluationResult, EvaluationIndices]:\n        result = EvaluationResult()\n        indices = EvaluationIndices()\n        matched_true = set()\n\n        for pred_idx, pred in enumerate(pred_entities):\n            found_match = False\n            found_partial = False\n\n            for true_idx, true in enumerate(true_entities):\n                if true_idx in matched_true:\n                    continue\n\n                # Check for sufficient overlap with min threshold\n                if self._has_sufficient_overlap(pred, true):\n                    if pred.start == true.start and pred.end == true.end:\n                        result.correct += 1\n                        indices.correct_indices.append((instance_index, pred_idx))\n                        matched_true.add(true_idx)\n                        found_match = True\n                        break\n                    if not found_partial:\n                        partial_pred_idx = pred_idx\n                        partial_true_idx = true_idx\n                        found_partial = True\n\n            if not found_match:\n                if found_partial:\n                    result.partial += 1\n                    indices.partial_indices.append((instance_index, partial_pred_idx))\n                    matched_true.add(partial_true_idx)\n                else:\n                    result.spurious += 1\n                    indices.spurious_indices.append((instance_index, pred_idx))\n\n        for true_idx, true in enumerate(true_entities):\n            if true_idx not in matched_true:\n                result.missed += 1\n                indices.missed_indices.append((instance_index, true_idx))\n\n        result.compute_metrics(partial_or_type=True)\n        return result, indices\n\n\nclass EntityTypeEvaluation(EvaluationStrategy):\n    \"\"\"\n    Entity type evaluation strategy - only checks entity types.\n\n    In strategy, we check for overlap between the predicted entity and the true entity.\n\n    If there's a predicted entity that perfectly matches or only some minimum overlap with a\n    true entity, and the same label, we mark it as correct. If there are multiple entities\n    with at least some minimum overlap, we mark as correct the one with boundaries closest to\n    a true entity.\n    If there's a predicted entity that doesn't match any true entity and that has some minimum\n    overlap or perfectly matches but has the wrong label we mark it as incorrect.\n    If there's a predicted entity that doesn't match any true entity, we mark it as spurious.\n    If there's a true entity that doesn't match any predicted entity, we mark it as missed.\n\n    When multiple true entities of the same label overlap a prediction, the match is chosen by\n    closest boundaries (minimum sum of start and end offset differences), so which true entity\n    is considered \"missed\" may differ from list order.\n    \"\"\"\n\n\n    def evaluate(\n        self, true_entities: List[Entity], pred_entities: List[Entity], tags: List[str], instance_index: int = 0\n    ) -> Tuple[EvaluationResult, EvaluationIndices]:\n        result = EvaluationResult()\n        indices = EvaluationIndices()\n        matched_true = set()\n\n        for pred_idx, pred in enumerate(pred_entities):\n            found_match = False\n            found_incorrect = False\n            current_match_boundaries_distance = None\n\n            for true_idx, true in enumerate(true_entities):\n                if true_idx in matched_true:\n                    continue\n\n                # Check for sufficient overlap with min threshold\n                if self._has_sufficient_overlap(pred, true):\n                    boundaries_distance = self._calculate_boundaries_distance(pred, true)\n                    if pred.label == true.label:\n                        if (\n                            current_match_boundaries_distance is None\n                            or boundaries_distance < current_match_boundaries_distance\n                        ):\n                            correct_true_idx = true_idx\n                            correct_pred_idx = pred_idx\n                            current_match_boundaries_distance = boundaries_distance\n                            found_match = True\n\n                    elif not found_incorrect:\n                        incorrect_true_idx = true_idx\n                        incorrect_pred_idx = pred_idx\n                        found_incorrect = True\n\n            if found_match:\n                result.correct += 1\n                indices.correct_indices.append((instance_index, correct_pred_idx))\n                matched_true.add(correct_true_idx)\n            else:\n                if found_incorrect:\n                    result.incorrect += 1\n                    indices.incorrect_indices.append((instance_index, incorrect_pred_idx))\n                    matched_true.add(incorrect_true_idx)\n                else:\n                    result.spurious += 1\n                    indices.spurious_indices.append((instance_index, pred_idx))\n\n        for true_idx, true in enumerate(true_entities):\n            if true_idx not in matched_true:\n                result.missed += 1\n                indices.missed_indices.append((instance_index, true_idx))\n\n        result.compute_metrics(partial_or_type=True)\n        return result, indices\n\n\nclass ExactEvaluation(EvaluationStrategy):\n    \"\"\"\n    Exact evaluation strategy - exact boundary match over the surface string, regardless of the type.\n\n    If there's a predicted entity that perfectly matches a true entity, regardless of the label, we mark it as correct.\n    If there's a predicted entity that doesn't match any true entity and that has only some minimum\n    overlap with a true entity, we mark it as incorrect.\n    If there's a predicted entity that doesn't match any true entity, we mark it as spurious.\n    If there's a true entity that doesn't match any predicted entity, we mark it as missed.\n    \"\"\"\n\n    def evaluate(\n        self, true_entities: List[Entity], pred_entities: List[Entity], tags: List[str], instance_index: int = 0\n    ) -> Tuple[EvaluationResult, EvaluationIndices]:\n        \"\"\"\n        Evaluate the predicted entities against the true entities using exact boundary matching.\n        Entity type is not considered in the matching.\n        \"\"\"\n        result = EvaluationResult()\n        indices = EvaluationIndices()\n        matched_true = set()\n\n        for pred_idx, pred in enumerate(pred_entities):\n            found_match = False\n            found_incorrect = False\n\n            for true_idx, true in enumerate(true_entities):\n                if true_idx in matched_true:\n                    continue\n\n                # Check for exact boundary match (regardless of label)\n                if pred.start == true.start and pred.end == true.end:\n                    result.correct += 1\n                    indices.correct_indices.append((instance_index, pred_idx))\n                    matched_true.add(true_idx)\n                    found_match = True\n                    break\n                # Check for sufficient overlap with min threshold\n                if self._has_sufficient_overlap(pred, true) and not found_incorrect:\n                    incorrect_true_idx = true_idx\n                    incorrect_pred_idx = pred_idx\n                    found_incorrect = True\n\n            if not found_match:\n                if found_incorrect:\n                    result.incorrect += 1\n                    indices.incorrect_indices.append((instance_index, incorrect_pred_idx))\n                    matched_true.add(incorrect_true_idx)\n                else:\n                    result.spurious += 1\n                    indices.spurious_indices.append((instance_index, pred_idx))\n\n        for true_idx, true in enumerate(true_entities):\n            if true_idx not in matched_true:\n                result.missed += 1\n                indices.missed_indices.append((instance_index, true_idx))\n\n        result.compute_metrics()\n        return result, indices\n"
  },
  {
    "path": "src/nervaluate/utils.py",
    "content": "def split_list(token: list[str], split_chars: list[str] | None = None) -> list[list[str]]:\n    \"\"\"\n    Split a list into sublists based on a list of split characters.\n\n    If split_chars is None, the list is split on empty strings.\n\n    :param token: The list to split.\n    :param split_chars: The characters to split on.\n\n    :returns:\n        A list of lists.\n    \"\"\"\n    if split_chars is None:\n        split_chars = [\"\"]\n    out = []\n    chunk = []\n    for i, item in enumerate(token):\n        if item not in split_chars:\n            chunk.append(item)\n            if i + 1 == len(token):\n                out.append(chunk)\n        else:\n            out.append(chunk)\n            chunk = []\n    return out\n\n\ndef conll_to_spans(doc: str) -> list[list[dict]]:\n    \"\"\"\n    Convert a CoNLL-formatted string to a list of spans.\n\n    :param doc: The CoNLL-formatted string.\n\n    :returns:\n        A list of spans.\n    \"\"\"\n    out = []\n    doc_parts = split_list(doc.split(\"\\n\"), split_chars=None)\n\n    for example in doc_parts:\n        labels = []\n        for token in example:\n            token_parts = token.split(\"\\t\")\n            label = token_parts[1]\n            labels.append(label)\n        out.append(labels)\n\n    spans = list_to_spans(out)\n\n    return spans\n\n\ndef list_to_spans(doc: list[list[str]]) -> list[list[dict]]:\n    \"\"\"\n    Convert a list of tags to a list of spans.\n\n    :param doc: The list of tags.\n\n    :returns:\n        A list of spans.\n    \"\"\"\n    spans = [collect_named_entities(tokens) for tokens in doc]\n    return spans\n\n\ndef collect_named_entities(tokens: list[str]) -> list[dict]:\n    \"\"\"\n    Creates a list of Entity named-tuples, storing the entity type and the start and end offsets of the entity.\n\n    :param tokens: a list of tags\n\n    :returns:\n        A list of Entity named-tuples.\n    \"\"\"\n\n    named_entities = []\n    start_offset = None\n    end_offset = None\n    ent_type = None\n\n    for offset, token_tag in enumerate(tokens):\n        if token_tag == \"O\":\n            if ent_type is not None and start_offset is not None:\n                end_offset = offset - 1\n                named_entities.append({\"label\": ent_type, \"start\": start_offset, \"end\": end_offset})\n                start_offset = None\n                end_offset = None\n                ent_type = None\n\n        elif ent_type is None:\n            ent_type = token_tag[2:]\n            start_offset = offset\n\n        elif ent_type != token_tag[2:] or (ent_type == token_tag[2:] and token_tag[:1] == \"B\"):\n            end_offset = offset - 1\n            named_entities.append({\"label\": ent_type, \"start\": start_offset, \"end\": end_offset})\n\n            # start of a new entity\n            ent_type = token_tag[2:]\n            start_offset = offset\n            end_offset = None\n\n    # Catches an entity that goes up until the last token\n    if ent_type is not None and start_offset is not None and end_offset is None:\n        named_entities.append({\"label\": ent_type, \"start\": start_offset, \"end\": len(tokens) - 1})\n\n    return named_entities\n\n\ndef find_overlap(true_range: range, pred_range: range) -> set:\n    \"\"\"\n    Find the overlap between two ranges.\n\n    :param true_range: The true range.\n    :param pred_range: The predicted range.\n\n    :returns:\n        A set of overlapping values.\n\n    Examples:\n        >>> find_overlap(range(1, 3), range(2, 4))\n        {2}\n        >>> find_overlap(range(1, 3), range(3, 5))\n        set()\n    \"\"\"\n\n    true_set = set(true_range)\n    pred_set = set(pred_range)\n    overlaps = true_set.intersection(pred_set)\n\n    return overlaps\n\n\ndef clean_entities(ent: dict) -> dict:\n    \"\"\"\n    Returns just the useful keys if additional keys are present in the entity\n    dict.\n\n    This may happen if passing a list of spans directly from prodigy, which\n    typically may include 'token_start' and 'token_end'.\n    \"\"\"\n    return {\"start\": ent[\"start\"], \"end\": ent[\"end\"], \"label\": ent[\"label\"]}\n"
  },
  {
    "path": "tests/__init__.py",
    "content": "import sys\n\nsys.path.append(\"../src/nervaluate\")\n"
  },
  {
    "path": "tests/test_entities.py",
    "content": "from nervaluate.entities import Entity, EvaluationResult\n\n\ndef test_entity_equality():\n    \"\"\"Test Entity equality comparison.\"\"\"\n    entity1 = Entity(label=\"PER\", start=0, end=1)\n    entity2 = Entity(label=\"PER\", start=0, end=1)\n    entity3 = Entity(label=\"ORG\", start=0, end=1)\n\n    assert entity1 == entity2\n    assert entity1 != entity3\n    assert entity1 != \"not an entity\"\n\n\ndef test_entity_hash():\n    \"\"\"Test Entity hashing.\"\"\"\n    entity1 = Entity(label=\"PER\", start=0, end=1)\n    entity2 = Entity(label=\"PER\", start=0, end=1)\n    entity3 = Entity(label=\"ORG\", start=0, end=1)\n\n    assert hash(entity1) == hash(entity2)\n    assert hash(entity1) != hash(entity3)\n\n\ndef test_evaluation_result_compute_metrics():\n    \"\"\"Test computation of evaluation metrics.\"\"\"\n    result = EvaluationResult(correct=5, incorrect=2, partial=1, missed=1, spurious=1)\n\n    # Test strict metrics\n    result.compute_metrics(partial_or_type=False)\n    assert result.precision == 5 / 9  # 5/(5+2+1+1)\n    assert result.recall == 5 / (5 + 2 + 1 + 1)\n\n    # Test partial metrics\n    result.compute_metrics(partial_or_type=True)\n    assert result.precision == 5.5 / 9  # (5+0.5*1)/(5+2+1+1)\n    assert result.recall == (5 + 0.5 * 1) / (5 + 2 + 1 + 1)\n\n\ndef test_evaluation_result_zero_cases():\n    \"\"\"Test evaluation metrics with zero values.\"\"\"\n    result = EvaluationResult()\n    result.compute_metrics()\n    assert result.precision == 0\n    assert result.recall == 0\n    assert result.f1 == 0\n"
  },
  {
    "path": "tests/test_evaluator.py",
    "content": "import csv\nimport io\nimport pytest\nfrom nervaluate.evaluator import Evaluator\n\n\n@pytest.fixture\ndef sample_data():\n    true = [\n        [\"O\", \"B-PER\", \"O\", \"B-ORG\", \"I-ORG\", \"B-LOC\"],\n        [\"O\", \"B-PER\", \"O\", \"B-ORG\"],\n    ]\n\n    pred = [\n        [\"O\", \"B-PER\", \"O\", \"B-ORG\", \"O\", \"B-PER\"],\n        [\"O\", \"B-PER\", \"O\", \"B-LOC\"],\n    ]\n\n    return true, pred\n\n\ndef test_evaluator_initialization(sample_data):\n    \"\"\"Test evaluator initialization.\"\"\"\n    true, pred = sample_data\n    evaluator = Evaluator(true, pred, [\"PER\", \"ORG\", \"LOC\"], loader=\"list\")\n\n    assert len(evaluator.true) == 2\n    assert len(evaluator.pred) == 2\n    assert evaluator.tags == [\"PER\", \"ORG\", \"LOC\"]\n\n\ndef test_evaluator_evaluation(sample_data):\n    \"\"\"Test evaluation process.\"\"\"\n    true, pred = sample_data\n    evaluator = Evaluator(true, pred, [\"PER\", \"ORG\", \"LOC\"], loader=\"list\")\n    results = evaluator.evaluate()\n\n    # Check that we have results for all strategies\n    assert \"overall\" in results\n    assert \"entities\" in results\n    assert \"strict\" in results[\"overall\"]\n    assert \"partial\" in results[\"overall\"]\n    assert \"ent_type\" in results[\"overall\"]\n\n    # Check that we have results for each entity type\n    for entity in [\"PER\", \"ORG\", \"LOC\"]:\n        assert entity in results[\"entities\"]\n        assert \"strict\" in results[\"entities\"][entity]\n        assert \"partial\" in results[\"entities\"][entity]\n        assert \"ent_type\" in results[\"entities\"][entity]\n\n\ndef test_evaluator_with_invalid_tags(sample_data):\n    \"\"\"Test evaluator with invalid tags.\"\"\"\n    true, pred = sample_data\n    evaluator = Evaluator(true, pred, [\"INVALID\"], loader=\"list\")\n    results = evaluator.evaluate()\n\n    for strategy in [\"strict\", \"partial\", \"ent_type\"]:\n        assert results[\"overall\"][strategy].correct == 0\n        assert results[\"overall\"][strategy].incorrect == 0\n        assert results[\"overall\"][strategy].partial == 0\n        assert results[\"overall\"][strategy].missed == 0\n        assert results[\"overall\"][strategy].spurious == 0\n\n\ndef test_partial_and_ent_type_metrics_use_partial_formula_after_merge():\n    \"\"\"\n    Test that partial and ent_type strategies use (COR + 0.5*PAR)/ACT for precision/recall\n    after merging multi-document results, not the strict formula COR/ACT.\n\n    Uses the README usage example: 2 documents, partial has correct=2, partial=3.\n    SemEval partial formula gives P=R=(2+0.5*3)/5=0.7; strict would give 0.4.\n    This test would have caught the bug where _merge_results called compute_metrics()\n    without partial_or_type=True, overwriting partial/ent_type metrics with strict values.\n    \"\"\"\n    # README usage example (2 documents so _merge_results is exercised)\n    true = [\n        [\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-ORG\", \"I-ORG\"],\n        [\"O\", \"B-LOC\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"B-DATE\"],\n    ]\n    pred = [\n        [\"O\", \"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"B-ORG\", \"I-ORG\"],\n        [\"O\", \"B-LOC\", \"I-LOC\", \"B-PER\", \"O\", \"O\", \"B-DATE\"],\n    ]\n    evaluator = Evaluator(true, pred, tags=[\"PER\", \"ORG\", \"LOC\", \"DATE\"], loader=\"list\")\n    results = evaluator.evaluate()\n\n    strict_res = results[\"overall\"][\"strict\"]\n    partial_res = results[\"overall\"][\"partial\"]\n    ent_type_res = results[\"overall\"][\"ent_type\"]\n\n    # Partial has correct=2, partial=3, no incorrect/missed/spurious -> ACT=POS=5\n    assert partial_res.correct == 2\n    assert partial_res.partial == 3\n    assert partial_res.incorrect == 0\n    assert partial_res.missed == 0\n    assert partial_res.spurious == 0\n    assert partial_res.actual == 5\n    assert partial_res.possible == 5\n\n    # SemEval partial formula: (COR + 0.5*PAR) / ACT and / POS\n    expected_partial_precision = (partial_res.correct + 0.5 * partial_res.partial) / partial_res.actual\n    expected_partial_recall = (partial_res.correct + 0.5 * partial_res.partial) / partial_res.possible\n    assert expected_partial_precision == pytest.approx(0.7)\n    assert expected_partial_recall == pytest.approx(0.7)\n\n    # Partial strategy must report these values (not strict 0.4)\n    assert partial_res.precision == pytest.approx(expected_partial_precision)\n    assert partial_res.recall == pytest.approx(expected_partial_recall)\n    assert partial_res.precision != strict_res.precision\n    assert partial_res.recall != strict_res.recall\n\n    # ent_type for this example has no partial/incorrect, so P/R=1.0; ensure it used partial formula path\n    assert ent_type_res.precision == pytest.approx(1.0)\n    assert ent_type_res.recall == pytest.approx(1.0)\n\n\ndef test_evaluator_different_document_lengths():\n    \"\"\"Test that Evaluator raises ValueError when documents have different lengths.\"\"\"\n    true = [\n        [\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-ORG\", \"I-ORG\"],  # 8 tokens\n        [\"O\", \"B-LOC\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"B-DATE\"],  # 7 tokens\n    ]\n    pred = [\n        [\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-ORG\", \"I-ORG\"],  # 8 tokens\n        [\"O\", \"B-LOC\", \"I-LOC\", \"O\", \"B-PER\", \"I-PER\", \"O\", \"B-DATE\", \"I-DATE\", \"O\"],  # 10 tokens\n    ]\n    tags = [\"PER\", \"ORG\", \"LOC\", \"DATE\"]\n\n    # Test that ValueError is raised\n    with pytest.raises(ValueError, match=\"Document 1 has different lengths: true=7, pred=10\"):\n        evaluator = Evaluator(true=true, pred=pred, tags=tags, loader=\"list\")\n        evaluator.evaluate()\n\n\ndef test_results_to_csv(sample_data, tmp_path):\n\n    true, pred = sample_data\n    evaluator = Evaluator(true, pred, [\"PER\", \"ORG\", \"LOC\"], loader=\"list\")\n\n    overall_csv_str = evaluator.results_to_csv(mode=\"overall\")\n    assert isinstance(overall_csv_str, str)\n\n    csv_reader = csv.reader(io.StringIO(overall_csv_str))\n    overall_csv = list(csv_reader)\n\n    assert len(overall_csv) > 1  # should have header + at least one row\n    assert overall_csv[0] == [\n        \"Strategy\",\n        \"Correct\",\n        \"Incorrect\",\n        \"Partial\",\n        \"Missed\",\n        \"Spurious\",\n        \"Precision\",\n        \"Recall\",\n        \"F1-Score\",\n    ]\n\n    # check that all strategies are present\n    strategies = {row[0] for row in overall_csv[1:]}\n    assert strategies == {\"strict\", \"partial\", \"ent_type\", \"exact\"}\n\n    # test entities mode - return as string\n    entities_csv_str = evaluator.results_to_csv(mode=\"entities\", scenario=\"strict\")\n    assert isinstance(entities_csv_str, str)\n\n    # parse CSV string to check content\n    csv_reader = csv.reader(io.StringIO(entities_csv_str))\n    entities_csv = list(csv_reader)\n\n    assert len(entities_csv) > 1  # should have header + at least one row\n    assert entities_csv[0] == [\n        \"Entity\",\n        \"Correct\",\n        \"Incorrect\",\n        \"Partial\",\n        \"Missed\",\n        \"Spurious\",\n        \"Precision\",\n        \"Recall\",\n        \"F1-Score\",\n    ]\n\n    # check that all entity types are present\n    entity_types = {row[0] for row in entities_csv[1:]}\n    assert entity_types == {\"PER\", \"ORG\", \"LOC\"}\n\n    # test file saving - overall mode\n    overall_file = tmp_path / \"overall_results.csv\"\n    result = evaluator.results_to_csv(mode=\"overall\", file_path=str(overall_file))\n    assert result is None  # Should return None when saving to file\n    assert overall_file.exists()\n\n    # verify file content\n    with open(overall_file, \"r\", encoding=\"utf-8\") as f:\n        saved_csv = list(csv.reader(f))\n    assert len(saved_csv) > 1\n    assert saved_csv[0][0] == \"Strategy\"\n\n    # test file saving - entities mode\n    entities_file = tmp_path / \"entities_results.csv\"\n    result = evaluator.results_to_csv(mode=\"entities\", scenario=\"partial\", file_path=str(entities_file))\n    assert result is None  # Should return None when saving to file\n    assert entities_file.exists()\n\n    # verify file content\n    with open(entities_file, \"r\", encoding=\"utf-8\") as f:\n        saved_csv = list(csv.reader(f))\n    assert len(saved_csv) > 1\n    assert saved_csv[0][0] == \"Entity\"\n\n    # test invalid mode\n    with pytest.raises(ValueError, match=\"Invalid mode: must be one of\"):\n        evaluator.results_to_csv(mode=\"invalid\")\n\n    # test invalid scenario for entities mode\n    with pytest.raises(ValueError, match=\"Invalid scenario: must be one of\"):\n        evaluator.results_to_csv(mode=\"entities\", scenario=\"invalid\")\n\n\ndef test_evaluator_with_min_overlap_percentage():\n    \"\"\"Test Evaluator class with minimum overlap percentage parameter.\"\"\"\n\n    # Test data: true entity spans positions 0-9 (10 tokens)\n    true_entities = [[{\"label\": \"PER\", \"start\": 0, \"end\": 9}]]  # 10-token entity\n\n    # Predicted entities with different overlap percentages\n    pred_entities = [[{\"label\": \"PER\", \"start\": 0, \"end\": 2}]]  # 30% overlap\n\n    # Test with default 1% threshold - should be partial match\n    evaluator_default = Evaluator(true=true_entities, pred=pred_entities, tags=[\"PER\"], loader=\"dict\")\n    results_default = evaluator_default.evaluate()\n    partial_default = results_default[\"overall\"][\"partial\"]\n    assert partial_default.partial == 1\n    assert partial_default.spurious == 0\n\n    # Test with 50% threshold - should be spurious\n    evaluator_50 = Evaluator(\n        true=true_entities, pred=pred_entities, tags=[\"PER\"], loader=\"dict\", min_overlap_percentage=50.0\n    )\n    results_50 = evaluator_50.evaluate()\n    partial_50 = results_50[\"overall\"][\"partial\"]\n    assert partial_50.partial == 0\n    assert partial_50.spurious == 1\n\n\ndef test_evaluator_min_overlap_validation():\n    \"\"\"Test that Evaluator validates minimum overlap percentage.\"\"\"\n    true_entities = [[{\"label\": \"PER\", \"start\": 0, \"end\": 5}]]\n    pred_entities = [[{\"label\": \"PER\", \"start\": 0, \"end\": 5}]]\n\n    # Valid values should work\n    Evaluator(true_entities, pred_entities, [\"PER\"], \"dict\", min_overlap_percentage=1.0)\n    Evaluator(true_entities, pred_entities, [\"PER\"], \"dict\", min_overlap_percentage=50.0)\n    Evaluator(true_entities, pred_entities, [\"PER\"], \"dict\", min_overlap_percentage=100.0)\n\n    # Invalid values should raise ValueError during strategy initialization\n    with pytest.raises(ValueError, match=\"min_overlap_percentage must be between 1.0 and 100.0\"):\n        Evaluator(true_entities, pred_entities, [\"PER\"], \"dict\", min_overlap_percentage=0.5)\n\n    with pytest.raises(ValueError, match=\"min_overlap_percentage must be between 1.0 and 100.0\"):\n        Evaluator(true_entities, pred_entities, [\"PER\"], \"dict\", min_overlap_percentage=101.0)\n\n\ndef test_evaluator_min_overlap_affects_all_strategies():\n    \"\"\"Test that minimum overlap percentage affects all evaluation strategies.\"\"\"\n    true_entities = [[{\"label\": \"PER\", \"start\": 0, \"end\": 9}]]  # 10 tokens\n\n    pred_entities = [[{\"label\": \"PER\", \"start\": 0, \"end\": 2}]]  # 30% overlap\n\n    evaluator = Evaluator(\n        true=true_entities, pred=pred_entities, tags=[\"PER\"], loader=\"dict\", min_overlap_percentage=50.0\n    )\n\n    results = evaluator.evaluate()\n\n    # All strategies should respect the 50% threshold\n    # 30% overlap < 50% threshold, so should be spurious for all strategies\n\n    # Partial strategy\n    partial_result = results[\"overall\"][\"partial\"]\n    assert partial_result.spurious == 1\n    assert partial_result.correct == 0\n    assert partial_result.partial == 0\n\n    # Strict strategy\n    strict_result = results[\"overall\"][\"strict\"]\n    assert strict_result.spurious == 1\n    assert strict_result.correct == 0\n    assert strict_result.incorrect == 0\n\n    # Entity type strategy\n    ent_type_result = results[\"overall\"][\"ent_type\"]\n    assert ent_type_result.spurious == 1\n    assert ent_type_result.correct == 0\n    assert ent_type_result.incorrect == 0\n\n    # Exact strategy\n    exact_result = results[\"overall\"][\"exact\"]\n    assert exact_result.spurious == 1\n    assert exact_result.correct == 0\n    assert exact_result.incorrect == 0\n\n\ndef test_evaluator_min_overlap_with_different_thresholds():\n    \"\"\"Test Evaluator with different overlap thresholds.\"\"\"\n    true_entities = [[{\"label\": \"PER\", \"start\": 0, \"end\": 9}]]  # 10 tokens\n\n    # Test cases with different predicted entities\n    test_cases = [\n        # (pred_entities, threshold, expected_result_type)\n        ([{\"label\": \"PER\", \"start\": 0, \"end\": 4}], 50.0, \"partial\"),  # 50% overlap = 50%\n        ([{\"label\": \"PER\", \"start\": 0, \"end\": 4}], 51.0, \"spurious\"),  # 50% overlap < 51%\n        ([{\"label\": \"PER\", \"start\": 0, \"end\": 6}], 75.0, \"spurious\"),  # 70% overlap < 75%\n        ([{\"label\": \"PER\", \"start\": 0, \"end\": 7}], 75.0, \"partial\"),  # 80% overlap > 75%\n        ([{\"label\": \"PER\", \"start\": 0, \"end\": 9}], 100.0, \"correct\"),  # 100% overlap = exact match\n    ]\n\n    for pred_data, threshold, expected_type in test_cases:\n        pred_entities = [pred_data]\n\n        evaluator = Evaluator(\n            true=true_entities, pred=pred_entities, tags=[\"PER\"], loader=\"dict\", min_overlap_percentage=threshold\n        )\n\n        results = evaluator.evaluate()\n        partial_results = results[\"overall\"][\"partial\"]\n\n        if expected_type == \"correct\":\n            assert partial_results.correct == 1, f\"Failed for {pred_data} with threshold {threshold}%\"\n            assert partial_results.partial == 0\n            assert partial_results.spurious == 0\n        elif expected_type == \"partial\":\n            assert partial_results.partial == 1, f\"Failed for {pred_data} with threshold {threshold}%\"\n            assert partial_results.correct == 0\n            assert partial_results.spurious == 0\n        elif expected_type == \"spurious\":\n            assert partial_results.spurious == 1, f\"Failed for {pred_data} with threshold {threshold}%\"\n            assert partial_results.correct == 0\n            assert partial_results.partial == 0\n\n\ndef test_evaluator_min_overlap_with_multiple_entities():\n    \"\"\"Test Evaluator with multiple entities and minimum overlap threshold.\"\"\"\n    true_entities = [\n        [\n            {\"label\": \"PER\", \"start\": 0, \"end\": 4},  # 5 tokens\n            {\"label\": \"ORG\", \"start\": 10, \"end\": 14},  # 5 tokens\n            {\"label\": \"LOC\", \"start\": 20, \"end\": 24},  # 5 tokens\n        ]\n    ]\n\n    pred_entities = [\n        [\n            {\"label\": \"PER\", \"start\": 0, \"end\": 1},  # 40% overlap (2/5 tokens)\n            {\"label\": \"ORG\", \"start\": 10, \"end\": 12},  # 60% overlap (3/5 tokens)\n            {\"label\": \"LOC\", \"start\": 20, \"end\": 24},  # 100% overlap (exact match)\n            {\"label\": \"MISC\", \"start\": 30, \"end\": 32},  # No overlap (spurious)\n        ]\n    ]\n\n    # Test with 50% threshold\n    evaluator = Evaluator(\n        true=true_entities,\n        pred=pred_entities,\n        tags=[\"PER\", \"ORG\", \"LOC\", \"MISC\"],\n        loader=\"dict\",\n        min_overlap_percentage=50.0,\n    )\n\n    results = evaluator.evaluate()\n    partial_results = results[\"overall\"][\"partial\"]\n\n    assert partial_results.correct == 1  # LOC exact match\n    assert partial_results.partial == 1  # ORG 60% overlap > 50%\n    assert partial_results.spurious == 2  # PER 40% < 50% and MISC no overlap\n    assert partial_results.missed == 1  # PER entity not sufficiently matched\n\n\ndef test_evaluator_min_overlap_backward_compatibility():\n    \"\"\"Test that the new feature maintains backward compatibility.\"\"\"\n    true_entities = [[{\"label\": \"PER\", \"start\": 0, \"end\": 9}]]\n\n    pred_entities = [[{\"label\": \"PER\", \"start\": 9, \"end\": 9}]]  # 10% overlap (1 token out of 10)\n\n    # Without specifying min_overlap_percentage (should default to 1.0)\n    evaluator_default = Evaluator(true=true_entities, pred=pred_entities, tags=[\"PER\"], loader=\"dict\")\n\n    # With explicitly setting to 1.0\n    evaluator_explicit = Evaluator(\n        true=true_entities, pred=pred_entities, tags=[\"PER\"], loader=\"dict\", min_overlap_percentage=1.0\n    )\n\n    results_default = evaluator_default.evaluate()\n    results_explicit = evaluator_explicit.evaluate()\n\n    # Results should be identical\n    for strategy in [\"strict\", \"partial\", \"ent_type\", \"exact\"]:\n        default_result = results_default[\"overall\"][strategy]\n        explicit_result = results_explicit[\"overall\"][strategy]\n\n        assert default_result.correct == explicit_result.correct\n        assert default_result.partial == explicit_result.partial\n        assert default_result.spurious == explicit_result.spurious\n        assert default_result.missed == explicit_result.missed\n"
  },
  {
    "path": "tests/test_loaders.py",
    "content": "import pytest\n\nfrom nervaluate.loaders import ConllLoader, ListLoader, DictLoader\n\n\ndef test_conll_loader():\n    \"\"\"Test CoNLL format loader.\"\"\"\n    true_conll = (\n        \"word\\tO\\nword\\tO\\nword\\tO\\nword\\tO\\nword\\tO\\nword\\tO\\n\\n\"\n        \"word\\tO\\nword\\tO\\nword\\tB-ORG\\nword\\tI-ORG\\nword\\tO\\nword\\tO\\n\\n\"\n        \"word\\tO\\nword\\tO\\nword\\tB-MISC\\nword\\tI-MISC\\nword\\tO\\nword\\tO\\n\\n\"\n        \"word\\tB-MISC\\nword\\tI-MISC\\nword\\tI-MISC\\nword\\tI-MISC\\nword\\tI-MISC\\nword\\tI-MISC\\n\"\n    )\n\n    pred_conll = (\n        \"word\\tO\\nword\\tO\\nword\\tB-PER\\nword\\tI-PER\\nword\\tO\\nword\\tO\\n\\n\"\n        \"word\\tO\\nword\\tO\\nword\\tB-ORG\\nword\\tI-ORG\\nword\\tO\\nword\\tO\\n\\n\"\n        \"word\\tO\\nword\\tO\\nword\\tB-MISC\\nword\\tI-MISC\\nword\\tO\\nword\\tO\\n\\n\"\n        \"word\\tB-MISC\\nword\\tI-MISC\\nword\\tI-MISC\\nword\\tI-MISC\\nword\\tI-MISC\\nword\\tI-MISC\\n\"\n    )\n\n    loader = ConllLoader()\n    true_entities = loader.load(true_conll)\n    pred_entities = loader.load(pred_conll)\n\n    # Test true entities\n    assert len(true_entities) == 4  # Four documents\n    assert len(true_entities[0]) == 0  # First document has no entities (all O tags)\n    assert len(true_entities[1]) == 1  # Second document has 1 entity (ORG)\n    assert len(true_entities[2]) == 1  # Third document has 1 entity (MISC)\n    assert len(true_entities[3]) == 1  # Fourth document has 1 entity (MISC)\n\n    # Check first entity in second document\n    assert true_entities[1][0].label == \"ORG\"\n    assert true_entities[1][0].start == 2\n    assert true_entities[1][0].end == 3\n\n    # Test pred entities\n    assert len(pred_entities) == 4  # Four documents\n    assert len(pred_entities[0]) == 1  # First document has 1 entity (PER)\n    assert len(pred_entities[1]) == 1  # Second document has 1 entity (ORG)\n    assert len(pred_entities[2]) == 1  # Third document has 1 entity (MISC)\n    assert len(pred_entities[3]) == 1  # Fourth document has 1 entity (MISC)\n\n    # Check first entity in first document\n    assert pred_entities[0][0].label == \"PER\"\n    assert pred_entities[0][0].start == 2\n    assert pred_entities[0][0].end == 3\n\n    # Test empty document handling\n    empty_doc = \"word\\tO\\nword\\tO\\nword\\tO\\n\\n\"\n    empty_entities = loader.load(empty_doc)\n    assert len(empty_entities) == 1  # One document\n    assert len(empty_entities[0]) == 0  # Empty list for document with only O tags\n\n\ndef test_list_loader():\n    \"\"\"Test list format loader.\"\"\"\n    true_list = [\n        [\"O\", \"O\", \"O\", \"O\", \"O\", \"O\"],\n        [\"O\", \"O\", \"B-ORG\", \"I-ORG\", \"O\", \"O\"],\n        [\"O\", \"O\", \"B-MISC\", \"I-MISC\", \"O\", \"O\"],\n        [\"B-MISC\", \"I-MISC\", \"I-MISC\", \"I-MISC\", \"I-MISC\", \"I-MISC\"],\n    ]\n\n    pred_list = [\n        [\"O\", \"O\", \"B-PER\", \"I-PER\", \"O\", \"O\"],\n        [\"O\", \"O\", \"B-ORG\", \"I-ORG\", \"O\", \"O\"],\n        [\"O\", \"O\", \"B-MISC\", \"I-MISC\", \"O\", \"O\"],\n        [\"B-MISC\", \"I-MISC\", \"I-MISC\", \"I-MISC\", \"I-MISC\", \"I-MISC\"],\n    ]\n\n    loader = ListLoader()\n    true_entities = loader.load(true_list)\n    pred_entities = loader.load(pred_list)\n\n    # Test true entities\n    assert len(true_entities) == 4  # Four documents\n    assert len(true_entities[0]) == 0  # First document has no entities (all O tags)\n    assert len(true_entities[1]) == 1  # Second document has 1 entity (ORG)\n    assert len(true_entities[2]) == 1  # Third document has 1 entity (MISC)\n    assert len(true_entities[3]) == 1  # Fourth document has 1 entity (MISC)\n\n    # Check no entities in the first document\n    assert len(true_entities[0]) == 0\n\n    # Check first entity in second document\n    assert true_entities[1][0].label == \"ORG\"\n    assert true_entities[1][0].start == 2\n    assert true_entities[1][0].end == 3\n\n    # Check only entity in the last document\n    assert true_entities[3][0].label == \"MISC\"\n    assert true_entities[3][0].start == 0\n    assert true_entities[3][0].end == 5\n\n    # Test pred entities\n    assert len(pred_entities) == 4  # Four documents\n    assert len(pred_entities[0]) == 1  # First document has 1 entity (PER)\n    assert len(pred_entities[1]) == 1  # Second document has 1 entity (ORG)\n    assert len(pred_entities[2]) == 1  # Third document has 1 entity (MISC)\n    assert len(pred_entities[3]) == 1  # Fourth document has 1 entity (MISC)\n\n    # Check first entity in first document\n    assert pred_entities[0][0].label == \"PER\"\n    assert pred_entities[0][0].start == 2\n    assert pred_entities[0][0].end == 3\n\n    # Test empty document handling\n    empty_doc = [[\"O\", \"O\", \"O\"]]\n    empty_entities = loader.load(empty_doc)\n    assert len(empty_entities) == 1  # One document\n    assert len(empty_entities[0]) == 0  # Empty list for document with only O tags\n\n\ndef test_dict_loader():\n    \"\"\"Test dictionary format loader.\"\"\"\n    true_prod = [\n        [],\n        [{\"label\": \"ORG\", \"start\": 2, \"end\": 3}],\n        [{\"label\": \"MISC\", \"start\": 2, \"end\": 3}],\n        [{\"label\": \"MISC\", \"start\": 0, \"end\": 5}],\n    ]\n\n    pred_prod = [\n        [{\"label\": \"PER\", \"start\": 2, \"end\": 3}],\n        [{\"label\": \"ORG\", \"start\": 2, \"end\": 3}],\n        [{\"label\": \"MISC\", \"start\": 2, \"end\": 3}],\n        [{\"label\": \"MISC\", \"start\": 0, \"end\": 5}],\n    ]\n\n    loader = DictLoader()\n    true_entities = loader.load(true_prod)\n    pred_entities = loader.load(pred_prod)\n\n    # Test true entities\n    assert len(true_entities) == 4  # Four documents\n    assert len(true_entities[0]) == 0  # First document has no entities\n    assert len(true_entities[1]) == 1  # Second document has 1 entity (ORG)\n    assert len(true_entities[2]) == 1  # Third document has 1 entity (MISC)\n    assert len(true_entities[3]) == 1  # Fourth document has 1 entity (MISC)\n\n    # Check first entity in second document\n    assert true_entities[1][0].label == \"ORG\"\n    assert true_entities[1][0].start == 2\n    assert true_entities[1][0].end == 3\n\n    # Check only entity in the last document\n    assert true_entities[3][0].label == \"MISC\"\n    assert true_entities[3][0].start == 0\n    assert true_entities[3][0].end == 5\n\n    # Test pred entities\n    assert len(pred_entities) == 4  # Four documents\n    assert len(pred_entities[0]) == 1  # First document has 1 entity (PER)\n    assert len(pred_entities[1]) == 1  # Second document has 1 entity (ORG)\n    assert len(pred_entities[2]) == 1  # Third document has 1 entity (MISC)\n    assert len(pred_entities[3]) == 1  # Fourth document has 1 entity (MISC)\n\n    # Check first entity in first document\n    assert pred_entities[0][0].label == \"PER\"\n    assert pred_entities[0][0].start == 2\n    assert pred_entities[0][0].end == 3\n\n    # Test empty document handling\n    empty_doc = [[]]\n    empty_entities = loader.load(empty_doc)\n    assert len(empty_entities) == 1  # One document\n    assert len(empty_entities[0]) == 0  # Empty list for empty document\n\n\ndef test_loader_with_empty_input():\n    \"\"\"Test loaders with empty input.\"\"\"\n    # Test ConllLoader with empty string\n    conll_loader = ConllLoader()\n    entities = conll_loader.load(\"\")\n    assert len(entities) == 0\n\n    # Test ListLoader with empty list\n    list_loader = ListLoader()\n    entities = list_loader.load([])\n    assert len(entities) == 0\n\n    # Test DictLoader with empty list\n    dict_loader = DictLoader()\n    entities = dict_loader.load([])\n    assert len(entities) == 0\n\n\ndef test_loader_with_invalid_data():\n    \"\"\"Test loaders with invalid data.\"\"\"\n    with pytest.raises(Exception):\n        ConllLoader().load(\"invalid\\tdata\")\n\n    with pytest.raises(Exception):\n        ListLoader().load([[\"invalid\"]])\n\n    with pytest.raises(Exception):\n        DictLoader().load([[{\"invalid\": \"data\"}]])\n"
  },
  {
    "path": "tests/test_strategies.py",
    "content": "from copy import deepcopy\nimport pytest\nfrom nervaluate.entities import Entity\nfrom nervaluate.strategies import EntityTypeEvaluation, ExactEvaluation, PartialEvaluation, StrictEvaluation\n\n\ndef create_entities_from_bio(bio_tags):\n    \"\"\"Helper function to create entities from BIO tags.\"\"\"\n    entities = []\n    current_entity = None\n\n    for i, tag in enumerate(bio_tags):\n        if tag == \"O\":\n            continue\n\n        if tag.startswith(\"B-\"):\n            if current_entity:\n                entities.append(current_entity)\n            current_entity = Entity(tag[2:], i, i + 1)\n        elif tag.startswith(\"I-\"):\n            if current_entity:\n                current_entity.end = i + 1\n            else:\n                # Handle case where I- tag appears without B-\n                current_entity = Entity(tag[2:], i, i + 1)\n\n    if current_entity:\n        entities.append(current_entity)\n\n    return entities\n\n\n@pytest.fixture\ndef base_sequence():\n    \"\"\"Base sequence: 'The John Smith who works at Google Inc'\"\"\"\n    return [\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-ORG\", \"I-ORG\"]\n\n\n@pytest.fixture\ndef base_sequence_nested():\n    \"\"\"\n    Base sequence: 'The Treaty of Westphalia negotiations concluded in 1648.'\n\n    first_level_entity: Treaty of Westphalia negotiations\n    second_level_entity: Treaty of Westphalia\n    third_level_entity: Westphalia\n    other_entity: 1648\n    \"\"\"\n    first_level_entity = Entity(\"EVENT\", 4, 37)\n    second_level_entity = Entity(\"EVENT\", 4, 24)\n    third_level_entity = Entity(\"LOCATION\", 14, 24)\n    other_entity = Entity(\"DATE\", 51, 55)\n\n    return [first_level_entity, second_level_entity, third_level_entity, other_entity]\n\n\nclass TestStrictEvaluation:\n    \"\"\"Test cases for strict evaluation strategy.\"\"\"\n\n    def test_perfect_match(self, base_sequence):\n        \"\"\"Test case: Perfect match of all entities.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio(base_sequence)\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_perfect_match_nested(self, base_sequence_nested):\n        \"\"\"Test case: Perfect match of all entities with nested entities.\"\"\"\n        evaluator = StrictEvaluation()\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_perfect_match_nested_reverse_order(self, base_sequence_nested):\n        \"\"\"Test case: Perfect match of all entities in reverse order, with nested entities.\"\"\"\n        evaluator = StrictEvaluation()\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)[::-1]\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_missed_entity(self, base_sequence):\n        \"\"\"Test case: One entity is missed in prediction.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"O\", \"O\"])\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 1\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == [(0, 1)]\n        assert result_indices.spurious_indices == []\n\n    def test_missed_entity_nested(self, base_sequence_nested):\n        \"\"\"Test case: First level entity is missed in prediction.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)[1:]\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 1\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == [(0, 0)]\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_label(self, base_sequence):\n        \"\"\"Test case: Entity with wrong label.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-LOC\", \"I-LOC\"])\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_label_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong label.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1].label = \"DATE\"\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_boundary(self, base_sequence):\n        \"\"\"Test case: Entity with wrong boundary.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-LOC\", \"O\"])\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_boundary_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong boundary.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1].end = 30\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_extra_entity_nested(self, base_sequence_nested):\n        \"\"\"Test case: Extra (spurious) entity in prediction with nested entities (Scenario II).\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested) + [Entity(\"MISC\", 60, 65)]\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\", \"MISC\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 1\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == [(0, 4)]\n\n    def test_wrong_boundary_and_label_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong boundary and wrong label (Scenario VI).\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1] = Entity(\"DATE\", 4, 30)\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_shifted_boundary(self, base_sequence):\n        \"\"\"Test case: Entity with shifted boundary.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"O\", \"B-LOC\"])\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_extra_entity(self, base_sequence):\n        \"\"\"Test case: Extra entity in prediction.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"B-PER\", \"O\", \"B-LOC\", \"I-LOC\"])\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 1\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == [(0, 2)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == [(0, 1)]\n\n\nclass TestEntityTypeEvaluation:\n    \"\"\"Test cases for entity type evaluation strategy.\"\"\"\n\n    def test_perfect_match(self, base_sequence):\n        \"\"\"Test case: Perfect match of all entities.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio(base_sequence)\n\n        evaluator = EntityTypeEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_perfect_match_nested(self, base_sequence_nested):\n        \"\"\"Test case: Perfect match of all entities with nested entities.\"\"\"\n        evaluator = EntityTypeEvaluation()\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_perfect_match_nested_reverse_order(self, base_sequence_nested):\n        \"\"\"Test case: Perfect match of all entities in reverse order, with nested entities.\"\"\"\n        evaluator = EntityTypeEvaluation()\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)[::-1]\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_missed_entity(self, base_sequence):\n        \"\"\"Test case: One entity is missed in prediction.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"O\", \"O\"])\n\n        evaluator = EntityTypeEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 1\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == [(0, 1)]\n        assert result_indices.spurious_indices == []\n\n    def test_missed_entity_nested(self, base_sequence_nested):\n        \"\"\"Test case: First level entity is missed in prediction.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)[1:]\n\n        evaluator = EntityTypeEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 1\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == [(0, 0)]\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_label(self, base_sequence):\n        \"\"\"Test case: Entity with wrong label.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-LOC\", \"I-LOC\"])\n\n        evaluator = EntityTypeEvaluation()\n        result, _ = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n\n    def test_wrong_label_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong label.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1].label = \"DATE\"\n\n        evaluator = EntityTypeEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_boundary(self, base_sequence):\n        \"\"\"Test case: Entity with wrong boundary.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-LOC\", \"O\"])\n\n        evaluator = EntityTypeEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_boundary_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong boundary.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1].end = 30\n\n        evaluator = EntityTypeEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_extra_entity_nested(self, base_sequence_nested):\n        \"\"\"Test case: Extra (spurious) entity in prediction with nested entities (Scenario II).\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested) + [Entity(\"MISC\", 60, 65)]\n\n        evaluator = EntityTypeEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\", \"MISC\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 1\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == [(0, 4)]\n\n    def test_wrong_boundary_and_label_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong boundary and wrong label (Scenario VI).\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1] = Entity(\"DATE\", 4, 30)\n\n        evaluator = EntityTypeEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_shifted_boundary(self, base_sequence):\n        \"\"\"Test case: Entity with shifted boundary.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"O\", \"B-LOC\"])\n\n        evaluator = EntityTypeEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_extra_entity(self, base_sequence):\n        \"\"\"Test case: Extra entity in prediction.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"B-PER\", \"O\", \"B-LOC\", \"I-LOC\"])\n\n        evaluator = EntityTypeEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 1\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == [(0, 2)]\n        assert result_indices.spurious_indices == [(0, 1)]\n        assert result_indices.missed_indices == []\n        assert result_indices.partial_indices == []\n\n\nclass TestExactEvaluation:\n    \"\"\"Test cases for exact evaluation strategy.\"\"\"\n\n    def test_perfect_match(self, base_sequence):\n        \"\"\"Test case: Perfect match of all entities.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio(base_sequence)\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_perfect_match_nested(self, base_sequence_nested):\n        \"\"\"Test case: Perfect match of all entities with nested entities.\"\"\"\n        evaluator = ExactEvaluation()\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_perfect_match_nested_reverse_order(self, base_sequence_nested):\n        \"\"\"Test case: Perfect match of all entities in reverse order, with nested entities.\"\"\"\n        evaluator = ExactEvaluation()\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)[::-1]\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_missed_entity(self, base_sequence):\n        \"\"\"Test case: One entity is missed in prediction.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"O\", \"O\"])\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 1\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == [(0, 1)]\n        assert result_indices.spurious_indices == []\n\n    def test_missed_entity_nested(self, base_sequence_nested):\n        \"\"\"Test case: First level entity is missed in prediction.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)[1:]\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 1\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == [(0, 0)]\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_label(self, base_sequence):\n        \"\"\"Test case: Entity with wrong label.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-LOC\", \"I-LOC\"])\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_label_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong label.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1].label = \"DATE\"\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_boundary(self, base_sequence):\n        \"\"\"Test case: Entity with wrong boundary.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-LOC\", \"O\"])\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_boundary_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong boundary.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1].end = 30\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_extra_entity_nested(self, base_sequence_nested):\n        \"\"\"Test case: Extra (spurious) entity in prediction with nested entities (Scenario II).\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested) + [Entity(\"MISC\", 60, 65)]\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\", \"MISC\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 1\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == [(0, 4)]\n\n    def test_wrong_boundary_and_label_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong boundary and wrong label (Scenario VI).\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1] = Entity(\"DATE\", 4, 30)\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_shifted_boundary(self, base_sequence):\n        \"\"\"Test case: Entity with shifted boundary.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"O\", \"B-LOC\"])\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == [(0, 1)]\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_extra_entity(self, base_sequence):\n        \"\"\"Test case: Extra entity in prediction.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"B-PER\", \"O\", \"B-LOC\", \"I-LOC\"])\n\n        evaluator = ExactEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 1\n        assert result_indices.correct_indices == [(0, 0), (0, 2)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == [(0, 1)]\n\n\nclass TestPartialEvaluation:\n    \"\"\"Test cases for partial evaluation strategy.\"\"\"\n\n    def test_perfect_match(self, base_sequence):\n        \"\"\"Test case: Perfect match of all entities.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio(base_sequence)\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_perfect_match_nested(self, base_sequence_nested):\n        \"\"\"Test case: Perfect match of all entities with nested entities.\"\"\"\n        evaluator = PartialEvaluation()\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_perfect_match_nested_reverse_order(self, base_sequence_nested):\n        \"\"\"Test case: Perfect match of all entities in reverse order, with nested entities.\"\"\"\n        evaluator = PartialEvaluation()\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)[::-1]\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_missed_entity(self, base_sequence):\n        \"\"\"Test case: One entity is missed in prediction.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"O\", \"O\"])\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 1\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == [(0, 1)]\n        assert result_indices.spurious_indices == []\n\n    def test_missed_entity_nested(self, base_sequence_nested):\n        \"\"\"Test case: First level entity is missed in prediction.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)[1:]\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 1\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == [(0, 0)]\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_label(self, base_sequence):\n        \"\"\"Test case: Entity with wrong label.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-LOC\", \"I-LOC\"])\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_label_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong label.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1].label = \"DATE\"\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_boundary(self, base_sequence):\n        \"\"\"Test case: Entity with wrong boundary.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"B-LOC\", \"O\"])\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 0\n        assert result.partial == 1\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == [(0, 1)]\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_wrong_boundary_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong boundary.\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1].end = 30\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 0\n        assert result.partial == 1\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == [(0, 1)]\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_extra_entity_nested(self, base_sequence_nested):\n        \"\"\"Test case: Extra (spurious) entity in prediction with nested entities (Scenario II).\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested) + [Entity(\"MISC\", 60, 65)]\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\", \"MISC\"])\n\n        assert result.correct == 4\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 1\n        assert result_indices.correct_indices == [(0, 0), (0, 1), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == [(0, 4)]\n\n    def test_wrong_boundary_and_label_nested(self, base_sequence_nested):\n        \"\"\"Test case: Nested entity with wrong boundary and wrong label (Scenario VI).\"\"\"\n        true = base_sequence_nested\n        pred = deepcopy(base_sequence_nested)\n        pred[1] = Entity(\"DATE\", 4, 30)\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"EVENT\", \"LOCATION\", \"DATE\"])\n\n        assert result.correct == 3\n        assert result.incorrect == 0\n        assert result.partial == 1\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 2), (0, 3)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == [(0, 1)]\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_shifted_boundary(self, base_sequence):\n        \"\"\"Test case: Entity with shifted boundary.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"O\", \"O\", \"O\", \"B-LOC\"])\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 0\n        assert result.partial == 1\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == [(0, 1)]\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == []\n\n    def test_extra_entity(self, base_sequence):\n        \"\"\"Test case: Extra entity in prediction.\"\"\"\n        true = create_entities_from_bio(base_sequence)\n        pred = create_entities_from_bio([\"O\", \"B-PER\", \"I-PER\", \"O\", \"B-PER\", \"O\", \"B-LOC\", \"I-LOC\"])\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\", \"LOC\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 1\n        assert result_indices.correct_indices == [(0, 0), (0, 2)]\n        assert result_indices.incorrect_indices == []\n        assert result_indices.partial_indices == []\n        assert result_indices.missed_indices == []\n        assert result_indices.spurious_indices == [(0, 1)]\n\n\nclass TestSingleCharacterEntities:\n    \"\"\"Test cases for single-character entities to ensure proper range handling.\"\"\"\n\n    def test_single_token_entities_strict(self):\n        \"\"\"Test case: Single token entities using strict evaluation.\"\"\"\n        # Create entities representing single characters/tokens\n        # Entity at position 1 with start=1, end=2 (standard representation)\n        true = [Entity(\"PER\", 1, 2), Entity(\"ORG\", 4, 5)]\n        pred = [Entity(\"PER\", 1, 2), Entity(\"ORG\", 4, 5)]\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1)]\n\n    def test_single_token_entities_same_start_end(self):\n        \"\"\"Test case: Single token entities where start==end (edge case).\"\"\"\n        # Edge case: entities where start and end are the same\n        # This tests the scenario mentioned in the user's question\n        true = [Entity(\"PER\", 1, 1), Entity(\"ORG\", 4, 4)]\n        pred = [Entity(\"PER\", 1, 1), Entity(\"ORG\", 4, 4)]\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1)]\n\n    def test_single_token_entities_partial_evaluation(self):\n        \"\"\"Test case: Single token entities with partial evaluation.\"\"\"\n        true = [Entity(\"PER\", 1, 1), Entity(\"ORG\", 4, 4)]\n        pred = [Entity(\"PER\", 1, 1), Entity(\"ORG\", 4, 4)]\n\n        evaluator = PartialEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1)]\n\n    def test_single_token_entities_overlap_detection(self):\n        \"\"\"Test case: Single token entities with overlapping positions.\"\"\"\n        # Test overlap detection for single character entities\n        true = [Entity(\"PER\", 1, 1)]  # Single token at position 1\n        pred = [Entity(\"ORG\", 1, 1)]  # Different label, same position\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        # Should be marked as incorrect due to label mismatch but position overlap\n        assert result.correct == 0\n        assert result.incorrect == 1\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.incorrect_indices == [(0, 0)]\n\n    def test_single_token_adjacent_entities(self):\n        \"\"\"Test case: Adjacent single token entities.\"\"\"\n        # Test entities at adjacent positions\n        true = [Entity(\"PER\", 1, 1), Entity(\"ORG\", 2, 2)]\n        pred = [Entity(\"PER\", 1, 1), Entity(\"ORG\", 2, 2)]\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 2\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 0\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0), (0, 1)]\n\n    def test_single_token_missed_entity(self):\n        \"\"\"Test case: Single token entity that is missed.\"\"\"\n        true = [Entity(\"PER\", 1, 1), Entity(\"ORG\", 4, 4)]\n        pred = [Entity(\"PER\", 1, 1)]  # Missing the ORG entity\n\n        evaluator = StrictEvaluation()\n        result, result_indices = evaluator.evaluate(true, pred, [\"PER\", \"ORG\"])\n\n        assert result.correct == 1\n        assert result.incorrect == 0\n        assert result.partial == 0\n        assert result.missed == 1\n        assert result.spurious == 0\n        assert result_indices.correct_indices == [(0, 0)]\n        assert result_indices.missed_indices == [(0, 1)]\n\n\ndef test_minimum_overlap_percentage_validation():\n    \"\"\"Test that minimum overlap percentage validation works correctly.\"\"\"\n\n    # Valid values should work\n    PartialEvaluation(min_overlap_percentage=1.0)\n    PartialEvaluation(min_overlap_percentage=50.0)\n    PartialEvaluation(min_overlap_percentage=100.0)\n\n    # Invalid values should raise ValueError\n    with pytest.raises(ValueError, match=\"min_overlap_percentage must be between 1.0 and 100.0\"):\n        PartialEvaluation(min_overlap_percentage=0.5)\n\n    with pytest.raises(ValueError, match=\"min_overlap_percentage must be between 1.0 and 100.0\"):\n        PartialEvaluation(min_overlap_percentage=101.0)\n\n    with pytest.raises(ValueError, match=\"min_overlap_percentage must be between 1.0 and 100.0\"):\n        PartialEvaluation(min_overlap_percentage=-5.0)\n\n\ndef test_overlap_percentage_calculation():\n    \"\"\"Test the overlap percentage calculation method.\"\"\"\n    strategy = PartialEvaluation(min_overlap_percentage=50.0)\n\n    true_entity = Entity(label=\"PER\", start=0, end=9)  # 10 tokens (0-9 inclusive)\n\n    test_cases = [\n        # (pred_entity, expected_percentage)\n        (Entity(label=\"PER\", start=0, end=9), 100.0),  # Complete overlap\n        (Entity(label=\"PER\", start=0, end=4), 50.0),  # Half overlap from start\n        (Entity(label=\"PER\", start=5, end=9), 50.0),  # Half overlap from end\n        (Entity(label=\"PER\", start=0, end=0), 10.0),  # Single token overlap at start\n        (Entity(label=\"PER\", start=9, end=9), 10.0),  # Single token overlap at end\n        (Entity(label=\"PER\", start=10, end=15), 0.0),  # No overlap (adjacent)\n        (Entity(label=\"PER\", start=-5, end=2), 30.0),  # Partial overlap from left (3 tokens: 0,1,2)\n        (Entity(label=\"PER\", start=7, end=12), 30.0),  # Partial overlap from right (3 tokens: 7,8,9)\n        (Entity(label=\"PER\", start=2, end=7), 60.0),  # Middle overlap (6 tokens: 2,3,4,5,6,7)\n    ]\n\n    for pred_entity, expected_percentage in test_cases:\n        calculated = strategy._calculate_overlap_percentage(pred_entity, true_entity)\n        assert (\n            abs(calculated - expected_percentage) < 0.1\n        ), f\"Expected {expected_percentage}%, got {calculated}% for pred={pred_entity} vs true={true_entity}\"\n\n\ndef test_has_sufficient_overlap():\n    \"\"\"Test the has_sufficient_overlap method with different thresholds.\"\"\"\n\n    true_entity = Entity(label=\"PER\", start=0, end=9)  # 10 tokens\n\n    # Test with 50% threshold\n    strategy_50 = PartialEvaluation(min_overlap_percentage=50.0)\n\n    # Should pass: 50% or more overlap\n    assert strategy_50._has_sufficient_overlap(Entity(label=\"PER\", start=0, end=4), true_entity)  # 50%\n    assert strategy_50._has_sufficient_overlap(Entity(label=\"PER\", start=0, end=6), true_entity)  # 70%\n    assert strategy_50._has_sufficient_overlap(Entity(label=\"PER\", start=0, end=9), true_entity)  # 100%\n\n    # Should fail: less than 50% overlap\n    assert not strategy_50._has_sufficient_overlap(Entity(label=\"PER\", start=0, end=2), true_entity)  # 30%\n    assert not strategy_50._has_sufficient_overlap(Entity(label=\"PER\", start=0, end=0), true_entity)  # 10%\n    assert not strategy_50._has_sufficient_overlap(Entity(label=\"PER\", start=10, end=15), true_entity)  # 0%\n\n    # Test with 75% threshold\n    strategy_75 = PartialEvaluation(min_overlap_percentage=75.0)\n\n    # Should pass: 75% or more overlap\n    assert strategy_75._has_sufficient_overlap(Entity(label=\"PER\", start=0, end=7), true_entity)  # 80%\n    assert strategy_75._has_sufficient_overlap(Entity(label=\"PER\", start=0, end=9), true_entity)  # 100%\n\n    # Should fail: less than 75% overlap\n    assert not strategy_75._has_sufficient_overlap(Entity(label=\"PER\", start=0, end=6), true_entity)  # 70%\n    assert not strategy_75._has_sufficient_overlap(Entity(label=\"PER\", start=0, end=4), true_entity)  # 50%\n\n\ndef test_partial_evaluation_with_min_overlap():\n    \"\"\"Test PartialEvaluation strategy with different minimum overlap thresholds.\"\"\"\n\n    true_entities = [Entity(label=\"PER\", start=0, end=9)]  # 10 tokens\n\n    test_cases = [\n        # (pred_entity, min_overlap_threshold, expected_correct, expected_partial, expected_spurious)\n        (Entity(label=\"PER\", start=0, end=4), 50.0, 0, 1, 0),  # 50% overlap -> partial\n        (Entity(label=\"PER\", start=0, end=2), 50.0, 0, 0, 1),  # 30% overlap < 50% -> spurious\n        (Entity(label=\"PER\", start=0, end=9), 50.0, 1, 0, 0),  # 100% overlap exact match -> correct\n        (Entity(label=\"PER\", start=0, end=6), 75.0, 0, 0, 1),  # 70% overlap < 75% -> spurious\n        (Entity(label=\"PER\", start=0, end=7), 75.0, 0, 1, 0),  # 80% overlap > 75% -> partial\n    ]\n\n    for pred_entity, threshold, expected_correct, expected_partial, expected_spurious in test_cases:\n        pred_entities = [pred_entity]\n        strategy = PartialEvaluation(min_overlap_percentage=threshold)\n        result, _ = strategy.evaluate(true_entities, pred_entities, [\"PER\"], 0)\n\n        assert (\n            result.correct == expected_correct\n        ), f\"Expected {expected_correct} correct, got {result.correct} for {pred_entity} with threshold {threshold}%\"\n        assert (\n            result.partial == expected_partial\n        ), f\"Expected {expected_partial} partial, got {result.partial} for {pred_entity} with threshold {threshold}%\"\n        assert (\n            result.spurious == expected_spurious\n        ), f\"Expected {expected_spurious} spurious, got {result.spurious} for {pred_entity} with threshold {threshold}%\"\n\n\ndef test_strict_evaluation_with_min_overlap():\n    \"\"\"Test StrictEvaluation strategy with minimum overlap threshold.\"\"\"\n\n    true_entities = [Entity(label=\"PER\", start=0, end=9)]\n\n    # Test case where pred has insufficient overlap -> should be spurious\n    pred_entities = [Entity(label=\"PER\", start=0, end=2)]  # 30% overlap\n    strategy = StrictEvaluation(min_overlap_percentage=50.0)\n    result, _ = strategy.evaluate(true_entities, pred_entities, [\"PER\"], 0)\n\n    assert result.correct == 0\n    assert result.incorrect == 0\n    assert result.spurious == 1  # Insufficient overlap -> spurious\n    assert result.missed == 1  # True entity not matched\n\n    # Test case where pred has sufficient overlap but wrong label -> should be incorrect\n    pred_entities = [Entity(label=\"ORG\", start=0, end=6)]  # 70% overlap, wrong label\n    result, _ = strategy.evaluate(true_entities, pred_entities, [\"PER\", \"ORG\"], 0)\n\n    assert result.correct == 0\n    assert result.incorrect == 1  # Sufficient overlap but wrong label\n    assert result.spurious == 0\n    assert result.missed == 0\n\n\ndef test_entity_type_evaluation_with_min_overlap():\n    \"\"\"Test EntityTypeEvaluation strategy with minimum overlap threshold.\"\"\"\n\n    true_entities = [Entity(label=\"PER\", start=0, end=9)]\n\n    # Test case: sufficient overlap with correct label -> correct\n    pred_entities = [Entity(label=\"PER\", start=0, end=6)]  # 70% overlap, correct label\n    strategy = EntityTypeEvaluation(min_overlap_percentage=50.0)\n    result, _ = strategy.evaluate(true_entities, pred_entities, [\"PER\"], 0)\n\n    assert result.correct == 1\n    assert result.incorrect == 0\n    assert result.spurious == 0\n    assert result.missed == 0\n\n    # Test case: sufficient overlap with wrong label -> incorrect\n    pred_entities = [Entity(label=\"ORG\", start=0, end=6)]  # 70% overlap, wrong label\n    result, _ = strategy.evaluate(true_entities, pred_entities, [\"PER\", \"ORG\"], 0)\n\n    assert result.correct == 0\n    assert result.incorrect == 1\n    assert result.spurious == 0\n    assert result.missed == 0\n\n    # Test case: insufficient overlap -> spurious\n    pred_entities = [Entity(label=\"PER\", start=0, end=2)]  # 30% overlap < 50%\n    result, _ = strategy.evaluate(true_entities, pred_entities, [\"PER\"], 0)\n\n    assert result.correct == 0\n    assert result.incorrect == 0\n    assert result.spurious == 1\n    assert result.missed == 1\n\n\ndef test_exact_evaluation_with_min_overlap():\n    \"\"\"Test ExactEvaluation strategy with minimum overlap threshold.\"\"\"\n\n    true_entities = [Entity(label=\"PER\", start=0, end=9)]\n\n    # Test case: exact boundaries (different label) -> correct\n    pred_entities = [Entity(label=\"ORG\", start=0, end=9)]  # Exact match, different label\n    strategy = ExactEvaluation(min_overlap_percentage=50.0)\n    result, _ = strategy.evaluate(true_entities, pred_entities, [\"PER\", \"ORG\"], 0)\n\n    assert result.correct == 1\n    assert result.incorrect == 0\n    assert result.spurious == 0\n    assert result.missed == 0\n\n    # Test case: sufficient overlap but not exact -> incorrect\n    pred_entities = [Entity(label=\"ORG\", start=0, end=6)]  # 70% overlap, not exact\n    result, _ = strategy.evaluate(true_entities, pred_entities, [\"PER\", \"ORG\"], 0)\n\n    assert result.correct == 0\n    assert result.incorrect == 1\n    assert result.spurious == 0\n    assert result.missed == 0\n\n    # Test case: insufficient overlap -> spurious\n    pred_entities = [Entity(label=\"ORG\", start=0, end=2)]  # 30% overlap < 50%\n    result, _ = strategy.evaluate(true_entities, pred_entities, [\"PER\", \"ORG\"], 0)\n\n    assert result.correct == 0\n    assert result.incorrect == 0\n    assert result.spurious == 1\n    assert result.missed == 1\n\n\ndef test_edge_cases_overlap_calculation():\n    \"\"\"Test edge cases for overlap calculation.\"\"\"\n\n    strategy = PartialEvaluation(min_overlap_percentage=100.0)\n\n    # Test single-token entities\n    true_single = Entity(label=\"ORG\", start=5, end=5)  # Single token\n    pred_single = Entity(label=\"ORG\", start=5, end=5)  # Exact match\n\n    overlap = strategy._calculate_overlap_percentage(pred_single, true_single)\n    assert overlap == 100.0, \"Single token exact match should be 100%\"\n\n    # Test adjacent but non-overlapping entities\n    pred_adjacent = Entity(label=\"ORG\", start=6, end=6)  # Adjacent token\n    overlap = strategy._calculate_overlap_percentage(pred_adjacent, true_single)\n    assert overlap == 0.0, \"Adjacent non-overlapping should be 0%\"\n\n    # Test overlapping single-token entities\n    pred_overlap = Entity(label=\"ORG\", start=4, end=6)  # Overlaps with true_single at position 5\n    overlap = strategy._calculate_overlap_percentage(pred_overlap, true_single)\n    assert overlap == 100.0, \"Single token overlap should be 100% of true entity\"\n\n\ndef test_multiple_entities_with_min_overlap():\n    \"\"\"Test evaluation with multiple entities and minimum overlap.\"\"\"\n\n    true_entities = [Entity(label=\"PER\", start=0, end=4), Entity(label=\"ORG\", start=10, end=14)]  # 5 tokens  # 5 tokens\n\n    pred_entities = [\n        Entity(label=\"PER\", start=0, end=1),  # 40% overlap with first entity\n        Entity(label=\"ORG\", start=10, end=12),  # 60% overlap with second entity\n        Entity(label=\"LOC\", start=20, end=22),  # No overlap (spurious)\n    ]\n\n    # With 50% threshold\n    strategy = PartialEvaluation(min_overlap_percentage=50.0)\n    result, _ = strategy.evaluate(true_entities, pred_entities, [\"PER\", \"ORG\", \"LOC\"], 0)\n\n    assert result.correct == 0\n    assert result.partial == 1  # Only the ORG entity has sufficient overlap (60% > 50%)\n    assert result.spurious == 2  # PER entity (40% < 50%) and LOC entity (no overlap)\n    assert result.missed == 1  # First true entity (PER) not sufficiently matched\n"
  },
  {
    "path": "tests/test_utils.py",
    "content": "from nervaluate import (\n    collect_named_entities,\n    conll_to_spans,\n    list_to_spans,\n    split_list,\n)\n\n\ndef test_list_to_spans():\n    before = [\n        [\"O\", \"B-LOC\", \"I-LOC\", \"B-LOC\", \"I-LOC\", \"O\"],\n        [\"O\", \"B-GPE\", \"I-GPE\", \"B-GPE\", \"I-GPE\", \"O\"],\n    ]\n\n    expected = [\n        [\n            {\"label\": \"LOC\", \"start\": 1, \"end\": 2},\n            {\"label\": \"LOC\", \"start\": 3, \"end\": 4},\n        ],\n        [\n            {\"label\": \"GPE\", \"start\": 1, \"end\": 2},\n            {\"label\": \"GPE\", \"start\": 3, \"end\": 4},\n        ],\n    ]\n\n    result = list_to_spans(before)\n\n    assert result == expected\n\n\ndef test_list_to_spans_1():\n    before = [\n        [\"O\", \"O\", \"O\", \"O\", \"O\", \"O\"],\n        [\"O\", \"O\", \"B-ORG\", \"I-ORG\", \"O\", \"O\"],\n        [\"O\", \"O\", \"B-MISC\", \"I-MISC\", \"O\", \"O\"],\n    ]\n\n    expected = [\n        [],\n        [{\"label\": \"ORG\", \"start\": 2, \"end\": 3}],\n        [{\"label\": \"MISC\", \"start\": 2, \"end\": 3}],\n    ]\n\n    actual = list_to_spans(before)\n\n    assert actual == expected\n\n\ndef test_conll_to_spans():\n    before = (\n        \",\\tO\\n\"\n        \"Davos\\tB-PER\\n\"\n        \"2018\\tO\\n\"\n        \":\\tO\\n\"\n        \"Soros\\tB-PER\\n\"\n        \"accuses\\tO\\n\"\n        \"Trump\\tB-PER\\n\"\n        \"of\\tO\\n\"\n        \"wanting\\tO\\n\"\n        \"\\n\"\n        \"foo\\tO\\n\"\n    )\n\n    after = [\n        [\n            {\"label\": \"PER\", \"start\": 1, \"end\": 1},\n            {\"label\": \"PER\", \"start\": 4, \"end\": 4},\n            {\"label\": \"PER\", \"start\": 6, \"end\": 6},\n        ],\n        [],\n    ]\n\n    out = conll_to_spans(before)\n\n    assert after == out\n\n\ndef test_conll_to_spans_1():\n    before = (\n        \"word\\tO\\nword\\tO\\nword\\tO\\nword\\tO\\nword\\tO\\nword\\tO\\n\\n\"\n        \"word\\tO\\nword\\tO\\nword\\tB-ORG\\nword\\tI-ORG\\nword\\tO\\nword\\tO\\n\\n\"\n        \"word\\tO\\nword\\tO\\nword\\tB-MISC\\nword\\tI-MISC\\nword\\tO\\nword\\tO\\n\"\n    )\n\n    expected = [\n        [],\n        [{\"label\": \"ORG\", \"start\": 2, \"end\": 3}],\n        [{\"label\": \"MISC\", \"start\": 2, \"end\": 3}],\n    ]\n\n    actual = conll_to_spans(before)\n\n    assert actual == expected\n\n\ndef test_split_list():\n    before = [\"aa\", \"bb\", \"cc\", \"\", \"dd\", \"ee\", \"ff\"]\n    expected = [[\"aa\", \"bb\", \"cc\"], [\"dd\", \"ee\", \"ff\"]]\n    out = split_list(before)\n\n    assert expected == out\n\n\ndef test_collect_named_entities_same_type_in_sequence():\n    tags = [\"O\", \"B-LOC\", \"I-LOC\", \"B-LOC\", \"I-LOC\", \"O\"]\n    result = collect_named_entities(tags)\n    expected = [\n        {\"label\": \"LOC\", \"start\": 1, \"end\": 2},\n        {\"label\": \"LOC\", \"start\": 3, \"end\": 4},\n    ]\n    assert result == expected\n\n\ndef test_collect_named_entities_sequence_has_only_one_entity():\n    tags = [\"B-LOC\", \"I-LOC\"]\n    result = collect_named_entities(tags)\n    expected = [{\"label\": \"LOC\", \"start\": 0, \"end\": 1}]\n    assert result == expected\n\n\ndef test_collect_named_entities_entity_goes_until_last_token():\n    tags = [\"O\", \"B-LOC\", \"I-LOC\", \"B-LOC\", \"I-LOC\"]\n    result = collect_named_entities(tags)\n    expected = [\n        {\"label\": \"LOC\", \"start\": 1, \"end\": 2},\n        {\"label\": \"LOC\", \"start\": 3, \"end\": 4},\n    ]\n    assert result == expected\n\n\ndef test_collect_named_entities_no_entity():\n    tags = [\"O\", \"O\", \"O\", \"O\", \"O\"]\n    result = collect_named_entities(tags)\n    expected = []\n    assert result == expected\n"
  }
]