[
  {
    "path": ".github/dependabot.yml",
    "content": "version: 2\nupdates:\n  - package-ecosystem: \"uv\"\n    directory: \"/\"\n    schedule:\n      interval: \"monthly\"\n\n  - package-ecosystem: \"github-actions\"\n    # NOTE(robinson) - Workflow files stored in the\n    # default location of `.github/workflows`\n    directory: \"/\"\n    schedule:\n      interval: \"monthly\"\n"
  },
  {
    "path": ".github/workflows/ci.yml",
    "content": "name: CI\n\non:\n  push:\n    branches: [ main ]\n  pull_request:\n    branches: [ main ]\n\npermissions:\n  contents: read\n\njobs:\n  lint:\n    runs-on: opensource-linux-8core\n    strategy:\n      fail-fast: false\n      matrix:\n        python-version: [\"3.11\", \"3.12\", \"3.13\"]\n    steps:\n    - uses: actions/checkout@v4\n    - name: Install uv\n      uses: astral-sh/setup-uv@v5\n      with:\n        enable-cache: true\n    - name: Set up Python\n      uses: actions/setup-python@v5\n      with:\n        python-version: ${{ matrix.python-version }}\n    - name: Install lint dependencies\n      run: make install-lint\n    - name: Lint\n      run: make check\n\n  shellcheck:\n    runs-on: opensource-linux-8core\n    steps:\n      - uses: actions/checkout@v4\n      - name: ShellCheck\n        uses: ludeeus/action-shellcheck@master\n\n  test:\n    runs-on: opensource-linux-8core\n    needs: lint\n    strategy:\n      fail-fast: false\n      matrix:\n        python-version: [\"3.11\", \"3.12\", \"3.13\"]\n    steps:\n    - uses: actions/checkout@v4\n    - name: Install uv\n      uses: astral-sh/setup-uv@v5\n      with:\n        enable-cache: true\n    - name: Set up Python\n      uses: actions/setup-python@v5\n      with:\n        python-version: ${{ matrix.python-version }}\n    - name: Install system dependencies\n      run: |\n        sudo apt-get update\n        sudo apt-get -y install poppler-utils tesseract-ocr\n    - name: Install dependencies\n      run: make install\n    - name: Configure AWS credentials\n      uses: aws-actions/configure-aws-credentials@v4\n      with:\n        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}\n        aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}\n        aws-region: us-east-2\n    - name: Test\n      env:\n        UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}\n      run: |\n        aws s3 cp s3://utic-dev-models/ci_test_model/test_ci_model.onnx test_unstructured_inference/models/\n        CI=true make test\n        make check-coverage\n\n  changelog:\n    runs-on: opensource-linux-8core\n    steps:\n    - uses: actions/checkout@v4\n    - if: github.ref != 'refs/heads/main'\n      uses: dorny/paths-filter@v2\n      id: changes\n      with:\n        filters: |\n          src:\n            - 'unstructured_inference/**'\n\n    - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'\n      uses: dangoslen/changelog-enforcer@v3\n"
  },
  {
    "path": ".github/workflows/claude.yml",
    "content": "name: Claude Code\n\non:\n  issue_comment:\n    types: [created]\n  pull_request_review_comment:\n    types: [created]\n  issues:\n    types: [opened, assigned]\n  pull_request_review:\n    types: [submitted]\n\njobs:\n  claude:\n    if: |\n      (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||\n      (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||\n      (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||\n      (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')))\n    runs-on: ubuntu-latest\n    permissions:\n      contents: read\n      pull-requests: read\n      issues: read\n      id-token: write\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v4\n        with:\n          fetch-depth: 1\n\n      - name: Run Claude Code\n        id: claude\n        uses: anthropics/claude-code-action@beta\n        with:\n          anthropic_api_key: ${{ secrets.GH_ANTHROPIC_API_KEY }}\n          allowed_tools: \"Bash(git:*),View,GlobTool,GrepTool,BatchTool\"\n"
  },
  {
    "path": ".github/workflows/create_issue.yml",
    "content": "name: create_jira_issue\n\non:\n  issues:\n    types:\n      - opened\n\njobs:\n  create:\n    runs-on: ubuntu-latest\n    name: Create JIRA Issue\n    steps:\n\n    - name: Login to Jira\n      uses: atlassian/gajira-login@v3\n      env:\n        JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}\n        JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}\n        JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}\n\n    - name: Create Jira issue\n      uses: atlassian/gajira-create@v3\n      with:\n        project: CORE\n        issuetype: Task\n        summary: ${{ github.event.issue.title }}\n        description: |\n          Created from github issue: ${{ github.event.issue.html_url }}\n          ----\n          ${{ github.event.issue.body }}\n        fields: '{ \"labels\": [\"github-issue\"] }'\n\n    - name: Log created issue\n      run: echo \"Issue ${{ steps.create.outputs.issue }} was created\"\n"
  },
  {
    "path": ".github/workflows/release.yml",
    "content": "name: Release\n\non:\n  release:\n    types: [published]\n\npermissions:\n  contents: read\n  id-token: write       # Required for PyPI trusted publishing / attestations\n\nconcurrency:\n  group: release\n  cancel-in-progress: false\n\njobs:\n  release:\n    runs-on: ubuntu-latest\n    steps:\n    - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4\n\n    - name: Install uv\n      uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5\n      with:\n        enable-cache: true\n\n    - name: Set up Python\n      uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5\n      with:\n        python-version: \"3.12\"\n\n    - name: Verify tag matches package version\n      run: |\n        PKG_VERSION=$(python -c \"exec(open('unstructured_inference/__version__.py').read()); print(__version__)\")\n        TAG_VERSION=\"${GITHUB_REF_NAME#v}\"\n        if [ \"$PKG_VERSION\" != \"$TAG_VERSION\" ]; then\n          echo \"::error::Tag ($TAG_VERSION) does not match package version ($PKG_VERSION)\"\n          exit 1\n        fi\n\n    - name: Install release dependencies\n      run: uv sync --locked --only-group release --no-install-project\n\n    - name: Build package\n      id: build\n      run: uv build\n\n    - name: Publish to PyPI\n      uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1\n\n    # Best-effort: attempt Azure upload even if PyPI fails, but only if build succeeded.\n    # continue-on-error allows the workflow to pass when Azure secrets are not configured.\n    - name: Publish to Azure Artifacts\n      if: always() && steps.build.outcome == 'success'\n      continue-on-error: true\n      run: |\n        uv run --no-sync twine upload \\\n          --repository-url \"${{ secrets.AZURE_ARTIFACTS_FEED }}\" \\\n          --username \"${{ secrets.AZURE_ARTIFACTS_USERNAME }}\" \\\n          --password \"${{ secrets.AZURE_ARTIFACTS_PAT }}\" \\\n          dist/*\n"
  },
  {
    "path": ".github/workflows/version-bump.yml",
    "content": "name: Version Bump\n\non:\n  pull_request:\n    branches: [main]\n    types: [opened, synchronize, reopened]\n\npermissions:\n  contents: write\n  pull-requests: read\n\njobs:\n  version-bump:\n    if: github.event.pull_request.user.login == 'utic-renovate[bot]'\n    uses: Unstructured-IO/infra/.github/workflows/version-bump.yml@main\n    with:\n      component-paths: '[\".\"]'\n      default-bump: patch\n      update-changelog: true\n      update-lockfile: true\n      renovate-app-id: ${{ vars.RENOVATE_APP_ID }}\n    secrets:\n      token: ${{ secrets.GITHUB_TOKEN }}\n      private-pypi-url: ${{ secrets.PRIVATE_PYPI_INDEX_URL }}\n      renovate-app-private-key: ${{ secrets.RENOVATE_APP_PRIVATE_KEY }}\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\nnbs/\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# Pycharm\n.idea/\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# Model artifacts\n.models/*\n!.models/.gitkeep\n\n# Mac stuff\n.DS_Store\n\n# VSCode\n.vscode/\n\nsample-docs/*_images\nexamples/**/output\nfigures\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: \"v5.0.0\"\n    hooks:\n      - id: check-added-large-files\n      - id: check-toml\n      - id: check-yaml\n      - id: check-json\n      - id: check-xml\n      - id: end-of-file-fixer\n        exclude: \\.json$\n        include: \\.py$\n      - id: trailing-whitespace\n      - id: mixed-line-ending\n\n  - repo: https://github.com/astral-sh/ruff-pre-commit\n    rev: \"v0.15.0\"\n    hooks:\n      - id: ruff\n        args: [\"--fix\"]\n      - id: ruff-format\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "## 1.6.11\n\n### Enhancement\n- Add `table_extraction_method` field to `LayoutElements` and `LayoutElement` to track which algorithm produced a table (grid, tatr, vlm).\n\n## 1.6.10\n\n### Enhancement\n- Add Python 3.13 support.\n\n## 1.6.9\n\n### Enhancement\n- Restore support for Python 3.11 alongside Python 3.12.\n\n## 1.6.8\n\n### Fix\n- Reject PDF pages that would render beyond the configured pixel limit before\n  allocating the page bitmap.\n\n## 1.6.7\n\n### Fix\n- `get_model` now materializes `LazyDict` model configs into a plain dict before\n  unpacking into `initialize(**...)`. Uses `__iter__` + `__getitem__` to avoid\n  depending on `Mapping.keys()`, which has been observed to fail at `**`\n  unpacking with \"argument after ** must be a mapping, not LazyDict\" in some\n  deployment environments.\n\n## 1.6.6\n\n### Enhancement\n- Relax the lower bound of the pandas and numpy dependency\n\n## 1.6.5\n\n### Enhancement\n- Store `pdf_rotation` in `page.image_metadata` so downstream consumers can check page rotation after the page image is freed\n- Add targeted unittest coverage for PDF page rotation handling in `convert_pdf_to_image`\n- Speed up the targeted rotation unittest by isolating the PDF image conversion surface into a lightweight module and mocking the PDFium rendering path for the timing-critical test\n\n## 1.6.4\n\n### Fix\n- Apply PDF `/Rotate` metadata during page rendering - pypdfium2's `page.render()` ignores the flag, producing sideways images for rotated pages\n\n## 1.6.3\n\n### Security\n\n- **security:** fix(deps): upgrade vulnerable transitive dependencies [security]\n\n## 1.6.2\n\n### Enhancement\n- Make `dpi` an explicit parameter on `convert_pdf_to_image` (default 200) instead of reading from config internally, enabling unstructured to use this as the single source of truth for PDF rendering\n\n## 1.6.1\n\n### Enhancement\n- Free intermediate arrays (`origin_img`, `img`, `ort_inputs`, `output`) and PIL pixel buffer at dead points during YoloX `image_processing()` to reduce peak memory during inference\n\n## 1.6.0\n\n### Fix\n- Relax `huggingface-hub` lower bound from `>=1.4.1` to `>=0.22.0` (the `>=1.4.1` was an artifact of the uv migration and broke compatibility with `transformers<5.0`)\n\n## 1.5.5\n\n### Enhancement\n- Lazy page rendering in `convert_pdf_to_image` to reduce peak memory from O(N pages) to O(1 page)\n\n## 1.5.4\n\n### Enhancement\n- Use `np.full()` instead of `np.ones() * scalar` in YoloX preprocessing to avoid a redundant temporary array\n\n## 1.5.3\n\n- Store routing in LayoutElement\n\n## 1.5.2\n\n### Fix\n- Switch to PyPI trusted publishing (OIDC) and remove API token auth\n\n## 1.5.1\n\n### Fix\n- Add `id-token: write` permission to release workflow for PyPI attestations\n\n## 1.5.0\n\n### Enhancement\n- Automate PyPI and Azure Artifacts publishing via GitHub release workflow\n- Replace `--frozen` with `--locked` across Makefile and Dockerfile for stricter lockfile validation\n- Add `release` dependency group with `twine` for Azure Artifacts upload\n- Constrain pillow to >=12.1.1 to address CVE for out-of-bounds write when loading PSD images\n\n## 1.4.0\n\n### Enhancement\n- Switch CI runners to `opensource-linux-8core` for faster builds\n- Add pytest-xdist parallelization (`-n auto`) to `docker-test` target\n- Remove mypy from lint pipeline; ruff covers linting needs sufficiently\n- Add `install-lint` target; CI lint job no longer downloads full project dependencies\n\n## 1.3.0\n\n### Enhancement\n- Migrate project to native uv with hatchling build backend\n- Consolidate all configuration into pyproject.toml\n- Replace pip/requirements workflow with uv sync/lock\n- Parallelize test runs with pytest-xdist (`-n auto`)\n\n### Breaking\n- Drop support for Python 3.10 and 3.11; require Python >=3.12, <3.13\n\n## 1.2.0\n\n### Enhancement\n- **Per-model locks for parallel model loading**: Replace single global lock with per-model locks\n  - Allows concurrent loading of different models (detectron2, yolox, etc.)\n  - 10x+ concurrency improvement in multi-model environments\n  - Maintains thread-safe initialization with double-check pattern\n  - Backward compatible - no API changes\n\n## 1.1.9\n\n### Fix\n- **TableTransformer device_map fix**: Remove device_map parameter to prevent meta tensor errors\n  - Device normalization (cuda -> cuda:0) for consistent caching\n  - Load models without device_map, use explicit .to(device, dtype=torch.float32)\n  - Fixes concurrent PDF processing AssertionError\n  - Prevents \"Trying to set a tensor of type Float but got Meta\" errors\n- Use context manager for `pdfium.PdfDocument`\n\n## 1.1.8\n\n- put `pdfium` call behind a thread lock\n\n## 1.1.7\n\n- Update OpenCV-Python to 4.13.0.90 to squash ffmpeg vulnerability CVE-2023-6605\n\n## 1.1.6\n\n- Use inference_config to set default rendering DPI\n\n## 1.1.5\n\n- Render PDF to image using PyPDFium instead of pdf2image, due to much improved performance for certain docs\n\n## 1.1.4\n\n- Constrain urllib3 to urllib3>=2.6.0 to address CVE-2025-66471 and CVE-2025-66418\n\n## 1.1.3\n\n- Constrain fonttools to >=4.60.2 to address CVE-2025-66034\n\n## 1.1.2\n\n* chore(deps): Bump several depedencies to resolve open high CVEs\n* fix: Exclude pip and setuptools pinning based on cursor comment\n* fix: With the newer version of transformers 4.57.1, the type checking became stricter, and mypy correctly flagged that DetrImageProcessor.from_pretrained() expects str | PathLike[Any], not a model object.\n* fix: Update test to explicitly cast numpy array to uint8 for Pillow 12.0.0 compatibility\n\n## 1.1.1\n\n* Add NotImplementedError when trying to single index a TextRegions, reflecting the fact that it won't behave correctly at the moment.\n\n## 1.1.0\n\n* Enhancement: Add `TextSource` to track where the text of an element came from\n* Enhancement: Refactor `__post_init__` of `TextRegions` and `LayoutElement` slightly to automate initialization\n\n## 1.0.10\n\n* Remove merging logic that's no longer used\n\n## 1.0.9\n\n* Make OD model loading thread safe\n\n## 1.0.8\n\n* Enhancement: Optimized `zoom_image` (codeflash)\n* Enhancement: Optimized `cells_to_html` for an 8% speedup in some cases (codeflash)\n* Enhancement: Optimized `outputs_to_objects` for an 88% speedup in some cases (codeflash)\n\n## 1.0.7\n\n* Fix a hardcoded file extension causing confusion in the logs\n\n## 1.0.6\n\n* Add slicing through indexing for vectorized elements\n\n## 1.0.5\n\n* feat: add thread lock to prevent racing condition when instantiating singletons\n* feat: parametrize edge config for `DetrImageProcessor` with env variables\n\n## 1.0.4\n\n* feat: use singleton instead of `global` to store shared variables\n\n## 1.0.3\n\n* setting longest_edge=1333 to the table image processor\n\n## 1.0.2\n\n* adding parameter to table image preprocessor related to the image size\n\n## 1.0.1\n\n* fix: moving the table transformer model to device when loading the model instead of once the model is loaded.\n\n## 1.0.0\n\n* feat: support for Python 3.10+; drop support for Python 3.9\n\n## 0.8.11\n\n* feat: remove `donut` model\n\n## 0.8.10\n\n* feat: unpin `numpy` and bump minimum for `onnxruntime` to be compatible with `numpy>=2`\n\n## 0.8.9\n\n* chore: unpin `pdfminer-six` version\n\n## 0.8.8\n* fix: pdfminer-six dependencies\n* feat: `PageLayout.elements` is now a `cached_property` to reduce unecessary memory and cpu costs\n\n## 0.8.7\n\n* fix: add `password` for PDF\n\n## 0.8.6\n\n* feat: add back `source` to `TextRegions` and `LayoutElements` for backward compatibility\n\n## 0.8.5\n\n* fix: remove `pdfplumber` but include `pdfminer-six==20240706` to update `pdfminer`\n\n## 0.8.4\n\n* feat: add `text_as_html` and `table_as_cells` to `LayoutElements` class as new attributes\n* feat: replace the single valueed `source` attribute from `TextRegions` and `LayoutElements` with an array attribute `sources`\n\n## 0.8.3\n\n* fix: removed `layoutelement.from_lp_textblock()` and related tests as it's not used\n* fix: update requirements to drop `layoutparser` lib\n* fix: update `README.md` to remove layoutparser model zoo support note\n\n## 0.8.2\n\n* fix: fix bug when an empty list is passed into `TextRegions.from_list` triggers `IndexError`\n* fix: fix bug when concatenate a list of `LayoutElements` the class id mapping is no properly\n  updated\n\n## 0.8.1\n\n* fix: fix list index out of range error caused by calling LayoutElements.from_list() with empty list\n\n## 0.8.0\n\n* fix: fix missing source after cleaning layout elements\n* **BREAKING** Remove chipper model\n\n## 0.7.41\n\n* fix: fix incorrect type casting with higher versions of `numpy` when substracting a `float` from an `int` array\n* fix: fix a bug where class id 0 becomes class type `None` when calling `LayoutElements.as_list()`\n\n## 0.7.40\n\n* fix: store probabilities with `float` data type instead of `int`\n\n## 0.7.39\n\n* fix: Correctly assign mutable default value to variable in `LayoutElements` class\n\n## 0.7.38\n\n* fix: Correctly assign mutable default value to variable in `TextRegions` class\n\n## 0.7.37\n\n* refactor: remove layout analysis related code\n* enhancement: Hide warning about table transformer weights not being loaded\n* fix(layout): Use TemporaryDirectory instead of NamedTemporaryFile for Windows support\n* refactor: use `numpy` array to store layout elements' information in one single `LayoutElements`\n  object instead of using a list of `LayoutElement`\n\n## 0.7.36\n\nfix: add input parameter validation to `fill_cells()` when converting cells to html\n\n## 0.7.35\n\nFix syntax for generated HTML tables\n\n## 0.7.34\n\n* Reduce excessive logging\n\n## 0.7.33\n\n* BREAKING CHANGE: removes legacy detectron2 model\n* deps: remove layoutparser optional dependencies\n\n## 0.7.32\n\n* refactor: remove all code related to filling inferred elements text from embedded text (pdfminer).\n* bug: set the Chipper max_length variable\n\n## 0.7.31\n\n* refactor: remove all `cid` related code that was originally added to filter out invalid `pdfminer` text\n* enhancement: Wrapped hf_hub_download with a function that checks for local file before checking HF\n\n## 0.7.30\n\n* fix: table transformer doesn't return multiple cells with same coordinates\n*\n## 0.7.29\n\n* fix: table transformer predictions are now removed if confidence is below threshold\n\n\n## 0.7.28\n\n* feat: allow table transformer agent to return table prediction in not parsed format\n\n## 0.7.27\n\n* fix: remove pin from `onnxruntime` dependency.\n\n## 0.7.26\n\n* feat: add a set of new `ElementType`s to extend future element types recognition\n* feat: allow registering of new models for inference using `unstructured_inference.models.base.register_new_model` function\n\n## 0.7.25\n\n* fix: replace `Rectangle.is_in()` with `Rectangle.is_almost_subregion_of()` when filling in an inferred element with embedded text\n* bug: check for None in Chipper bounding box reduction\n* chore: removes `install-detectron2` from the `Makefile`\n* fix: convert label_map keys read from os.environment `UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH` to int type\n* feat: removes supergradients references\n\n## 0.7.24\n\n* fix: assign value to `text_as_html` element attribute only if `text` attribute contains HTML tags.\n\n## 0.7.23\n\n* fix: added handling in `UnstructuredTableTransformerModel` for if `recognize` returns an empty\n  list in `run_prediction`.\n\n## 0.7.22\n\n* fix: add logic to handle computation of intersections betwen 2 `Rectangle`s when a `Rectangle` has `None` value in its coordinates\n\n## 0.7.21\n\n* fix: fix a bug where chipper, or any element extraction model based `PageLayout` object, lack `image_metadata` and other attributes that are required for downstream processing; this fix also reduces the memory overhead of using chipper model\n\n## 0.7.20\n\n* chipper-v3: improved table prediction\n\n## 0.7.19\n\n* refactor: remove all OCR related code\n\n## 0.7.18\n\n* refactor: remove all image extraction related code\n\n## 0.7.17\n\n* refactor: remove all `pdfminer` related code\n* enhancement: improved Chipper bounding boxes\n\n## 0.7.16\n\n* bug: Allow supplied ONNX models to use label_map dictionary from json file\n\n## 0.7.15\n\n* enhancement: Enable env variables for model definition\n\n## 0.7.14\n\n* enhancement: Remove Super-Gradients Dependency and Allow General Onnx Models Instead\n\n## 0.7.13\n\n* refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings\n* enhancement: support extracting elements with types `Picture` and `Figure`\n* fix: update logger in table initalization where the logger info was not showing\n* chore: supress UserWarning about specified model providers\n\n## 0.7.12\n\n* change the default model to yolox, as table output appears to be better and speed is similar to `yolox_quantized`\n\n## 0.7.11\n\n* chore: remove logger info for chipper since its private\n* fix: update broken slack invite link in chipper logger info\n* enhancement: Improve error message when # images extracted doesn't match # page layouts.\n* fix: use automatic mixed precision on GPU for Chipper\n* fix: chipper Table elements now match other layout models' Table element format: html representation is stored in `text_as_html` attribute and `text` attribute stores text without html tags\n\n## 0.7.10\n\n* Handle kwargs explicitly when needed, suppress otherwise\n* fix: Reduce Chipper memory consumption on x86_64 cpus\n* fix: Skips ordering elements coming from Chipper\n* fix: After refactoring to introduce Chipper, annotate() wasn't able to show text with extra info from elements, this is fixed now.\n* feat: add table cell and dataframe output formats to table transformer's `run_prediction` call\n* breaking change: function `unstructured_inference.models.tables.recognize` no longer takes `out_html` parameter and it now only returns table cell data format (lists of dictionaries)\n\n## 0.7.9\n\n* Allow table model to accept optional OCR tokens\n\n## 0.7.8\n\n* Fix: include onnx as base dependency.\n\n## 0.7.7\n\n• Fix a memory leak in DonutProcessor when using large images in numpy format\n• Set the right settings for beam search size > 1\n• Fix a bug that in very rare cases made the last element predicted by Chipper to have a bbox = None\n\n## 0.7.6\n\n* fix a bug where invalid zoom factor lead to exceptions; now invalid zoom factors results in no scaling of the image\n\n## 0.7.5\n\n* Improved packaging\n\n## 0.7.4\n\n* Dynamic beam search size has been implemented for Chipper, the decoding process starts with a size = 1 and changes to size = 3 if repetitions appear.\n* Fixed bug when PDFMiner predicts that an image text occupies the full page and removes annotations by Chipper.\n* Added random seed to Chipper text generation to avoid differences between calls to Chipper.\n* Allows user to use super-gradients model if they have a callback predict function, a yaml file with names field corresponding to classes and a path to the model weights\n\n## 0.7.3\n\n* Integration of Chipperv2 and additional Chipper functionality, which includes automatic detection of GPU,\nbounding box prediction and hierarchical representation.\n* Remove control characters from the text of all layout elements\n\n## 0.7.2\n\n* Sort elements extracted by `pdfminer` to get consistent result from `aggregate_by_block()`\n\n## 0.7.1\n\n* Download yolox_quantized from HF\n\n## 0.7.0\n\n* Remove all OCR related code expect the table OCR code\n\n## 0.6.6\n\n* Stop passing ocr_languages parameter into paddle to avoid invalid paddle language code error, this will be fixed until\nwe have the mapping from standard language code to paddle language code.\n## 0.6.5\n\n* Add functionality to keep extracted image elements while merging inferred layout with extracted layout\n* Fix `source` property for elements generated by pdfminer.\n* Add 'OCR-tesseract' and 'OCR-paddle' as sources for elements generated by OCR.\n\n## 0.6.4\n\n* add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task\n* add the new image auto scaling parameters to `config.py`\n\n## 0.6.3\n\n* fix a bug where padded table structure bounding boxes are not shifted back into the original image coordinates correctly\n\n## 0.6.2\n\n* move the confidence threshold for table transformer to config\n\n## 0.6.1\n\n* YoloX_quantized is now the default model. This models detects most diverse types and detect tables better than previous model.\n* Since detection models tend to nest elements inside others(specifically in Tables), an algorithm has been added for reducing this\n  behavior. Now all the elements produced by detection models are disjoint and they don't produce overlapping regions, which helps\n  reduce duplicated content.\n* Add `source` property to our elements, so you can know where the information was generated (OCR or detection model)\n\n## 0.6.0\n\n* add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environement variables\n* update behavior of `pad_image_with_background_color` so that input `pad` is applied to all sides\n\n## 0.5.31\n\n* Add functionality to extract and save images from the page\n* Add functionality to get only \"true\" embedded images when extracting elements from PDF pages\n* Update the layout visualization script to be able to show only image elements if need\n* add an evaluation metric for table comparison based on token similarity\n* fix paddle unit tests where `make test` fails since paddle doesn't work on M1/M2 chip locally\n\n## 0.5.28\n\n* add env variable `ENTIRE_PAGE_OCR` to specify using paddle or tesseract on entire page OCR\n\n## 0.5.27\n\n* table structure detection now pads the input image by 25 pixels in all 4 directions to improve its recall\n\n## 0.5.26\n\n* support paddle with both cpu and gpu and assumed it is pre-installed\n\n## 0.5.25\n\n* fix a bug where `cells_to_html` doesn't handle cells spanning multiple rows properly\n\n## 0.5.24\n\n* remove `cv2` preprocessing step before OCR step in table transformer\n\n## 0.5.23\n\n* Add functionality to bring back embedded images in PDF\n\n## 0.5.22\n\n* Add object-detection classification probabilities to LayoutElement for all currently implemented object detection models\n\n## 0.5.21\n\n* adds `safe_division` to replae 0 with machine epsilon for `float` to avoid division by 0\n* apply `safe_division` to area overlap calculations in `unstructured_inference/inference/elements.py`\n\n## 0.5.20\n\n* Adds YoloX quantized model\n\n## 0.5.19\n\n* Add functionality to supplement detected layout with elements from the full page OCR\n* Add functionality to annotate any layout(extracted, inferred, OCR) on a page\n\n## 0.5.18\n\n* Fix for incorrect type assignation at ingest test\n\n## 0.5.17\n\n* Use `OMP_THREAD_LIMIT` to improve tesseract performance\n\n## 0.5.16\n\n* Fix to no longer create a directory for storing processed images\n* Hot-load images for annotation\n\n## 0.5.15\n\n* Handle an uncaught TesseractError\n\n## 0.5.14\n\n* Add TIFF test file and TIFF filetype to `test_from_image_file` in `test_layout`\n\n## 0.5.13\n\n* Fix extracted image elements being included in layout merge\n\n## 0.5.12\n\n* Add multipage TIFF extraction support\n* Fix a pdfminer error when using `process_data_with_model`\n\n## 0.5.11\n\n* Add warning when chipper is used with < 300 DPI\n* Use None default for dpi so defaults can be properly handled upstream\n\n## 0.5.10\n\n* Implement full-page OCR\n\n## 0.5.9\n\n* Handle exceptions from Tesseract\n\n## 0.5.8\n\n* Add alternative architecture for detectron2 (but default is unchanged)\n* Updates:\n\n| Library       | From      | To       |\n|---------------|-----------|----------|\n| transformers  | 4.29.2    | 4.30.2   |\n| opencv-python | 4.7.0.72  | 4.8.0.74 |\n| ipython       | 8.12.2    | 8.14.0   |\n\n* Cache named models that have been loaded\n\n## 0.5.7\n\n* hotfix to handle issue storing images in a new dir when the pdf has no file extension\n\n## 0.5.6\n\n* Update the `annotate` and `_get_image_array` methods of `PageLayout` to get the image from the `image_path` property if the `image` property is `None`.\n* Add functionality to store pdf images for later use.\n* Add `image_metadata` property to `PageLayout` & set `page.image` to None to reduce memory usage.\n* Update `DocumentLayout.from_file` to open only one image.\n* Update `load_pdf` to return either Image objects or Image paths.\n* Warns users that Chipper is a beta model.\n* Exposed control over dpi when converting PDF to an image.\n* Updated detectron2 version to avoid errors related to deprecated PIL reference\n\n## 0.5.5\n\n* Rename large model to chipper\n* Added functionality to write images to computer storage temporarily instead of keeping them in memory for `pdf2image.convert_from_path`\n* Added functionality to convert a PDF in small chunks of pages at a time for `pdf2image.convert_from_path`\n* Table processing check for the area of the package to fix division by zero bug\n* Added CUDA and TensorRT execution providers for yolox and detectron2onnx model.\n* Warning for onnx version of detectron2 for empty pages suppresed.\n\n## 0.5.4\n\n* Tweak to element ordering to make it more deterministic\n\n## 0.5.3\n\n* Refactor for large model\n\n## 0.5.2\n\n* Combine inferred elements with extracted elements\n* Add ruff to keep code consistent with unstructured\n* Configure fallback for OCR token if paddleocr doesn't work to use tesseract\n\n## 0.5.1\n\n* Add annotation for pages\n* Store page numbers when processing PDFs\n* Hotfix to handle inference of blank pages using ONNX detectron2\n* Revert ordering change to investigate examples of misordering\n\n## 0.5.0\n\n* Preserve image format in PIL.Image.Image when loading\n* Added ONNX version of Detectron2 and make default model\n* Remove API code, we don't serve this as a standalone API any more\n* Update ordering logic to account for multicolumn documents.\n\n## 0.4.4\n\n* Fixed patches not being a package.\n\n## 0.4.3\n\n* Patch pdfminer.six to fix parsing bug\n\n## 0.4.2\n\n* Output of table extraction is now stored in `text_as_html` property rather than `text` property\n\n## 0.4.1\n\n* Added the ability to pass `ocr_languages` to the OCR agent for users who need\n  non-English language packs.\n\n## 0.4.0\n\n* Added logic to partition granular elements (words, characters) by proximity\n* Text extraction is now delegated to text regions rather than being handled centrally\n* Fixed embedded image coordinates being interpreted differently than embedded text coordinates\n* Update to how dependencies are being handled\n* Update detectron2 version\n\n## 0.3.2\n\n* Allow extracting tables from higher level functions\n\n## 0.3.1\n\n* Pin protobuf version to avoid errors\n* Make paddleocr an extra again\n\n## 0.3.0\n\n* Fix for text block detection\n* Add paddleocr dependency to setup for x86_64 machines\n\n## 0.2.14\n\n* Suppressed processing progress bars\n\n## 0.2.13\n\n* Add table processing\n* Change OCR logic to be aware of PDF image elements\n\n## 0.2.12\n\n* Fix for processing RGBA images\n\n## 0.2.11\n\n* Fixed some cases where image elements were not being OCR'd\n\n## 0.2.10\n\n* Removed control characters from tesseract output\n\n## 0.2.9\n\n* Removed multithreading from OCR (DocumentLayout.get_elements_from_layout)\n\n## 0.2.8\n\n* Refactored YoloX inference code to integrate better with framework\n* Improved testing time\n\n## 0.2.7\n\n* Fixed duplicated load_pdf call\n\n## 0.2.6\n\n* Add donut model script for image prediction\n* Add sample receipt and test for donut prediction\n\n## 0.2.5\n\n* Add YoloX model for images and PDFs\n* Add generic model interface\n\n## 0.2.4\n\n* Download default model from huggingface\n* Clarify error when trying to open file that doesn't exist as an image\n\n## 0.2.3\n\n* Pins the version of `opencv-python` for linux compatibility\n\n## 0.2.2\n\n* Add capability to process image files\n* Add logic to use OCR when layout text is full of unknown characters\n\n## 0.2.1\n\n* Refactor to facilitate local inference\n* Removes BasicConfig from logger configuration\n* Implement auto model downloading\n\n## 0.2.0\n\n* Initial release of unstructured-inference\n"
  },
  {
    "path": "Dockerfile",
    "content": "# syntax=docker/dockerfile:experimental\nARG PYTHON_VERSION=3.12\nFROM python:${PYTHON_VERSION}-slim AS base\n\n# Set up environment\nENV HOME=/home/\nWORKDIR ${HOME}\nRUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \\\n  && ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts\n\n# Install uv\nCOPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /usr/local/bin/\n\nFROM base AS deps\n# Copy project files needed for dependency resolution\nCOPY pyproject.toml uv.lock ./\nCOPY unstructured_inference/__version__.py unstructured_inference/__version__.py\n\nRUN uv sync --locked --all-groups --no-install-project\n\n# Ensure venv binaries are on PATH so pytest/etc. are directly accessible\nENV PATH=\"/home/.venv/bin:${PATH}\"\n\nFROM deps AS code\nCOPY unstructured_inference unstructured_inference\nRUN uv sync --locked --all-groups\n\nCMD [\"/bin/bash\"]\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "Makefile",
    "content": "PACKAGE_NAME := unstructured_inference\nCURRENT_DIR := $(shell pwd)\n\n\n.PHONY: help\nhelp: Makefile\n\t@sed -n 's/^\\(## \\)\\([a-zA-Z]\\)/\\2/p' $<\n\n\n###########\n# Install #\n###########\n\n## install:                 install all dependencies via uv\n.PHONY: install\ninstall:\n\t@uv sync --locked --all-groups\n\n## install-lint:            install only lint dependencies (no project deps)\n.PHONY: install-lint\ninstall-lint:\n\t@uv sync --locked --only-group lint\n\n## lock:                    update and lock all dependencies\n.PHONY: lock\nlock:\n\t@uv lock --upgrade\n\n#################\n# Test and Lint #\n#################\n\nexport CI ?= false\n\n## test:                    runs all unittests (excluding slow)\n.PHONY: test\ntest:\n\tCI=$(CI) uv run --locked --no-sync pytest -n auto -m \"not slow\" test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing\n\n## test-slow:               runs all unittests (including slow)\n.PHONY: test-slow\ntest-slow:\n\tCI=$(CI) uv run --locked --no-sync pytest -n auto test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing\n\n## check:                   runs all linters and checks\n.PHONY: check\ncheck: check-ruff check-version\n\n## check-ruff:              runs ruff linter\n.PHONY: check-ruff\ncheck-ruff:\n\tuv run --locked --no-sync ruff check .\n\tuv run --locked --no-sync ruff format --check .\n\n## check-scripts:           run shellcheck\n.PHONY: check-scripts\ncheck-scripts:\n\tscripts/shellcheck.sh\n\n## check-version:           run check to ensure version in CHANGELOG.md matches version in package\n.PHONY: check-version\ncheck-version:\n    # Fail if syncing version would produce changes\n\tscripts/version-sync.sh -c \\\n\t\t-s CHANGELOG.md \\\n\t\t-f ${PACKAGE_NAME}/__version__.py semver\n\n## tidy:                    auto-format and fix lint issues\n.PHONY: tidy\ntidy:\n\tuv run --locked --no-sync ruff format .\n\tuv run --locked --no-sync ruff check --fix-only --show-fixes .\n\n## version-sync:            update __version__.py with most recent version from CHANGELOG.md\n.PHONY: version-sync\nversion-sync:\n\tscripts/version-sync.sh \\\n\t\t-s CHANGELOG.md \\\n\t\t-f ${PACKAGE_NAME}/__version__.py semver\n\n## check-coverage:          check test coverage meets threshold\n.PHONY: check-coverage\ncheck-coverage:\n\tuv run --locked --no-sync coverage report --fail-under=90\n\n##########\n# Docker #\n##########\n\nDOCKER_IMAGE ?= unstructured-inference:dev\n\n.PHONY: docker-build\ndocker-build:\n\tDOCKER_IMAGE=${DOCKER_IMAGE} ./scripts/docker-build.sh\n\n.PHONY: docker-test\ndocker-test: docker-build\n\tdocker run --rm \\\n\t-v ${CURRENT_DIR}/test_unstructured_inference:/home/test_unstructured_inference \\\n\t-v ${CURRENT_DIR}/sample-docs:/home/sample-docs \\\n\t$(DOCKER_IMAGE) \\\n\tbash -c \"pytest -n auto $(if $(TEST_NAME),-k $(TEST_NAME),) test_unstructured_inference\"\n"
  },
  {
    "path": "README.md",
    "content": "<h3 align=\"center\">\n  <img\n    src=\"https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/img/unstructured_logo.png\"\n    height=\"200\"\n  >\n\n</h3>\n\n<h3 align=\"center\">\n  <p>Open-Source Pre-Processing Tools for Unstructured Data</p>\n</h3>\n\nThe `unstructured-inference` repo contains hosted model inference code for layout parsing models. \nThese models are invoked via API as part of the partitioning bricks in the `unstructured` package.\n\n**Requires Python >=3.11, <3.14.**\n\n## Installation\n\n### Package\n\n```shell\npip install unstructured-inference\n```\n\n### Detectron2\n\n[Detectron2](https://github.com/facebookresearch/detectron2) is required for using models from the [layoutparser model zoo](#using-models-from-the-layoutparser-model-zoo) \nbut is not automatically installed with this package. \nFor MacOS and Linux, build from source with:\n```shell\npip install 'git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a'\n```\nOther install options can be found in the \n[Detectron2 installation guide](https://detectron2.readthedocs.io/en/latest/tutorials/install.html).\n\nWindows is not officially supported by Detectron2, but some users are able to install it anyway. \nSee discussion [here](https://layout-parser.github.io/tutorials/installation#for-windows-users) for \ntips on installing Detectron2 on Windows.\n\n### Development Setup\n\nThis project uses [uv](https://docs.astral.sh/uv/) for dependency management.\n\n```shell\n# Clone and install all dependencies (including dev/test/lint groups)\ngit clone https://github.com/Unstructured-IO/unstructured-inference.git\ncd unstructured-inference\nmake install\n```\n\nRun `make help` for a full list of available targets.\n\n## Getting Started\n\nTo get started with the layout parsing model, use the following commands:\n\n```python\nfrom unstructured_inference.inference.layout import DocumentLayout\n\nlayout = DocumentLayout.from_file(\"sample-docs/loremipsum.pdf\")\n\nprint(layout.pages[0].elements)\n```\n\nOnce the model has detected the layout and OCR'd the document, the text extracted from the first \npage of the sample document will be displayed.\nYou can convert a given element to a `dict` by running the `.to_dict()` method.\n\n## Models\n\nThe inference pipeline operates by finding text elements in a document page using a detection model, then extracting the contents of the elements using direct extraction (if available), OCR, and optionally table inference models.\n\nWe offer several detection models including [Detectron2](https://github.com/facebookresearch/detectron2) and [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX).\n\n### Using a non-default model\n\nWhen doing inference, an alternate model can be used by passing the model object to the ingestion method via the `model` parameter. The `get_model` function can be used to construct one of our out-of-the-box models from a keyword, e.g.:\n```python\nfrom unstructured_inference.models.base import get_model\nfrom unstructured_inference.inference.layout import DocumentLayout\n\nmodel = get_model(\"yolox\")\nlayout = DocumentLayout.from_file(\"sample-docs/layout-parser-paper.pdf\", detection_model=model)\n```\n\n### Using your own model\n\nAny detection model can be used for in the `unstructured_inference` pipeline by wrapping the model in the `UnstructuredObjectDetectionModel` class. To integrate with the `DocumentLayout` class, a subclass of `UnstructuredObjectDetectionModel` must have a `predict` method that accepts a `PIL.Image.Image` and returns a list of `LayoutElement`s, and an `initialize` method, which loads the model and prepares it for inference.\n\n## Security Policy\n\nSee our [security policy](https://github.com/Unstructured-IO/unstructured-inference/security/policy) for\ninformation on how to report security vulnerabilities.\n\n## Learn more\n\n| Section | Description |\n|-|-|\n| [Unstructured Community Github](https://github.com/Unstructured-IO/community) | Information about Unstructured.io community projects  |\n| [Unstructured Github](https://github.com/Unstructured-IO) | Unstructured.io open source repositories |\n| [Company Website](https://unstructured.io) | Unstructured.io product and company info |\n"
  },
  {
    "path": "benchmarks/__init__.py",
    "content": ""
  },
  {
    "path": "benchmarks/test_benchmark_yolox.py",
    "content": "\"\"\"Benchmark for YoloX image_processing() memory optimization.\n\nUses a fake ONNX session to isolate the memory behavior of image_processing()\nwithout requiring the real model weights. The fake session allocates a realistic\n35 MiB workspace to simulate ONNX inference memory pressure.\n\"\"\"\n\nimport numpy as np\nfrom PIL import Image as PILImage\n\nfrom unstructured_inference.models.yolox import UnstructuredYoloXModel\n\n\nclass _FakeInput:\n    def __init__(self) -> None:\n        self.name = \"input\"\n\n\nclass _FakeSession:\n    \"\"\"Simulates an ONNX inference session with realistic memory allocation.\"\"\"\n\n    def get_inputs(self):\n        return [_FakeInput()]\n\n    def run(self, _names, _inputs):\n        workspace = np.empty((35 * 1024 * 1024,), dtype=np.uint8)  # 35 MiB  # noqa: F841\n        # input_shape (1024,768), strides [8,16,32] → 128*96 + 64*48 + 32*24 = 16128\n        return [np.random.randn(1, 16128, 16).astype(np.float32)]\n\n\ndef make_model() -> UnstructuredYoloXModel:\n    model = object.__new__(UnstructuredYoloXModel)\n    model.model = _FakeSession()\n    model.model_path = \"yolox_fake\"\n    model.layout_classes = {\n        0: \"Caption\",\n        1: \"Footnote\",\n        2: \"Formula\",\n        3: \"List-item\",\n        4: \"Page-footer\",\n        5: \"Page-header\",\n        6: \"Picture\",\n        7: \"Section-header\",\n        8: \"Table\",\n        9: \"Text\",\n        10: \"Title\",\n    }\n    return model\n\n\n# Letter-size page at 200 DPI — the default render resolution\ndef make_letter_200dpi() -> PILImage.Image:\n    return PILImage.fromarray(np.random.randint(0, 255, (2200, 1700, 3), dtype=np.uint8))\n\n\ndef run_image_processing():\n    model = make_model()\n    img = make_letter_200dpi()\n    return model.image_processing(img)\n\n\ndef test_benchmark_yolox_image_processing(benchmark):\n    benchmark(run_image_processing)\n"
  },
  {
    "path": "examples/ocr/engine.py",
    "content": "import os\nimport re\nimport time\nfrom typing import List, cast\n\nimport cv2\nimport numpy as np\nimport pytesseract\nfrom pytesseract import Output\n\nfrom unstructured_inference.inference import layout\nfrom unstructured_inference.inference.elements import Rectangle, TextRegion\n\n\ndef remove_non_printable(s):\n    dst_str = re.sub(r\"[^\\x20-\\x7E]\", \" \", s)\n    return \" \".join(dst_str.split())\n\n\ndef run_ocr_with_layout_detection(\n    images,\n    detection_model=None,\n    element_extraction_model=None,\n    mode=\"individual_blocks\",\n    output_dir=\"\",\n    drawable=True,\n    printable=True,\n):\n    total_text_extraction_infer_time = 0\n    total_extracted_text = {}\n    for i, image in enumerate(images):\n        page_num = i + 1\n        page_num_str = f\"page{page_num}\"\n\n        page = layout.PageLayout(\n            number=i + 1,\n            image=image,\n            layout=None,\n            detection_model=detection_model,\n            element_extraction_model=element_extraction_model,\n        )\n\n        inferred_layout: List[TextRegion] = cast(List[TextRegion], page.detection_model(page.image))\n\n        cv_img = np.array(image)\n\n        if mode == \"individual_blocks\":\n            # OCR'ing individual blocks (current approach)\n            text_extraction_start_time = time.time()\n\n            elements = page.get_elements_from_layout(inferred_layout)\n\n            text_extraction_infer_time = time.time() - text_extraction_start_time\n\n            total_text_extraction_infer_time += text_extraction_infer_time\n\n            page_text = \"\"\n            for el in elements:\n                page_text += el.text\n            filtered_page_text = remove_non_printable(page_text)\n            total_extracted_text[page_num_str] = filtered_page_text\n        elif mode == \"entire_page\":\n            # OCR'ing entire page (new approach to implement)\n            text_extraction_start_time = time.time()\n\n            ocr_data = pytesseract.image_to_data(image, lang=\"eng\", output_type=Output.DICT)\n            boxes = ocr_data[\"level\"]\n            extracted_text_list = []\n            for k in range(len(boxes)):\n                (x, y, w, h) = (\n                    ocr_data[\"left\"][k],\n                    ocr_data[\"top\"][k],\n                    ocr_data[\"width\"][k],\n                    ocr_data[\"height\"][k],\n                )\n                extracted_text = ocr_data[\"text\"][k]\n                if not extracted_text:\n                    continue\n\n                extracted_region = Rectangle(x1=x, y1=y, x2=x + w, y2=y + h)\n\n                extracted_is_subregion_of_inferred = False\n                for inferred_region in inferred_layout:\n                    extracted_is_subregion_of_inferred = extracted_region.is_almost_subregion_of(\n                        inferred_region.pad(12),\n                        subregion_threshold=0.75,\n                    )\n                    if extracted_is_subregion_of_inferred:\n                        break\n\n                if extracted_is_subregion_of_inferred:\n                    extracted_text_list.append(extracted_text)\n\n                if drawable:\n                    if extracted_is_subregion_of_inferred:\n                        cv2.rectangle(cv_img, (x, y), (x + w, y + h), (0, 255, 0), 2, None)\n                    else:\n                        cv2.rectangle(cv_img, (x, y), (x + w, y + h), (255, 0, 0), 2, None)\n\n            text_extraction_infer_time = time.time() - text_extraction_start_time\n            total_text_extraction_infer_time += text_extraction_infer_time\n\n            page_text = \" \".join(extracted_text_list)\n            filtered_page_text = remove_non_printable(page_text)\n            total_extracted_text[page_num_str] = filtered_page_text\n        else:\n            raise ValueError(\"Invalid mode\")\n\n        if drawable:\n            for el in inferred_layout:\n                pt1 = [int(el.x1), int(el.y1)]\n                pt2 = [int(el.x2), int(el.y2)]\n                cv2.rectangle(\n                    img=cv_img,\n                    pt1=pt1,\n                    pt2=pt2,\n                    color=(0, 0, 255),\n                    thickness=4,\n                    lineType=None,\n                )\n\n            f_path = os.path.join(output_dir, f\"ocr_{mode}_{page_num_str}.jpg\")\n            cv2.imwrite(f_path, cv_img)\n\n        if printable:\n            print(\n                f\"page: {i + 1} - n_layout_elements: {len(inferred_layout)} - \"\n                f\"text_extraction_infer_time: {text_extraction_infer_time}\"\n            )\n\n    return total_text_extraction_infer_time, total_extracted_text\n\n\ndef run_ocr(\n    images,\n    printable=True,\n):\n    total_text_extraction_infer_time = 0\n    total_text = \"\"\n    for i, image in enumerate(images):\n        text_extraction_start_time = time.time()\n\n        page_text = pytesseract.image_to_string(image)\n\n        text_extraction_infer_time = time.time() - text_extraction_start_time\n\n        if printable:\n            print(f\"page: {i + 1} - text_extraction_infer_time: {text_extraction_infer_time}\")\n\n        total_text_extraction_infer_time += text_extraction_infer_time\n        total_text += page_text\n\n    return total_text_extraction_infer_time, total_text\n"
  },
  {
    "path": "examples/ocr/requirements.txt",
    "content": "unstructured[local-inference]\nnltk"
  },
  {
    "path": "examples/ocr/validate_ocr_performance.py",
    "content": "import json\nimport os\nimport time\nfrom datetime import datetime\nfrom difflib import SequenceMatcher\n\nimport nltk\nimport pdf2image\n\nfrom unstructured_inference.inference.layout import (\n    DocumentLayout,\n    create_image_output_dir,\n    process_file_with_model,\n)\n\n# Download the required resources (run this once)\nnltk.download(\"punkt\")\n\n\ndef validate_performance(\n    f_name,\n    validation_mode,\n    is_image_file=False,\n):\n    print(\n        f\">>> Start performance comparison - filename: {f_name}\"\n        f\" - validation_mode: {validation_mode}\"\n        f\" - is_image_file: {is_image_file}\"\n    )\n\n    now_dt = datetime.utcnow()\n    now_str = now_dt.strftime(\"%Y_%m_%d-%H_%M_%S\")\n\n    f_path = os.path.join(example_docs_dir, f_name)\n\n    image_f_paths = []\n    if validation_mode == \"pdf\":\n        pdf_info = pdf2image.pdfinfo_from_path(f_path)\n        n_pages = pdf_info[\"Pages\"]\n    elif validation_mode == \"image\":\n        if is_image_file:\n            image_f_paths.append(f_path)\n        else:\n            image_output_dir = create_image_output_dir(f_path)\n            images = pdf2image.convert_from_path(f_path, output_folder=image_output_dir)\n            image_f_paths = [image.filename for image in images]\n        n_pages = len(image_f_paths)\n    else:\n        n_pages = 0\n\n    processing_result = {}\n    for ocr_mode in [\"individual_blocks\", \"entire_page\"]:\n        start_time = time.time()\n\n        if validation_mode == \"pdf\":\n            layout = process_file_with_model(\n                f_path,\n                model_name=None,\n                ocr_mode=ocr_mode,\n            )\n        elif validation_mode == \"image\":\n            pages = []\n            for image_f_path in image_f_paths:\n                _layout = process_file_with_model(\n                    image_f_path,\n                    model_name=None,\n                    ocr_mode=ocr_mode,\n                    is_image=True,\n                )\n                pages += _layout.pages\n            for i, page in enumerate(pages):\n                page.number = i + 1\n            layout = DocumentLayout.from_pages(pages)\n        else:\n            layout = None\n\n        infer_time = time.time() - start_time\n\n        if layout is None:\n            print(\"Layout is None\")\n            return\n\n        full_text = str(layout)\n        page_text = {}\n        for page in layout.pages:\n            page_text[page.number] = str(page)\n\n        processing_result[ocr_mode] = {\n            \"infer_time\": infer_time,\n            \"full_text\": full_text,\n            \"page_text\": page_text,\n        }\n\n    individual_mode_page_text = processing_result[\"individual_blocks\"][\"page_text\"]\n    entire_mode_page_text = processing_result[\"individual_blocks\"][\"page_text\"]\n    individual_mode_full_text = processing_result[\"individual_blocks\"][\"full_text\"]\n    entire_mode_full_text = processing_result[\"entire_page\"][\"full_text\"]\n\n    compare_result = compare_processed_text(individual_mode_full_text, entire_mode_full_text)\n\n    report = {\n        \"validation_mode\": validation_mode,\n        \"file_info\": {\n            \"filename\": f_name,\n            \"n_pages\": n_pages,\n        },\n        \"processing_time\": {\n            \"individual_blocks\": processing_result[\"individual_blocks\"][\"infer_time\"],\n            \"entire_page\": processing_result[\"entire_page\"][\"infer_time\"],\n        },\n        \"text_similarity\": compare_result,\n        \"extracted_text\": {\n            \"individual_blocks\": {\n                \"page_text\": individual_mode_page_text,\n                \"full_text\": individual_mode_full_text,\n            },\n            \"entire_page\": {\n                \"page_text\": entire_mode_page_text,\n                \"full_text\": entire_mode_full_text,\n            },\n        },\n    }\n\n    write_report(report, now_str, validation_mode)\n\n    print(\"<<< End performance comparison\", f_name)\n\n\ndef compare_processed_text(individual_mode_full_text, entire_mode_full_text, delimiter=\" \"):\n    # Calculate similarity ratio\n    similarity_ratio = SequenceMatcher(\n        None, individual_mode_full_text, entire_mode_full_text\n    ).ratio()\n\n    print(f\"similarity_ratio: {similarity_ratio}\")\n\n    # Tokenize the text into words\n    word_list_individual = nltk.word_tokenize(individual_mode_full_text)\n    n_word_list_individual = len(word_list_individual)\n    print(\"n_word_list_in_text_individual:\", n_word_list_individual)\n    word_sets_individual = set(word_list_individual)\n    n_word_sets_individual = len(word_sets_individual)\n    print(f\"n_word_sets_in_text_individual: {n_word_sets_individual}\")\n    # print(\"word_sets_merged:\", word_sets_merged)\n\n    word_list_entire = nltk.word_tokenize(entire_mode_full_text)\n    n_word_list_entire = len(word_list_entire)\n    print(\"n_word_list_individual:\", n_word_list_entire)\n    word_sets_entire = set(word_list_entire)\n    n_word_sets_entire = len(word_sets_entire)\n    print(f\"n_word_sets_individual: {n_word_sets_entire}\")\n    # print(\"word_sets_individual:\", word_sets_individual)\n\n    # Find unique elements using difference\n    print(\"diff_elements:\")\n    unique_words_individual = word_sets_individual - word_sets_entire\n    unique_words_entire = word_sets_entire - word_sets_individual\n    print(f\"unique_words_in_text_individual: {unique_words_individual}\\n\")\n    print(f\"unique_words_in_text_entire: {unique_words_entire}\")\n\n    return {\n        \"similarity_ratio\": similarity_ratio,\n        \"individual_blocks\": {\n            \"n_word_list\": n_word_list_individual,\n            \"n_word_sets\": n_word_sets_individual,\n            \"unique_words\": delimiter.join(list(unique_words_individual)),\n        },\n        \"entire_page\": {\n            \"n_word_list\": n_word_list_entire,\n            \"n_word_sets\": n_word_sets_entire,\n            \"unique_words\": delimiter.join(list(unique_words_entire)),\n        },\n    }\n\n\ndef write_report(report, now_str, validation_mode):\n    report_f_name = f\"validate-ocr-{validation_mode}-{now_str}.json\"\n    report_f_path = os.path.join(output_dir, report_f_name)\n    with open(report_f_path, \"w\", encoding=\"utf-8-sig\") as f:\n        json.dump(report, f, indent=4)\n\n\ndef run():\n    test_files = [\n        {\"name\": \"layout-parser-paper-fast.pdf\", \"mode\": \"image\", \"is_image_file\": False},\n        {\"name\": \"loremipsum_multipage.pdf\", \"mode\": \"image\", \"is_image_file\": False},\n        {\"name\": \"2023-Jan-economic-outlook.pdf\", \"mode\": \"image\", \"is_image_file\": False},\n        {\"name\": \"recalibrating-risk-report.pdf\", \"mode\": \"image\", \"is_image_file\": False},\n        {\"name\": \"Silent-Giant.pdf\", \"mode\": \"image\", \"is_image_file\": False},\n    ]\n\n    for test_file in test_files:\n        f_name = test_file[\"name\"]\n        validation_mode = test_file[\"mode\"]\n        is_image_file = test_file[\"is_image_file\"]\n\n        validate_performance(f_name, validation_mode, is_image_file)\n\n\nif __name__ == \"__main__\":\n    cur_dir = os.getcwd()\n    base_dir = os.path.join(cur_dir, os.pardir, os.pardir)\n    example_docs_dir = os.path.join(base_dir, \"sample-docs\")\n\n    # folder path to save temporary outputs\n    output_dir = os.path.join(cur_dir, \"output\")\n    os.makedirs(output_dir, exist_ok=True)\n\n    run()\n"
  },
  {
    "path": "logger_config.yaml",
    "content": "version: 1\ndisable_existing_loggers: False\nformatters:\n  default_format:\n    \"()\": uvicorn.logging.DefaultFormatter\n    format: '%(asctime)s %(name)s %(levelname)s %(message)s'\n  access:\n    \"()\": uvicorn.logging.AccessFormatter\n    format: '%(asctime)s %(client_addr)s %(request_line)s - %(status_code)s'\nhandlers:\n  access_handler:\n    formatter: access\n    class: logging.StreamHandler\n    stream: ext://sys.stderr\n  standard_handler:\n    formatter: default_format\n    class: logging.StreamHandler\n    stream: ext://sys.stderr\nloggers:\n  uvicorn.error:\n    level: INFO\n    handlers:\n      - standard_handler\n    propagate: no\n    # disable logging for uvicorn.error by not having a handler\n  uvicorn.access:\n    level: INFO\n    handlers:\n      - access_handler\n    propagate: no\n    # disable logging for uvicorn.access by not having a handler\n  unstructured:\n    level: INFO\n    handlers:\n      - standard_handler\n    propagate: no\n  unstructured_inference:\n    level: DEBUG\n    handlers:\n      - standard_handler\n    propagate: no\n\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[project]\nname = \"unstructured_inference\"\ndescription = \"A library for performing inference using trained models.\"\nrequires-python = \">=3.11, <3.14\"\nauthors = [{name = \"Unstructured Technologies\", email = \"devops@unstructuredai.io\"}]\nclassifiers = [\n    \"Development Status :: 4 - Beta\",\n    \"Intended Audience :: Developers\",\n    \"Intended Audience :: Education\",\n    \"Intended Audience :: Science/Research\",\n    \"License :: OSI Approved :: Apache Software License\",\n    \"Operating System :: OS Independent\",\n    \"Programming Language :: Python :: 3\",\n    \"Programming Language :: Python :: 3.11\",\n    \"Programming Language :: Python :: 3.12\",\n    \"Programming Language :: Python :: 3.13\",\n    \"Topic :: Scientific/Engineering :: Artificial Intelligence\",\n]\nreadme = \"README.md\"\nlicense = \"Apache-2.0\"\nkeywords = [\"NLP\", \"PDF\", \"HTML\", \"CV\", \"XML\", \"parsing\", \"preprocessing\"]\ndynamic = [\"version\"]\ndependencies = [\n    \"huggingface-hub>=0.22.0\",\n    \"numpy>=1.26.0\",\n    \"opencv-python>=4.13.0.90\",\n    \"onnx>=1.20.1\",\n    \"onnxruntime>=1.25.0\",\n    \"matplotlib>=3.10.8\",\n    \"torch>=2.10.0\",\n    \"timm>=1.0.24\",\n    # NOTE(alan): Pinned because this is when the most recent module we import appeared\n    \"transformers>=4.25.1\",\n    # Required by transformers[torch] for model loading with torch\n    \"accelerate>=1.12.0\",\n    \"rapidfuzz>=3.14.3\",\n    \"pandas>=1.5.0\",\n    \"scipy>=1.17.0\",\n    \"pypdfium2>=5.0.0\",\n]\n\n[project.urls]\nHomepage = \"https://github.com/Unstructured-IO/unstructured-inference\"\n\n[tool.hatch.version]\npath = \"unstructured_inference/__version__.py\"\n\n[dependency-groups]\nlint = [\n    \"ruff>=0.15.0\",\n]\ntest = [\n    \"pytest>=9.0.2\",\n    \"pytest-cov>=7.0.0\",\n    \"pytest-mock>=3.15.1\",\n    \"pytest-xdist>=3.5.0\",\n    \"coverage>=7.13.3\",\n    \"httpx>=0.28.1\",\n    \"pdf2image>=1.16.2\",\n]\ndev = [\n    \"jupyter>=1.1.1\",\n    \"ipython>=9.10.0\",\n]\nrelease = [\n    \"twine>=6.2.0\",\n]\n\n[build-system]\nrequires = [\"hatchling\"]\nbuild-backend = \"hatchling.build\"\n\n[tool.uv]\nconstraint-dependencies = [\n    # Security: CVE fix for fonttools\n    \"fonttools>=4.60.2\",\n    # Security: CVE fix for urllib3\n    \"urllib3>=2.6.0\",\n    # Security: CVE fix for Pillow (out-of-bounds write loading PSD images)\n    \"pillow>=12.1.1\",\n]\n\n[tool.hatch.build.targets.wheel]\npackages = [\"/unstructured_inference\"]\n\n[tool.hatch.build.targets.sdist]\npackages = [\"/unstructured_inference\"]\n\n[tool.ruff]\nline-length = 100\n\n[tool.ruff.lint]\nselect = [\n    # pycodestyle\n    \"E\",\n    # Pyflakes\n    \"F\",\n    # flake8-comprehensions\n    \"C4\",\n    # flake8-commas\n    \"COM\",\n    # isort\n    \"I\",\n    # flake8-simplify\n    \"SIM\",\n    # pyupgrade\n    \"UP015\",\n    \"UP018\",\n    \"UP032\",\n    \"UP034\",\n    # pylint refactor\n    \"PLR0402\",\n    # flake8-pytest-style\n    \"PT\",\n]\nignore = [\n    \"COM812\",\n    \"PT011\",\n    \"PT012\",\n]\n\n[tool.ruff.lint.per-file-ignores]\n\"test_*/**\" = [\"D\"]\n\n[tool.pytest.ini_options]\nmarkers = [\n    \"slow: marks tests as slow (deselect with '-m \\\"not slow\\\"')\",\n]\nfilterwarnings = [\n    \"ignore::DeprecationWarning\",\n]\n\n[tool.codeflash]\nbenchmarks-root = \"benchmarks\"\n\n[tool.coverage.report]\nfail_under = 90\n"
  },
  {
    "path": "renovate.json",
    "content": "{\n  \"$schema\": \"https://docs.renovatebot.com/renovate-schema.json\",\n  \"extends\": [\"github>Unstructured-IO/renovate-config:python-uv\"]\n}\n"
  },
  {
    "path": "scripts/docker-build.sh",
    "content": "#!/usr/bin/env bash\n\nset -euo pipefail\nDOCKER_IMAGE=\"${DOCKER_IMAGE:-unstructured-inference:dev}\"\n\nDOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile \\\n  --build-arg BUILDKIT_INLINE_CACHE=1 \\\n  --progress plain \\\n  -t \"$DOCKER_IMAGE\" .)\n\nDOCKER_BUILDKIT=1 \"${DOCKER_BUILD_CMD[@]}\"\n"
  },
  {
    "path": "scripts/shellcheck.sh",
    "content": "#!/usr/bin/env bash\n\nfind scripts -name \"*.sh\" -exec shellcheck {} +\n\n"
  },
  {
    "path": "scripts/test-unstructured-ingest-helper.sh",
    "content": "#!/usr/bin/env bash\n\n# This is intended to be run from an unstructured checkout, not in this repo\n# The goal here is to see what changes the current branch would introduce to unstructured\n# fixtures\n\nINGEST_COMMANDS=(\n    test_unstructured_ingest/src/azure.sh\n    test_unstructured_ingest/src/biomed-api.sh\n    test_unstructured_ingest/src/biomed-path.sh\n    test_unstructured_ingest/src/box.sh\n    test_unstructured_ingest/src/dropbox.sh\n    test_unstructured_ingest/src/gcs.sh\n    test_unstructured_ingest/src/onedrive.sh\n    test_unstructured_ingest/src/s3.sh\n)\n\nEXIT_STATUSES=()\n\n# Run each command and capture its exit status\nfor INGEST_COMMAND in \"${INGEST_COMMANDS[@]}\"; do\n  $INGEST_COMMAND\n  EXIT_STATUSES+=($?)\ndone\n\n# Check for failures\nfor STATUS in \"${EXIT_STATUSES[@]}\"; do\n  if [[ $STATUS -ne 0 ]]; then\n    echo \"At least one ingest command failed! Scroll up to see which\"\n    exit 1\n  fi\ndone\n\necho \"No diff's resulted from any ingest commands\"\n"
  },
  {
    "path": "scripts/version-sync.sh",
    "content": "#!/usr/bin/env bash\nfunction usage {\n    echo \"Usage: $(basename \"$0\") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]\" 2>&1\n    echo 'Synchronize files to latest version in source file'\n    echo '   -s              Specifies source file for version (default is CHANGELOG.md)'\n    echo '   -f              Specifies a file to change and the format for searching and replacing versions'\n    echo '                       FILE_TO_CHANGE is the file to be updated/checked for updates'\n    echo '                       REPLACEMENT_FORMAT is one of (semver, release, api-release)'\n    echo '                           semver indicates to look for a full semver version and replace with the latest full version'\n    echo '                           release indicates to look for a release semver version (x.x.x) and replace with the latest release version'\n    echo '                           api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version'\n    echo '   -c              Compare versions and output proposed changes without changing anything.'\n}\n\nfunction getopts-extra () {\n    declare -i i=1\n    # if the next argument is not an option, then append it to array OPTARG\n    while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do\n        OPTARG[i]=${!OPTIND}\n        ((i += 1))\n        ((OPTIND += 1))\n    done\n}\n\n# Parse input options\ndeclare CHECK=0\ndeclare SOURCE_FILE=\"CHANGELOG.md\"\ndeclare -a FILES_TO_CHECK=()\ndeclare -a REPLACEMENT_FORMATS=()\ndeclare args\ndeclare OPTIND OPTARG opt\nwhile getopts \":hcs:f:\" opt; do\n    case $opt in\n        h)\n            usage\n            exit 0\n            ;;\n        c)\n            CHECK=1\n            ;;\n        s)\n            SOURCE_FILE=\"$OPTARG\"\n            ;;\n        f)\n            getopts-extra \"$@\"\n            args=( \"${OPTARG[@]}\" )\n            # validate length of args, should be 2\n            if [ ${#args[@]} -eq 2 ]; then\n                FILES_TO_CHECK+=( \"${args[0]}\" )\n                REPLACEMENT_FORMATS+=( \"${args[1]}\" )\n            else\n                echo \"Exactly 2 arguments must follow -f option.\" >&2\n                exit 1\n            fi\n            ;;\n        \\?)\n            echo \"Invalid option: -$OPTARG.\" >&2\n            usage\n            exit 1\n            ;;\n    esac\ndone\n\n# Parse REPLACEMENT_FORMATS\nRE_SEMVER_FULL=\"(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\\+([0-9a-zA-Z-]+(\\.[0-9a-zA-Z-]+)*))?\"\nRE_RELEASE=\"(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\"\nRE_API_RELEASE=\"v(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\"\n# Pull out semver appearing earliest in SOURCE_FILE.\nLAST_VERSION=$(grep -o -m 1 -E \"${RE_SEMVER_FULL}\" \"$SOURCE_FILE\")\nLAST_RELEASE=$(grep -o -m 1 -E \"${RE_RELEASE}($|[^-+])\" \"$SOURCE_FILE\" | grep -o -m 1 -E \"${RE_RELEASE}\")\nLAST_API_RELEASE=\"v$(grep -o -m 1 -E \"${RE_RELEASE}($|[^-+])$\" \"$SOURCE_FILE\" | grep -o -m 1 -E \"${RE_RELEASE}\")\"\ndeclare -a RE_SEMVERS=()\ndeclare -a UPDATED_VERSIONS=()\nfor i in \"${!REPLACEMENT_FORMATS[@]}\"; do\n    REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]}\n    case $REPLACEMENT_FORMAT in\n        semver)\n            RE_SEMVERS+=( \"$RE_SEMVER_FULL\" )\n            UPDATED_VERSIONS+=( \"$LAST_VERSION\" )\n            ;;\n        release)\n            RE_SEMVERS+=( \"$RE_RELEASE\" )\n            UPDATED_VERSIONS+=( \"$LAST_RELEASE\" )\n            ;;\n        api-release)\n            RE_SEMVERS+=( \"$RE_API_RELEASE\" )\n            UPDATED_VERSIONS+=( \"$LAST_API_RELEASE\" )\n            ;;\n        *)\n            echo \"Invalid replacement format: \\\"${REPLACEMENT_FORMAT}\\\". Use semver, release, or api-release\" >&2\n            exit 1\n            ;;\n    esac\ndone\n\nif [ -z \"$LAST_VERSION\" ];\nthen\n    # No match to semver regex in SOURCE_FILE, so no version to go from.\n    printf \"Error: Unable to find latest version from %s.\\n\" \"$SOURCE_FILE\"\n    exit 1\nfi\n\n# Search files in FILES_TO_CHECK and change (or get diffs)\ndeclare FAILED_CHECK=0\n\nfor i in \"${!FILES_TO_CHECK[@]}\"; do\n    FILE_TO_CHANGE=${FILES_TO_CHECK[$i]}\n    RE_SEMVER=${RE_SEMVERS[$i]}\n    UPDATED_VERSION=${UPDATED_VERSIONS[$i]}\n    FILE_VERSION=$(grep -o -m 1 -E \"${RE_SEMVER}\" \"$FILE_TO_CHANGE\")\n    if [ -z \"$FILE_VERSION\" ];\n    then\n        # No match to semver regex in VERSIONFILE, so nothing to replace\n        printf \"Error: No semver version found in file %s.\\n\" \"$FILE_TO_CHANGE\"\n        exit 1\n    else\n        # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE\n        TMPFILE=$(mktemp /tmp/new_version.XXXXXX)\n        # Check sed version, exit if version < 4.3\n        if ! sed --version > /dev/null 2>&1; then\n            CURRENT_VERSION=1.archaic\n        else\n            CURRENT_VERSION=$(sed --version | head -n1 | cut -d\" \" -f4)\n        fi\n        REQUIRED_VERSION=\"4.3\"\n        if [ \"$(printf '%s\\n' \"$REQUIRED_VERSION\" \"$CURRENT_VERSION\" | sort -V | head -n1)\" != \"$REQUIRED_VERSION\" ]; then\n            echo \"sed version must be >= ${REQUIRED_VERSION}\" && exit 1\n        fi\n        sed -E -r \"s/$RE_SEMVER/$UPDATED_VERSION/\" \"$FILE_TO_CHANGE\" > \"$TMPFILE\"\n        if [ $CHECK == 1 ];\n        then\n            DIFF=$(diff \"$FILE_TO_CHANGE\"  \"$TMPFILE\" )\n            if [ -z \"$DIFF\" ];\n            then\n                printf \"version sync would make no changes to %s.\\n\" \"$FILE_TO_CHANGE\"\n                rm \"$TMPFILE\"\n            else\n                FAILED_CHECK=1\n                printf \"version sync would make the following changes to %s:\\n%s\\n\" \"$FILE_TO_CHANGE\" \"$DIFF\"\n                rm \"$TMPFILE\"\n            fi\n        else\n            cp \"$TMPFILE\" \"$FILE_TO_CHANGE\" \n            rm \"$TMPFILE\"\n        fi\n    fi\ndone\n\n# Exit with code determined by whether changes were needed in a check.\nif [ ${FAILED_CHECK} -ne 0 ]; then\n    exit 1\nelse\n    exit 0\nfi\n"
  },
  {
    "path": "test_unstructured_inference/conftest.py",
    "content": "import numpy as np\nimport pytest\nfrom PIL import Image\n\nfrom unstructured_inference.inference.elements import (\n    EmbeddedTextRegion,\n    Rectangle,\n    TextRegion,\n)\nfrom unstructured_inference.inference.layoutelement import LayoutElement\n\n\n@pytest.fixture\ndef mock_pil_image():\n    return Image.new(\"RGB\", (50, 50))\n\n\n@pytest.fixture\ndef mock_numpy_image():\n    return np.zeros((50, 50, 3), np.uint8)\n\n\n@pytest.fixture\ndef mock_rectangle():\n    return Rectangle(100, 100, 300, 300)\n\n\n@pytest.fixture\ndef mock_text_region():\n    return TextRegion.from_coords(100, 100, 300, 300, text=\"Sample text\")\n\n\n@pytest.fixture\ndef mock_layout_element():\n    return LayoutElement.from_coords(\n        100,\n        100,\n        300,\n        300,\n        text=\"Sample text\",\n        source=None,\n        type=\"Text\",\n    )\n\n\n@pytest.fixture\ndef mock_embedded_text_regions():\n    return [\n        EmbeddedTextRegion.from_coords(\n            x1=453.00277777777774,\n            y1=317.319341111111,\n            x2=711.5338541666665,\n            y2=358.28571222222206,\n            text=\"LayoutParser:\",\n        ),\n        EmbeddedTextRegion.from_coords(\n            x1=726.4778125,\n            y1=317.319341111111,\n            x2=760.3308594444444,\n            y2=357.1698966666667,\n            text=\"A\",\n        ),\n        EmbeddedTextRegion.from_coords(\n            x1=775.2748177777777,\n            y1=317.319341111111,\n            x2=917.3579885555555,\n            y2=357.1698966666667,\n            text=\"Unified\",\n        ),\n        EmbeddedTextRegion.from_coords(\n            x1=932.3019468888888,\n            y1=317.319341111111,\n            x2=1071.8426522222221,\n            y2=357.1698966666667,\n            text=\"Toolkit\",\n        ),\n        EmbeddedTextRegion.from_coords(\n            x1=1086.7866105555556,\n            y1=317.319341111111,\n            x2=1141.2105142777777,\n            y2=357.1698966666667,\n            text=\"for\",\n        ),\n        EmbeddedTextRegion.from_coords(\n            x1=1156.154472611111,\n            y1=317.319341111111,\n            x2=1256.334784222222,\n            y2=357.1698966666667,\n            text=\"Deep\",\n        ),\n        EmbeddedTextRegion.from_coords(\n            x1=437.83888888888885,\n            y1=367.13322999999986,\n            x2=610.0171992222222,\n            y2=406.9837855555556,\n            text=\"Learning\",\n        ),\n        EmbeddedTextRegion.from_coords(\n            x1=624.9611575555555,\n            y1=367.13322999999986,\n            x2=741.6754646666665,\n            y2=406.9837855555556,\n            text=\"Based\",\n        ),\n        EmbeddedTextRegion.from_coords(\n            x1=756.619423,\n            y1=367.13322999999986,\n            x2=958.3867708333332,\n            y2=406.9837855555556,\n            text=\"Document\",\n        ),\n        EmbeddedTextRegion.from_coords(\n            x1=973.3307291666665,\n            y1=367.13322999999986,\n            x2=1092.0535042777776,\n            y2=406.9837855555556,\n            text=\"Image\",\n        ),\n    ]\n\n\n# TODO(alan): Make a better test layout\n@pytest.fixture\ndef mock_layout(mock_embedded_text_regions):\n    return [\n        LayoutElement(text=r.text, type=\"UncategorizedText\", bbox=r.bbox)\n        for r in mock_embedded_text_regions\n    ]\n\n\n@pytest.fixture\ndef example_table_cells():\n    cells = [\n        {\"cell text\": \"Disability Category\", \"row_nums\": [0, 1], \"column_nums\": [0]},\n        {\"cell text\": \"Participants\", \"row_nums\": [0, 1], \"column_nums\": [1]},\n        {\"cell text\": \"Ballots Completed\", \"row_nums\": [0, 1], \"column_nums\": [2]},\n        {\"cell text\": \"Ballots Incomplete/Terminated\", \"row_nums\": [0, 1], \"column_nums\": [3]},\n        {\"cell text\": \"Results\", \"row_nums\": [0], \"column_nums\": [4, 5]},\n        {\"cell text\": \"Accuracy\", \"row_nums\": [1], \"column_nums\": [4]},\n        {\"cell text\": \"Time to complete\", \"row_nums\": [1], \"column_nums\": [5]},\n        {\"cell text\": \"Blind\", \"row_nums\": [2], \"column_nums\": [0]},\n        {\"cell text\": \"Low Vision\", \"row_nums\": [3], \"column_nums\": [0]},\n        {\"cell text\": \"Dexterity\", \"row_nums\": [4], \"column_nums\": [0]},\n        {\"cell text\": \"Mobility\", \"row_nums\": [5], \"column_nums\": [0]},\n        {\"cell text\": \"5\", \"row_nums\": [2], \"column_nums\": [1]},\n        {\"cell text\": \"5\", \"row_nums\": [3], \"column_nums\": [1]},\n        {\"cell text\": \"5\", \"row_nums\": [4], \"column_nums\": [1]},\n        {\"cell text\": \"3\", \"row_nums\": [5], \"column_nums\": [1]},\n        {\"cell text\": \"1\", \"row_nums\": [2], \"column_nums\": [2]},\n        {\"cell text\": \"2\", \"row_nums\": [3], \"column_nums\": [2]},\n        {\"cell text\": \"4\", \"row_nums\": [4], \"column_nums\": [2]},\n        {\"cell text\": \"3\", \"row_nums\": [5], \"column_nums\": [2]},\n        {\"cell text\": \"4\", \"row_nums\": [2], \"column_nums\": [3]},\n        {\"cell text\": \"3\", \"row_nums\": [3], \"column_nums\": [3]},\n        {\"cell text\": \"1\", \"row_nums\": [4], \"column_nums\": [3]},\n        {\"cell text\": \"0\", \"row_nums\": [5], \"column_nums\": [3]},\n        {\"cell text\": \"34.5%, n=1\", \"row_nums\": [2], \"column_nums\": [4]},\n        {\"cell text\": \"98.3% n=2 (97.7%, n=3)\", \"row_nums\": [3], \"column_nums\": [4]},\n        {\"cell text\": \"98.3%, n=4\", \"row_nums\": [4], \"column_nums\": [4]},\n        {\"cell text\": \"95.4%, n=3\", \"row_nums\": [5], \"column_nums\": [4]},\n        {\"cell text\": \"1199 sec, n=1\", \"row_nums\": [2], \"column_nums\": [5]},\n        {\"cell text\": \"1716 sec, n=3 (1934 sec, n=2)\", \"row_nums\": [3], \"column_nums\": [5]},\n        {\"cell text\": \"1672.1 sec, n=4\", \"row_nums\": [4], \"column_nums\": [5]},\n        {\"cell text\": \"1416 sec, n=3\", \"row_nums\": [5], \"column_nums\": [5]},\n    ]\n    for i in range(len(cells)):\n        cells[i][\"column header\"] = False\n    return [cells]\n"
  },
  {
    "path": "test_unstructured_inference/inference/test_layout.py",
    "content": "import os\nimport os.path\nimport tempfile\nfrom unittest.mock import MagicMock, mock_open, patch\n\nimport numpy as np\nimport pytest\nfrom PIL import Image\n\nimport unstructured_inference.models.base as models\nfrom unstructured_inference.constants import IsExtracted\nfrom unstructured_inference.inference import elements, layout, layoutelement, pdf_image\nfrom unstructured_inference.inference.elements import (\n    EmbeddedTextRegion,\n    ImageTextRegion,\n)\nfrom unstructured_inference.models.unstructuredmodel import (\n    UnstructuredElementExtractionModel,\n    UnstructuredObjectDetectionModel,\n)\n\nskip_outside_ci = os.getenv(\"CI\", \"\").lower() in {\"\", \"false\", \"f\", \"0\"}\n\n\n@pytest.fixture\ndef mock_image():\n    return Image.new(\"1\", (1, 1))\n\n\n@pytest.fixture\ndef mock_initial_layout():\n    text_block = EmbeddedTextRegion.from_coords(\n        2,\n        4,\n        6,\n        8,\n        text=\"A very repetitive narrative. \" * 10,\n        is_extracted=IsExtracted.TRUE,\n    )\n\n    title_block = EmbeddedTextRegion.from_coords(\n        1,\n        2,\n        3,\n        4,\n        text=\"A Catchy Title\",\n        is_extracted=IsExtracted.TRUE,\n    )\n\n    return [text_block, title_block]\n\n\n@pytest.fixture\ndef mock_final_layout():\n    text_block = layoutelement.LayoutElement.from_coords(\n        2,\n        4,\n        6,\n        8,\n        source=\"Mock\",\n        text=\"A very repetitive narrative. \" * 10,\n        type=\"NarrativeText\",\n    )\n\n    title_block = layoutelement.LayoutElement.from_coords(\n        1,\n        2,\n        3,\n        4,\n        source=\"Mock\",\n        text=\"A Catchy Title\",\n        type=\"Title\",\n    )\n\n    return layoutelement.LayoutElements.from_list([text_block, title_block])\n\n\ndef test_pdf_page_converts_images_to_array(mock_image):\n    def verify_image_array():\n        assert page.image_array is None\n        image_array = page._get_image_array()\n        assert isinstance(image_array, np.ndarray)\n        assert page.image_array.all() == image_array.all()\n\n    # Scenario 1: where self.image exists\n    page = layout.PageLayout(number=0, image=mock_image)\n    verify_image_array()\n\n    # Scenario 2: where self.image is None, but self.image_path exists\n    page.image_array = None\n    page.image = None\n    page.image_path = \"mock_path_to_image\"\n    with patch.object(Image, \"open\", return_value=mock_image):\n        verify_image_array()\n\n\nclass MockLayoutModel:\n    def __init__(self, layout):\n        self.layout_return = layout\n\n    def __call__(self, *args):\n        return self.layout_return\n\n    def initialize(self, *args, **kwargs):\n        pass\n\n    def deduplicate_detected_elements(self, elements, *args, **kwargs):\n        return elements\n\n\ndef test_get_page_elements(monkeypatch, mock_final_layout):\n    image = Image.fromarray(\n        np.random.randint(12, 14, size=(40, 10, 3)).astype(np.uint8), mode=\"RGB\"\n    )\n    page = layout.PageLayout(\n        number=0,\n        image=image,\n        detection_model=MockLayoutModel(mock_final_layout),\n    )\n    elements = page.get_elements_with_detection_model(inplace=False)\n    page.get_elements_with_detection_model(inplace=True)\n    assert elements == page.elements_array\n\n\nclass MockPool:\n    def map(self, f, xs):\n        return [f(x) for x in xs]\n\n    def close(self):\n        pass\n\n    def join(self):\n        pass\n\n\n@pytest.mark.parametrize(\"model_name\", [None, \"checkbox\", \"fake\"])\ndef test_process_data_with_model(monkeypatch, mock_final_layout, model_name):\n    monkeypatch.setattr(layout, \"get_model\", lambda x: MockLayoutModel(mock_final_layout))\n    monkeypatch.setattr(\n        layout.DocumentLayout,\n        \"from_file\",\n        lambda *args, **kwargs: layout.DocumentLayout.from_pages([]),\n    )\n\n    def new_isinstance(obj, cls):\n        if type(obj) is MockLayoutModel:\n            return True\n        else:\n            return isinstance(obj, cls)\n\n    with (\n        patch(\"builtins.open\", mock_open(read_data=b\"000000\")),\n        patch(\n            \"unstructured_inference.inference.layout.UnstructuredObjectDetectionModel\",\n            MockLayoutModel,\n        ),\n        open(\"\") as fp,\n    ):\n        assert layout.process_data_with_model(fp, model_name=model_name)\n\n\ndef test_process_data_with_model_raises_on_invalid_model_name():\n    with (\n        patch(\"builtins.open\", mock_open(read_data=b\"000000\")),\n        pytest.raises(\n            models.UnknownModelException,\n        ),\n        open(\"\") as fp,\n    ):\n        layout.process_data_with_model(fp, model_name=\"fake\")\n\n\n@pytest.mark.parametrize(\"model_name\", [None, \"yolox\"])\ndef test_process_file_with_model(monkeypatch, mock_final_layout, model_name):\n    def mock_initialize(self, *args, **kwargs):\n        self.model = MockLayoutModel(mock_final_layout)\n\n    monkeypatch.setattr(\n        layout.DocumentLayout,\n        \"from_file\",\n        lambda *args, **kwargs: layout.DocumentLayout.from_pages([]),\n    )\n    monkeypatch.setattr(models.UnstructuredDetectronONNXModel, \"initialize\", mock_initialize)\n    filename = \"\"\n    assert layout.process_file_with_model(filename, model_name=model_name)\n\n\ndef test_process_file_no_warnings(monkeypatch, mock_final_layout, recwarn):\n    def mock_initialize(self, *args, **kwargs):\n        self.model = MockLayoutModel(mock_final_layout)\n\n    monkeypatch.setattr(\n        layout.DocumentLayout,\n        \"from_file\",\n        lambda *args, **kwargs: layout.DocumentLayout.from_pages([]),\n    )\n    monkeypatch.setattr(models.UnstructuredDetectronONNXModel, \"initialize\", mock_initialize)\n    filename = \"\"\n    layout.process_file_with_model(filename, model_name=None)\n    # There should be no UserWarning, but if there is one it should not have the following message\n    with pytest.raises(AssertionError, match=\"not found in warning list\"):\n        user_warning = recwarn.pop(UserWarning)\n        assert \"not in available provider names\" not in str(user_warning.message)\n\n\ndef test_process_file_with_model_raises_on_invalid_model_name():\n    with pytest.raises(models.UnknownModelException):\n        layout.process_file_with_model(\"\", model_name=\"fake\")\n\n\nclass MockPoints:\n    def tolist(self):\n        return [1, 2, 3, 4]\n\n\nclass MockEmbeddedTextRegion(EmbeddedTextRegion):\n    def __init__(self, type=None, text=None):\n        self.type = type\n        self.text = text\n\n    @property\n    def points(self):\n        return MockPoints()\n\n\nclass MockPageLayout(layout.PageLayout):\n    def __init__(\n        self,\n        number=1,\n        image=None,\n        model=None,\n        detection_model=None,\n    ):\n        self.image = image\n        self.layout = layout\n        self.model = model\n        self.number = number\n        self.detection_model = detection_model\n\n\nclass MockLayout:\n    def __init__(self, *elements):\n        self.elements = elements\n\n    def __len__(self):\n        return len(self.elements)\n\n    def sort(self, key, inplace):\n        return self.elements\n\n    def __iter__(self):\n        return iter(self.elements)\n\n    def get_texts(self):\n        return [el.text for el in self.elements]\n\n    def filter_by(self, *args, **kwargs):\n        return MockLayout()\n\n\n@pytest.mark.parametrize(\"element_extraction_model\", [None, \"foo\"])\n@pytest.mark.parametrize(\"filetype\", [\"png\", \"jpg\", \"tiff\"])\ndef test_from_image_file(monkeypatch, mock_final_layout, filetype, element_extraction_model):\n    def mock_get_elements(self, *args, **kwargs):\n        self.elements = [mock_final_layout]\n\n    monkeypatch.setattr(layout.PageLayout, \"get_elements_with_detection_model\", mock_get_elements)\n    monkeypatch.setattr(layout.PageLayout, \"get_elements_using_image_extraction\", mock_get_elements)\n    filename = f\"sample-docs/loremipsum.{filetype}\"\n    image = Image.open(filename)\n    image_metadata = {\n        \"format\": image.format,\n        \"width\": image.width,\n        \"height\": image.height,\n        \"pdf_rotation\": 0,\n    }\n\n    doc = layout.DocumentLayout.from_image_file(\n        filename,\n        element_extraction_model=element_extraction_model,\n    )\n    page = doc.pages[0]\n    assert page.elements[0] == mock_final_layout\n    assert page.image is None\n    assert page.image_path == os.path.abspath(filename)\n    assert page.image_metadata == image_metadata\n\n\ndef test_from_file(monkeypatch, mock_final_layout):\n    def mock_get_elements(self, *args, **kwargs):\n        self.elements = [mock_final_layout]\n\n    monkeypatch.setattr(layout.PageLayout, \"get_elements_with_detection_model\", mock_get_elements)\n\n    with tempfile.TemporaryDirectory() as tmpdir:\n        image_path = os.path.join(tmpdir, \"loremipsum.ppm\")\n        image = Image.open(\"sample-docs/loremipsum.jpg\")\n        image.save(image_path)\n        image_metadata = {\n            \"format\": \"PPM\",\n            \"width\": image.width,\n            \"height\": image.height,\n            \"pdf_rotation\": 0,\n        }\n\n        with patch.object(\n            layout,\n            \"convert_pdf_to_image\",\n            lambda *args, **kwargs: ([image_path]),\n        ):\n            doc = layout.DocumentLayout.from_file(\"fake-file.pdf\")\n            page = doc.pages[0]\n            assert page.elements[0] == mock_final_layout\n            assert page.image_metadata == image_metadata\n            assert page.image is None\n\n\ndef test_from_file_rotated_pdf_stores_rotation_in_metadata(monkeypatch, mock_final_layout):\n    \"\"\"image_metadata includes pdf_rotation for rotated PDF pages.\"\"\"\n\n    def mock_get_elements(self, *args, **kwargs):\n        self.elements = [mock_final_layout]\n\n    monkeypatch.setattr(layout.PageLayout, \"get_elements_with_detection_model\", mock_get_elements)\n\n    doc = layout.DocumentLayout.from_file(\"sample-docs/rotated-page-90.pdf\")\n    page = doc.pages[0]\n    assert page.image_metadata[\"pdf_rotation\"] == 90\n    assert page.image is None\n\n\n@pytest.mark.slow\ndef test_from_file_with_password(monkeypatch, mock_final_layout):\n\n    doc = layout.DocumentLayout.from_file(\"sample-docs/password.pdf\", password=\"password\")\n    assert doc\n\n    monkeypatch.setattr(layout, \"get_model\", lambda x: MockLayoutModel(mock_final_layout))\n    with (\n        patch(\n            \"unstructured_inference.inference.layout.UnstructuredObjectDetectionModel\",\n            MockLayoutModel,\n        ),\n        open(\"sample-docs/password.pdf\", mode=\"rb\") as fp,\n    ):\n        doc = layout.process_data_with_model(fp, model_name=\"fake\", password=\"password\")\n        assert doc\n\n\ndef test_from_image_file_raises_with_empty_fn():\n    with pytest.raises(FileNotFoundError):\n        layout.DocumentLayout.from_image_file(\"\")\n\n\ndef test_from_image_file_raises_isadirectoryerror_with_dir():\n    with tempfile.TemporaryDirectory() as tempdir, pytest.raises(IsADirectoryError):\n        layout.DocumentLayout.from_image_file(tempdir)\n\n\ndef test_page_numbers_in_page_objects():\n    with patch(\n        \"unstructured_inference.inference.layout.PageLayout.get_elements_with_detection_model\",\n    ) as mock_get_elements:\n        doc = layout.DocumentLayout.from_file(\"sample-docs/layout-parser-paper.pdf\")\n        mock_get_elements.assert_called()\n        assert [page.number for page in doc.pages] == list(range(1, len(doc.pages) + 1))\n\n\nno_text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100)\ntext_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100, text=\"test\")\noverlapping_rect = ImageTextRegion.from_coords(50, 50, 150, 150)\nnonoverlapping_rect = ImageTextRegion.from_coords(150, 150, 200, 200)\npopulated_text_region = EmbeddedTextRegion.from_coords(50, 50, 60, 60, text=\"test\")\nunpopulated_text_region = EmbeddedTextRegion.from_coords(50, 50, 60, 60, text=None)\n\n\n@pytest.mark.parametrize(\n    (\"colors\", \"add_details\", \"threshold\"),\n    [(\"red\", False, 0.992), (None, False, 0.992), (\"red\", True, 0.8)],\n)\ndef test_annotate(colors, add_details, threshold):\n    def check_annotated_image():\n        annotated_array = np.array(annotated_image)\n        for coords in [coords1, coords2]:\n            x1, y1, x2, y2 = coords\n            # Make sure the pixels on the edge of the box are red\n            for i, expected in zip(range(3), [255, 0, 0]):\n                assert all(annotated_array[y1, x1:x2, i] == expected)\n                assert all(annotated_array[y2, x1:x2, i] == expected)\n                assert all(annotated_array[y1:y2, x1, i] == expected)\n                assert all(annotated_array[y1:y2, x2, i] == expected)\n            # Make sure almost all the pixels are not changed\n            assert ((annotated_array[:, :, 0] == 1).mean()) > threshold\n            assert ((annotated_array[:, :, 1] == 1).mean()) > threshold\n            assert ((annotated_array[:, :, 2] == 1).mean()) > threshold\n\n    test_image_arr = np.ones((100, 100, 3), dtype=\"uint8\")\n    image = Image.fromarray(test_image_arr)\n    page = layout.PageLayout(number=1, image=image)\n    coords1 = (21, 30, 37, 41)\n    rect1 = elements.TextRegion.from_coords(*coords1)\n    coords2 = (1, 10, 7, 11)\n    rect2 = elements.TextRegion.from_coords(*coords2)\n    page.elements = [rect1, rect2]\n\n    annotated_image = page.annotate(colors=colors, add_details=add_details, sources=None)\n    check_annotated_image()\n\n    # Scenario 1: where self.image exists\n    annotated_image = page.annotate(colors=colors, add_details=add_details)\n    check_annotated_image()\n\n    # Scenario 2: where self.image is None, but self.image_path exists\n    with patch.object(Image, \"open\", return_value=image):\n        page.image = None\n        page.image_path = \"mock_path_to_image\"\n        annotated_image = page.annotate(colors=colors, add_details=add_details)\n        check_annotated_image()\n\n\nclass MockDetectionModel(layout.UnstructuredObjectDetectionModel):\n    def initialize(self, *args, **kwargs):\n        pass\n\n    def predict(self, x):\n        return layoutelement.LayoutElements.from_list(\n            [\n                layout.LayoutElement.from_coords(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text=\"0\"),\n                layout.LayoutElement.from_coords(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text=\"1\"),\n                layout.LayoutElement.from_coords(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text=\"2\"),\n                layout.LayoutElement.from_coords(\n                    x1=444.5,\n                    y1=942.3,\n                    x2=1261.1,\n                    y2=1584.1,\n                    text=\"3\",\n                ),\n                layout.LayoutElement.from_coords(\n                    x1=444.8,\n                    y1=1609.4,\n                    x2=1257.2,\n                    y2=1665.2,\n                    text=\"4\",\n                ),\n                layout.LayoutElement.from_coords(\n                    x1=414.0,\n                    y1=1718.8,\n                    x2=635.0,\n                    y2=1755.2,\n                    text=\"5\",\n                ),\n                layout.LayoutElement.from_coords(\n                    x1=372.6,\n                    y1=1786.9,\n                    x2=1333.6,\n                    y2=1848.7,\n                    text=\"6\",\n                ),\n            ],\n        )\n\n\ndef test_layout_order(mock_image):\n    with tempfile.TemporaryDirectory() as tmpdir:\n        mock_image_path = os.path.join(tmpdir, \"mock.jpg\")\n        mock_image.save(mock_image_path)\n        with (\n            patch.object(layout, \"get_model\", lambda: MockDetectionModel()),\n            patch.object(\n                layout,\n                \"convert_pdf_to_image\",\n                lambda *args, **kwargs: ([mock_image_path]),\n            ),\n        ):\n            doc = layout.DocumentLayout.from_file(\"sample-docs/layout-parser-paper.pdf\")\n            page = doc.pages[0]\n    for n, element in enumerate(page.elements):\n        assert element.text == str(n)\n\n\ndef test_page_layout_raises_when_multiple_models_passed(mock_image, mock_initial_layout):\n    with pytest.raises(ValueError):\n        layout.PageLayout(\n            0,\n            mock_image,\n            mock_initial_layout,\n            detection_model=\"something\",\n            element_extraction_model=\"something else\",\n        )\n\n\nclass MockElementExtractionModel:\n    def __call__(self, x):\n        return [1, 2, 3]\n\n\n@pytest.mark.parametrize((\"inplace\", \"expected\"), [(True, None), (False, [1, 2, 3])])\ndef test_get_elements_using_image_extraction(mock_image, inplace, expected):\n    page = layout.PageLayout(\n        1,\n        mock_image,\n        None,\n        element_extraction_model=MockElementExtractionModel(),\n    )\n    assert page.get_elements_using_image_extraction(inplace=inplace) == expected\n\n\ndef test_get_elements_using_image_extraction_raises_with_no_extraction_model(\n    mock_image,\n):\n    page = layout.PageLayout(1, mock_image, None, element_extraction_model=None)\n    with pytest.raises(ValueError):\n        page.get_elements_using_image_extraction()\n\n\ndef test_get_elements_with_detection_model_raises_with_wrong_default_model(monkeypatch):\n    monkeypatch.setattr(layout, \"get_model\", lambda *x: MockLayoutModel(mock_final_layout))\n    page = layout.PageLayout(1, mock_image, None)\n    with pytest.raises(NotImplementedError):\n        page.get_elements_with_detection_model()\n\n\n@pytest.mark.parametrize(\n    (\n        \"detection_model\",\n        \"element_extraction_model\",\n        \"detection_model_called\",\n        \"element_extraction_model_called\",\n    ),\n    [(None, \"asdf\", False, True), (\"asdf\", None, True, False)],\n)\ndef test_from_image(\n    mock_image,\n    detection_model,\n    element_extraction_model,\n    detection_model_called,\n    element_extraction_model_called,\n):\n    with (\n        patch.object(\n            layout.PageLayout,\n            \"get_elements_using_image_extraction\",\n        ) as mock_image_extraction,\n        patch.object(\n            layout.PageLayout,\n            \"get_elements_with_detection_model\",\n        ) as mock_detection,\n    ):\n        layout.PageLayout.from_image(\n            mock_image,\n            image_path=None,\n            detection_model=detection_model,\n            element_extraction_model=element_extraction_model,\n        )\n        assert mock_image_extraction.called == element_extraction_model_called\n        assert mock_detection.called == detection_model_called\n\n\nclass MockUnstructuredElementExtractionModel(UnstructuredElementExtractionModel):\n    def initialize(self, *args, **kwargs):\n        return super().initialize(*args, **kwargs)\n\n    def predict(self, x: Image):\n        return super().predict(x)\n\n\nclass MockUnstructuredDetectionModel(UnstructuredObjectDetectionModel):\n    def initialize(self, *args, **kwargs):\n        return super().initialize(*args, **kwargs)\n\n    def predict(self, x: Image):\n        return super().predict(x)\n\n\n@pytest.mark.parametrize(\n    (\"model_type\", \"is_detection_model\"),\n    [\n        (MockUnstructuredElementExtractionModel, False),\n        (MockUnstructuredDetectionModel, True),\n    ],\n)\ndef test_process_file_with_model_routing(monkeypatch, model_type, is_detection_model):\n    model = model_type()\n    monkeypatch.setattr(layout, \"get_model\", lambda *x: model)\n    with patch.object(layout.DocumentLayout, \"from_file\") as mock_from_file:\n        layout.process_file_with_model(\"asdf\", model_name=\"fake\", is_image=False)\n        if is_detection_model:\n            detection_model = model\n            element_extraction_model = None\n        else:\n            detection_model = None\n            element_extraction_model = model\n        mock_from_file.assert_called_once_with(\n            \"asdf\",\n            detection_model=detection_model,\n            element_extraction_model=element_extraction_model,\n            fixed_layouts=None,\n            password=None,\n            pdf_image_dpi=200,\n            pdf_render_max_pixels_per_page=None,\n        )\n\n\n@pytest.mark.parametrize((\"pdf_image_dpi\", \"expected\"), [(200, 2200), (100, 1100)])\ndef test_exposed_pdf_image_dpi(pdf_image_dpi, expected, monkeypatch):\n    with patch.object(layout.PageLayout, \"from_image\") as mock_from_image:\n        layout.DocumentLayout.from_file(\"sample-docs/loremipsum.pdf\", pdf_image_dpi=pdf_image_dpi)\n        assert mock_from_image.call_args[0][0].height == expected\n\n\ndef test_convert_pdf_to_image_no_output_folder():\n    result = layout.convert_pdf_to_image(filename=\"sample-docs/loremipsum.pdf\", dpi=72)\n    assert len(result) == 1\n    assert isinstance(result[0], Image.Image)\n\n\ndef _install_mock_pdfium(monkeypatch, *, width=720, height=720):\n    page = MagicMock()\n    page.get_width.return_value = width\n    page.get_height.return_value = height\n    page.get_rotation.return_value = 0\n    page.render.return_value.to_pil.return_value = Image.new(\"RGB\", (1, 1))\n    pdf = MagicMock()\n    pdf.__len__.return_value = 1\n    pdf.__getitem__.return_value = page\n    pdfium = MagicMock()\n    pdfium.PdfDocument.return_value = pdf\n    monkeypatch.setattr(pdf_image, \"_get_pdfium_module\", lambda: pdfium)\n    return page\n\n\ndef test_convert_pdf_to_image_rejects_oversized_page_before_render(monkeypatch):\n    page = _install_mock_pdfium(monkeypatch)\n\n    with pytest.raises(pdf_image.PdfRenderTooLargeError, match=\"too many pixels\"):\n        pdf_image.convert_pdf_to_image(\n            filename=\"mock.pdf\",\n            dpi=100,\n            pdf_render_max_pixels_per_page=999_999,\n        )\n\n    page.render.assert_not_called()\n\n\ndef test_convert_pdf_to_image_allows_render_guard_to_be_disabled(monkeypatch):\n    page = _install_mock_pdfium(monkeypatch)\n\n    result = pdf_image.convert_pdf_to_image(\n        filename=\"mock.pdf\",\n        dpi=100,\n        pdf_render_max_pixels_per_page=0,\n    )\n\n    page.render.assert_called_once()\n    assert len(result) == 1\n    assert isinstance(result[0], Image.Image)\n\n\ndef test_page_hotload_preserves_render_max_pixels_per_page(monkeypatch, tmp_path):\n    image_path = tmp_path / \"page_1.png\"\n    Image.new(\"RGB\", (1, 1)).save(image_path)\n    calls = []\n\n    def fake_convert_pdf_to_image(**kwargs):\n        calls.append(kwargs)\n        return [str(image_path)]\n\n    monkeypatch.setattr(layout, \"convert_pdf_to_image\", fake_convert_pdf_to_image)\n    page = layout.PageLayout(\n        number=1,\n        image=Image.new(\"RGB\", (1, 1)),\n        document_filename=\"mock.pdf\",\n        pdf_render_max_pixels_per_page=None,\n    )\n\n    image = page._get_image(\"mock.pdf\", 1, pdf_image_dpi=123)\n\n    assert image.size == (1, 1)\n    assert calls[0][\"dpi\"] == 123\n    assert calls[0][\"pdf_render_max_pixels_per_page\"] is None\n\n\ndef test_convert_pdf_to_image_output_folder_returns_images(tmp_path):\n    result = layout.convert_pdf_to_image(\n        filename=\"sample-docs/loremipsum.pdf\",\n        dpi=72,\n        output_folder=tmp_path,\n        path_only=False,\n    )\n    assert len(result) == 1\n    assert isinstance(result[0], Image.Image)\n    saved = list(tmp_path.glob(\"*.png\"))\n    assert len(saved) == 1\n\n\ndef test_convert_pdf_to_image_path_only(tmp_path):\n    result = layout.convert_pdf_to_image(\n        filename=\"sample-docs/loremipsum.pdf\",\n        dpi=72,\n        output_folder=tmp_path,\n        path_only=True,\n    )\n    assert len(result) == 1\n    assert all(isinstance(p, str) for p in result)\n    for p in result:\n        assert os.path.exists(p)\n        assert p.endswith(\".png\")\n    saved = sorted(tmp_path.glob(\"*.png\"))\n    assert [str(s) for s in saved] == sorted(result)\n\n\ndef test_convert_pdf_to_image_applies_rotation_path_only(tmp_path):\n    \"\"\"Rotation is also applied when saving to disk (path_only mode).\"\"\"\n    result = layout.convert_pdf_to_image(\n        filename=\"sample-docs/rotated-page-90.pdf\",\n        dpi=72,\n        output_folder=tmp_path,\n        path_only=True,\n    )\n    assert len(result) == 1\n    saved = Image.open(result[0])\n    assert saved.height > saved.width, f\"Expected portrait after rotation, got {saved.size}\"\n\n\ndef test_convert_pdf_to_image_no_rotation_on_normal_pdf():\n    \"\"\"Non-rotated PDFs are unchanged.\"\"\"\n    result = layout.convert_pdf_to_image(filename=\"sample-docs/loremipsum.pdf\", dpi=72)\n    assert len(result) == 1\n    img = result[0]\n    # loremipsum.pdf is a standard portrait page - should stay portrait\n    assert img.height > img.width, f\"Expected portrait, got {img.size}\"\n\n\ndef test_convert_pdf_to_image_save_not_under_pdfium_lock(tmp_path):\n    \"\"\"Verify that PIL save (disk I/O) is NOT performed while holding _pdfium_lock.\"\"\"\n    original_save = Image.Image.save\n    lock_held_during_save = []\n\n    def spy_save(self, *args, **kwargs):\n        lock_held_during_save.append(layout._pdfium_lock.locked())\n        return original_save(self, *args, **kwargs)\n\n    with patch.object(Image.Image, \"save\", spy_save):\n        layout.convert_pdf_to_image(\n            filename=\"sample-docs/loremipsum.pdf\",\n            dpi=72,\n            output_folder=tmp_path,\n            path_only=True,\n        )\n    assert lock_held_during_save, \"save was never called\"\n    assert not any(lock_held_during_save), \"pil_image.save() was called while _pdfium_lock was held\"\n\n\ndef test_convert_pdf_to_image_concurrent_saves_not_serialized(tmp_path):\n    \"\"\"Two concurrent callers must be able to overlap their disk writes.\n\n    Uses a threading.Barrier to verify both threads are inside save()\n    simultaneously. If saves are serialized under _pdfium_lock, the second\n    thread can never reach save() while the first is there, so the barrier\n    times out and the test fails.\n    \"\"\"\n    import threading\n\n    original_save = Image.Image.save\n    barrier = threading.Barrier(2, timeout=5)\n    overlap_detected = threading.Event()\n\n    def barrier_save(self, *args, **kwargs):\n        try:\n            barrier.wait()\n            overlap_detected.set()\n        except threading.BrokenBarrierError:\n            pass\n        return original_save(self, *args, **kwargs)\n\n    errors: list[str] = []\n\n    def run(folder):\n        try:\n            layout.convert_pdf_to_image(\n                filename=\"sample-docs/loremipsum.pdf\",\n                dpi=72,\n                output_folder=folder,\n                path_only=True,\n            )\n        except Exception as exc:\n            errors.append(str(exc))\n\n    dir_a = tmp_path / \"a\"\n    dir_b = tmp_path / \"b\"\n    dir_a.mkdir()\n    dir_b.mkdir()\n\n    with patch.object(Image.Image, \"save\", barrier_save):\n        t1 = threading.Thread(target=run, args=(dir_a,))\n        t2 = threading.Thread(target=run, args=(dir_b,))\n        t1.start()\n        t2.start()\n        t1.join(timeout=10)\n        t2.join(timeout=10)\n\n    assert not errors, f\"threads raised: {errors}\"\n    assert overlap_detected.is_set(), (\n        \"saves were serialized under _pdfium_lock — threads could not overlap\"\n    )\n    assert list(dir_a.glob(\"*.png\")), \"thread A produced no output\"\n    assert list(dir_b.glob(\"*.png\")), \"thread B produced no output\"\n\n\ndef test_render_can_proceed_while_other_thread_saves(tmp_path):\n    \"\"\"Thread B can acquire _pdfium_lock and render while thread A is in save().\n\n    Blocks thread A inside save() (outside the lock), then starts thread B.\n    If B completes entirely while A is still blocked, the lock was not held\n    during save — rendering and saving can overlap across callers.\n    \"\"\"\n    import threading\n\n    original_save = Image.Image.save\n    a_in_save = threading.Event()\n    b_done = threading.Event()\n\n    dir_a = tmp_path / \"a\"\n    dir_b = tmp_path / \"b\"\n    dir_a.mkdir()\n    dir_b.mkdir()\n\n    def gated_save(self, *args, **kwargs):\n        fp = str(args[0]) if args else \"\"\n        if str(dir_a) in fp:\n            a_in_save.set()\n            b_done.wait(timeout=5)\n        return original_save(self, *args, **kwargs)\n\n    errors: list[str] = []\n\n    def run(folder, done_event=None):\n        try:\n            layout.convert_pdf_to_image(\n                filename=\"sample-docs/loremipsum.pdf\",\n                dpi=72,\n                output_folder=folder,\n                path_only=True,\n            )\n        except Exception as exc:\n            errors.append(str(exc))\n        finally:\n            if done_event:\n                done_event.set()\n\n    with patch.object(Image.Image, \"save\", gated_save):\n        t_a = threading.Thread(target=run, args=(dir_a,))\n        t_b = threading.Thread(target=run, args=(dir_b, b_done))\n        t_a.start()\n        a_in_save.wait(timeout=5)\n        # A is now blocked in save (outside lock). B should render + save freely.\n        t_b.start()\n        t_b.join(timeout=10)\n        t_a.join(timeout=10)\n\n    assert not errors, f\"threads raised: {errors}\"\n    assert b_done.is_set(), \"Thread B could not complete while A was saving\"\n    assert list(dir_a.glob(\"*.png\")), \"thread A produced no output\"\n    assert list(dir_b.glob(\"*.png\")), \"thread B produced no output\"\n\n\ndef test_multi_page_concurrent_output_complete(tmp_path):\n    \"\"\"Two threads processing a multi-page PDF both produce correct, complete output.\"\"\"\n    import threading\n\n    errors: list[str] = []\n\n    def run(folder):\n        try:\n            layout.convert_pdf_to_image(\n                filename=\"sample-docs/loremipsum_multipage.pdf\",\n                dpi=72,\n                output_folder=folder,\n                path_only=True,\n            )\n        except Exception as exc:\n            errors.append(str(exc))\n\n    dir_a = tmp_path / \"a\"\n    dir_b = tmp_path / \"b\"\n    dir_a.mkdir()\n    dir_b.mkdir()\n\n    t1 = threading.Thread(target=run, args=(dir_a,))\n    t2 = threading.Thread(target=run, args=(dir_b,))\n    t1.start()\n    t2.start()\n    t1.join(timeout=60)\n    t2.join(timeout=60)\n\n    assert not errors, f\"threads raised: {errors}\"\n    a_files = sorted(dir_a.glob(\"*.png\"))\n    b_files = sorted(dir_b.glob(\"*.png\"))\n    assert len(a_files) == 10, f\"thread A produced {len(a_files)} files, expected 10\"\n    assert len(b_files) == 10, f\"thread B produced {len(b_files)} files, expected 10\"\n    for i in range(1, 11):\n        assert (dir_a / f\"page_{i}.png\").exists(), f\"thread A missing page_{i}.png\"\n        assert (dir_b / f\"page_{i}.png\").exists(), f\"thread B missing page_{i}.png\"\n\n\ndef test_error_in_one_thread_does_not_block_other(tmp_path):\n    \"\"\"If one thread fails mid-processing, the other still completes.\"\"\"\n    import threading\n\n    original_save = Image.Image.save\n\n    dir_a = tmp_path / \"a\"\n    dir_b = tmp_path / \"b\"\n    dir_a.mkdir()\n    dir_b.mkdir()\n\n    def failing_save(self, *args, **kwargs):\n        fp = str(args[0]) if args else \"\"\n        if str(dir_a) in fp:\n            raise OSError(\"simulated disk failure\")\n        return original_save(self, *args, **kwargs)\n\n    a_error: list[Exception] = []\n    b_result: list[str] = []\n    b_error: list[Exception] = []\n\n    def run_a():\n        try:\n            layout.convert_pdf_to_image(\n                filename=\"sample-docs/loremipsum.pdf\",\n                dpi=72,\n                output_folder=dir_a,\n                path_only=True,\n            )\n        except Exception as exc:\n            a_error.append(exc)\n\n    def run_b():\n        try:\n            result = layout.convert_pdf_to_image(\n                filename=\"sample-docs/loremipsum.pdf\",\n                dpi=72,\n                output_folder=dir_b,\n                path_only=True,\n            )\n            b_result.extend(result)\n        except Exception as exc:\n            b_error.append(exc)\n\n    with patch.object(Image.Image, \"save\", failing_save):\n        t_a = threading.Thread(target=run_a)\n        t_b = threading.Thread(target=run_b)\n        t_a.start()\n        t_b.start()\n        t_a.join(timeout=10)\n        t_b.join(timeout=10)\n\n    assert a_error, \"Thread A should have failed\"\n    assert not b_error, f\"Thread B should have succeeded: {b_error}\"\n    assert b_result, \"Thread B produced no result\"\n    assert list(dir_b.glob(\"*.png\")), \"Thread B produced no output files\"\n\n\n@pytest.mark.parametrize(\n    (\"filename\", \"img_num\", \"should_complete\"),\n    [\n        (\"sample-docs/empty-document.pdf\", 0, True),\n        (\"sample-docs/empty-document.pdf\", 10, False),\n    ],\n)\ndef test_get_image(filename, img_num, should_complete):\n    doc = layout.DocumentLayout.from_file(filename)\n    page = doc.pages[0]\n    try:\n        img = page._get_image(filename, img_num)\n        # transform img to numpy array\n        img = np.array(img)\n        # is a blank image with all pixels white\n        assert img.mean() == 255.0\n    except ValueError:\n        assert not should_complete\n"
  },
  {
    "path": "test_unstructured_inference/inference/test_layout_element.py",
    "content": "from unstructured_inference.constants import IsExtracted, Source\nfrom unstructured_inference.inference.layoutelement import LayoutElement, TextRegion\n\n\ndef test_layout_element_to_dict(mock_layout_element):\n    expected = {\n        \"coordinates\": ((100, 100), (100, 300), (300, 300), (300, 100)),\n        \"text\": \"Sample text\",\n        \"is_extracted\": None,\n        \"type\": \"Text\",\n        \"prob\": None,\n        \"source\": None,\n    }\n\n    assert mock_layout_element.to_dict() == expected\n\n\ndef test_layout_element_from_region(mock_rectangle):\n    expected = LayoutElement.from_coords(100, 100, 300, 300)\n    region = TextRegion(bbox=mock_rectangle)\n\n    assert LayoutElement.from_region(region) == expected\n\n\ndef test_layoutelement_inheritance_works_correctly():\n    \"\"\"Test that LayoutElement properly inherits from TextRegion without conflicts\"\"\"\n    from unstructured_inference.inference.elements import TextRegion\n\n    # Create a TextRegion with both source and text_source\n    region = TextRegion.from_coords(\n        0, 0, 10, 10, text=\"test\", source=Source.YOLOX, is_extracted=IsExtracted.TRUE\n    )\n\n    # Convert to LayoutElement\n    element = LayoutElement.from_region(region)\n\n    # Check that both properties are preserved\n    assert element.source == Source.YOLOX, \"LayoutElement should inherit source from TextRegion\"\n    assert element.is_extracted == IsExtracted.TRUE, (\n        \"LayoutElement should inherit is_extracted from TextRegion\"\n    )\n\n    # Check that to_dict() works correctly\n    d = element.to_dict()\n    assert d[\"source\"] == Source.YOLOX\n    assert d[\"is_extracted\"] == IsExtracted.TRUE\n\n    # Check that we can set source directly on LayoutElement\n    element.source = Source.DETECTRON2_ONNX\n    assert element.source == Source.DETECTRON2_ONNX\n"
  },
  {
    "path": "test_unstructured_inference/inference/test_layout_rotation.py",
    "content": "from __future__ import annotations\n\nimport numpy as np\n\nfrom unstructured_inference.inference import pdf_image\n\n\ndef test_convert_pdf_to_image_applies_rotation():\n    \"\"\"Pages with /Rotate metadata are rendered upright.\"\"\"\n    result = pdf_image.convert_pdf_to_image(filename=\"sample-docs/rotated-page-90.pdf\", dpi=72)\n    assert len(result) == 1\n    img = result[0]\n    # The PDF has /Rotate=90 on a landscape page (width > height in PDF units).\n    # Without rotation fix the rendered image would be landscape; with the fix it's portrait.\n    assert img.height > img.width, f\"Expected portrait after rotation, got {img.size}\"\n\n    # Fixture contract: rotated-page-90.pdf has visible dark text in the upper half when upright.\n    # Use relative dark-pixel counts to reduce sensitivity to minor renderer differences.\n    gray = np.array(img.convert(\"L\"))\n    split = gray.shape[0] // 2\n    top_dark_pixels = int(np.count_nonzero(gray[:split] < 245))\n    bottom_dark_pixels = int(np.count_nonzero(gray[split:] < 245))\n\n    assert top_dark_pixels > 0, \"Expected text pixels in upper half of upright page\"\n    assert top_dark_pixels > max(bottom_dark_pixels * 10, 50), (\n        \"Expected substantially more dark pixels in upper half for upright orientation; \"\n        f\"got top={top_dark_pixels}, bottom={bottom_dark_pixels}\"\n    )\n"
  },
  {
    "path": "test_unstructured_inference/models/test_detectron2onnx.py",
    "content": "import os\nfrom unittest.mock import patch\n\nimport pytest\nfrom PIL import Image\n\nimport unstructured_inference.models.base as models\nimport unstructured_inference.models.detectron2onnx as detectron2\n\n\nclass MockDetectron2ONNXLayoutModel:\n    def __init__(self, *args, **kwargs):\n        self.args = args\n        self.kwargs = kwargs\n\n    def run(self, *args):\n        return ([(1, 2, 3, 4)], [0], [(4, 5)], [0.818])\n\n    def get_inputs(self):\n        class input_thing:\n            name = \"Bernard\"\n\n        return [input_thing()]\n\n\ndef test_load_default_model(monkeypatch):\n    monkeypatch.setattr(models, \"models\", {})\n    with patch.object(\n        detectron2.onnxruntime,\n        \"InferenceSession\",\n        new=MockDetectron2ONNXLayoutModel,\n    ):\n        model = models.get_model(\"detectron2_mask_rcnn\")\n\n    assert isinstance(model.model, MockDetectron2ONNXLayoutModel)\n\n\n@pytest.mark.parametrize((\"model_path\", \"label_map\"), [(\"asdf\", \"diufs\"), (\"dfaw\", \"hfhfhfh\")])\ndef test_load_model(model_path, label_map):\n    with patch.object(detectron2.onnxruntime, \"InferenceSession\", return_value=True):\n        model = detectron2.UnstructuredDetectronONNXModel()\n        model.initialize(model_path=model_path, label_map=label_map)\n        args, _ = detectron2.onnxruntime.InferenceSession.call_args\n        assert args == (model_path,)\n    assert label_map == model.label_map\n\n\ndef test_unstructured_detectron_model():\n    model = detectron2.UnstructuredDetectronONNXModel()\n    model.model = 1\n    with patch.object(detectron2.UnstructuredDetectronONNXModel, \"predict\", return_value=[]):\n        result = model(None)\n    assert isinstance(result, list)\n    assert len(result) == 0\n\n\ndef test_inference():\n    with patch.object(\n        detectron2.onnxruntime,\n        \"InferenceSession\",\n        return_value=MockDetectron2ONNXLayoutModel(),\n    ):\n        model = detectron2.UnstructuredDetectronONNXModel()\n        model.initialize(model_path=\"test_path\", label_map={0: \"test_class\"})\n        assert isinstance(model.model, MockDetectron2ONNXLayoutModel)\n        with open(os.path.join(\"sample-docs\", \"receipt-sample.jpg\"), mode=\"rb\") as fp:\n            image = Image.open(fp)\n            image.load()\n        elements = model(image)\n        assert len(elements) == 1\n        element = elements[0]\n        (x1, y1), _, (x2, y2), _ = element.bbox.coordinates\n        assert hasattr(\n            element,\n            \"prob\",\n        )  # NOTE(pravin) New Assertion to Make Sure element has probabilities\n        assert isinstance(\n            element.prob,\n            float,\n        )  # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float\n        # NOTE(alan): The bbox coordinates get resized, so check their relative proportions\n        assert x2 / x1 == pytest.approx(3.0)  # x1 == 1, x2 == 3 before scaling\n        assert y2 / y1 == pytest.approx(2.0)  # y1 == 2, y2 == 4 before scaling\n        assert element.type == \"test_class\"\n"
  },
  {
    "path": "test_unstructured_inference/models/test_eval.py",
    "content": "import pytest\n\nfrom unstructured_inference.inference.layoutelement import table_cells_to_dataframe\nfrom unstructured_inference.models.eval import compare_contents_as_df, default_tokenizer\n\n\n@pytest.fixture\ndef actual_cells():\n    return [\n        {\n            \"column_nums\": [0],\n            \"row_nums\": [0, 1],\n            \"column header\": True,\n            \"cell text\": \"Disability Category\",\n        },\n        {\n            \"column_nums\": [1],\n            \"row_nums\": [0, 1],\n            \"column header\": True,\n            \"cell text\": \"Participants\",\n        },\n        {\n            \"column_nums\": [2],\n            \"row_nums\": [0, 1],\n            \"column header\": True,\n            \"cell text\": \"Ballots Completed\",\n        },\n        {\n            \"column_nums\": [3],\n            \"row_nums\": [0, 1],\n            \"column header\": True,\n            \"cell text\": \"Ballots Incomplete/Terminated\",\n        },\n        {\"column_nums\": [4, 5], \"row_nums\": [0], \"column header\": True, \"cell text\": \"Results\"},\n        {\"column_nums\": [4], \"row_nums\": [1], \"column header\": False, \"cell text\": \"Accuracy\"},\n        {\n            \"column_nums\": [5],\n            \"row_nums\": [1],\n            \"column header\": False,\n            \"cell text\": \"Time to complete\",\n        },\n        {\"column_nums\": [0], \"row_nums\": [2], \"column header\": False, \"cell text\": \"Blind\"},\n        {\"column_nums\": [0], \"row_nums\": [3], \"column header\": False, \"cell text\": \"Low Vision\"},\n        {\"column_nums\": [0], \"row_nums\": [4], \"column header\": False, \"cell text\": \"Dexterity\"},\n        {\"column_nums\": [0], \"row_nums\": [5], \"column header\": False, \"cell text\": \"Mobility\"},\n        {\"column_nums\": [1], \"row_nums\": [2], \"column header\": False, \"cell text\": \"5\"},\n        {\"column_nums\": [1], \"row_nums\": [3], \"column header\": False, \"cell text\": \"5\"},\n        {\"column_nums\": [1], \"row_nums\": [4], \"column header\": False, \"cell text\": \"5\"},\n        {\"column_nums\": [1], \"row_nums\": [5], \"column header\": False, \"cell text\": \"3\"},\n        {\"column_nums\": [2], \"row_nums\": [2], \"column header\": False, \"cell text\": \"1\"},\n        {\"column_nums\": [2], \"row_nums\": [3], \"column header\": False, \"cell text\": \"2\"},\n        {\"column_nums\": [2], \"row_nums\": [4], \"column header\": False, \"cell text\": \"4\"},\n        {\"column_nums\": [2], \"row_nums\": [5], \"column header\": False, \"cell text\": \"3\"},\n        {\"column_nums\": [3], \"row_nums\": [2], \"column header\": False, \"cell text\": \"4\"},\n        {\"column_nums\": [3], \"row_nums\": [3], \"column header\": False, \"cell text\": \"3\"},\n        {\"column_nums\": [3], \"row_nums\": [4], \"column header\": False, \"cell text\": \"1\"},\n        {\"column_nums\": [3], \"row_nums\": [5], \"column header\": False, \"cell text\": \"0\"},\n        {\"column_nums\": [4], \"row_nums\": [2], \"column header\": False, \"cell text\": \"34.5%, n=1\"},\n        {\n            \"column_nums\": [4],\n            \"row_nums\": [3],\n            \"column header\": False,\n            \"cell text\": \"98.3% n=2 (97.7%, n=3)\",\n        },\n        {\"column_nums\": [4], \"row_nums\": [4], \"column header\": False, \"cell text\": \"98.3%, n=4\"},\n        {\"column_nums\": [4], \"row_nums\": [5], \"column header\": False, \"cell text\": \"95.4%, n=3\"},\n        {\"column_nums\": [5], \"row_nums\": [2], \"column header\": False, \"cell text\": \"1199 sec, n=1\"},\n        {\n            \"column_nums\": [5],\n            \"row_nums\": [3],\n            \"column header\": False,\n            \"cell text\": \"1716 sec, n=3 (1934 sec, n=2)\",\n        },\n        {\n            \"column_nums\": [5],\n            \"row_nums\": [4],\n            \"column header\": False,\n            \"cell text\": \"1672.1 sec, n=4\",\n        },\n        {\"column_nums\": [5], \"row_nums\": [5], \"column header\": False, \"cell text\": \"1416 sec, n=3\"},\n    ]\n\n\n@pytest.fixture\ndef pred_cells():\n    return [\n        {\"column_nums\": [0], \"row_nums\": [2], \"column header\": False, \"cell text\": \"Blind\"},\n        {\"column_nums\": [0], \"row_nums\": [3], \"column header\": False, \"cell text\": \"Low Vision\"},\n        {\"column_nums\": [0], \"row_nums\": [4], \"column header\": False, \"cell text\": \"Dexterity\"},\n        {\"column_nums\": [0], \"row_nums\": [5], \"column header\": False, \"cell text\": \"Mobility\"},\n        {\"column_nums\": [1], \"row_nums\": [2], \"column header\": False, \"cell text\": \"5\"},\n        {\"column_nums\": [1], \"row_nums\": [3], \"column header\": False, \"cell text\": \"5\"},\n        {\"column_nums\": [1], \"row_nums\": [4], \"column header\": False, \"cell text\": \"5\"},\n        {\"column_nums\": [1], \"row_nums\": [5], \"column header\": False, \"cell text\": \"3\"},\n        {\"column_nums\": [2], \"row_nums\": [2], \"column header\": False, \"cell text\": \"1\"},\n        {\"column_nums\": [2], \"row_nums\": [3], \"column header\": False, \"cell text\": \"2\"},\n        {\"column_nums\": [2], \"row_nums\": [4], \"column header\": False, \"cell text\": \"4\"},\n        {\"column_nums\": [2], \"row_nums\": [5], \"column header\": False, \"cell text\": \"3\"},\n        {\"column_nums\": [3], \"row_nums\": [2], \"column header\": False, \"cell text\": \"4\"},\n        {\"column_nums\": [3], \"row_nums\": [3], \"column header\": False, \"cell text\": \"3\"},\n        {\"column_nums\": [3], \"row_nums\": [4], \"column header\": False, \"cell text\": \"1\"},\n        {\"column_nums\": [3], \"row_nums\": [5], \"column header\": False, \"cell text\": \"0\"},\n        {\"column_nums\": [4], \"row_nums\": [1], \"column header\": False, \"cell text\": \"Accuracy\"},\n        {\"column_nums\": [4], \"row_nums\": [2], \"column header\": False, \"cell text\": \"34.5%, n=1\"},\n        {\n            \"column_nums\": [4],\n            \"row_nums\": [3],\n            \"column header\": False,\n            \"cell text\": \"98.3% n=2 (97.7%, n=3)\",\n        },\n        {\"column_nums\": [4], \"row_nums\": [4], \"column header\": False, \"cell text\": \"98.3%, n=4\"},\n        {\"column_nums\": [4], \"row_nums\": [5], \"column header\": False, \"cell text\": \"95.4%, n=3\"},\n        {\n            \"column_nums\": [5],\n            \"row_nums\": [1],\n            \"column header\": False,\n            \"cell text\": \"Time to complete\",\n        },\n        {\"column_nums\": [5], \"row_nums\": [2], \"column header\": False, \"cell text\": \"1199 sec, n=1\"},\n        {\n            \"column_nums\": [5],\n            \"row_nums\": [3],\n            \"column header\": False,\n            \"cell text\": \"1716 sec, n=3 | (1934 sec, n=2)\",\n        },\n        {\n            \"column_nums\": [5],\n            \"row_nums\": [4],\n            \"column header\": False,\n            \"cell text\": \"1672.1 sec, n=4\",\n        },\n        {\"column_nums\": [5], \"row_nums\": [5], \"column header\": False, \"cell text\": \"1416 sec, n=3\"},\n        {\n            \"column_nums\": [0],\n            \"row_nums\": [0, 1],\n            \"column header\": True,\n            \"cell text\": \"soa etealeiliay Category\",\n        },\n        {\"column_nums\": [4, 5], \"row_nums\": [0], \"column header\": True, \"cell text\": \"Results\"},\n        {\n            \"column_nums\": [1],\n            \"row_nums\": [0, 1],\n            \"column header\": True,\n            \"cell text\": \"Participants P\",\n        },\n        {\n            \"column_nums\": [2],\n            \"row_nums\": [0, 1],\n            \"column header\": True,\n            \"cell text\": \"pallets Completed\",\n        },\n        {\n            \"column_nums\": [3],\n            \"row_nums\": [0, 1],\n            \"column header\": True,\n            \"cell text\": \"Ballot: incom lete/ Ne Terminated\",\n        },\n    ]\n\n\n@pytest.fixture\ndef actual_df(actual_cells):\n    return table_cells_to_dataframe(actual_cells).fillna(\"\")\n\n\n@pytest.fixture\ndef pred_df(pred_cells):\n    return table_cells_to_dataframe(pred_cells).fillna(\"\")\n\n\n@pytest.mark.parametrize(\n    (\"eval_func\", \"processor\"),\n    [\n        (\"token_ratio\", default_tokenizer),\n        (\"token_ratio\", None),\n        (\"partial_token_ratio\", default_tokenizer),\n        (\"ratio\", None),\n        (\"ratio\", default_tokenizer),\n        (\"partial_ratio\", default_tokenizer),\n    ],\n)\ndef test_compare_content_as_df(actual_df, pred_df, eval_func, processor):\n    results = compare_contents_as_df(actual_df, pred_df, eval_func=eval_func, processor=processor)\n    assert 0 < results.get(f\"by_col_{eval_func}\") < 100\n\n\ndef test_compare_content_as_df_with_invalid_input(actual_df, pred_df):\n    with pytest.raises(ValueError, match=\"eval_func must be one of\"):\n        compare_contents_as_df(actual_df, pred_df, eval_func=\"foo\")\n"
  },
  {
    "path": "test_unstructured_inference/models/test_model.py",
    "content": "import json\nimport threading\nimport time\nfrom typing import Any\nfrom unittest import mock\n\nimport numpy as np\nimport pytest\n\nimport unstructured_inference.models.base as models\nfrom unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements\nfrom unstructured_inference.models.unstructuredmodel import (\n    ModelNotInitializedError,\n    UnstructuredObjectDetectionModel,\n)\n\n\nclass MockModel(UnstructuredObjectDetectionModel):\n    call_count = 0\n\n    def __init__(self):\n        self.initializer = mock.MagicMock()\n        super().__init__()\n\n    def initialize(self, *args, **kwargs):\n        return self.initializer(self, *args, **kwargs)\n\n    def predict(self, x: Any) -> Any:\n        return LayoutElements(element_coords=np.array([]))\n\n\nMOCK_MODEL_TYPES = {\n    \"foo\": {\n        \"input_shape\": (640, 640),\n    },\n}\n\n\ndef test_get_model(monkeypatch):\n    monkeypatch.setattr(models, \"models\", {})\n    with mock.patch.dict(models.model_class_map, {\"yolox\": MockModel}):\n        assert isinstance(models.get_model(\"yolox\"), MockModel)\n\n\ndef test_get_model_threaded(monkeypatch):\n    \"\"\"Test that get_model works correctly when called from multiple threads simultaneously.\"\"\"\n    monkeypatch.setattr(models, \"models\", {})\n\n    # Results and exceptions from threads will be stored here\n    results = []\n    exceptions = []\n\n    def get_model_worker(thread_id):\n        \"\"\"Worker function for each thread.\"\"\"\n        try:\n            model = models.get_model(\"yolox\")\n            results.append((thread_id, model))\n        except Exception as e:\n            exceptions.append((thread_id, e))\n\n    # Create and start multiple threads\n    num_threads = 10\n    threads = []\n\n    with mock.patch.dict(models.model_class_map, {\"yolox\": MockModel}):\n        for i in range(num_threads):\n            thread = threading.Thread(target=get_model_worker, args=(i,))\n            threads.append(thread)\n            thread.start()\n\n        # Wait for all threads to complete\n        for thread in threads:\n            thread.join()\n\n    # Verify no exceptions occurred\n    assert len(exceptions) == 0, f\"Exceptions occurred in threads: {exceptions}\"\n\n    # Verify all threads got results\n    assert len(results) == num_threads, f\"Expected {num_threads} results, got {len(results)}\"\n\n    # Verify all results are MockModel instances\n    for thread_id, model in results:\n        assert isinstance(model, MockModel), (\n            f\"Thread {thread_id} got unexpected model type: {type(model)}\"\n        )\n\n\ndef test_get_model_concurrent_different_models(monkeypatch):\n    \"\"\"Test that different models can load in parallel without serialization.\"\"\"\n    monkeypatch.setattr(models, \"models\", {})\n\n    # Track initialization timing\n    init_events = []\n    init_lock = threading.Lock()\n\n    class SlowMockModel(MockModel):\n        def __init__(self):\n            super().__init__()\n            self.model_name = None\n\n        def initialize(self, *args, **kwargs):\n            with init_lock:\n                init_events.append((self.model_name, \"start\"))\n            time.sleep(0.1)  # Simulate slow loading\n            with init_lock:\n                init_events.append((self.model_name, \"end\"))\n            return super().initialize(*args, **kwargs)\n\n    # Store model names in instances\n    def create_model_with_name(name):\n        def factory():\n            model = SlowMockModel()\n            model.model_name = name\n            return model\n\n        return factory\n\n    results = []\n\n    def worker(model_name):\n        models.get_model(model_name)  # Load the model\n        results.append(model_name)\n\n    # Load 2 different models concurrently\n    threads = []\n    mock_config = {\"input_shape\": (640, 640)}\n    with (\n        mock.patch.dict(\n            models.model_class_map,\n            {\n                \"yolox\": create_model_with_name(\"yolox\"),\n                \"detectron2\": create_model_with_name(\"detectron2\"),\n            },\n        ),\n        mock.patch.dict(models.model_config_map, {\"yolox\": mock_config, \"detectron2\": mock_config}),\n    ):\n        for model_name in [\"yolox\", \"detectron2\"]:\n            thread = threading.Thread(target=worker, args=(model_name,))\n            threads.append(thread)\n            thread.start()\n\n        for thread in threads:\n            thread.join()\n\n    # Both models should load successfully\n    assert len(results) == 2\n\n    # Verify parallel execution (both start before either ends)\n    assert len(init_events) == 4, f\"Expected 4 events (2 starts + 2 ends), got {len(init_events)}\"\n\n    # True parallelism means both models start before either finishes\n    # Find when the first model finishes\n    first_end_idx = next(\n        (i for i, (_, event_type) in enumerate(init_events) if event_type == \"end\"), None\n    )\n    assert first_end_idx is not None, \"No 'end' event found\"\n\n    # Count how many models started before the first one finished\n    starts_before_first_end = sum(\n        1 for _, event_type in init_events[:first_end_idx] if event_type == \"start\"\n    )\n    assert starts_before_first_end == 2, (\n        f\"Expected both models to start before either finishes (parallel execution), \"\n        f\"but only {starts_before_first_end} started before first completion. \"\n        f\"Events: {init_events}\"\n    )\n\n\ndef test_register_new_model():\n    assert \"foo\" not in models.model_class_map\n    assert \"foo\" not in models.model_config_map\n    models.register_new_model(MOCK_MODEL_TYPES, MockModel)\n    assert \"foo\" in models.model_class_map\n    assert \"foo\" in models.model_config_map\n    model = models.get_model(\"foo\")\n    assert len(model.initializer.mock_calls) == 1\n    assert model.initializer.mock_calls[0][-1] == MOCK_MODEL_TYPES[\"foo\"]\n    assert isinstance(model, MockModel)\n    # unregister the new model by reset to default\n    models.model_class_map, models.model_config_map = models.get_default_model_mappings()\n    assert \"foo\" not in models.model_class_map\n    assert \"foo\" not in models.model_config_map\n\n\ndef test_get_model_with_lazydict_config(monkeypatch):\n    \"\"\"get_model must unpack a LazyDict config into initialize() without\n    depending on Mapping.keys() — prevents regression of\n    'argument after ** must be a mapping, not LazyDict' in prod.\n    \"\"\"\n    from unstructured_inference.utils import LazyDict, LazyEvaluateInfo\n\n    monkeypatch.setattr(models, \"models\", {})\n\n    evaluated = []\n\n    def _fake_download(path):\n        evaluated.append(path)\n        return path\n\n    lazy_config = LazyDict(\n        model_path=LazyEvaluateInfo(_fake_download, \"/tmp/weights.onnx\"),\n        input_shape=(640, 640),\n    )\n\n    with (\n        mock.patch.dict(models.model_class_map, {\"lazy_mock\": MockModel}),\n        mock.patch.dict(models.model_config_map, {\"lazy_mock\": lazy_config}),\n    ):\n        model = models.get_model(\"lazy_mock\")\n\n    assert isinstance(model, MockModel)\n    assert evaluated == [\"/tmp/weights.onnx\"]\n    model.initializer.assert_called_once_with(\n        model,\n        model_path=\"/tmp/weights.onnx\",\n        input_shape=(640, 640),\n    )\n\n\ndef test_raises_invalid_model():\n    with pytest.raises(models.UnknownModelException):\n        models.get_model(\"fake_model\")\n\n\ndef test_raises_uninitialized():\n    with pytest.raises(ModelNotInitializedError):\n        models.UnstructuredDetectronONNXModel().predict(None)\n\n\ndef test_model_initializes_once():\n    from unstructured_inference.inference import layout\n\n    with (\n        mock.patch.dict(models.model_class_map, {\"yolox\": MockModel}),\n        mock.patch.object(\n            models,\n            \"models\",\n            {},\n        ),\n    ):\n        doc = layout.DocumentLayout.from_file(\"sample-docs/loremipsum.pdf\")\n        doc.pages[0].detection_model.initializer.assert_called_once()\n\n\ndef test_deduplicate_detected_elements():\n    import numpy as np\n\n    from unstructured_inference.inference.elements import intersections\n    from unstructured_inference.inference.layout import DocumentLayout\n    from unstructured_inference.models.base import get_model\n\n    model = get_model(\"yolox_quantized\")\n    # model.confidence_threshold=0.5\n    file = \"sample-docs/example_table.jpg\"\n    doc = DocumentLayout.from_image_file(\n        file,\n        model,\n    )\n    known_elements = [e.bbox for e in doc.pages[0].elements if e.type != \"UncategorizedText\"]\n    # Compute intersection matrix\n    intersections_mtx = intersections(*known_elements)\n    # Get rid off diagonal (cause an element will always intersect itself)\n    np.fill_diagonal(intersections_mtx, False)\n    # Now all the elements should be False, because any intersection remains\n    assert not intersections_mtx.any()\n\n\ndef test_enhance_regions():\n    from unstructured_inference.inference.elements import Rectangle\n    from unstructured_inference.models.base import get_model\n\n    elements = [\n        LayoutElement(bbox=Rectangle(0, 0, 1, 1)),\n        LayoutElement(bbox=Rectangle(0.01, 0.01, 1.01, 1.01)),\n        LayoutElement(bbox=Rectangle(0.02, 0.02, 1.02, 1.02)),\n        LayoutElement(bbox=Rectangle(0.03, 0.03, 1.03, 1.03)),\n        LayoutElement(bbox=Rectangle(0.04, 0.04, 1.04, 1.04)),\n        LayoutElement(bbox=Rectangle(0.05, 0.05, 1.05, 1.05)),\n        LayoutElement(bbox=Rectangle(0.06, 0.06, 1.06, 1.06)),\n        LayoutElement(bbox=Rectangle(0.07, 0.07, 1.07, 1.07)),\n        LayoutElement(bbox=Rectangle(0.08, 0.08, 1.08, 1.08)),\n        LayoutElement(bbox=Rectangle(0.09, 0.09, 1.09, 1.09)),\n        LayoutElement(bbox=Rectangle(0.10, 0.10, 1.10, 1.10)),\n    ]\n    model = get_model(\"yolox_tiny\")\n    elements = model.enhance_regions(elements, 0.5)\n    assert len(elements) == 1\n    assert (\n        elements[0].bbox.x1,\n        elements[0].bbox.y1,\n        elements[0].bbox.x2,\n        elements[0].bbox.x2,\n    ) == (\n        0,\n        0,\n        1.10,\n        1.10,\n    )\n\n\ndef test_clean_type():\n    from unstructured_inference.inference.layout import LayoutElement\n    from unstructured_inference.models.base import get_model\n\n    elements = [\n        LayoutElement.from_coords(\n            0.6,\n            0.6,\n            0.65,\n            0.65,\n            type=\"Table\",\n        ),  # One little table nested inside all the others\n        LayoutElement.from_coords(0.5, 0.5, 0.7, 0.7, type=\"Table\"),  # One nested table\n        LayoutElement.from_coords(0, 0, 1, 1, type=\"Table\"),  # Big table\n        LayoutElement.from_coords(0.01, 0.01, 1.01, 1.01),\n        LayoutElement.from_coords(0.02, 0.02, 1.02, 1.02),\n        LayoutElement.from_coords(0.03, 0.03, 1.03, 1.03),\n        LayoutElement.from_coords(0.04, 0.04, 1.04, 1.04),\n        LayoutElement.from_coords(0.05, 0.05, 1.05, 1.05),\n    ]\n    model = get_model(\"yolox_tiny\")\n    elements = model.clean_type(elements, type_to_clean=\"Table\")\n    assert len(elements) == 1\n    assert (\n        elements[0].bbox.x1,\n        elements[0].bbox.y1,\n        elements[0].bbox.x2,\n        elements[0].bbox.x2,\n    ) == (0, 0, 1, 1)\n\n\ndef test_env_variables_override_default_model(monkeypatch):\n    # When an environment variable specifies a different default model and we call get_model with no\n    # args, we should get back the model the env var calls for\n    monkeypatch.setattr(models, \"models\", {})\n    with (\n        mock.patch.dict(\n            models.os.environ,\n            {\"UNSTRUCTURED_DEFAULT_MODEL_NAME\": \"yolox\"},\n        ),\n        mock.patch.dict(models.model_class_map, {\"yolox\": MockModel}),\n    ):\n        model = models.get_model()\n    assert isinstance(model, MockModel)\n\n\ndef test_env_variables_override_initialization_params(monkeypatch):\n    # When initialization params are specified in an environment variable, and we call get_model, we\n    # should see that the model was initialized with those params\n    monkeypatch.setattr(models, \"models\", {})\n    fake_label_map = {\"1\": \"label1\", \"2\": \"label2\"}\n    with (\n        mock.patch.dict(\n            models.os.environ,\n            {\"UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH\": \"fake_json.json\"},\n        ),\n        mock.patch.object(models, \"DEFAULT_MODEL\", \"fake\"),\n        mock.patch.dict(\n            models.model_class_map,\n            {\"fake\": mock.MagicMock()},\n        ),\n        mock.patch(\n            \"builtins.open\",\n            mock.mock_open(\n                read_data='{\"model_path\": \"fakepath\", \"label_map\": '\n                + json.dumps(fake_label_map)\n                + \"}\",\n            ),\n        ),\n    ):\n        model = models.get_model()\n    model.initialize.assert_called_once_with(\n        model_path=\"fakepath\",\n        label_map={1: \"label1\", 2: \"label2\"},\n    )\n"
  },
  {
    "path": "test_unstructured_inference/models/test_tables.py",
    "content": "import os\nimport threading\nfrom copy import deepcopy\n\nimport numpy as np\nimport pytest\nimport torch\nfrom PIL import Image\nfrom transformers.models.table_transformer.modeling_table_transformer import (\n    TableTransformerDecoder,\n)\n\nimport unstructured_inference.models.table_postprocess as postprocess\nfrom unstructured_inference.models import tables\nfrom unstructured_inference.models.tables import (\n    apply_thresholds_on_objects,\n    structure_to_cells,\n)\n\nskip_outside_ci = os.getenv(\"CI\", \"\").lower() in {\"\", \"false\", \"f\", \"0\"}\n\n\n@pytest.fixture\ndef table_transformer():\n    tables.load_agent()\n    return tables.tables_agent\n\n\ndef test_load_agent(table_transformer):\n    assert hasattr(table_transformer, \"model\")\n\n\n@pytest.fixture\ndef example_image():\n    return Image.open(\"./sample-docs/table-multi-row-column-cells.png\").convert(\"RGB\")\n\n\n@pytest.fixture\ndef mocked_ocr_tokens():\n    return [\n        {\n            \"bbox\": [51.0, 37.0, 1333.0, 38.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 0,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [1064.0, 47.0, 1161.0, 71.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 1,\n            \"text\": \"Results\",\n        },\n        {\n            \"bbox\": [891.0, 113.0, 1333.0, 114.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 2,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [51.0, 236.0, 1333.0, 237.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 3,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [51.0, 308.0, 1333.0, 309.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 4,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [51.0, 450.0, 1333.0, 452.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 5,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [51.0, 522.0, 1333.0, 524.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 6,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [51.0, 37.0, 53.0, 596.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 7,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [90.0, 89.0, 167.0, 93.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 8,\n            \"text\": \"soa\",\n        },\n        {\n            \"bbox\": [684.0, 68.0, 762.0, 91.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 9,\n            \"text\": \"Ballot:\",\n        },\n        {\n            \"bbox\": [69.0, 84.0, 196.0, 140.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 10,\n            \"text\": \"etealeiliay\",\n        },\n        {\n            \"bbox\": [283.0, 109.0, 446.0, 132.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 11,\n            \"text\": \"Participants\",\n        },\n        {\n            \"bbox\": [484.0, 84.0, 576.0, 140.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 12,\n            \"text\": \"pallets\",\n        },\n        {\n            \"bbox\": [684.0, 75.0, 776.0, 132.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 13,\n            \"text\": \"incom\",\n        },\n        {\n            \"bbox\": [788.0, 107.0, 853.0, 136.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 14,\n            \"text\": \"lete/\",\n        },\n        {\n            \"bbox\": [68.0, 121.0, 191.0, 162.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 15,\n            \"text\": \"Category\",\n        },\n        {\n            \"bbox\": [371.0, 115.0, 386.0, 137.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 16,\n            \"text\": \"P\",\n        },\n        {\n            \"bbox\": [483.0, 121.0, 632.0, 162.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 17,\n            \"text\": \"Completed\",\n        },\n        {\n            \"bbox\": [756.0, 115.0, 785.0, 154.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 18,\n            \"text\": \"Ne\",\n        },\n        {\n            \"bbox\": [930.0, 125.0, 1054.0, 152.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 19,\n            \"text\": \"Accuracy\",\n        },\n        {\n            \"bbox\": [1159.0, 124.0, 1227.0, 147.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 20,\n            \"text\": \"Time\",\n        },\n        {\n            \"bbox\": [1235.0, 126.0, 1264.0, 147.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 21,\n            \"text\": \"to\",\n        },\n        {\n            \"bbox\": [682.0, 149.0, 841.0, 173.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 22,\n            \"text\": \"Terminated\",\n        },\n        {\n            \"bbox\": [1147.0, 169.0, 1276.0, 198.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 23,\n            \"text\": \"complete\",\n        },\n        {\n            \"bbox\": [70.0, 245.0, 127.0, 266.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 24,\n            \"text\": \"Blind\",\n        },\n        {\n            \"bbox\": [361.0, 247.0, 373.0, 266.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 25,\n            \"text\": \"5\",\n        },\n        {\n            \"bbox\": [562.0, 247.0, 573.0, 266.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 26,\n            \"text\": \"1\",\n        },\n        {\n            \"bbox\": [772.0, 247.0, 786.0, 266.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 27,\n            \"text\": \"4\",\n        },\n        {\n            \"bbox\": [925.0, 246.0, 1005.0, 270.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 28,\n            \"text\": \"34.5%,\",\n        },\n        {\n            \"bbox\": [1017.0, 247.0, 1059.0, 266.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 29,\n            \"text\": \"n=1\",\n        },\n        {\n            \"bbox\": [1129.0, 246.0, 1187.0, 266.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 30,\n            \"text\": \"1199\",\n        },\n        {\n            \"bbox\": [1197.0, 251.0, 1241.0, 270.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 31,\n            \"text\": \"sec,\",\n        },\n        {\n            \"bbox\": [1253.0, 247.0, 1295.0, 266.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 32,\n            \"text\": \"n=1\",\n        },\n        {\n            \"bbox\": [70.0, 319.0, 117.0, 338.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 33,\n            \"text\": \"Low\",\n        },\n        {\n            \"bbox\": [125.0, 318.0, 198.0, 338.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 34,\n            \"text\": \"Vision\",\n        },\n        {\n            \"bbox\": [361.0, 319.0, 373.0, 338.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 35,\n            \"text\": \"5\",\n        },\n        {\n            \"bbox\": [561.0, 318.0, 573.0, 338.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 36,\n            \"text\": \"2\",\n        },\n        {\n            \"bbox\": [773.0, 318.0, 785.0, 338.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 37,\n            \"text\": \"3\",\n        },\n        {\n            \"bbox\": [928.0, 318.0, 1002.0, 339.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 38,\n            \"text\": \"98.3%\",\n        },\n        {\n            \"bbox\": [1013.0, 318.0, 1055.0, 338.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 39,\n            \"text\": \"n=2\",\n        },\n        {\n            \"bbox\": [1129.0, 318.0, 1188.0, 338.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 40,\n            \"text\": \"1716\",\n        },\n        {\n            \"bbox\": [1197.0, 323.0, 1242.0, 342.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 41,\n            \"text\": \"sec,\",\n        },\n        {\n            \"bbox\": [1253.0, 318.0, 1295.0, 338.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 42,\n            \"text\": \"n=3\",\n        },\n        {\n            \"bbox\": [916.0, 387.0, 1005.0, 413.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 43,\n            \"text\": \"(97.7%,\",\n        },\n        {\n            \"bbox\": [1016.0, 387.0, 1068.0, 413.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 44,\n            \"text\": \"n=3)\",\n        },\n        {\n            \"bbox\": [1086.0, 383.0, 1099.0, 418.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 45,\n            \"text\": \"|\",\n        },\n        {\n            \"bbox\": [1120.0, 387.0, 1188.0, 413.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 46,\n            \"text\": \"(1934\",\n        },\n        {\n            \"bbox\": [1197.0, 393.0, 1241.0, 412.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 47,\n            \"text\": \"sec,\",\n        },\n        {\n            \"bbox\": [1253.0, 387.0, 1305.0, 413.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 48,\n            \"text\": \"n=2)\",\n        },\n        {\n            \"bbox\": [70.0, 456.0, 181.0, 489.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 49,\n            \"text\": \"Dexterity\",\n        },\n        {\n            \"bbox\": [360.0, 461.0, 372.0, 480.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 50,\n            \"text\": \"5\",\n        },\n        {\n            \"bbox\": [560.0, 461.0, 574.0, 480.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 51,\n            \"text\": \"4\",\n        },\n        {\n            \"bbox\": [774.0, 461.0, 785.0, 480.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 52,\n            \"text\": \"1\",\n        },\n        {\n            \"bbox\": [924.0, 460.0, 1005.0, 484.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 53,\n            \"text\": \"98.3%,\",\n        },\n        {\n            \"bbox\": [1017.0, 461.0, 1060.0, 480.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 54,\n            \"text\": \"n=4\",\n        },\n        {\n            \"bbox\": [1118.0, 460.0, 1199.0, 480.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 55,\n            \"text\": \"1672.1\",\n        },\n        {\n            \"bbox\": [1209.0, 465.0, 1253.0, 484.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 56,\n            \"text\": \"sec,\",\n        },\n        {\n            \"bbox\": [1265.0, 461.0, 1308.0, 480.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 57,\n            \"text\": \"n=4\",\n        },\n        {\n            \"bbox\": [70.0, 527.0, 170.0, 561.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 58,\n            \"text\": \"Mobility\",\n        },\n        {\n            \"bbox\": [361.0, 532.0, 373.0, 552.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 59,\n            \"text\": \"3\",\n        },\n        {\n            \"bbox\": [561.0, 532.0, 573.0, 552.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 60,\n            \"text\": \"3\",\n        },\n        {\n            \"bbox\": [773.0, 532.0, 786.0, 552.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 61,\n            \"text\": \"0\",\n        },\n        {\n            \"bbox\": [924.0, 532.0, 1005.0, 556.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 62,\n            \"text\": \"95.4%,\",\n        },\n        {\n            \"bbox\": [1017.0, 532.0, 1059.0, 552.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 63,\n            \"text\": \"n=3\",\n        },\n        {\n            \"bbox\": [1129.0, 532.0, 1188.0, 552.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 64,\n            \"text\": \"1416\",\n        },\n        {\n            \"bbox\": [1197.0, 537.0, 1242.0, 556.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 65,\n            \"text\": \"sec,\",\n        },\n        {\n            \"bbox\": [1253.0, 532.0, 1295.0, 552.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 66,\n            \"text\": \"n=3\",\n        },\n        {\n            \"bbox\": [266.0, 37.0, 267.0, 596.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 67,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [466.0, 37.0, 468.0, 596.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 68,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [666.0, 37.0, 668.0, 596.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 69,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [891.0, 37.0, 893.0, 596.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 70,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [1091.0, 113.0, 1093.0, 596.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 71,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [51.0, 595.0, 1333.0, 596.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 72,\n            \"text\": \" \",\n        },\n        {\n            \"bbox\": [1331.0, 37.0, 1333.0, 596.0],\n            \"block_num\": 0,\n            \"line_num\": 0,\n            \"span_num\": 73,\n            \"text\": \" \",\n        },\n    ]\n\n\n@pytest.mark.parametrize(\n    \"model_path\",\n    [\n        (\"invalid_table_path\"),\n        (\"incorrect_table_path\"),\n    ],\n)\ndef test_load_table_model_raises_when_not_available(model_path):\n    with pytest.raises(OSError):\n        table_model = tables.UnstructuredTableTransformerModel()\n        table_model.initialize(model=model_path)\n\n\n@pytest.mark.parametrize(\n    (\"bbox1\", \"bbox2\", \"expected_result\"),\n    [\n        ((0, 0, 5, 5), (2, 2, 7, 7), 0.36),\n        ((0, 0, 0, 0), (6, 6, 10, 10), 0),\n    ],\n)\ndef test_iob(bbox1, bbox2, expected_result):\n    result = tables.iob(bbox1, bbox2)\n    assert result == expected_result\n\n\n@pytest.mark.parametrize(\n    \"model_path\",\n    [\n        \"microsoft/table-transformer-structure-recognition\",\n    ],\n)\ndef test_load_donut_model(model_path):\n    table_model = tables.UnstructuredTableTransformerModel()\n    table_model.initialize(model=model_path)\n    assert type(table_model.model.model.decoder) is TableTransformerDecoder\n\n\n@pytest.mark.parametrize(\n    (\"input_test\", \"output_test\"),\n    [\n        (\n            [\n                {\n                    \"label\": \"table column header\",\n                    \"score\": 0.9349299073219299,\n                    \"bbox\": [\n                        47.83147430419922,\n                        116.8877944946289,\n                        2557.79296875,\n                        216.98883056640625,\n                    ],\n                },\n                {\n                    \"label\": \"table column header\",\n                    \"score\": 0.934,\n                    \"bbox\": [\n                        47.83147430419922,\n                        116.8877944946289,\n                        2557.79296875,\n                        216.98883056640625,\n                    ],\n                },\n            ],\n            [\n                {\n                    \"label\": \"table column header\",\n                    \"score\": 0.9349299073219299,\n                    \"bbox\": [\n                        47.83147430419922,\n                        116.8877944946289,\n                        2557.79296875,\n                        216.98883056640625,\n                    ],\n                },\n            ],\n        ),\n        ([], []),\n    ],\n)\ndef test_nms(input_test, output_test):\n    output = postprocess.nms(input_test)\n\n    assert output == output_test\n\n\n@pytest.mark.parametrize(\n    (\"supercell1\", \"supercell2\"),\n    [\n        (\n            {\n                \"label\": \"table spanning cell\",\n                \"score\": 0.526617169380188,\n                \"bbox\": [\n                    1446.2801513671875,\n                    1023.817138671875,\n                    2114.3525390625,\n                    1099.20166015625,\n                ],\n                \"projected row header\": False,\n                \"header\": False,\n                \"row_numbers\": [3, 4],\n                \"column_numbers\": [0, 4],\n            },\n            {\n                \"label\": \"table spanning cell\",\n                \"score\": 0.5199193954467773,\n                \"bbox\": [\n                    98.92312622070312,\n                    676.1566772460938,\n                    751.0982666015625,\n                    938.5986938476562,\n                ],\n                \"projected row header\": False,\n                \"header\": False,\n                \"row_numbers\": [3, 4, 6],\n                \"column_numbers\": [0, 4],\n            },\n        ),\n        (\n            {\n                \"label\": \"table spanning cell\",\n                \"score\": 0.526617169380188,\n                \"bbox\": [\n                    1446.2801513671875,\n                    1023.817138671875,\n                    2114.3525390625,\n                    1099.20166015625,\n                ],\n                \"projected row header\": False,\n                \"header\": False,\n                \"row_numbers\": [3, 4],\n                \"column_numbers\": [0, 4],\n            },\n            {\n                \"label\": \"table spanning cell\",\n                \"score\": 0.5199193954467773,\n                \"bbox\": [\n                    98.92312622070312,\n                    676.1566772460938,\n                    751.0982666015625,\n                    938.5986938476562,\n                ],\n                \"projected row header\": False,\n                \"header\": False,\n                \"row_numbers\": [4],\n                \"column_numbers\": [0, 4, 6],\n            },\n        ),\n        (\n            {\n                \"label\": \"table spanning cell\",\n                \"score\": 0.526617169380188,\n                \"bbox\": [\n                    1446.2801513671875,\n                    1023.817138671875,\n                    2114.3525390625,\n                    1099.20166015625,\n                ],\n                \"projected row header\": False,\n                \"header\": False,\n                \"row_numbers\": [3, 4],\n                \"column_numbers\": [1, 4],\n            },\n            {\n                \"label\": \"table spanning cell\",\n                \"score\": 0.5199193954467773,\n                \"bbox\": [\n                    98.92312622070312,\n                    676.1566772460938,\n                    751.0982666015625,\n                    938.5986938476562,\n                ],\n                \"projected row header\": False,\n                \"header\": False,\n                \"row_numbers\": [4],\n                \"column_numbers\": [0, 4, 6],\n            },\n        ),\n        (\n            {\n                \"label\": \"table spanning cell\",\n                \"score\": 0.526617169380188,\n                \"bbox\": [\n                    1446.2801513671875,\n                    1023.817138671875,\n                    2114.3525390625,\n                    1099.20166015625,\n                ],\n                \"projected row header\": False,\n                \"header\": False,\n                \"row_numbers\": [3, 4],\n                \"column_numbers\": [1, 4],\n            },\n            {\n                \"label\": \"table spanning cell\",\n                \"score\": 0.5199193954467773,\n                \"bbox\": [\n                    98.92312622070312,\n                    676.1566772460938,\n                    751.0982666015625,\n                    938.5986938476562,\n                ],\n                \"projected row header\": False,\n                \"header\": False,\n                \"row_numbers\": [2, 4, 5, 6, 7, 8],\n                \"column_numbers\": [0, 4, 6],\n            },\n        ),\n    ],\n)\ndef test_remove_supercell_overlap(supercell1, supercell2):\n    assert postprocess.remove_supercell_overlap(supercell1, supercell2) is None\n\n\n@pytest.mark.parametrize(\n    (\"supercells\", \"rows\", \"columns\", \"output_test\"),\n    [\n        (\n            [\n                {\n                    \"label\": \"table spanning cell\",\n                    \"score\": 0.9,\n                    \"bbox\": [\n                        98.92312622070312,\n                        143.11549377441406,\n                        2115.197265625,\n                        1238.27587890625,\n                    ],\n                    \"projected row header\": True,\n                    \"header\": True,\n                    \"span\": True,\n                },\n            ],\n            [\n                {\n                    \"label\": \"table row\",\n                    \"score\": 0.9299452900886536,\n                    \"bbox\": [0, 0, 10, 10],\n                    \"column header\": True,\n                    \"header\": True,\n                },\n                {\n                    \"label\": \"table row\",\n                    \"score\": 0.9299452900886536,\n                    \"bbox\": [\n                        98.92312622070312,\n                        143.11549377441406,\n                        2114.3525390625,\n                        193.67681884765625,\n                    ],\n                    \"column header\": True,\n                    \"header\": True,\n                },\n                {\n                    \"label\": \"table row\",\n                    \"score\": 0.9299452900886536,\n                    \"bbox\": [\n                        98.92312622070312,\n                        143.11549377441406,\n                        2114.3525390625,\n                        193.67681884765625,\n                    ],\n                    \"column header\": True,\n                    \"header\": True,\n                },\n            ],\n            [\n                {\n                    \"label\": \"table column\",\n                    \"score\": 0.9996132254600525,\n                    \"bbox\": [\n                        98.92312622070312,\n                        143.11549377441406,\n                        517.6508178710938,\n                        1616.48779296875,\n                    ],\n                },\n                {\n                    \"label\": \"table column\",\n                    \"score\": 0.9935646653175354,\n                    \"bbox\": [\n                        520.0474853515625,\n                        143.11549377441406,\n                        751.0982666015625,\n                        1616.48779296875,\n                    ],\n                },\n            ],\n            [\n                {\n                    \"label\": \"table spanning cell\",\n                    \"score\": 0.9,\n                    \"bbox\": [\n                        98.92312622070312,\n                        143.11549377441406,\n                        751.0982666015625,\n                        193.67681884765625,\n                    ],\n                    \"projected row header\": True,\n                    \"header\": True,\n                    \"span\": True,\n                    \"row_numbers\": [1, 2],\n                    \"column_numbers\": [0, 1],\n                },\n                {\n                    \"row_numbers\": [0],\n                    \"column_numbers\": [0, 1],\n                    \"score\": 0.9,\n                    \"propagated\": True,\n                    \"bbox\": [\n                        98.92312622070312,\n                        143.11549377441406,\n                        751.0982666015625,\n                        193.67681884765625,\n                    ],\n                },\n            ],\n        ),\n    ],\n)\ndef test_align_supercells(supercells, rows, columns, output_test):\n    assert postprocess.align_supercells(supercells, rows, columns) == output_test\n\n\n@pytest.mark.parametrize((\"rows\", \"bbox\", \"output\"), [([1.0], [0.0], [1.0])])\ndef test_align_rows(rows, bbox, output):\n    assert postprocess.align_rows(rows, bbox) == output\n\n\n@pytest.mark.parametrize(\n    (\"output_format\", \"expectation\"),\n    [\n        (\"html\", \"<tr><td>Blind</td><td>5</td><td>1</td><td>4</td><td>34.5%, n=1</td>\"),\n        (\n            \"cells\",\n            {\n                \"column_nums\": [0],\n                \"row_nums\": [2],\n                \"column header\": False,\n                \"cell text\": \"Blind\",\n            },\n        ),\n        (\"dataframe\", [\"Blind\", \"5\", \"1\", \"4\", \"34.5%, n=1\", \"1199 sec, n=1\"]),\n        (None, \"<tr><td>Blind</td><td>5</td><td>1</td><td>4</td><td>34.5%, n=1</td>\"),\n    ],\n)\ndef test_table_prediction_output_format(\n    output_format,\n    expectation,\n    table_transformer,\n    example_image,\n    mocker,\n    example_table_cells,\n    mocked_ocr_tokens,\n):\n    mocker.patch.object(tables, \"recognize\", return_value=example_table_cells)\n    mocker.patch.object(\n        tables.UnstructuredTableTransformerModel,\n        \"get_structure\",\n        return_value=None,\n    )\n    if output_format:\n        result = table_transformer.run_prediction(\n            example_image,\n            result_format=output_format,\n            ocr_tokens=mocked_ocr_tokens,\n        )\n    else:\n        result = table_transformer.run_prediction(example_image, ocr_tokens=mocked_ocr_tokens)\n\n    if output_format == \"dataframe\":\n        assert expectation in result.values\n    elif output_format == \"cells\":\n        # other output like bbox are flakey to test since they depend on OCR and it may change\n        # slightly when OCR pacakge changes or even on different machines\n        validation_fields = (\"column_nums\", \"row_nums\", \"column header\", \"cell text\")\n        assert expectation in [{key: cell[key] for key in validation_fields} for cell in result]\n    else:\n        assert expectation in result\n\n\ndef test_table_prediction_output_format_when_wrong_type_then_value_error(\n    table_transformer,\n    example_image,\n    mocker,\n    example_table_cells,\n    mocked_ocr_tokens,\n):\n    mocker.patch.object(tables, \"recognize\", return_value=example_table_cells)\n    mocker.patch.object(\n        tables.UnstructuredTableTransformerModel,\n        \"get_structure\",\n        return_value=None,\n    )\n    with pytest.raises(ValueError):\n        table_transformer.run_prediction(\n            example_image,\n            result_format=\"Wrong format\",\n            ocr_tokens=mocked_ocr_tokens,\n        )\n\n\ndef test_table_prediction_runs_with_empty_recognize(\n    table_transformer,\n    example_image,\n    mocker,\n    mocked_ocr_tokens,\n):\n    mocker.patch.object(tables, \"recognize\", return_value=[])\n    mocker.patch.object(\n        tables.UnstructuredTableTransformerModel,\n        \"get_structure\",\n        return_value=None,\n    )\n    assert table_transformer.run_prediction(example_image, ocr_tokens=mocked_ocr_tokens) == \"\"\n\n\ndef test_table_prediction_with_ocr_tokens(table_transformer, example_image, mocked_ocr_tokens):\n    prediction = table_transformer.predict(example_image, ocr_tokens=mocked_ocr_tokens)\n    assert '<table><thead><tr><th rowspan=\"2\">' in prediction\n    assert \"<tr><td>Blind</td><td>5</td><td>1</td><td>4</td><td>34.5%, n=1</td>\" in prediction\n\n\ndef test_table_prediction_with_no_ocr_tokens(table_transformer, example_image):\n    with pytest.raises(ValueError):\n        table_transformer.predict(example_image)\n\n\n@pytest.mark.parametrize(\n    (\"thresholds\", \"expected_object_number\"),\n    [\n        ({\"0\": 0.5}, 1),\n        ({\"0\": 0.1}, 3),\n        ({\"0\": 0.9}, 0),\n    ],\n)\ndef test_objects_are_filtered_based_on_class_thresholds_when_correct_prediction_and_threshold(\n    thresholds,\n    expected_object_number,\n):\n    objects = [\n        {\"label\": \"0\", \"score\": 0.2},\n        {\"label\": \"0\", \"score\": 0.4},\n        {\"label\": \"0\", \"score\": 0.55},\n    ]\n    assert len(apply_thresholds_on_objects(objects, thresholds)) == expected_object_number\n\n\n@pytest.mark.parametrize(\n    (\"thresholds\", \"expected_object_number\"),\n    [\n        ({\"0\": 0.5, \"1\": 0.1}, 4),\n        ({\"0\": 0.1, \"1\": 0.9}, 3),\n        ({\"0\": 0.9, \"1\": 0.5}, 1),\n    ],\n)\ndef test_objects_are_filtered_based_on_class_thresholds_when_two_classes(\n    thresholds,\n    expected_object_number,\n):\n    objects = [\n        {\"label\": \"0\", \"score\": 0.2},\n        {\"label\": \"0\", \"score\": 0.4},\n        {\"label\": \"0\", \"score\": 0.55},\n        {\"label\": \"1\", \"score\": 0.2},\n        {\"label\": \"1\", \"score\": 0.4},\n        {\"label\": \"1\", \"score\": 0.55},\n    ]\n    assert len(apply_thresholds_on_objects(objects, thresholds)) == expected_object_number\n\n\ndef test_objects_filtering_when_missing_threshold():\n    class_name = \"class_name\"\n    objects = [{\"label\": class_name, \"score\": 0.2}]\n    thresholds = {\"1\": 0.5}\n    with pytest.raises(KeyError, match=class_name):\n        apply_thresholds_on_objects(objects, thresholds)\n\n\ndef test_intersect():\n    a = postprocess.Rect()\n    b = postprocess.Rect([1, 2, 3, 4])\n    assert a.intersect(b).get_area() == 4.0\n\n\ndef test_include_rect():\n    a = postprocess.Rect()\n    assert a.include_rect([1, 2, 3, 4]).get_area() == 4.0\n\n\n@pytest.mark.parametrize(\n    (\"spans\", \"join_with_space\", \"expected\"),\n    [\n        (\n            [\n                {\n                    \"flags\": 2**0,\n                    \"text\": \"5\",\n                    \"superscript\": False,\n                    \"span_num\": 0,\n                    \"line_num\": 0,\n                    \"block_num\": 0,\n                },\n            ],\n            True,\n            \"\",\n        ),\n        (\n            [\n                {\n                    \"flags\": 2**0,\n                    \"text\": \"p\",\n                    \"superscript\": False,\n                    \"span_num\": 0,\n                    \"line_num\": 0,\n                    \"block_num\": 0,\n                },\n            ],\n            True,\n            \"p\",\n        ),\n        (\n            [\n                {\n                    \"flags\": 2**0,\n                    \"text\": \"p\",\n                    \"superscript\": False,\n                    \"span_num\": 0,\n                    \"line_num\": 0,\n                    \"block_num\": 0,\n                },\n                {\n                    \"flags\": 2**0,\n                    \"text\": \"p\",\n                    \"superscript\": False,\n                    \"span_num\": 0,\n                    \"line_num\": 0,\n                    \"block_num\": 0,\n                },\n            ],\n            True,\n            \"p p\",\n        ),\n        (\n            [\n                {\n                    \"flags\": 2**0,\n                    \"text\": \"p\",\n                    \"superscript\": False,\n                    \"span_num\": 0,\n                    \"line_num\": 0,\n                    \"block_num\": 0,\n                },\n                {\n                    \"flags\": 2**0,\n                    \"text\": \"p\",\n                    \"superscript\": False,\n                    \"span_num\": 0,\n                    \"line_num\": 0,\n                    \"block_num\": 1,\n                },\n            ],\n            True,\n            \"p p\",\n        ),\n        (\n            [\n                {\n                    \"flags\": 2**0,\n                    \"text\": \"p\",\n                    \"superscript\": False,\n                    \"span_num\": 0,\n                    \"line_num\": 0,\n                    \"block_num\": 0,\n                },\n                {\n                    \"flags\": 2**0,\n                    \"text\": \"p\",\n                    \"superscript\": False,\n                    \"span_num\": 0,\n                    \"line_num\": 0,\n                    \"block_num\": 1,\n                },\n            ],\n            False,\n            \"p p\",\n        ),\n    ],\n)\ndef test_extract_text_from_spans(spans, join_with_space, expected):\n    res = postprocess.extract_text_from_spans(\n        spans,\n        join_with_space=join_with_space,\n        remove_integer_superscripts=True,\n    )\n    assert res == expected\n\n\n@pytest.mark.parametrize(\n    (\"supercells\", \"expected_len\"),\n    [\n        ([{\"header\": \"hi\", \"row_numbers\": [0, 1, 2], \"score\": 0.9}], 1),\n        (\n            [\n                {\n                    \"header\": \"hi\",\n                    \"row_numbers\": [0],\n                    \"column_numbers\": [1, 2, 3],\n                    \"score\": 0.9,\n                },\n                {\n                    \"header\": \"hi\",\n                    \"row_numbers\": [1],\n                    \"column_numbers\": [1],\n                    \"score\": 0.9,\n                },\n                {\n                    \"header\": \"hi\",\n                    \"row_numbers\": [1],\n                    \"column_numbers\": [2],\n                    \"score\": 0.9,\n                },\n                {\n                    \"header\": \"hi\",\n                    \"row_numbers\": [1],\n                    \"column_numbers\": [3],\n                    \"score\": 0.9,\n                },\n            ],\n            4,\n        ),\n        (\n            [\n                {\n                    \"header\": \"hi\",\n                    \"row_numbers\": [0],\n                    \"column_numbers\": [0],\n                    \"score\": 0.9,\n                },\n                {\n                    \"header\": \"hi\",\n                    \"row_numbers\": [1],\n                    \"column_numbers\": [0],\n                    \"score\": 0.9,\n                },\n                {\n                    \"header\": \"hi\",\n                    \"row_numbers\": [1, 2],\n                    \"column_numbers\": [0],\n                    \"score\": 0.9,\n                },\n                {\n                    \"header\": \"hi\",\n                    \"row_numbers\": [3],\n                    \"column_numbers\": [0],\n                    \"score\": 0.9,\n                },\n            ],\n            3,\n        ),\n    ],\n)\ndef test_header_supercell_tree(supercells, expected_len):\n    postprocess.header_supercell_tree(supercells)\n    assert len(supercells) == expected_len\n\n\n@pytest.mark.parametrize(\"zoom\", [1, 0.1, 5, -1, 0])\ndef test_zoom_image(example_image, zoom):\n    width, height = example_image.size\n    new_image = tables.zoom_image(example_image, zoom)\n    new_w, new_h = new_image.size\n    if zoom <= 0:\n        zoom = 1\n    assert new_w == np.round(width * zoom, 0)\n    assert new_h == np.round(height * zoom, 0)\n\n\n@pytest.mark.parametrize(\n    (\"input_cells\", \"expected_html\"),\n    [\n        # +----------+---------------------+\n        # | row1col1 | row1col2 | row1col3 |\n        # |----------|----------+----------|\n        # | row2col1 | row2col2 | row2col3 |\n        # +----------+----------+----------+\n        pytest.param(\n            [\n                {\n                    \"row_nums\": [0],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row1col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [0],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row1col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [0],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row1col3\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row2col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row2col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row2col3\",\n                    \"column header\": False,\n                },\n            ],\n            (\n                \"<table><tbody><tr><td>row1col1</td><td>row1col2</td><td>row1col3</td></tr>\"\n                \"<tr><td>row2col1</td><td>row2col2</td><td>row2col3</td></tr></tbody></table>\"\n            ),\n            id=\"simple table without header\",\n        ),\n        # +----------+---------------------+\n        # |  h1col1  |  h1col2  |  h1col3  |\n        # |----------|----------+----------|\n        # | row1col1 | row1col2 | row1col3 |\n        # |----------|----------+----------|\n        # | row2col1 | row2col2 | row2col3 |\n        # +----------+----------+----------+\n        pytest.param(\n            [\n                {\"row_nums\": [0], \"column_nums\": [0], \"cell text\": \"h1col1\", \"column header\": True},\n                {\"row_nums\": [0], \"column_nums\": [1], \"cell text\": \"h1col2\", \"column header\": True},\n                {\"row_nums\": [0], \"column_nums\": [2], \"cell text\": \"h1col2\", \"column header\": True},\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row1col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row1col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row1col3\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row2col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row2col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row2col3\",\n                    \"column header\": False,\n                },\n            ],\n            (\n                \"<table><thead><tr><th>h1col1</th><th>h1col2</th><th>h1col2</th></tr></thead>\"\n                \"<tbody><tr><td>row1col1</td><td>row1col2</td><td>row1col3</td></tr>\"\n                \"<tr><td>row2col1</td><td>row2col2</td><td>row2col3</td></tr></tbody></table>\"\n            ),\n            id=\"simple table with header\",\n        ),\n        # +----------+---------------------+\n        # |  h1col1  |  h1col2  |  h1col3  |\n        # |----------|----------+----------|\n        # | row1col1 | row1col2 | row1col3 |\n        # |----------|----------+----------|\n        # | row2col1 | row2col2 | row2col3 |\n        # +----------+----------+----------+\n        pytest.param(\n            [\n                {\"row_nums\": [0], \"column_nums\": [1], \"cell text\": \"h1col2\", \"column header\": True},\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row2col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row1col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row2col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row1col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row2col3\",\n                    \"column header\": False,\n                },\n                {\"row_nums\": [0], \"column_nums\": [0], \"cell text\": \"h1col1\", \"column header\": True},\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row1col3\",\n                    \"column header\": False,\n                },\n                {\"row_nums\": [0], \"column_nums\": [2], \"cell text\": \"h1col2\", \"column header\": True},\n            ],\n            (\n                \"<table><thead><tr><th>h1col1</th><th>h1col2</th><th>h1col2</th></tr></thead>\"\n                \"<tbody><tr><td>row1col1</td><td>row1col2</td><td>row1col3</td></tr>\"\n                \"<tr><td>row2col1</td><td>row2col2</td><td>row2col3</td></tr></tbody></table>\"\n            ),\n            id=\"simple table with header, mixed elements\",\n        ),\n        # +----------+---------------------+\n        # |    two   |   two columns       |\n        # |          |----------+----------|\n        # |    rows  |sub cell 1|sub cell 2|\n        # +----------+----------+----------+\n        pytest.param(\n            [\n                {\n                    \"row_nums\": [0, 1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"two row\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [0],\n                    \"column_nums\": [1, 2],\n                    \"cell text\": \"two cols\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [1],\n                    \"cell text\": \"sub cell 1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [2],\n                    \"cell text\": \"sub cell 2\",\n                    \"column header\": False,\n                },\n            ],\n            (\n                '<table><tbody><tr><td rowspan=\"2\">two row</td><td colspan=\"2\">two '\n                \"cols</td></tr><tr><td>sub cell 1</td><td>sub cell 2</td></tr>\"\n                \"</tbody></table>\"\n            ),\n            id=\"various spans, no headers\",\n        ),\n        # +----------+---------------------+----------+\n        # |          |       h1col23       |  h1col4  |\n        # | h12col1  |----------+----------+----------|\n        # |          |  h2col2  |       h2col34       |\n        # |----------|----------+----------+----------+\n        # |  r3col1  |  r3col2  |                     |\n        # |----------+----------|      r34col34       |\n        # |       r4col12       |                     |\n        # +----------+----------+----------+----------+\n        pytest.param(\n            [\n                {\n                    \"row_nums\": [0, 1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"h12col1\",\n                    \"column header\": True,\n                },\n                {\n                    \"row_nums\": [0],\n                    \"column_nums\": [1, 2],\n                    \"cell text\": \"h1col23\",\n                    \"column header\": True,\n                },\n                {\"row_nums\": [0], \"column_nums\": [3], \"cell text\": \"h1col4\", \"column header\": True},\n                {\"row_nums\": [1], \"column_nums\": [1], \"cell text\": \"h2col2\", \"column header\": True},\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [2, 3],\n                    \"cell text\": \"h2col34\",\n                    \"column header\": True,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [0],\n                    \"cell text\": \"r3col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [1],\n                    \"cell text\": \"r3col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2, 3],\n                    \"column_nums\": [2, 3],\n                    \"cell text\": \"r34col34\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [3],\n                    \"column_nums\": [0, 1],\n                    \"cell text\": \"r4col12\",\n                    \"column header\": False,\n                },\n            ],\n            (\n                '<table><thead><tr><th rowspan=\"2\">h12col1</th>'\n                '<th colspan=\"2\">h1col23</th><th>h1col4</th></tr>'\n                '<tr><th>h2col2</th><th colspan=\"2\">h2col34</th></tr></thead><tbody>'\n                '<tr><td>r3col1</td><td>r3col2</td><td colspan=\"2\" rowspan=\"2\">r34col34</td></tr>'\n                '<tr><td colspan=\"2\">r4col12</td></tr></tbody></table>'\n            ),\n            id=\"various spans, with 2 row header\",\n        ),\n    ],\n)\ndef test_cells_to_html(input_cells, expected_html):\n    assert tables.cells_to_html(input_cells) == expected_html\n\n\n@pytest.mark.parametrize(\n    (\"input_cells\", \"expected_cells\"),\n    [\n        pytest.param(\n            [\n                {\"row_nums\": [0], \"column_nums\": [0], \"cell text\": \"h1col1\", \"column header\": True},\n                {\"row_nums\": [0], \"column_nums\": [1], \"cell text\": \"h1col2\", \"column header\": True},\n                {\"row_nums\": [0], \"column_nums\": [2], \"cell text\": \"h1col2\", \"column header\": True},\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row1col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row1col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row1col3\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row2col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row2col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row2col3\",\n                    \"column header\": False,\n                },\n            ],\n            [\n                {\"row_nums\": [0], \"column_nums\": [0], \"cell text\": \"h1col1\", \"column header\": True},\n                {\"row_nums\": [0], \"column_nums\": [1], \"cell text\": \"h1col2\", \"column header\": True},\n                {\"row_nums\": [0], \"column_nums\": [2], \"cell text\": \"h1col2\", \"column header\": True},\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row1col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row1col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row1col3\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row2col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row2col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row2col3\",\n                    \"column header\": False,\n                },\n            ],\n            id=\"identical tables, no changes expected\",\n        ),\n        pytest.param(\n            [\n                {\"row_nums\": [0], \"column_nums\": [0], \"cell text\": \"h1col1\", \"column header\": True},\n                {\"row_nums\": [0], \"column_nums\": [2], \"cell text\": \"h1col2\", \"column header\": True},\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row1col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row1col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row2col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row2col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row2col3\",\n                    \"column header\": False,\n                },\n            ],\n            [\n                {\"row_nums\": [0], \"column_nums\": [0], \"cell text\": \"h1col1\", \"column header\": True},\n                {\"row_nums\": [0], \"column_nums\": [1], \"cell text\": \"\", \"column header\": True},\n                {\"row_nums\": [0], \"column_nums\": [2], \"cell text\": \"h1col2\", \"column header\": True},\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row1col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row1col2\",\n                    \"column header\": False,\n                },\n                {\"row_nums\": [1], \"column_nums\": [2], \"cell text\": \"\", \"column header\": False},\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [0],\n                    \"cell text\": \"row2col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [1],\n                    \"cell text\": \"row2col2\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [2],\n                    \"cell text\": \"row2col3\",\n                    \"column header\": False,\n                },\n            ],\n            id=\"missing column in header and in the middle\",\n        ),\n        pytest.param(\n            [\n                {\n                    \"row_nums\": [0, 1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"h12col1\",\n                    \"column header\": True,\n                },\n                {\n                    \"row_nums\": [0],\n                    \"column_nums\": [1, 2],\n                    \"cell text\": \"h1col23\",\n                    \"column header\": True,\n                },\n                {\"row_nums\": [1], \"column_nums\": [1], \"cell text\": \"h2col2\", \"column header\": True},\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [2, 3],\n                    \"cell text\": \"h2col34\",\n                    \"column header\": True,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [0],\n                    \"cell text\": \"r3col1\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [2, 3],\n                    \"column_nums\": [2, 3],\n                    \"cell text\": \"r34col34\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [3],\n                    \"column_nums\": [0, 1],\n                    \"cell text\": \"r4col12\",\n                    \"column header\": False,\n                },\n            ],\n            [\n                {\n                    \"row_nums\": [0, 1],\n                    \"column_nums\": [0],\n                    \"cell text\": \"h12col1\",\n                    \"column header\": True,\n                },\n                {\n                    \"row_nums\": [0],\n                    \"column_nums\": [1, 2],\n                    \"cell text\": \"h1col23\",\n                    \"column header\": True,\n                },\n                {\"row_nums\": [0], \"column_nums\": [3], \"cell text\": \"\", \"column header\": True},\n                {\"row_nums\": [1], \"column_nums\": [1], \"cell text\": \"h2col2\", \"column header\": True},\n                {\n                    \"row_nums\": [1],\n                    \"column_nums\": [2, 3],\n                    \"cell text\": \"h2col34\",\n                    \"column header\": True,\n                },\n                {\n                    \"row_nums\": [2],\n                    \"column_nums\": [0],\n                    \"cell text\": \"r3col1\",\n                    \"column header\": False,\n                },\n                {\"row_nums\": [2], \"column_nums\": [1], \"cell text\": \"\", \"column header\": False},\n                {\n                    \"row_nums\": [2, 3],\n                    \"column_nums\": [2, 3],\n                    \"cell text\": \"r34col34\",\n                    \"column header\": False,\n                },\n                {\n                    \"row_nums\": [3],\n                    \"column_nums\": [0, 1],\n                    \"cell text\": \"r4col12\",\n                    \"column header\": False,\n                },\n            ],\n            id=\"missing column in header and in the middle in table with spans\",\n        ),\n    ],\n)\ndef test_fill_cells(input_cells, expected_cells):\n    def sort_cells(cells):\n        return sorted(cells, key=lambda x: (x[\"row_nums\"], x[\"column_nums\"]))\n\n    assert sort_cells(tables.fill_cells(input_cells)) == sort_cells(expected_cells)\n\n\ndef test_padded_results_has_right_dimensions(table_transformer, example_image):\n    str_class_name2idx = tables.get_class_map(\"structure\")\n    # a simpler mapping so we keep all structure in the returned objs below for test\n    str_class_idx2name = dict.fromkeys(str_class_name2idx.values(), \"table cell\")\n    # pad size is no more than 10% of the original image so we can setup test below easier\n    pad = int(min(example_image.size) / 10)\n\n    structure = table_transformer.get_structure(example_image, pad_for_structure_detection=pad)\n    # boxes deteced OUTSIDE of the original image; this shouldn't happen but we want to make sure\n    # the code handles it as expected\n    structure[\"pred_boxes\"][0][0, :2] = 0.5\n    structure[\"pred_boxes\"][0][0, 2:] = 1.0\n    # mock a box we know are safly inside the original image with known positions\n    width, height = example_image.size\n    padded_width = width + pad * 2\n    padded_height = height + pad * 2\n    original = [1, 3, 101, 53]\n    structure[\"pred_boxes\"][0][1, :] = torch.tensor(\n        [\n            (51 + pad) / padded_width,\n            (28 + pad) / padded_height,\n            100 / padded_width,\n            50 / padded_height,\n        ],\n    )\n    objs = tables.outputs_to_objects(structure, example_image.size, str_class_idx2name)\n    np.testing.assert_almost_equal(objs[0][\"bbox\"], [-pad, -pad, width + pad, height + pad], 4)\n    np.testing.assert_almost_equal(objs[1][\"bbox\"], original, 4)\n    # a more strict test would be to constrain the actual detected boxes to be within the original\n    # image but that requires the table transformer to behave in certain ways and do not\n    # actually test the padding math; so here we use the relaxed condition\n    for obj in objs[2:]:\n        x1, y1, x2, y2 = obj[\"bbox\"]\n        assert max(x1, x2) < width + pad\n        assert max(y1, y2) < height + pad\n\n\ndef test_compute_confidence_score_zero_division_error_handling():\n    assert tables.compute_confidence_score([]) == 0\n\n\n@pytest.mark.parametrize(\n    (\"column_span_score\", \"row_span_score\", \"expected_text_to_indexes\"),\n    [\n        (\n            0.9,\n            0.8,\n            (\n                {\n                    \"one three\": {\"row_nums\": [0, 1], \"column_nums\": [0]},\n                    \"two\": {\"row_nums\": [0], \"column_nums\": [1]},\n                    \"four\": {\"row_nums\": [1], \"column_nums\": [1]},\n                }\n            ),\n        ),\n        (\n            0.8,\n            0.9,\n            (\n                {\n                    \"one two\": {\"row_nums\": [0], \"column_nums\": [0, 1]},\n                    \"three\": {\"row_nums\": [1], \"column_nums\": [0]},\n                    \"four\": {\"row_nums\": [1], \"column_nums\": [1]},\n                }\n            ),\n        ),\n    ],\n)\ndef test_subcells_filtering_when_overlapping_spanning_cells(\n    column_span_score,\n    row_span_score,\n    expected_text_to_indexes,\n):\n    \"\"\"\n    # table\n    # +-----------+----------+\n    # |    one    |   two    |\n    # |-----------+----------|\n    # |    three  |   four   |\n    # +-----------+----------+\n\n    spanning cells over first row and over first column\n    \"\"\"\n    table_structure = {\n        \"rows\": [\n            {\"bbox\": [0, 0, 10, 20]},\n            {\"bbox\": [10, 0, 20, 20]},\n        ],\n        \"columns\": [\n            {\"bbox\": [0, 0, 20, 10]},\n            {\"bbox\": [0, 10, 20, 20]},\n        ],\n        \"spanning cells\": [\n            {\"bbox\": [0, 0, 20, 10], \"score\": column_span_score},\n            {\"bbox\": [0, 0, 10, 20], \"score\": row_span_score},\n        ],\n    }\n    tokens = [\n        {\n            \"text\": \"one\",\n            \"bbox\": [0, 0, 10, 10],\n        },\n        {\n            \"text\": \"two\",\n            \"bbox\": [0, 10, 10, 20],\n        },\n        {\n            \"text\": \"three\",\n            \"bbox\": [10, 0, 20, 10],\n        },\n        {\"text\": \"four\", \"bbox\": [10, 10, 20, 20]},\n    ]\n    token_args = {\"span_num\": 1, \"line_num\": 1, \"block_num\": 1}\n    for token in tokens:\n        token.update(token_args)\n    for spanning_cell in table_structure[\"spanning cells\"]:\n        spanning_cell[\"projected row header\"] = False\n\n    # table structure is edited inside structure_to_cells, save copy for future runs\n    saved_table_structure = deepcopy(table_structure)\n\n    predicted_cells, _ = structure_to_cells(table_structure, tokens=tokens)\n    predicted_text_to_indexes = {\n        cell[\"cell text\"]: {\n            \"row_nums\": cell[\"row_nums\"],\n            \"column_nums\": cell[\"column_nums\"],\n        }\n        for cell in predicted_cells\n    }\n    assert predicted_text_to_indexes == expected_text_to_indexes\n\n    # swap spanning cells to ensure the highest prob spanning cell is used\n    spans = saved_table_structure[\"spanning cells\"]\n    spans[0], spans[1] = spans[1], spans[0]\n    saved_table_structure[\"spanning cells\"] = spans\n\n    predicted_cells_after_reorder, _ = structure_to_cells(saved_table_structure, tokens=tokens)\n    assert predicted_cells_after_reorder == predicted_cells\n\n\ndef test_model_init_is_thread_safe():\n    threads = []\n    tables.tables_agent.model = None\n    for i in range(5):\n        thread = threading.Thread(target=tables.load_agent)\n        threads.append(thread)\n        thread.start()\n\n    for thread in threads:\n        thread.join()\n\n    assert tables.tables_agent.model is not None\n"
  },
  {
    "path": "test_unstructured_inference/models/test_yolox.py",
    "content": "import os\n\nimport pytest\n\nfrom unstructured_inference.inference.layout import process_file_with_model\n\n\n@pytest.mark.slow\ndef test_layout_yolox_local_parsing_image():\n    filename = os.path.join(\"sample-docs\", \"test-image.jpg\")\n    # NOTE(benjamin) keep_output = True create a file for each image in\n    # localstorage for visualization of the result\n    document_layout = process_file_with_model(filename, model_name=\"yolox\", is_image=True)\n    # NOTE(benjamin) The example image should result in one page result\n    assert len(document_layout.pages) == 1\n    # NOTE(benjamin) The example sent to the test contains 13 detections\n    types_known = [\"Text\", \"Section-header\", \"Page-header\"]\n    elements = document_layout.pages[0].elements_array\n    known_regions = [\n        e for e in elements.element_class_ids if elements.element_class_id_map[e] in types_known\n    ]\n    assert len(known_regions) == 13\n    # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities\n    assert hasattr(elements, \"element_probs\")\n    assert isinstance(\n        elements.element_probs[0],\n        float,\n    )  # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float\n\n\n@pytest.mark.slow\ndef test_layout_yolox_local_parsing_pdf():\n    filename = os.path.join(\"sample-docs\", \"loremipsum.pdf\")\n    document_layout = process_file_with_model(filename, model_name=\"yolox\")\n    assert len(document_layout.pages) == 1\n    # NOTE(benjamin) The example sent to the test contains 5 text detections\n    text_elements = [e for e in document_layout.pages[0].elements if e.type == \"Text\"]\n    assert len(text_elements) == 5\n    assert hasattr(\n        document_layout.pages[0].elements[0],\n        \"prob\",\n    )  # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities\n    assert isinstance(\n        document_layout.pages[0].elements[0].prob,\n        float,\n    )  # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float\n\n\n@pytest.mark.slow\ndef test_layout_yolox_local_parsing_empty_pdf():\n    filename = os.path.join(\"sample-docs\", \"empty-document.pdf\")\n    document_layout = process_file_with_model(filename, model_name=\"yolox\")\n    assert len(document_layout.pages) == 1\n    # NOTE(benjamin) The example sent to the test contains 0 detections\n    assert len(document_layout.pages[0].elements) == 0\n\n\n########################\n# ONLY SHORT TESTS BELOW\n########################\n\n\ndef test_layout_yolox_local_parsing_image_soft():\n    filename = os.path.join(\"sample-docs\", \"example_table.jpg\")\n    # NOTE(benjamin) keep_output = True create a file for each image in\n    # localstorage for visualization of the result\n    document_layout = process_file_with_model(filename, model_name=\"yolox_quantized\", is_image=True)\n    # NOTE(benjamin) The example image should result in one page result\n    assert len(document_layout.pages) == 1\n    # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model\n    assert len(document_layout.pages[0].elements) > 0\n    assert hasattr(\n        document_layout.pages[0].elements[0],\n        \"prob\",\n    )  # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities\n    assert isinstance(\n        document_layout.pages[0].elements[0].prob,\n        float,\n    )  # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float\n\n\ndef test_layout_yolox_local_parsing_pdf_soft():\n    filename = os.path.join(\"sample-docs\", \"loremipsum.pdf\")\n    document_layout = process_file_with_model(filename, model_name=\"yolox_tiny\")\n    assert len(document_layout.pages) == 1\n    # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model\n    assert len(document_layout.pages[0].elements) > 0\n    assert hasattr(\n        document_layout.pages[0].elements[0],\n        \"prob\",\n    )  # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities\n\n\ndef test_layout_yolox_local_parsing_empty_pdf_soft():\n    filename = os.path.join(\"sample-docs\", \"empty-document.pdf\")\n    document_layout = process_file_with_model(filename, model_name=\"yolox_tiny\")\n    assert len(document_layout.pages) == 1\n    # NOTE(benjamin) The example sent to the test contains 0 detections\n    text_elements_page_1 = [el for el in document_layout.pages[0].elements if el.type != \"Image\"]\n    assert len(text_elements_page_1) == 0\n"
  },
  {
    "path": "test_unstructured_inference/test_config.py",
    "content": "def test_default_config():\n    from unstructured_inference.config import inference_config\n\n    assert inference_config.TT_TABLE_CONF == 0.5\n\n\ndef test_env_override(monkeypatch):\n    monkeypatch.setenv(\"TT_TABLE_CONF\", 1)\n    from unstructured_inference.config import inference_config\n\n    assert inference_config.TT_TABLE_CONF == 1\n"
  },
  {
    "path": "test_unstructured_inference/test_elements.py",
    "content": "import os\nfrom random import randint\nfrom unittest.mock import PropertyMock, patch\n\nimport numpy as np\nimport pytest\n\nfrom unstructured_inference.constants import IsExtracted, Source\nfrom unstructured_inference.inference import elements\nfrom unstructured_inference.inference.elements import (\n    Rectangle,\n    TextRegion,\n    TextRegions,\n)\nfrom unstructured_inference.inference.layoutelement import (\n    LayoutElements,\n    clean_layoutelements,\n    clean_layoutelements_for_class,\n    partition_groups_from_regions,\n    separate,\n)\n\nskip_outside_ci = os.getenv(\"CI\", \"\").lower() in {\"\", \"false\", \"f\", \"0\"}\n\n\ndef intersect_brute(rect1, rect2):\n    return any(\n        (rect2.x1 <= x <= rect2.x2) and (rect2.y1 <= y <= rect2.y2)\n        for x in range(rect1.x1, rect1.x2 + 1)\n        for y in range(rect1.y1, rect1.y2 + 1)\n    )\n\n\ndef rand_rect(size=10):\n    x1 = randint(0, 30 - size)\n    y1 = randint(0, 30 - size)\n    return elements.Rectangle(x1, y1, x1 + size, y1 + size)\n\n\n@pytest.fixture\ndef test_layoutelements():\n    coords = np.array(\n        [\n            [0.6, 0.6, 0.65, 0.65],  # One little table nested inside all the others\n            [0.5, 0.5, 0.7, 0.7],  # One nested table\n            [0, 0, 1, 1],  # Big table\n            [0.01, 0.01, 0.09, 0.09],\n            [0.02, 0.02, 1.02, 1.02],\n            [0.03, 0.03, 1.03, 1.03],\n            [0.04, 0.04, 1.04, 1.04],\n            [0.05, 0.05, 1.05, 1.05],\n            [2, 2, 3, 3],  # Big table\n        ],\n    )\n    element_class_ids = np.array([1, 1, 1, 0, 0, 0, 0, 0, 2])\n    class_map = {0: \"type0\", 1: \"type1\", 2: \"type2\"}\n    return LayoutElements(\n        element_coords=coords,\n        element_class_ids=element_class_ids,\n        element_class_id_map=class_map,\n        source=Source.YOLOX,\n    )\n\n\n@pytest.mark.parametrize(\n    (\"rect1\", \"rect2\", \"expected\"),\n    [\n        (Rectangle(0, 0, 1, 1), Rectangle(0, 0, None, None), None),\n        (Rectangle(0, 0, None, None), Rectangle(0, 0, 1, 1), None),\n    ],\n)\ndef test_unhappy_intersection(rect1, rect2, expected):\n    assert rect1.intersection(rect2) == expected\n    assert not rect1.intersects(rect2)\n\n\n@pytest.mark.parametrize(\"second_size\", [10, 20])\ndef test_intersects(second_size):\n    for _ in range(1000):\n        rect1 = rand_rect()\n        rect2 = rand_rect(second_size)\n        assert intersect_brute(rect1, rect2) == rect1.intersects(rect2) == rect2.intersects(rect1)\n        if rect1.intersects(rect2):\n            if rect1.is_in(rect2):\n                assert rect1.intersection(rect2) == rect1 == rect2.intersection(rect1)\n            elif rect2.is_in(rect1):\n                assert rect2.intersection(rect1) == rect2\n            else:\n                x1 = max(rect1.x1, rect2.x1)\n                x2 = min(rect1.x2, rect2.x2)\n                y1 = max(rect1.y1, rect2.y1)\n                y2 = min(rect1.y2, rect2.y2)\n                intersection = elements.Rectangle(x1, y1, x2, y2)\n                assert rect1.intersection(rect2) == intersection == rect2.intersection(rect1)\n        else:\n            assert rect1.intersection(rect2) is None\n            assert rect2.intersection(rect1) is None\n\n\ndef test_intersection_of_lots_of_rects():\n    for _ in range(1000):\n        n_rects = 10\n        rects = [rand_rect(6) for _ in range(n_rects)]\n        intersection_mtx = elements.intersections(*rects)\n        for i in range(n_rects):\n            for j in range(n_rects):\n                assert (\n                    intersect_brute(rects[i], rects[j])\n                    == intersection_mtx[i, j]\n                    == intersection_mtx[j, i]\n                )\n\n\ndef test_rectangle_width_height():\n    for _ in range(1000):\n        x1 = randint(0, 50)\n        x2 = randint(x1 + 1, 100)\n        y1 = randint(0, 50)\n        y2 = randint(y1 + 1, 100)\n        rect = elements.Rectangle(x1, y1, x2, y2)\n        assert rect.width == x2 - x1\n        assert rect.height == y2 - y1\n\n\ndef test_minimal_containing_rect():\n    for _ in range(1000):\n        rect1 = rand_rect()\n        rect2 = rand_rect()\n        big_rect = elements.minimal_containing_region(rect1, rect2)\n        for decrease_attr in [\"x1\", \"y1\", \"x2\", \"y2\"]:\n            almost_as_big_rect = rand_rect()\n            mod = 1 if decrease_attr.endswith(\"1\") else -1\n            for attr in [\"x1\", \"y1\", \"x2\", \"y2\"]:\n                if attr == decrease_attr:\n                    setattr(almost_as_big_rect, attr, getattr(big_rect, attr) + mod)\n                else:\n                    setattr(almost_as_big_rect, attr, getattr(big_rect, attr))\n            assert not rect1.is_in(almost_as_big_rect) or not rect2.is_in(almost_as_big_rect)\n\n        assert rect1.is_in(big_rect)\n        assert rect2.is_in(big_rect)\n\n\n@pytest.mark.parametrize(\"coord_type\", [int, float])\ndef test_partition_groups_from_regions(mock_embedded_text_regions, coord_type):\n    words = TextRegions.from_list(mock_embedded_text_regions)\n    words.element_coords = words.element_coords.astype(coord_type)\n    groups = partition_groups_from_regions(words)\n    assert len(groups) == 1\n    text = \"\".join(groups[-1].texts)\n    assert text.startswith(\"Layout\")\n    # test backward compatibility\n    text = \"\".join([str(region) for region in groups[-1].as_list()])\n    assert text.startswith(\"Layout\")\n\n\ndef test_rectangle_padding():\n    rect = Rectangle(x1=0, y1=1, x2=3, y2=4)\n    padded = rect.pad(1)\n    assert (padded.x1, padded.y1, padded.x2, padded.y2) == (-1, 0, 4, 5)\n    assert (rect.x1, rect.y1, rect.x2, rect.y2) == (0, 1, 3, 4)\n\n\ndef test_rectangle_area(monkeypatch):\n    for _ in range(1000):\n        width = randint(0, 20)\n        height = randint(0, 20)\n        with (\n            patch(\n                \"unstructured_inference.inference.elements.Rectangle.height\",\n                new_callable=PropertyMock,\n            ) as mockheight,\n            patch(\n                \"unstructured_inference.inference.elements.Rectangle.width\",\n                new_callable=PropertyMock,\n            ) as mockwidth,\n        ):\n            rect = elements.Rectangle(0, 0, 0, 0)\n            mockheight.return_value = height\n            mockwidth.return_value = width\n            assert rect.area == width * height\n\n\ndef test_rectangle_iou():\n    for _ in range(1000):\n        rect1 = rand_rect()\n        assert rect1.intersection_over_union(rect1) == 1.0\n        rect2 = rand_rect(20)\n        assert rect1.intersection_over_union(rect2) == rect2.intersection_over_union(rect1)\n        if rect1.is_in(rect2):\n            assert rect1.intersection_over_union(rect2) == rect1.area / rect2.area\n        elif rect2.is_in(rect1):\n            assert rect1.intersection_over_union(rect2) == rect2.area / rect1.area\n        else:\n            if rect1.intersection(rect2) is None:\n                assert rect1.intersection_over_union(rect2) == 0.0\n            else:\n                intersection = rect1.intersection(rect2).area\n                assert rect1.intersection_over_union(rect2) == intersection / (\n                    rect1.area + rect2.area - intersection\n                )\n\n\ndef test_midpoints():\n    for _ in range(1000):\n        x2 = randint(0, 100)\n        y2 = randint(0, 100)\n        rect1 = elements.Rectangle(0, 0, x2, y2)\n        assert rect1.x_midpoint == x2 / 2.0\n        assert rect1.y_midpoint == y2 / 2.0\n        x_offset = randint(0, 50)\n        y_offset = randint(0, 50)\n        rect2 = elements.Rectangle(x_offset, y_offset, x2 + x_offset, y2 + y_offset)\n        assert rect2.x_midpoint == (x2 / 2.0) + x_offset\n        assert rect2.y_midpoint == (y2 / 2.0) + y_offset\n\n\ndef test_is_disjoint():\n    for _ in range(1000):\n        a = randint(0, 100)\n        b = randint(a + 1, 200)\n        c = randint(b + 1, 300)\n        d = randint(c + 1, 400)\n        e = randint(0, 100)\n        f = randint(e, 200)\n        g = randint(0, 100)\n        h = randint(g, 200)\n        rect1 = elements.Rectangle(a, e, b, f)\n        rect2 = elements.Rectangle(c, g, d, h)\n        assert rect1.is_disjoint(rect2)\n        assert rect2.is_disjoint(rect1)\n        rect3 = elements.Rectangle(e, a, f, b)\n        rect4 = elements.Rectangle(g, c, h, d)\n        assert rect3.is_disjoint(rect4)\n        assert rect4.is_disjoint(rect3)\n\n\n@pytest.mark.parametrize(\n    (\"rect1\", \"rect2\", \"expected\"),\n    [\n        (elements.Rectangle(0, 0, 100, 200), elements.Rectangle(0, 0, 60, 150), 1.0),\n        (elements.Rectangle(0, 0, 100, 100), elements.Rectangle(150, 150, 200, 200), 0.0),\n        (elements.Rectangle(0, 0, 100, 100), elements.Rectangle(50, 50, 150, 150), 0.25),\n        (elements.Rectangle(0, 0, 100, 100), elements.Rectangle(20, 20, 120, 40), 0.8),\n    ],\n)\ndef test_intersection_over_min(\n    rect1: elements.Rectangle,\n    rect2: elements.Rectangle,\n    expected: float,\n):\n    assert (\n        rect1.intersection_over_minimum(rect2) == rect2.intersection_over_minimum(rect1) == expected\n    )\n\n\ndef test_grow_region_to_match_region():\n    from unstructured_inference.inference.elements import (\n        Rectangle,\n        grow_region_to_match_region,\n    )\n\n    a = Rectangle(1, 1, 2, 2)\n    b = Rectangle(1, 1, 5, 5)\n    grow_region_to_match_region(a, b)\n    assert a == Rectangle(1, 1, 5, 5)\n\n\n@pytest.mark.parametrize(\n    (\"rect1\", \"rect2\", \"expected\"),\n    [\n        (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(3, 3, 5.1, 5.1), True),\n        (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(3, 3, 5.2, 5.2), True),\n        (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(7, 7, 10, 10), False),\n    ],\n)\ndef test_is_almost_subregion_of(rect1, rect2, expected):\n    assert expected == rect2.is_almost_subregion_of(rect1)\n\n\n@pytest.mark.parametrize(\n    (\"rect1\", \"rect2\"),\n    [\n        (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(3, 3, 6, 6)),\n        (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(6, 6, 8, 8)),\n        (elements.Rectangle(3, 3, 7, 7), elements.Rectangle(2, 2, 4, 4)),\n        (elements.Rectangle(2, 2, 4, 11), elements.Rectangle(3, 3, 7, 10)),\n        (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(3, 3, 7, 10)),\n        (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(2.5, 2.5, 3.5, 4.5)),\n        (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(3, 1, 4, 3.5)),\n        (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(3, 1, 4.5, 3.5)),\n    ],\n)\ndef test_separate(rect1, rect2):\n    separate(rect1, rect2)\n\n    # assert not rect1.intersects(rect2) #TODO: fix this test\n\n\ndef test_clean_layoutelements(test_layoutelements):\n    elements = clean_layoutelements(test_layoutelements).as_list()\n    assert len(elements) == 2\n    assert (\n        elements[0].bbox.x1,\n        elements[0].bbox.y1,\n        elements[0].bbox.x2,\n        elements[0].bbox.x2,\n    ) == (0, 0, 1, 1)\n    assert (\n        elements[1].bbox.x1,\n        elements[1].bbox.y1,\n        elements[1].bbox.x2,\n        elements[1].bbox.x2,\n    ) == (2, 2, 3, 3)\n    assert elements[0].source == elements[1].source == Source.YOLOX\n\n\n@pytest.mark.parametrize(\n    (\"coords\", \"class_ids\", \"expected_coords\", \"expected_ids\"),\n    [\n        ([[0, 0, 1, 1], [0, 0, 1, 1]], [0, 1], [[0, 0, 1, 1]], [0]),  # one box\n        (\n            [[0, 0, 1, 1], [0, 0, 1, 1], [1, 1, 2, 2]],\n            [0, 1, 0],\n            [[0, 0, 1, 1], [1, 1, 2, 2]],\n            [0, 0],\n        ),\n        (\n            [[0, 0, 1.4, 1.4], [0, 0, 1, 1], [0.4, 0, 1.4, 1], [1.2, 0, 1.4, 1]],\n            [0, 1, 1, 1],\n            [[0, 0, 1.4, 1.4]],\n            [0],\n        ),\n    ],\n)\ndef test_clean_layoutelements_cases(\n    coords,\n    class_ids,\n    expected_coords,\n    expected_ids,\n):\n    coords = np.array(coords)\n    element_class_ids = np.array(class_ids)\n    elements = LayoutElements(element_coords=coords, element_class_ids=element_class_ids)\n\n    elements = clean_layoutelements(elements)\n    np.testing.assert_array_equal(elements.element_coords, expected_coords)\n    np.testing.assert_array_equal(elements.element_class_ids, expected_ids)\n\n\n@pytest.mark.parametrize(\n    (\"coords\", \"class_ids\", \"class_to_filter\", \"expected_coords\", \"expected_ids\"),\n    [\n        ([[0, 0, 1, 1], [0, 0, 1, 1]], [0, 1], 1, [[0, 0, 1, 1]], [1]),  # one box\n        (\n            [[0, 0, 1, 1], [0, 0, 1, 1], [1, 1, 2, 2]],  # one box\n            [0, 1, 0],\n            1,\n            [[0, 0, 1, 1], [1, 1, 2, 2]],\n            [1, 0],\n        ),\n        (\n            # a -> b, b -> c, but a not -> c\n            [[0, 0, 1.4, 1.4], [0, 0, 1, 1], [0.4, 0, 1.4, 1], [1.2, 0, 1.4, 1]],\n            [0, 1, 1, 1],\n            1,\n            [[0, 0, 1, 1], [1.2, 0, 1.4, 1], [0, 0, 1.4, 1.4]],\n            [1, 1, 0],\n        ),\n        (\n            # like the case above but a different filtering element type changes the results\n            [[0, 0, 1.4, 1.4], [0, 0, 1, 1], [0.4, 0, 1.4, 1], [1.2, 0, 1.4, 1]],\n            [0, 1, 1, 1],\n            0,\n            [[0, 0, 1.4, 1.4]],\n            [0],\n        ),\n    ],\n)\ndef test_clean_layoutelements_for_class(\n    coords,\n    class_ids,\n    class_to_filter,\n    expected_coords,\n    expected_ids,\n):\n    coords = np.array(coords)\n    element_class_ids = np.array(class_ids)\n    elements = LayoutElements(element_coords=coords, element_class_ids=element_class_ids)\n\n    elements = clean_layoutelements_for_class(elements, element_class=class_to_filter)\n    np.testing.assert_array_equal(elements.element_coords, expected_coords)\n    np.testing.assert_array_equal(elements.element_class_ids, expected_ids)\n\n\ndef test_layoutelements_to_list_and_back(test_layoutelements):\n    back = LayoutElements.from_list(test_layoutelements.as_list())\n    np.testing.assert_array_equal(test_layoutelements.element_coords, back.element_coords)\n    np.testing.assert_array_equal(test_layoutelements.texts, back.texts)\n    assert all(np.isnan(back.element_probs))\n    assert [\n        test_layoutelements.element_class_id_map[idx]\n        for idx in test_layoutelements.element_class_ids\n    ] == [back.element_class_id_map[idx] for idx in back.element_class_ids]\n\n\ndef test_layoutelements_from_list_no_elements():\n    back = LayoutElements.from_list(elements=[])\n    assert back.sources.size == 0\n    assert back.source is None\n    assert back.element_coords.size == 0\n\n\ndef test_textregions_from_list_no_elements():\n    back = TextRegions.from_list(regions=[])\n    assert back.is_extracted_array.size == 0\n    assert back.is_extracted is None\n    assert back.element_coords.size == 0\n\n\ndef test_layoutelements_concatenate():\n    layout1 = LayoutElements(\n        element_coords=np.array([[0, 0, 1, 1], [1, 1, 2, 2]]),\n        texts=np.array([\"a\", \"two\"]),\n        source=Source.YOLOX,\n        element_class_ids=np.array([0, 1]),\n        element_class_id_map={0: \"type0\", 1: \"type1\"},\n    )\n    layout2 = LayoutElements(\n        element_coords=np.array([[10, 10, 2, 2], [20, 20, 1, 1]]),\n        texts=np.array([\"three\", \"4\"]),\n        sources=np.array([Source.DETECTRON2_ONNX, Source.DETECTRON2_ONNX]),\n        element_class_ids=np.array([0, 1]),\n        element_class_id_map={0: \"type1\", 1: \"type2\"},\n    )\n    joint = LayoutElements.concatenate([layout1, layout2])\n    assert joint.texts.tolist() == [\"a\", \"two\", \"three\", \"4\"]\n    assert [s.value for s in joint.sources.tolist()] == [\n        \"yolox\",\n        \"yolox\",\n        \"detectron2_onnx\",\n        \"detectron2_onnx\",\n    ]\n    assert joint.element_class_ids.tolist() == [0, 1, 1, 2]\n    assert joint.element_class_id_map == {0: \"type0\", 1: \"type1\", 2: \"type2\"}\n\n\n@pytest.mark.parametrize(\n    \"test_elements\",\n    [\n        TextRegions(\n            element_coords=np.array(\n                [\n                    [0.0, 0.0, 1.0, 1.0],\n                    [1.0, 0.0, 1.5, 1.0],\n                    [2.0, 0.0, 2.5, 1.0],\n                    [3.0, 0.0, 4.0, 1.0],\n                    [4.0, 0.0, 5.0, 1.0],\n                ]\n            ),\n            texts=np.array([\"0\", \"1\", \"2\", \"3\", \"4\"]),\n            is_extracted_array=np.array([IsExtracted.TRUE] * 5),\n            is_extracted=IsExtracted.TRUE,\n        ),\n        LayoutElements(\n            element_coords=np.array(\n                [\n                    [0.0, 0.0, 1.0, 1.0],\n                    [1.0, 0.0, 1.5, 1.0],\n                    [2.0, 0.0, 2.5, 1.0],\n                    [3.0, 0.0, 4.0, 1.0],\n                    [4.0, 0.0, 5.0, 1.0],\n                ]\n            ),\n            texts=np.array([\"0\", \"1\", \"2\", \"3\", \"4\"]),\n            sources=np.array([Source.YOLOX] * 5),\n            source=Source.YOLOX,\n            is_extracted_array=np.array([] * 5),\n            is_extracted=IsExtracted.TRUE,\n            element_probs=np.array([0.0, 0.1, 0.2, 0.3, 0.4]),\n        ),\n    ],\n)\ndef test_textregions_support_numpy_slicing(test_elements):\n    np.testing.assert_equal(test_elements[1:4].texts, np.array([\"1\", \"2\", \"3\"]))\n    np.testing.assert_equal(test_elements[0::2].texts, np.array([\"0\", \"2\", \"4\"]))\n    np.testing.assert_equal(test_elements[[1, 2, 4]].texts, np.array([\"1\", \"2\", \"4\"]))\n    np.testing.assert_equal(test_elements[np.array([1, 2, 4])].texts, np.array([\"1\", \"2\", \"4\"]))\n    np.testing.assert_equal(\n        test_elements[np.array([True, False, False, True, False])].texts, np.array([\"0\", \"3\"])\n    )\n    if isinstance(test_elements, LayoutElements):\n        np.testing.assert_almost_equal(test_elements[1:4].element_probs, np.array([0.1, 0.2, 0.3]))\n\n\ndef test_textregions_from_list_collects_sources():\n    \"\"\"Test that TextRegions.from_list() collects both source and text_source from regions\"\"\"\n    from unstructured_inference.inference.elements import TextRegion\n\n    regions = [\n        TextRegion.from_coords(\n            0, 0, 10, 10, text=\"first\", source=Source.YOLOX, is_extracted=IsExtracted.TRUE\n        ),\n        TextRegion.from_coords(\n            10,\n            10,\n            20,\n            20,\n            text=\"second\",\n            source=Source.DETECTRON2_ONNX,\n            is_extracted=IsExtracted.TRUE,\n        ),\n    ]\n\n    text_regions = TextRegions.from_list(regions)\n\n    # This should fail because from_list() doesn't collect sources\n    assert text_regions.sources.size > 0, \"sources array should not be empty\"\n    assert text_regions.sources[0] == Source.YOLOX\n    assert text_regions.sources[1] == Source.DETECTRON2_ONNX\n\n\ndef test_textregions_has_sources_field():\n    \"\"\"Test that TextRegions has a sources field\"\"\"\n    text_regions = TextRegions(element_coords=np.array([[0, 0, 10, 10]]))\n\n    # This should fail because TextRegions doesn't have a sources field\n    assert hasattr(text_regions, \"sources\"), \"TextRegions should have a sources field\"\n    assert hasattr(text_regions, \"source\"), \"TextRegions should have a source field\"\n\n\ndef test_textregions_iter_elements_preserves_source():\n    \"\"\"Test that TextRegions.iter_elements() preserves source property\"\"\"\n    from unstructured_inference.inference.elements import TextRegion\n\n    regions = [\n        TextRegion.from_coords(\n            0, 0, 10, 10, text=\"first\", source=Source.YOLOX, is_extracted=IsExtracted.TRUE\n        ),\n    ]\n    text_regions = TextRegions.from_list(regions)\n\n    elements = list(text_regions.iter_elements())\n\n    # This should fail because iter_elements() doesn't pass source to TextRegion.from_coords()\n    assert elements[0].source == Source.YOLOX, \"iter_elements() should preserve source\"\n\n\ndef test_textregions_slice_preserves_sources():\n    \"\"\"Test that TextRegions slicing preserves sources array\"\"\"\n    from unstructured_inference.inference.elements import TextRegion\n\n    regions = [\n        TextRegion.from_coords(\n            0, 0, 10, 10, text=\"first\", source=Source.YOLOX, is_extracted=IsExtracted.TRUE\n        ),\n        TextRegion.from_coords(\n            10,\n            10,\n            20,\n            20,\n            text=\"second\",\n            source=Source.DETECTRON2_ONNX,\n            is_extracted=IsExtracted.TRUE,\n        ),\n    ]\n    text_regions = TextRegions.from_list(regions)\n\n    sliced = text_regions[0:1]\n\n    # This should fail because slice() doesn't handle sources\n    assert sliced.sources.size > 0, \"Sliced TextRegions should have sources\"\n    assert sliced.sources[0] == Source.YOLOX\n    assert sliced.is_extracted_array[0] is IsExtracted.TRUE\n\n\ndef test_textregions_post_init_handles_sources():\n    \"\"\"Test that TextRegions.__post_init__() handles sources array initialization\"\"\"\n    # Create with source but no sources array\n    text_regions = TextRegions(\n        element_coords=np.array([[0, 0, 10, 10], [10, 10, 20, 20]]), source=Source.YOLOX\n    )\n\n    # This should fail because __post_init__() doesn't handle sources\n    assert text_regions.sources.size > 0, \"sources should be initialized from source\"\n    assert text_regions.sources[0] == Source.YOLOX\n    assert text_regions.sources[1] == Source.YOLOX\n\n\ndef test_textregions_from_coords_accepts_source():\n    \"\"\"Test that TextRegion.from_coords() accepts source parameter\"\"\"\n    # This should fail because from_coords() doesn't accept source parameter\n    region = TextRegion.from_coords(\n        0, 0, 10, 10, text=\"test\", source=Source.YOLOX, is_extracted=IsExtracted.TRUE\n    )\n\n    assert region.source == Source.YOLOX\n    assert region.is_extracted\n\n\n@pytest.mark.skip(reason=\"Not implemented\")\ndef test_textregions_allows_for_single_element_access_and_returns_textregion_with_correct_values():\n    \"\"\"Test that TextRegions allows for single element access and returns a TextRegion with the\n    correct values\"\"\"\n\n    regions = [\n        TextRegion.from_coords(\n            0, 0, 10, 10, text=\"first\", source=Source.YOLOX, is_extracted=IsExtracted.TRUE\n        ),\n        TextRegion.from_coords(\n            0,\n            0,\n            20,\n            20,\n            text=\"second\",\n            source=Source.DETECTRON2_ONNX,\n            is_extracted=IsExtracted.PARTIAL,\n        ),\n    ]\n    text_regions = TextRegions.from_list(regions)\n    for i, region in enumerate(regions):\n        sliced = text_regions[i]\n        assert isinstance(sliced, TextRegion)\n        assert sliced.text == region.text\n        assert sliced.source == region.source\n        assert sliced.is_extracted is region.is_extracted\n"
  },
  {
    "path": "test_unstructured_inference/test_logger.py",
    "content": "import logging\n\nimport pytest\n\nfrom unstructured_inference import logger\n\n\n@pytest.mark.parametrize(\"level\", range(50))\ndef test_translate_log_level(level):\n    level_name = logging.getLevelName(level)\n    if level_name in [\"WARNING\", \"INFO\", \"DEBUG\", \"NOTSET\", \"WARN\"]:\n        expected = 4\n    elif level_name in [\"ERROR\", \"CRITICAL\"]:\n        expected = 3\n    else:\n        expected = 0\n    assert logger.translate_log_level(level) == expected\n"
  },
  {
    "path": "test_unstructured_inference/test_math.py",
    "content": "import numpy as np\nimport pytest\n\nfrom unstructured_inference.math import FLOAT_EPSILON, safe_division\n\n\n@pytest.mark.parametrize(\n    (\"a\", \"b\", \"expected\"),\n    [(0, 0, 0), (0, 1, 0), (1, 0, np.round(1 / FLOAT_EPSILON, 1)), (2, 3, 0.7)],\n)\ndef test_safe_division(a, b, expected):\n    assert np.round(safe_division(a, b), 1) == expected\n"
  },
  {
    "path": "test_unstructured_inference/test_utils.py",
    "content": "import numpy as np\nimport pytest\n\nfrom unstructured_inference.inference.layout import DocumentLayout\nfrom unstructured_inference.utils import (\n    LazyDict,\n    LazyEvaluateInfo,\n    pad_image_with_background_color,\n    strip_tags,\n)\n\n\n# Mocking the DocumentLayout and Page classes\nclass MockPageLayout:\n    def annotate(self, annotation_data):\n        return \"mock_image\"\n\n\nclass MockDocumentLayout(DocumentLayout):\n    @property\n    def pages(self):\n        return [MockPageLayout(), MockPageLayout()]\n\n\ndef test_dict_same():\n    d = {\"a\": 1, \"b\": 2, \"c\": 3}\n    ld = LazyDict(**d)\n    assert all(kd == kld for kd, kld in zip(d, ld))\n    assert all(d[k] == ld[k] for k in d)\n    assert len(ld) == len(d)\n\n\ndef test_lazy_evaluate():\n    called = 0\n\n    def func(x):\n        nonlocal called\n        called += 1\n        return x\n\n    lei = LazyEvaluateInfo(func, 3)\n    assert called == 0\n    ld = LazyDict(a=lei)\n    assert called == 0\n    assert ld[\"a\"] == 3\n    assert called == 1\n\n\n@pytest.mark.parametrize((\"cache\", \"expected\"), [(True, 1), (False, 2)])\ndef test_caches(cache, expected):\n    called = 0\n\n    def func(x):\n        nonlocal called\n        called += 1\n        return x\n\n    lei = LazyEvaluateInfo(func, 3)\n    assert called == 0\n    ld = LazyDict(cache=cache, a=lei)\n    assert called == 0\n    assert ld[\"a\"] == 3\n    assert ld[\"a\"] == 3\n    assert called == expected\n\n\ndef test_pad_image_with_background_color(mock_pil_image):\n    pad = 10\n    height, width = mock_pil_image.size\n    padded = pad_image_with_background_color(mock_pil_image, pad, \"black\")\n    assert padded.size == (height + 2 * pad, width + 2 * pad)\n    np.testing.assert_array_almost_equal(\n        np.array(padded.crop((pad, pad, width + pad, height + pad))),\n        np.array(mock_pil_image),\n    )\n    assert padded.getpixel((1, 1)) == (0, 0, 0)\n\n\ndef test_pad_image_with_invalid_input(mock_pil_image):\n    with pytest.raises(ValueError, match=\"Can not pad an image with negative space!\"):\n        pad_image_with_background_color(mock_pil_image, -1)\n\n\n@pytest.mark.parametrize(\n    (\"html\", \"text\"),\n    [\n        (\"<table>Table</table>\", \"Table\"),\n        # test escaped character\n        (\"<table>y&ltx, x&gtz</table>\", \"y<x, x>z\"),\n        # test tag with parameters\n        (\"<table format=foo>Table\", \"Table\"),\n    ],\n)\ndef test_strip_tags(html, text):\n    assert strip_tags(html) == text\n"
  },
  {
    "path": "test_unstructured_inference/test_visualization.py",
    "content": "from unittest.mock import MagicMock, patch\n\nimport numpy as np\nimport pytest\nfrom PIL import Image\n\nfrom unstructured_inference.inference.elements import TextRegion\nfrom unstructured_inference.visualize import draw_bbox, show_plot\n\n\ndef test_draw_bbox():\n    test_image_arr = np.ones((100, 100, 3), dtype=\"uint8\")\n    image = Image.fromarray(test_image_arr)\n    x1, y1, x2, y2 = (1, 10, 7, 11)\n    rect = TextRegion.from_coords(x1, y1, x2, y2)\n    annotated_image = draw_bbox(image=image, element=rect, details=False)\n    annotated_array = np.array(annotated_image)\n    # Make sure the pixels on the edge of the box are red\n    for i, expected in zip(range(3), [255, 0, 0]):\n        assert all(annotated_array[y1, x1:x2, i] == expected)\n        assert all(annotated_array[y2, x1:x2, i] == expected)\n        assert all(annotated_array[y1:y2, x1, i] == expected)\n        assert all(annotated_array[y1:y2, x2, i] == expected)\n    # Make sure almost all the pixels are not changed\n    assert ((annotated_array[:, :, 0] == 1).mean()) > 0.995\n    assert ((annotated_array[:, :, 1] == 1).mean()) > 0.995\n    assert ((annotated_array[:, :, 2] == 1).mean()) > 0.995\n\n\ndef test_show_plot_with_pil_image(mock_pil_image):\n    mock_fig = MagicMock()\n    mock_ax = MagicMock()\n\n    with (\n        patch(\n            \"matplotlib.pyplot.subplots\",\n            return_value=(mock_fig, mock_ax),\n        ) as mock_subplots,\n        patch(\"matplotlib.pyplot.show\") as mock_show,\n        patch.object(\n            mock_ax,\n            \"imshow\",\n        ) as mock_imshow,\n    ):\n        show_plot(mock_pil_image, desired_width=100)\n\n    mock_subplots.assert_called()\n    mock_imshow.assert_called_with(mock_pil_image)\n    mock_show.assert_called()\n\n\ndef test_show_plot_with_numpy_image(mock_numpy_image):\n    mock_fig = MagicMock()\n    mock_ax = MagicMock()\n\n    with (\n        patch(\n            \"matplotlib.pyplot.subplots\",\n            return_value=(mock_fig, mock_ax),\n        ) as mock_subplots,\n        patch(\"matplotlib.pyplot.show\") as mock_show,\n        patch.object(\n            mock_ax,\n            \"imshow\",\n        ) as mock_imshow,\n    ):\n        show_plot(mock_numpy_image)\n\n    mock_subplots.assert_called()\n    mock_imshow.assert_called_with(mock_numpy_image)\n    mock_show.assert_called()\n\n\ndef test_show_plot_with_unsupported_image_type():\n    with pytest.raises(ValueError) as exec_info:\n        show_plot(\"unsupported_image_type\")\n\n    assert \"Unsupported Image Type\" in str(exec_info.value)\n"
  },
  {
    "path": "unstructured_inference/__init__.py",
    "content": ""
  },
  {
    "path": "unstructured_inference/__version__.py",
    "content": "__version__ = \"1.6.11\"  # pragma: no cover\n"
  },
  {
    "path": "unstructured_inference/config.py",
    "content": "\"\"\"\nThis module contains variables that can permitted to be tweaked by the system environment. For\nexample, model parameters that changes the output of an inference call. Constants do NOT belong in\nthis module. Constants are values that are usually names for common options (e.g., color names) or\nsettings that should not be altered without making a code change (e.g., definition of 1Gb of memory\nin bytes). Constants should go into `./constants.py`\n\"\"\"\n\nimport os\nfrom dataclasses import dataclass\n\n\n@dataclass\nclass InferenceConfig:\n    \"\"\"class for configuring inference parameters\"\"\"\n\n    def _get_string(self, var: str, default_value: str = \"\") -> str:\n        \"\"\"attempt to get the value of var from the os environment; if not present return the\n        default_value\"\"\"\n        return os.environ.get(var, default_value)\n\n    def _get_int(self, var: str, default_value: int) -> int:\n        if value := self._get_string(var):\n            return int(value)\n        return default_value\n\n    def _get_float(self, var: str, default_value: float) -> float:\n        if value := self._get_string(var):\n            return float(value)\n        return default_value\n\n    @property\n    def TABLE_IMAGE_BACKGROUND_PAD(self) -> int:\n        \"\"\"number of pixels to pad around an table image with a white background color\n\n        The padding adds NO image data around an identified table bounding box; it simply adds white\n        background around the image\n        \"\"\"\n        return self._get_int(\"TABLE_IMAGE_BACKGROUND_PAD\", 20)\n\n    @property\n    def TT_TABLE_CONF(self) -> float:\n        \"\"\"confidence threshold for table identified by table transformer\"\"\"\n        return self._get_float(\"TT_TABLE_CONF\", 0.5)\n\n    @property\n    def TABLE_COLUMN_CONF(self) -> float:\n        \"\"\"confidence threshold for column identified by table transformer\"\"\"\n        return self._get_float(\"TABLE_COLUMN_CONF\", 0.5)\n\n    @property\n    def TABLE_ROW_CONF(self) -> float:\n        \"\"\"confidence threshold for column identified by table transformer\"\"\"\n        return self._get_float(\"TABLE_ROW_CONF\", 0.5)\n\n    @property\n    def TABLE_COLUMN_HEADER_CONF(self) -> float:\n        \"\"\"confidence threshold for column header identified by table transformer\"\"\"\n        return self._get_float(\"TABLE_COLUMN_HEADER_CONF\", 0.5)\n\n    @property\n    def TABLE_PROJECTED_ROW_HEADER_CONF(self) -> float:\n        \"\"\"confidence threshold for projected row header identified by table transformer\"\"\"\n        return self._get_float(\"TABLE_PROJECTED_ROW_HEADER_CONF\", 0.5)\n\n    @property\n    def TABLE_SPANNING_CELL_CONF(self) -> float:\n        \"\"\"confidence threshold for table spanning cells identified by table transformer\"\"\"\n        return self._get_float(\"TABLE_SPANNING_CELL_CONF\", 0.5)\n\n    @property\n    def TABLE_IOB_THRESHOLD(self) -> float:\n        \"\"\"minimum intersection over box area ratio for a box to be considered part of a larger box\n        it intersects\"\"\"\n        return self._get_float(\"TABLE_IOB_THRESHOLD\", 0.5)\n\n    @property\n    def LAYOUT_SAME_REGION_THRESHOLD(self) -> float:\n        \"\"\"threshold for two layouts' bounding boxes to be considered as the same region\n\n        When the intersection area over union area of the two is larger than this threshold the two\n        boxes are considered the same region\n        \"\"\"\n        return self._get_float(\"LAYOUT_SAME_REGION_THRESHOLD\", 0.75)\n\n    @property\n    def LAYOUT_SUBREGION_THRESHOLD(self) -> float:\n        \"\"\"threshold for one bounding box to be considered as a sub-region of another bounding box\n\n        When the intersection region area divided by self area is larger than this threshold self is\n        considered a subregion of the other\n        \"\"\"\n        return self._get_float(\"LAYOUT_SUBREGION_THRESHOLD\", 0.75)\n\n    @property\n    def ELEMENTS_H_PADDING_COEF(self) -> float:\n        \"\"\"When extending the boundaries of a PDF object for the purpose of determining which other\n        elements should be considered in the same text region, we use a relative distance based on\n        some fraction of the block height (typically character height). This is the fraction used\n        for the horizontal extension applied to the left and right sides.\n        \"\"\"\n        return self._get_float(\"ELEMENTS_H_PADDING_COEF\", 0.4)\n\n    @property\n    def ELEMENTS_V_PADDING_COEF(self) -> float:\n        \"\"\"Same as ELEMENTS_H_PADDING_COEF but the vertical extension.\"\"\"\n        return self._get_float(\"ELEMENTS_V_PADDING_COEF\", 0.3)\n\n    @property\n    def IMG_PROCESSOR_LONGEST_EDGE(self) -> int:\n        \"\"\"configuration for DetrImageProcessor to scale images\"\"\"\n        return self._get_int(\"IMG_PROCESSOR_LONGEST_EDGE\", 1333)\n\n    @property\n    def IMG_PROCESSOR_SHORTEST_EDGE(self) -> int:\n        \"\"\"configuration for DetrImageProcessor to scale images\"\"\"\n        return self._get_int(\"IMG_PROCESSOR_SHORTEST_EDGE\", 800)\n\n    @property\n    def PDF_RENDER_MAX_PIXELS_PER_PAGE(self) -> int:\n        \"\"\"maximum number of pixels (width * height) a single PDF page may render to\n\n        Pages whose rendered bitmap would exceed this value are rejected before allocation.\n        Set to 0 to disable the guard.\n        \"\"\"\n        return self._get_int(\"PDF_RENDER_MAX_PIXELS_PER_PAGE\", 1_000_000_000)\n\n\ninference_config = InferenceConfig()\n"
  },
  {
    "path": "unstructured_inference/constants.py",
    "content": "from enum import Enum\n\n\nclass Source(Enum):\n    YOLOX = \"yolox\"\n    DETECTRON2_ONNX = \"detectron2_onnx\"\n    DETECTRON2_LP = \"detectron2_lp\"\n\n\nclass IsExtracted(Enum):\n    TRUE = \"true\"\n    FALSE = \"false\"\n    PARTIAL = \"partial\"\n\n\nclass ElementType:\n    PARAGRAPH = \"Paragraph\"\n    IMAGE = \"Image\"\n    PARAGRAPH_IN_IMAGE = \"ParagraphInImage\"\n    FIGURE = \"Figure\"\n    PICTURE = \"Picture\"\n    TABLE = \"Table\"\n    PARAGRAPH_IN_TABLE = \"ParagraphInTable\"\n    LIST = \"List\"\n    FORM = \"Form\"\n    PARAGRAPH_IN_FORM = \"ParagraphInForm\"\n    CHECK_BOX_CHECKED = \"CheckBoxChecked\"\n    CHECK_BOX_UNCHECKED = \"CheckBoxUnchecked\"\n    RADIO_BUTTON_CHECKED = \"RadioButtonChecked\"\n    RADIO_BUTTON_UNCHECKED = \"RadioButtonUnchecked\"\n    LIST_ITEM = \"List-item\"\n    FORMULA = \"Formula\"\n    CAPTION = \"Caption\"\n    PAGE_HEADER = \"Page-header\"\n    SECTION_HEADER = \"Section-header\"\n    PAGE_FOOTER = \"Page-footer\"\n    FOOTNOTE = \"Footnote\"\n    TITLE = \"Title\"\n    TEXT = \"Text\"\n    UNCATEGORIZED_TEXT = \"UncategorizedText\"\n    PAGE_BREAK = \"PageBreak\"\n    CODE_SNIPPET = \"CodeSnippet\"\n    PAGE_NUMBER = \"PageNumber\"\n    OTHER = \"Other\"\n\n\nFULL_PAGE_REGION_THRESHOLD = 0.99\n\n# this field is defined by pytesseract/unstructured.pytesseract\nTESSERACT_TEXT_HEIGHT = \"height\"\n\nPDF_POINTS_PER_INCH = 72\n"
  },
  {
    "path": "unstructured_inference/inference/__init__.py",
    "content": ""
  },
  {
    "path": "unstructured_inference/inference/elements.py",
    "content": "from __future__ import annotations\n\nfrom copy import deepcopy\nfrom dataclasses import dataclass, field\nfrom functools import cached_property\nfrom typing import Optional, Union\n\nimport numpy as np\n\nfrom unstructured_inference.constants import IsExtracted, Source\nfrom unstructured_inference.math import safe_division\n\n\n@dataclass\nclass Rectangle:\n    x1: Union[int, float]\n    y1: Union[int, float]\n    x2: Union[int, float]\n    y2: Union[int, float]\n\n    def pad(self, padding: Union[int, float]):\n        \"\"\"Increases (or decreases, if padding is negative) the size of the rectangle by extending\n        the boundary outward (resp. inward).\"\"\"\n        out_object = self.hpad(padding).vpad(padding)\n        return out_object\n\n    def hpad(self, padding: Union[int, float]):\n        \"\"\"Increases (or decreases, if padding is negative) the size of the rectangle by extending\n        the left and right sides of the boundary outward (resp. inward).\"\"\"\n        out_object = deepcopy(self)\n        out_object.x1 -= padding\n        out_object.x2 += padding\n        return out_object\n\n    def vpad(self, padding: Union[int, float]):\n        \"\"\"Increases (or decreases, if padding is negative) the size of the rectangle by extending\n        the top and bottom of the boundary outward (resp. inward).\"\"\"\n        out_object = deepcopy(self)\n        out_object.y1 -= padding\n        out_object.y2 += padding\n        return out_object\n\n    @property\n    def width(self) -> Union[int, float]:\n        \"\"\"Width of rectangle\"\"\"\n        return self.x2 - self.x1\n\n    @property\n    def height(self) -> Union[int, float]:\n        \"\"\"Height of rectangle\"\"\"\n        return self.y2 - self.y1\n\n    @property\n    def x_midpoint(self) -> Union[int, float]:\n        \"\"\"Finds the horizontal midpoint of the object.\"\"\"\n        return (self.x2 + self.x1) / 2\n\n    @property\n    def y_midpoint(self) -> Union[int, float]:\n        \"\"\"Finds the vertical midpoint of the object.\"\"\"\n        return (self.y2 + self.y1) / 2\n\n    def is_disjoint(self, other: Rectangle) -> bool:\n        \"\"\"Checks whether this rectangle is disjoint from another rectangle.\"\"\"\n        return not self.intersects(other)\n\n    def intersects(self, other: Rectangle) -> bool:\n        \"\"\"Checks whether this rectangle intersects another rectangle.\"\"\"\n        if self._has_none() or other._has_none():\n            return False\n        return intersections(self, other)[0, 1]\n\n    def is_in(self, other: Rectangle, error_margin: Optional[Union[int, float]] = None) -> bool:\n        \"\"\"Checks whether this rectangle is contained within another rectangle.\"\"\"\n        padded_other = other.pad(error_margin) if error_margin is not None else other\n        return all(\n            [\n                (self.x1 >= padded_other.x1),\n                (self.x2 <= padded_other.x2),\n                (self.y1 >= padded_other.y1),\n                (self.y2 <= padded_other.y2),\n            ],\n        )\n\n    def _has_none(self) -> bool:\n        \"\"\"return false when one of the coord is nan\"\"\"\n        return any((self.x1 is None, self.x2 is None, self.y1 is None, self.y2 is None))\n\n    @property\n    def coordinates(self):\n        \"\"\"Gets coordinates of the rectangle\"\"\"\n        return ((self.x1, self.y1), (self.x1, self.y2), (self.x2, self.y2), (self.x2, self.y1))\n\n    def intersection(self, other: Rectangle) -> Optional[Rectangle]:\n        \"\"\"Gives the rectangle that is the intersection of two rectangles, or None if the\n        rectangles are disjoint.\"\"\"\n        if self._has_none() or other._has_none():\n            return None\n        x1 = max(self.x1, other.x1)\n        x2 = min(self.x2, other.x2)\n        y1 = max(self.y1, other.y1)\n        y2 = min(self.y2, other.y2)\n        if x1 > x2 or y1 > y2:\n            return None\n        return Rectangle(x1, y1, x2, y2)\n\n    @property\n    def area(self) -> float:\n        \"\"\"Gives the area of the rectangle.\"\"\"\n        return self.width * self.height\n\n    def intersection_over_union(self, other: Rectangle) -> float:\n        \"\"\"Gives the intersection-over-union of two rectangles. This tends to be a good metric of\n        how similar the regions are. Returns 0 for disjoint rectangles, 1 for two identical\n        rectangles -- area of intersection / area of union.\"\"\"\n        intersection = self.intersection(other)\n        intersection_area = 0.0 if intersection is None else intersection.area\n        union_area = self.area + other.area - intersection_area\n        return safe_division(intersection_area, union_area)\n\n    def intersection_over_minimum(self, other: Rectangle) -> float:\n        \"\"\"Gives the area-of-intersection over the minimum of the areas of the rectangles. Useful\n        for identifying when one rectangle is almost-a-subset of the other. Returns 0 for disjoint\n        rectangles, 1 when either is a subset of the other.\"\"\"\n        intersection = self.intersection(other)\n        intersection_area = 0.0 if intersection is None else intersection.area\n        min_area = min(self.area, other.area)\n        return safe_division(intersection_area, min_area)\n\n    def is_almost_subregion_of(self, other: Rectangle, subregion_threshold: float = 0.75) -> bool:\n        \"\"\"Returns whether this region is almost a subregion of other. This is determined by\n        comparing the intersection area over self area to some threshold, and checking whether self\n        is the smaller rectangle.\"\"\"\n        intersection = self.intersection(other)\n        intersection_area = 0.0 if intersection is None else intersection.area\n        return (subregion_threshold < safe_division(intersection_area, self.area)) and (\n            self.area <= other.area\n        )\n\n\ndef minimal_containing_region(*regions: Rectangle) -> Rectangle:\n    \"\"\"Returns the smallest rectangular region that contains all regions passed\"\"\"\n    x1 = min(region.x1 for region in regions)\n    y1 = min(region.y1 for region in regions)\n    x2 = max(region.x2 for region in regions)\n    y2 = max(region.y2 for region in regions)\n\n    return Rectangle(x1, y1, x2, y2)\n\n\ndef intersections(*rects: Rectangle):\n    \"\"\"Returns a square boolean matrix of intersections of an arbitrary number of rectangles, i.e.\n    the ijth entry of the matrix is True if and only if the ith Rectangle and jth Rectangle\n    intersect.\"\"\"\n    # NOTE(alan): Rewrite using line scan\n    coords = np.array([[r.x1, r.y1, r.x2, r.y2] for r in rects])\n    return coords_intersections(coords)\n\n\ndef coords_intersections(coords: np.ndarray) -> np.ndarray:\n    \"\"\"Returns a square boolean matrix of intersections of given stack of coords, i.e.\n    the ijth entry of the matrix is True if and only if the ith coords and jth coords\n    intersect.\"\"\"\n    x1s, y1s, x2s, y2s = coords[:, 0], coords[:, 1], coords[:, 2], coords[:, 3]\n\n    # Use broadcasting to get comparison matrices.\n    # For Rectangles r1 and r2, any of the following conditions makes the rectangles disjoint:\n    # r1.x1 > r2.x2\n    # r1.y1 > r2.y2\n    # r2.x1 > r1.x2\n    # r2.y1 > r1.y2\n    # Then we take the complement (~) of the disjointness matrix to get the intersection matrix.\n    intersections = ~(\n        (x1s[None] > x2s[..., None])\n        | (y1s[None] > y2s[..., None])\n        | (x1s[None] > x2s[..., None]).T\n        | (y1s[None] > y2s[..., None]).T\n    )\n\n    return intersections\n\n\n@dataclass\nclass TextRegion:\n    bbox: Rectangle\n    text: Optional[str] = None\n    source: Optional[Source] = None\n    is_extracted: Optional[IsExtracted] = None\n\n    def __str__(self) -> str:\n        return str(self.text)\n\n    @classmethod\n    def from_coords(\n        cls,\n        x1: Union[int, float],\n        y1: Union[int, float],\n        x2: Union[int, float],\n        y2: Union[int, float],\n        text: Optional[str] = None,\n        source: Optional[Source] = None,\n        is_extracted: Optional[IsExtracted] = None,\n        **kwargs,\n    ) -> TextRegion:\n        \"\"\"Constructs a region from coordinates.\"\"\"\n        bbox = Rectangle(x1, y1, x2, y2)\n\n        return cls(text=text, source=source, is_extracted=is_extracted, bbox=bbox, **kwargs)\n\n\n@dataclass\nclass TextRegions:\n    element_coords: np.ndarray\n    texts: np.ndarray = field(default_factory=lambda: np.array([]))\n    sources: np.ndarray = field(default_factory=lambda: np.array([]))\n    source: Source | None = None\n    is_extracted_array: np.ndarray = field(default_factory=lambda: np.array([]))\n    is_extracted: IsExtracted | None = None\n    _optional_array_attributes: list[str] = field(\n        init=False, default_factory=lambda: [\"texts\", \"sources\", \"is_extracted_array\"]\n    )\n    _scalar_to_array_mappings: dict[str, str] = field(\n        init=False,\n        default_factory=lambda: {\n            \"source\": \"sources\",\n            \"is_extracted\": \"is_extracted_array\",\n        },\n    )\n\n    def __post_init__(self):\n        element_size = self.element_coords.shape[0]\n        for scalar, array in self._scalar_to_array_mappings.items():\n            if (\n                getattr(self, scalar) is not None\n                and getattr(self, array).size == 0\n                and element_size\n            ):\n                setattr(self, array, np.array([getattr(self, scalar)] * element_size))\n            elif getattr(self, scalar) is None and getattr(self, array).size > 0:\n                setattr(self, scalar, getattr(self, array)[0])\n        for attr in self._optional_array_attributes:\n            if getattr(self, attr).size == 0 and element_size:\n                setattr(self, attr, np.array([None] * element_size))\n\n        # we convert to float so data type is more consistent (e.g., None will be np.nan)\n        self.element_coords = self.element_coords.astype(float)\n\n    def __getitem__(self, indices) -> TextRegions:\n        return self.slice(indices)\n\n    def slice(self, indices) -> TextRegions:\n        \"\"\"slice text regions based on indices\"\"\"\n        # NOTE(alan): I would expect if I try to access a single element, it should return a\n        # TextRegion, not a TextRegions. Currently, you get an error when trying to access a single\n        # element.\n        if self.element_coords[indices].ndim == 1:\n            # We've indexed a single element. For now this isn't implemented.\n            raise NotImplementedError(\"Slicing a single element is not implemented\")\n        return TextRegions(\n            element_coords=self.element_coords[indices],\n            texts=self.texts[indices],\n            sources=self.sources[indices],\n            is_extracted_array=self.is_extracted_array[indices],\n        )\n\n    def iter_elements(self):\n        \"\"\"iter text regions as one TextRegion per iteration; this returns a generator and has less\n        memory impact than the as_list method\"\"\"\n        for (x1, y1, x2, y2), text, source, is_extracted in zip(\n            self.element_coords,\n            self.texts,\n            self.sources,\n            self.is_extracted_array,\n        ):\n            yield TextRegion.from_coords(x1, y1, x2, y2, text, source, is_extracted)\n\n    def as_list(self):\n        \"\"\"return a list of LayoutElement for backward compatibility\"\"\"\n        return list(self.iter_elements())\n\n    @classmethod\n    def from_list(cls, regions: list):\n        \"\"\"create TextRegions from a list of TextRegion objects; the objects must have the same\n        is_extracted\"\"\"\n        coords, texts, sources, is_extracted_array = [], [], [], []\n        for region in regions:\n            coords.append((region.bbox.x1, region.bbox.y1, region.bbox.x2, region.bbox.y2))\n            texts.append(region.text)\n            sources.append(region.source)\n            is_extracted_array.append(region.is_extracted)\n        return cls(\n            element_coords=np.array(coords),\n            texts=np.array(texts),\n            sources=np.array(sources),\n            is_extracted_array=np.array(is_extracted_array),\n        )\n\n    def __len__(self):\n        return self.element_coords.shape[0]\n\n    @property\n    def x1(self):\n        \"\"\"left coordinate\"\"\"\n        return self.element_coords[:, 0]\n\n    @property\n    def y1(self):\n        \"\"\"top coordinate\"\"\"\n        return self.element_coords[:, 1]\n\n    @property\n    def x2(self):\n        \"\"\"right coordinate\"\"\"\n        return self.element_coords[:, 2]\n\n    @property\n    def y2(self):\n        \"\"\"bottom coordinate\"\"\"\n        return self.element_coords[:, 3]\n\n    @cached_property\n    def areas(self) -> np.ndarray:\n        \"\"\"areas of each region; only compute it when it is needed\"\"\"\n        return (self.x2 - self.x1) * (self.y2 - self.y1)\n\n\nclass EmbeddedTextRegion(TextRegion):\n    pass\n\n\nclass ImageTextRegion(TextRegion):\n    pass\n\n\ndef region_bounding_boxes_are_almost_the_same(\n    region1: Rectangle,\n    region2: Rectangle,\n    same_region_threshold: float = 0.75,\n) -> bool:\n    \"\"\"Returns whether bounding boxes are almost the same. This is determined by checking if the\n    intersection over union is above some threshold.\"\"\"\n    return region1.intersection_over_union(region2) > same_region_threshold\n\n\ndef grow_region_to_match_region(region_to_grow: Rectangle, region_to_match: Rectangle):\n    \"\"\"Grows a region to the minimum size necessary to contain both regions.\"\"\"\n    (new_x1, new_y1), _, (new_x2, new_y2), _ = minimal_containing_region(\n        region_to_grow,\n        region_to_match,\n    ).coordinates\n    region_to_grow.x1, region_to_grow.y1, region_to_grow.x2, region_to_grow.y2 = (\n        new_x1,\n        new_y1,\n        new_x2,\n        new_y2,\n    )\n"
  },
  {
    "path": "unstructured_inference/inference/layout.py",
    "content": "from __future__ import annotations\n\nimport os\nimport tempfile\nfrom functools import cached_property\nfrom pathlib import PurePath\nfrom typing import Any, BinaryIO, Collection, List, Optional, Union, cast\n\nimport numpy as np\nfrom PIL import Image, ImageSequence\n\nfrom unstructured_inference.inference import pdf_image as pdf_image_utils\nfrom unstructured_inference.inference.elements import (\n    TextRegion,\n)\nfrom unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements\nfrom unstructured_inference.logger import logger\nfrom unstructured_inference.models.base import get_model\nfrom unstructured_inference.models.unstructuredmodel import (\n    UnstructuredElementExtractionModel,\n    UnstructuredObjectDetectionModel,\n)\nfrom unstructured_inference.visualize import draw_bbox\n\nconvert_pdf_to_image = pdf_image_utils.convert_pdf_to_image\n_pdfium_lock = pdf_image_utils._pdfium_lock\n\n\nclass DocumentLayout:\n    \"\"\"Class for handling documents that are saved as .pdf files. For .pdf files, a\n    document image analysis (DIA) model detects the layout of the page prior to extracting\n    element.\"\"\"\n\n    def __init__(self, pages=None):\n        self._pages = pages\n\n    def __str__(self) -> str:\n        return \"\\n\\n\".join([str(page) for page in self.pages])\n\n    @property\n    def pages(self) -> List[PageLayout]:\n        \"\"\"Gets all elements from pages in sequential order.\"\"\"\n        return self._pages\n\n    @classmethod\n    def from_pages(cls, pages: List[PageLayout]) -> DocumentLayout:\n        \"\"\"Generates a new instance of the class from a list of `PageLayouts`s\"\"\"\n        doc_layout = cls()\n        doc_layout._pages = pages\n        return doc_layout\n\n    @classmethod\n    def from_file(\n        cls,\n        filename: str,\n        fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,\n        pdf_image_dpi: int = 200,\n        pdf_render_max_pixels_per_page: Optional[int] = None,\n        password: Optional[str] = None,\n        **kwargs,\n    ) -> DocumentLayout:\n        \"\"\"Creates a DocumentLayout from a pdf file.\"\"\"\n        logger.info(f\"Reading PDF for file: {filename} ...\")\n\n        with tempfile.TemporaryDirectory() as temp_dir:\n            _image_paths = convert_pdf_to_image(\n                filename=filename,\n                dpi=pdf_image_dpi,\n                output_folder=temp_dir,\n                path_only=True,\n                password=password,\n                pdf_render_max_pixels_per_page=pdf_render_max_pixels_per_page,\n            )\n            image_paths = cast(List[str], _image_paths)\n            number_of_pages = len(image_paths)\n            pages: List[PageLayout] = []\n            if fixed_layouts is None:\n                fixed_layouts = [None for _ in range(0, number_of_pages)]\n            for i, (image_path, fixed_layout) in enumerate(zip(image_paths, fixed_layouts)):\n                # NOTE(robinson) - In the future, maybe we detect the page number and default\n                # to the index if it is not detected\n                with Image.open(image_path) as image:\n                    page = PageLayout.from_image(\n                        image,\n                        number=i + 1,\n                        document_filename=filename,\n                        fixed_layout=fixed_layout,\n                        pdf_render_max_pixels_per_page=pdf_render_max_pixels_per_page,\n                        **kwargs,\n                    )\n                    pages.append(page)\n            return cls.from_pages(pages)\n\n    @classmethod\n    def from_image_file(\n        cls,\n        filename: str,\n        detection_model: Optional[UnstructuredObjectDetectionModel] = None,\n        element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,\n        fixed_layout: Optional[List[TextRegion]] = None,\n        **kwargs,\n    ) -> DocumentLayout:\n        \"\"\"Creates a DocumentLayout from an image file.\"\"\"\n        logger.info(f\"Reading image file: {filename} ...\")\n        try:\n            image = Image.open(filename)\n            format = image.format\n            images: list[Image.Image] = []\n            for i, im in enumerate(ImageSequence.Iterator(image)):\n                im = im.convert(\"RGB\")\n                im.format = format\n                images.append(im)\n        except Exception as e:\n            if os.path.isdir(filename) or os.path.isfile(filename):\n                raise e\n            else:\n                raise FileNotFoundError(f'File \"{filename}\" not found!') from e\n        pages = []\n        for i, image in enumerate(images):  # type: ignore\n            page = PageLayout.from_image(\n                image,\n                image_path=filename,\n                number=i,\n                detection_model=detection_model,\n                element_extraction_model=element_extraction_model,\n                fixed_layout=fixed_layout,\n                **kwargs,\n            )\n            pages.append(page)\n        return cls.from_pages(pages)\n\n\nclass PageLayout:\n    \"\"\"Class for an individual PDF page.\"\"\"\n\n    def __init__(\n        self,\n        number: int,\n        image: Image.Image,\n        image_metadata: Optional[dict] = None,\n        image_path: Optional[Union[str, PurePath]] = None,  # TODO: Deprecate\n        document_filename: Optional[Union[str, PurePath]] = None,\n        detection_model: Optional[UnstructuredObjectDetectionModel] = None,\n        element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,\n        pdf_render_max_pixels_per_page: Optional[int] = None,\n        password: Optional[str] = None,\n    ):\n        if detection_model is not None and element_extraction_model is not None:\n            raise ValueError(\"Only one of detection_model and extraction_model should be passed.\")\n        self.image: Optional[Image.Image] = image\n        if image_metadata is None:\n            image_metadata = {}\n        self.image_metadata = image_metadata\n        self.image_path = image_path\n        self.image_array: Union[np.ndarray[Any, Any], None] = None\n        self.document_filename = document_filename\n        self.number = number\n        self.detection_model = detection_model\n        self.element_extraction_model = element_extraction_model\n        self.pdf_render_max_pixels_per_page = pdf_render_max_pixels_per_page\n        self.elements_array: LayoutElements | None = None\n        self.password = password\n        # NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has\n        # locations now and if we need to support LayoutElements without bounding boxes we can make\n        # the bbox property optional\n\n    def __str__(self) -> str:\n        return \"\\n\\n\".join([str(element) for element in self.elements])\n\n    @cached_property\n    def elements(self) -> Collection[LayoutElement]:\n        \"\"\"return a list of layout elements from the array data structure; intended for backward\n        compatibility\"\"\"\n        if self.elements_array is None:\n            return []\n        return self.elements_array.as_list()\n\n    def get_elements_using_image_extraction(\n        self,\n        inplace=True,\n    ) -> Optional[list[LayoutElement]]:\n        \"\"\"Uses end-to-end text element extraction model to extract the elements on the page.\"\"\"\n        if self.element_extraction_model is None:\n            raise ValueError(\n                \"Cannot get elements using image extraction, no image extraction model defined\",\n            )\n        assert self.image is not None\n        elements = self.element_extraction_model(self.image)\n        if inplace:\n            self.elements = elements\n            return None\n        return elements\n\n    def get_elements_with_detection_model(\n        self,\n        inplace: bool = True,\n    ) -> Optional[LayoutElements]:\n        \"\"\"Uses specified model to detect the elements on the page.\"\"\"\n        if self.detection_model is None:\n            model = get_model()\n            if isinstance(model, UnstructuredObjectDetectionModel):\n                self.detection_model = model\n            else:\n                raise NotImplementedError(\"Default model should be a detection model\")\n\n        # NOTE(mrobinson) - We'll want make this model inference step some kind of\n        # remote call in the future.\n        assert self.image is not None\n        inferred_layout: LayoutElements = self.detection_model(self.image)\n        routing = inferred_layout.routing\n        routing_score = inferred_layout.routing_score\n        inferred_layout = self.detection_model.deduplicate_detected_elements(\n            inferred_layout,\n        )\n        inferred_layout.routing = routing\n        inferred_layout.routing_score = routing_score\n\n        if inplace:\n            self.elements_array = inferred_layout\n            return None\n\n        return inferred_layout\n\n    def _get_image_array(self) -> Union[np.ndarray[Any, Any], None]:\n        \"\"\"Converts the raw image into a numpy array.\"\"\"\n        if self.image_array is None:\n            if self.image:\n                self.image_array = np.array(self.image)\n            else:\n                image = Image.open(self.image_path)  # type: ignore\n                self.image_array = np.array(image)\n        return self.image_array\n\n    def annotate(\n        self,\n        colors: Optional[Union[List[str], str]] = None,\n        image_dpi: int = 200,\n        annotation_data: Optional[dict[str, dict]] = None,\n        add_details: bool = False,\n        sources: Optional[List[str]] = None,\n    ) -> Image.Image:\n        \"\"\"Annotates the elements on the page image.\n        if add_details is True, and the elements contain type and source attributes, then\n        the type and source will be added to the image.\n        sources is a list of sources to annotate. If sources is [\"all\"], then all sources will be\n        annotated. Current sources allowed are \"yolox\",\"detectron2_onnx\" and \"detectron2_lp\" \"\"\"\n        if colors is None:\n            colors = [\"red\" for _ in self.elements]\n        if isinstance(colors, str):\n            colors = [colors]\n        # If there aren't enough colors, just cycle through the colors a few times\n        if len(colors) < len(self.elements):\n            n_copies = (len(self.elements) // len(colors)) + 1\n            colors = colors * n_copies\n\n        # Hotload image if it hasn't been loaded yet\n        if self.image:\n            img = self.image.copy()\n        elif self.image_path:\n            img = Image.open(self.image_path)\n        else:\n            img = self._get_image(self.document_filename, self.number, image_dpi)\n\n        if annotation_data is None:\n            for el, color in zip(self.elements, colors):\n                if sources is None or el.source in sources:\n                    img = draw_bbox(img, el, color=color, details=add_details)\n        else:\n            for attribute, style in annotation_data.items():\n                if hasattr(self, attribute) and getattr(self, attribute):\n                    color = style[\"color\"]\n                    width = style[\"width\"]\n                    for region in getattr(self, attribute):\n                        required_source = getattr(region, \"source\", None)\n                        if (sources is None) or (required_source in sources):\n                            img = draw_bbox(\n                                img,\n                                region,\n                                color=color,\n                                width=width,\n                                details=add_details,\n                            )\n\n        return img\n\n    def _get_image(self, filename, page_number, pdf_image_dpi: int = 200) -> Image.Image:\n        \"\"\"Hotloads a page image from a pdf file.\"\"\"\n\n        with tempfile.TemporaryDirectory() as temp_dir:\n            _image_paths = convert_pdf_to_image(\n                filename=filename,\n                dpi=pdf_image_dpi,\n                output_folder=temp_dir,\n                path_only=True,\n                pdf_render_max_pixels_per_page=self.pdf_render_max_pixels_per_page,\n            )\n            image_paths = cast(List[str], _image_paths)\n            if page_number > len(image_paths):\n                raise ValueError(\n                    f\"Page number {page_number} is greater than the number of pages in the PDF.\",\n                )\n\n            with Image.open(image_paths[page_number - 1]) as image:\n                return image.copy()\n\n    @classmethod\n    def from_image(\n        cls,\n        image: Image.Image,\n        image_path: Optional[Union[str, PurePath]] = None,\n        document_filename: Optional[Union[str, PurePath]] = None,\n        number: int = 1,\n        detection_model: Optional[UnstructuredObjectDetectionModel] = None,\n        element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,\n        fixed_layout: Optional[List[TextRegion]] = None,\n        pdf_render_max_pixels_per_page: Optional[int] = None,\n    ):\n        \"\"\"Creates a PageLayout from an already-loaded PIL Image.\"\"\"\n\n        page = cls(\n            number=number,\n            image=image,\n            detection_model=detection_model,\n            element_extraction_model=element_extraction_model,\n            pdf_render_max_pixels_per_page=pdf_render_max_pixels_per_page,\n        )\n        # FIXME (yao): refactor the other methods so they all return elements like the third route\n        if page.element_extraction_model is not None:\n            page.get_elements_using_image_extraction()\n        elif fixed_layout is None:\n            page.get_elements_with_detection_model()\n        else:\n            page.elements = []\n\n        page.image_metadata = {\n            \"format\": page.image.format if page.image else None,\n            \"width\": page.image.width if page.image else None,\n            \"height\": page.image.height if page.image else None,\n            \"pdf_rotation\": int(page.image.info.get(\"pdf_rotation\", 0)) if page.image else 0,\n        }\n        page.image_path = os.path.abspath(image_path) if image_path else None\n        page.document_filename = os.path.abspath(document_filename) if document_filename else None\n\n        # Clear the image to save memory\n        page.image = None\n\n        return page\n\n\ndef process_data_with_model(\n    data: BinaryIO,\n    model_name: Optional[str],\n    password: Optional[str] = None,\n    **kwargs: Any,\n) -> DocumentLayout:\n    \"\"\"Process PDF or image as file-like object `data` into a `DocumentLayout`.\n\n    Uses the model identified by `model_name`.\n    \"\"\"\n    # Note: We use a temp dir, not a temp file,\n    # because the latter fails on Windows\n    # https://github.com/Unstructured-IO/unstructured-inference/pull/376\n    with tempfile.TemporaryDirectory() as tmp_dir_path:\n        file_path = os.path.join(tmp_dir_path, \"document\")\n        with open(file_path, \"wb\") as f:\n            f.write(data.read())\n            f.flush()\n        layout = process_file_with_model(\n            file_path,\n            model_name,\n            password=password,\n            **kwargs,\n        )\n\n    return layout\n\n\ndef process_file_with_model(\n    filename: str,\n    model_name: Optional[str],\n    is_image: bool = False,\n    fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,\n    pdf_image_dpi: int = 200,\n    pdf_render_max_pixels_per_page: Optional[int] = None,\n    password: Optional[str] = None,\n    **kwargs: Any,\n) -> DocumentLayout:\n    \"\"\"Processes pdf or image file with name filename into a DocumentLayout by using\n    a model identified by model_name.\"\"\"\n\n    model = get_model(model_name, **kwargs)\n    if isinstance(model, UnstructuredObjectDetectionModel):\n        detection_model = model\n        element_extraction_model = None\n    elif isinstance(model, UnstructuredElementExtractionModel):\n        detection_model = None\n        element_extraction_model = model\n    else:\n        raise ValueError(f\"Unsupported model type: {type(model)}\")\n    layout = (\n        DocumentLayout.from_image_file(\n            filename,\n            detection_model=detection_model,\n            element_extraction_model=element_extraction_model,\n            **kwargs,\n        )\n        if is_image\n        else DocumentLayout.from_file(\n            filename,\n            detection_model=detection_model,\n            element_extraction_model=element_extraction_model,\n            fixed_layouts=fixed_layouts,\n            pdf_image_dpi=pdf_image_dpi,\n            pdf_render_max_pixels_per_page=pdf_render_max_pixels_per_page,\n            password=password,\n            **kwargs,\n        )\n    )\n    return layout\n"
  },
  {
    "path": "unstructured_inference/inference/layoutelement.py",
    "content": "from __future__ import annotations\n\nfrom dataclasses import dataclass, field\nfrom typing import Any, Iterable, List, Optional, Union\n\nimport numpy as np\nfrom pandas import DataFrame\nfrom scipy.sparse.csgraph import connected_components\n\nfrom unstructured_inference.config import inference_config\nfrom unstructured_inference.constants import IsExtracted, Source\nfrom unstructured_inference.inference.elements import (\n    Rectangle,\n    TextRegion,\n    TextRegions,\n    coords_intersections,\n)\n\nEPSILON_AREA = 1e-7\n\n\n@dataclass\nclass LayoutElements(TextRegions):\n    element_probs: np.ndarray = field(default_factory=lambda: np.array([]))\n    element_class_ids: np.ndarray = field(default_factory=lambda: np.array([]))\n    element_class_id_map: dict[int, str] = field(default_factory=dict)\n    text_as_html: np.ndarray = field(default_factory=lambda: np.array([]))\n    table_as_cells: np.ndarray = field(default_factory=lambda: np.array([]))\n    table_extraction_method: np.ndarray = field(default_factory=lambda: np.array([]))\n    routing: str | None = None\n    routing_score: float | None = None\n    _optional_array_attributes: list[str] = field(\n        init=False,\n        default_factory=lambda: [\n            \"texts\",\n            \"sources\",\n            \"is_extracted_array\",\n            \"element_probs\",\n            \"element_class_ids\",\n            \"text_as_html\",\n            \"table_as_cells\",\n            \"table_extraction_method\",\n        ],\n    )\n    _scalar_to_array_mappings: dict[str, str] = field(\n        init=False,\n        default_factory=lambda: {\n            \"source\": \"sources\",\n            \"is_extracted\": \"is_extracted_array\",\n        },\n    )\n\n    def __post_init__(self):\n        super().__post_init__()\n        self.element_probs = self.element_probs.astype(float)\n\n    def __eq__(self, other: object) -> bool:\n        if not isinstance(other, LayoutElements):\n            return NotImplemented\n\n        mask = ~np.isnan(self.element_probs)\n        other_mask = ~np.isnan(other.element_probs)\n        return (\n            np.array_equal(self.element_coords, other.element_coords)\n            and np.array_equal(self.texts, other.texts)\n            and np.array_equal(mask, other_mask)\n            and np.array_equal(self.element_probs[mask], other.element_probs[mask])\n            and (\n                [self.element_class_id_map[idx] for idx in self.element_class_ids]\n                == [other.element_class_id_map[idx] for idx in other.element_class_ids]\n            )\n            and np.array_equal(self.sources[mask], other.sources[mask])\n            and np.array_equal(self.is_extracted_array[mask], other.is_extracted_array[mask])\n            and np.array_equal(self.text_as_html[mask], other.text_as_html[mask])\n            and np.array_equal(self.table_as_cells[mask], other.table_as_cells[mask])\n            and np.array_equal(\n                self.table_extraction_method[mask], other.table_extraction_method[mask]\n            )\n        )\n\n    def __getitem__(self, indices):\n        return self.slice(indices)\n\n    def slice(self, indices) -> LayoutElements:\n        \"\"\"slice and return only selected indices\"\"\"\n        return LayoutElements(\n            element_coords=self.element_coords[indices],\n            texts=self.texts[indices],\n            is_extracted_array=self.is_extracted_array[indices],\n            sources=self.sources[indices],\n            element_probs=self.element_probs[indices],\n            element_class_ids=self.element_class_ids[indices],\n            element_class_id_map=self.element_class_id_map,\n            text_as_html=self.text_as_html[indices],\n            table_as_cells=self.table_as_cells[indices],\n            table_extraction_method=self.table_extraction_method[indices],\n        )\n\n    @classmethod\n    def concatenate(cls, groups: Iterable[LayoutElements]) -> LayoutElements:\n        \"\"\"concatenate a sequence of LayoutElements in order as one LayoutElements\"\"\"\n        coords, texts, probs, class_ids, sources, is_extracted_array = [], [], [], [], [], []\n        text_as_html, table_as_cells, table_extraction_method = [], [], []\n        class_id_reverse_map: dict[str, int] = {}\n        for group in groups:\n            coords.append(group.element_coords)\n            texts.append(group.texts)\n            probs.append(group.element_probs)\n            sources.append(group.sources)\n            is_extracted_array.append(group.is_extracted_array)\n            text_as_html.append(group.text_as_html)\n            table_as_cells.append(group.table_as_cells)\n            table_extraction_method.append(group.table_extraction_method)\n\n            idx = group.element_class_ids.copy()\n            if group.element_class_id_map:\n                for class_id, class_name in group.element_class_id_map.items():\n                    if class_name in class_id_reverse_map:\n                        idx[group.element_class_ids == class_id] = class_id_reverse_map[class_name]\n                        continue\n                    new_id = len(class_id_reverse_map)\n                    class_id_reverse_map[class_name] = new_id\n                    idx[group.element_class_ids == class_id] = new_id\n            class_ids.append(idx)\n\n        return cls(\n            element_coords=np.concatenate(coords),\n            texts=np.concatenate(texts),\n            element_probs=np.concatenate(probs),\n            element_class_ids=np.concatenate(class_ids),\n            element_class_id_map={v: k for k, v in class_id_reverse_map.items()},\n            sources=np.concatenate(sources),\n            is_extracted_array=np.concatenate(is_extracted_array),\n            text_as_html=np.concatenate(text_as_html),\n            table_as_cells=np.concatenate(table_as_cells),\n            table_extraction_method=np.concatenate(table_extraction_method),\n        )\n\n    def iter_elements(self):\n        \"\"\"iter elements as one LayoutElement per iteration; this returns a generator and has less\n        memory impact than the as_list method\"\"\"\n        for (\n            (x1, y1, x2, y2),\n            text,\n            prob,\n            class_id,\n            source,\n            is_extracted,\n            text_as_html,\n            table_as_cells,\n            table_extraction_method,\n        ) in zip(\n            self.element_coords,\n            self.texts,\n            self.element_probs,\n            self.element_class_ids,\n            self.sources,\n            self.is_extracted_array,\n            self.text_as_html,\n            self.table_as_cells,\n            self.table_extraction_method,\n        ):\n            yield LayoutElement.from_coords(\n                x1,\n                y1,\n                x2,\n                y2,\n                text=text,\n                type=(\n                    self.element_class_id_map[class_id]\n                    if class_id is not None and self.element_class_id_map\n                    else None\n                ),\n                prob=None if np.isnan(prob) else prob,\n                source=source,\n                is_extracted=is_extracted,\n                text_as_html=text_as_html,\n                table_as_cells=table_as_cells,\n                table_extraction_method=table_extraction_method,\n            )\n\n    @classmethod\n    def from_list(cls, elements: list):\n        \"\"\"create LayoutElements from a list of LayoutElement objects; the objects must have the\n        same source\"\"\"\n        len_ele = len(elements)\n        coords = np.empty((len_ele, 4), dtype=float)\n        # text and probs can be Nones so use lists first then convert into array to avoid them being\n        # filled as nan\n        (\n            texts,\n            text_as_html,\n            table_as_cells,\n            table_extraction_method,\n            sources,\n            is_extracted_array,\n            class_probs,\n        ) = (\n            [],\n            [],\n            [],\n            [],\n            [],\n            [],\n            [],\n        )\n        class_types = np.empty((len_ele,), dtype=\"object\")\n\n        for i, element in enumerate(elements):\n            coords[i] = [element.bbox.x1, element.bbox.y1, element.bbox.x2, element.bbox.y2]\n            texts.append(element.text)\n            sources.append(element.source)\n            is_extracted_array.append(element.is_extracted)\n            text_as_html.append(element.text_as_html)\n            table_as_cells.append(element.table_as_cells)\n            table_extraction_method.append(getattr(element, \"table_extraction_method\", None))\n            class_probs.append(element.prob)\n            class_types[i] = element.type or \"None\"\n\n        unique_ids, class_ids = np.unique(class_types, return_inverse=True)\n        unique_ids[unique_ids == \"None\"] = None\n\n        return cls(\n            element_coords=coords,\n            texts=np.array(texts),\n            element_probs=np.array(class_probs),\n            element_class_ids=class_ids,\n            element_class_id_map=dict(zip(range(len(unique_ids)), unique_ids)),\n            sources=np.array(sources),\n            is_extracted_array=np.array(is_extracted_array),\n            text_as_html=np.array(text_as_html),\n            table_as_cells=np.array(table_as_cells),\n            table_extraction_method=np.array(table_extraction_method),\n        )\n\n\n@dataclass\nclass LayoutElement(TextRegion):\n    type: Optional[str] = None\n    prob: Optional[float] = None\n    image_path: Optional[str] = None\n    parent: Optional[LayoutElement] = None\n    text_as_html: Optional[str] = None\n    table_as_cells: Optional[str] = None\n    table_extraction_method: Optional[str] = None\n\n    def to_dict(self) -> dict:\n        \"\"\"Converts the class instance to dictionary form.\"\"\"\n        out_dict = {\n            \"coordinates\": None if self.bbox is None else self.bbox.coordinates,\n            \"text\": self.text,\n            \"type\": self.type,\n            \"prob\": self.prob,\n            \"source\": self.source,\n            \"is_extracted\": self.is_extracted,\n        }\n        return out_dict\n\n    @classmethod\n    def from_region(cls, region: TextRegion):\n        \"\"\"Create LayoutElement from superclass.\"\"\"\n        text = region.text if hasattr(region, \"text\") else None\n        type = region.type if hasattr(region, \"type\") else None\n        prob = region.prob if hasattr(region, \"prob\") else None\n        source = region.source if hasattr(region, \"source\") else None\n        is_extracted = region.is_extracted if hasattr(region, \"is_extracted\") else None\n        return cls(\n            bbox=region.bbox,\n            text=text,\n            source=source,\n            is_extracted=is_extracted,\n            type=type,\n            prob=prob,\n        )\n\n    @classmethod\n    def from_coords(\n        cls,\n        x1: Union[int, float],\n        y1: Union[int, float],\n        x2: Union[int, float],\n        y2: Union[int, float],\n        text: Optional[str] = None,\n        source: Optional[Source] = None,\n        is_extracted: Optional[IsExtracted] = None,\n        type: Optional[str] = None,\n        prob: Optional[float] = None,\n        text_as_html: Optional[str] = None,\n        table_as_cells: Optional[str] = None,\n        table_extraction_method: Optional[str] = None,\n        **kwargs,\n    ) -> LayoutElement:\n        \"\"\"Constructs a LayoutElement from coordinates.\"\"\"\n        bbox = Rectangle(x1, y1, x2, y2)\n        return cls(\n            text=text,\n            is_extracted=is_extracted,\n            type=type,\n            prob=prob,\n            source=source,\n            text_as_html=text_as_html,\n            table_as_cells=table_as_cells,\n            table_extraction_method=table_extraction_method,\n            bbox=bbox,\n            **kwargs,\n        )\n\n\ndef separate(region_a: Rectangle, region_b: Rectangle):\n    \"\"\"Reduce leftmost rectangle to don't overlap with the other\"\"\"\n\n    def reduce(keep: Rectangle, reduce: Rectangle):\n        # Asume intersection\n\n        # Other is down\n        if reduce.y2 > keep.y2 and reduce.x1 < keep.x2:\n            # other is down-right\n            if reduce.x2 > keep.x2 and reduce.y2 > keep.y2:\n                reduce.x1 = keep.x2 * 1.01\n                reduce.y1 = keep.y2 * 1.01\n                return\n            # other is down-left\n            if reduce.x1 < keep.x1 and reduce.y1 < keep.y2:\n                reduce.y1 = keep.y2\n                return\n            # other is centered\n            reduce.y1 = keep.y2\n        else:  # other is up\n            # other is up-right\n            if reduce.x2 > keep.x2 and reduce.y1 < keep.y1:\n                reduce.y2 = keep.y1\n                return\n            # other is left\n            if reduce.x1 < keep.x1 and reduce.y1 < keep.y1:\n                reduce.y2 = keep.y1\n                return\n            # other is centered\n            reduce.y2 = keep.y1\n\n    if not region_a.intersects(region_b):\n        return\n    else:\n        if region_a.area > region_b.area:\n            reduce(keep=region_a, reduce=region_b)\n        else:\n            reduce(keep=region_b, reduce=region_a)\n\n\ndef table_cells_to_dataframe(\n    cells: List[dict],\n    nrows: int = 1,\n    ncols: int = 1,\n    header=None,\n) -> DataFrame:\n    \"\"\"convert table-transformer's cells data into a pandas dataframe\"\"\"\n    arr = np.empty((nrows, ncols), dtype=object)\n    for cell in cells:\n        rows = cell[\"row_nums\"]\n        cols = cell[\"column_nums\"]\n        if rows[0] >= nrows or cols[0] >= ncols:\n            new_arr = np.empty((max(rows[0] + 1, nrows), max(cols[0] + 1, ncols)), dtype=object)\n            new_arr[:nrows, :ncols] = arr\n            arr = new_arr\n            nrows, ncols = arr.shape\n        arr[rows[0], cols[0]] = cell[\"cell text\"]\n\n    return DataFrame(arr, columns=header)\n\n\ndef partition_groups_from_regions(regions: TextRegions) -> List[TextRegions]:\n    \"\"\"Partitions regions into groups of regions based on proximity. Returns list of lists of\n    regions, each list corresponding with a group\"\"\"\n    if len(regions) == 0:\n        return []\n    padded_coords = regions.element_coords.copy().astype(float)\n    v_pad = (regions.y2 - regions.y1) * inference_config.ELEMENTS_V_PADDING_COEF\n    h_pad = (regions.x2 - regions.x1) * inference_config.ELEMENTS_H_PADDING_COEF\n    padded_coords[:, 0] -= h_pad\n    padded_coords[:, 1] -= v_pad\n    padded_coords[:, 2] += h_pad\n    padded_coords[:, 3] += v_pad\n\n    intersection_mtx = coords_intersections(padded_coords)\n\n    group_count, group_nums = connected_components(intersection_mtx)\n    groups: List[TextRegions] = []\n    for group in range(group_count):\n        groups.append(regions.slice(np.where(group_nums == group)[0]))\n\n    return groups\n\n\ndef intersection_areas_between_coords(\n    coords1: np.ndarray,\n    coords2: np.ndarray,\n    threshold: float = 0.5,\n):\n    \"\"\"compute intersection area and own areas for two groups of bounding boxes\"\"\"\n    x11, y11, x12, y12 = np.split(coords1, 4, axis=1)\n    x21, y21, x22, y22 = np.split(coords2, 4, axis=1)\n\n    xa = np.maximum(x11, np.transpose(x21))\n    ya = np.maximum(y11, np.transpose(y21))\n    xb = np.minimum(x12, np.transpose(x22))\n    yb = np.minimum(y12, np.transpose(y22))\n\n    return np.maximum((xb - xa), 0) * np.maximum((yb - ya), 0)\n\n\ndef clean_layoutelements(elements: LayoutElements, subregion_threshold: float = 0.5):\n    \"\"\"After this function, the list of elements will not contain any element inside\n    of the type specified\"\"\"\n    # Sort elements from biggest to smallest\n    if len(elements) < 2:\n        return elements\n\n    sorted_by_area = np.argsort(-elements.areas)\n    sorted_coords = elements.element_coords[sorted_by_area]\n\n    # First check if targets contains each other\n    self_intersection = intersection_areas_between_coords(sorted_coords, sorted_coords)\n    areas = elements.areas[sorted_by_area]\n    # check from largest to smallest regions to find if it contains any other regions\n    is_almost_subregion_of = (\n        self_intersection / np.maximum(areas, EPSILON_AREA) > subregion_threshold\n    ) & (areas <= areas.T)\n\n    n_candidates = len(elements)\n    mask = np.ones_like(areas, dtype=bool)\n    current_candidate = 0\n    while n_candidates > 1:\n        plus_one = current_candidate + 1\n        remove = (\n            np.where(is_almost_subregion_of[current_candidate, plus_one:])[0]\n            + current_candidate\n            + 1\n        )\n\n        if not remove.sum():\n            break\n\n        mask[remove] = 0\n        n_candidates -= len(remove) + 1\n        remaining_candidates = np.where(mask[plus_one:])[0]\n\n        if not len(remaining_candidates):\n            break\n\n        current_candidate = remaining_candidates[0] + plus_one\n\n    final_coords = sorted_coords[mask]\n    sorted_by_y1 = np.argsort(final_coords[:, 1])\n\n    final_attrs: dict[str, Any] = {\n        \"element_class_id_map\": elements.element_class_id_map,\n    }\n    for attr in (\n        \"element_class_ids\",\n        \"element_probs\",\n        \"texts\",\n        \"sources\",\n        \"is_extracted_array\",\n        \"text_as_html\",\n        \"table_as_cells\",\n        \"table_extraction_method\",\n    ):\n        if (original_attr := getattr(elements, attr)) is None:\n            continue\n        final_attrs[attr] = original_attr[sorted_by_area][mask][sorted_by_y1]\n    final_elements = LayoutElements(element_coords=final_coords[sorted_by_y1], **final_attrs)\n    return final_elements\n\n\ndef clean_layoutelements_for_class(\n    elements: LayoutElements,\n    element_class: int,\n    subregion_threshold: float = 0.5,\n):\n    \"\"\"After this function, the list of elements will not contain any element inside\n    of the type specified\"\"\"\n    # Sort elements from biggest to smallest\n    sorted_by_area = np.argsort(-elements.areas)\n    sorted_coords = elements.element_coords[sorted_by_area]\n\n    target_indices = elements.element_class_ids[sorted_by_area] == element_class\n\n    # skip trivial result\n    len_target = target_indices.sum()\n    if len_target == 0 or len_target == len(elements):\n        return elements\n\n    target_coords = sorted_coords[target_indices]\n    other_coords = sorted_coords[~target_indices]\n\n    # First check if targets contains each other\n    target_self_intersection = intersection_areas_between_coords(target_coords, target_coords)\n    target_areas = elements.areas[sorted_by_area][target_indices]\n    # check from largest to smallest regions to find if it contains any other regions\n    is_almost_subregion_of = (\n        target_self_intersection / np.maximum(target_areas, EPSILON_AREA) > subregion_threshold\n    ) & (target_areas <= target_areas.T)\n\n    n_candidates = len_target\n    mask = np.ones_like(target_areas, dtype=bool)\n    current_candidate = 0\n    while n_candidates > 1:\n        plus_one = current_candidate + 1\n        remove = (\n            np.where(is_almost_subregion_of[current_candidate, plus_one:])[0]\n            + current_candidate\n            + 1\n        )\n\n        if not remove.sum():\n            break\n\n        mask[remove] = 0\n        n_candidates -= len(remove) + 1\n        remaining_candidates = np.where(mask[plus_one:])[0]\n\n        if not len(remaining_candidates):\n            break\n\n        current_candidate = remaining_candidates[0] + plus_one\n\n    target_coords_to_keep = target_coords[mask]\n\n    other_to_target_intersection = intersection_areas_between_coords(\n        other_coords,\n        target_coords_to_keep,\n    )\n    # check from largest to smallest regions to find if it contains any other regions\n    other_areas = elements.areas[sorted_by_area][~target_indices]\n    other_is_almost_subregion_of_target = (\n        other_to_target_intersection / np.maximum(other_areas, EPSILON_AREA) > subregion_threshold\n    ) & (other_areas.reshape((-1, 1)) <= target_areas[mask].T)\n\n    other_mask = ~other_is_almost_subregion_of_target.sum(axis=1).astype(bool)\n\n    final_coords = np.vstack([target_coords[mask], other_coords[other_mask]])\n    final_attrs: dict[str, Any] = {\"element_class_id_map\": elements.element_class_id_map}\n    for attr in (\n        \"element_class_ids\",\n        \"element_probs\",\n        \"texts\",\n        \"sources\",\n        \"is_extracted_array\",\n        \"text_as_html\",\n        \"table_as_cells\",\n        \"table_extraction_method\",\n    ):\n        if (original_attr := getattr(elements, attr)) is None:\n            continue\n        sorted_attr = original_attr[sorted_by_area]\n        final_attrs[attr] = np.concatenate(\n            (sorted_attr[target_indices][mask], sorted_attr[~target_indices][other_mask]),\n        )\n    final_elements = LayoutElements(element_coords=final_coords, **final_attrs)\n    return final_elements\n"
  },
  {
    "path": "unstructured_inference/inference/pdf_image.py",
    "content": "from __future__ import annotations\n\nimport math\nimport os\nfrom functools import lru_cache\nfrom pathlib import Path, PurePath\nfrom threading import Lock\nfrom typing import BinaryIO, Optional, Union\n\nfrom PIL import Image\nfrom PIL.PngImagePlugin import PngInfo\n\nfrom unstructured_inference.config import inference_config\nfrom unstructured_inference.constants import PDF_POINTS_PER_INCH\n\n_pdfium_lock = Lock()\n\n\nclass PdfRenderTooLargeError(ValueError):\n    pass\n\n\ndef _check_pdf_render_max_pixels(page, page_number: int, scale: float, maximum: int) -> None:\n    if maximum <= 0:\n        return\n\n    rendered_width = math.ceil(page.get_width() * scale)\n    rendered_height = math.ceil(page.get_height() * scale)\n    rendered_pixels = rendered_width * rendered_height\n\n    if rendered_pixels > maximum:\n        raise PdfRenderTooLargeError(\n            \"PDF page would render to too many pixels for safe processing: \"\n            f\"page={page_number}, pixels={rendered_pixels}, maximum={maximum}. \"\n            \"Try splitting the PDF, reducing the page dimensions, or using a lower render DPI.\",\n        )\n\n\n@lru_cache(maxsize=1)\ndef _get_pdfium_module():\n    import pypdfium2 as pdfium\n\n    return pdfium\n\n\ndef convert_pdf_to_image(\n    filename: Optional[str] = None,\n    file: Optional[Union[bytes, BinaryIO]] = None,\n    dpi: int = 200,\n    output_folder: Optional[Union[str, PurePath]] = None,\n    path_only: bool = False,\n    first_page: Optional[int] = None,\n    last_page: Optional[int] = None,\n    password: Optional[str] = None,\n    pdf_render_max_pixels_per_page: Optional[int] = None,\n) -> Union[list[Image.Image], list[str]]:\n    \"\"\"Render PDF pages to PIL images or saved PNGs using pypdfium2.\n\n    This is the single source of truth for PDF→image rendering across unstructured\n    and unstructured-inference. Callers should pass their own DPI value explicitly.\n    \"\"\"\n    if path_only and not output_folder:\n        raise ValueError(\"output_folder must be specified if path_only is true\")\n    if filename is None and file is None:\n        raise ValueError(\"Either filename or file must be provided\")\n    if output_folder:\n        assert Path(output_folder).exists()\n        assert Path(output_folder).is_dir()\n\n    scale = dpi / PDF_POINTS_PER_INCH\n    if pdf_render_max_pixels_per_page is None:\n        pdf_render_max_pixels_per_page = inference_config.PDF_RENDER_MAX_PIXELS_PER_PAGE\n    pdfium = _get_pdfium_module()\n\n    with _pdfium_lock:\n        pdf = pdfium.PdfDocument(filename or file, password=password)\n        n_pages = len(pdf)\n\n    try:\n        images: dict[int, Image.Image] = {}\n        filenames: list[str] = []\n        for i in range(n_pages):\n            page_num = i + 1\n            if first_page is not None and page_num < first_page:\n                continue\n            if last_page is not None and page_num > last_page:\n                break\n\n            with _pdfium_lock:\n                page = pdf[i]\n                try:\n                    _check_pdf_render_max_pixels(\n                        page=page,\n                        page_number=page_num,\n                        scale=scale,\n                        maximum=pdf_render_max_pixels_per_page,\n                    )\n                    bitmap = page.render(\n                        scale=scale,\n                        no_smoothtext=False,\n                        no_smoothimage=False,\n                        no_smoothpath=False,\n                        optimize_mode=\"print\",\n                    )\n                    try:\n                        pil_image = bitmap.to_pil()\n                    finally:\n                        bitmap.close()\n\n                    rotation = page.get_rotation()\n                    if rotation:\n                        pil_image = pil_image.rotate(rotation, expand=True)\n                    pil_image.info[\"pdf_rotation\"] = rotation\n\n                finally:\n                    page.close()\n\n            if output_folder:\n                fn: str = os.path.join(str(output_folder), f\"page_{page_num}.png\")\n\n                png_meta = PngInfo()\n                png_meta.add_text(\"pdf_rotation\", str(rotation))\n                pil_image.save(\n                    fn,\n                    format=\"PNG\",\n                    compress_level=1,\n                    optimize=False,\n                    pnginfo=png_meta,\n                )\n                filenames.append(fn)\n                if not path_only:\n                    images[page_num] = pil_image\n            else:\n                images[page_num] = pil_image\n    finally:\n        with _pdfium_lock:\n            pdf.close()\n\n    if path_only:\n        return filenames\n    return list(images.values())\n"
  },
  {
    "path": "unstructured_inference/logger.py",
    "content": "import logging\n\n\ndef translate_log_level(level: int) -> int:\n    \"\"\"Translate Python debugg level to ONNX runtime error level\n    since blank pages error are shown at level 3 that should be the\n    exception, and 4 the normal behavior\"\"\"\n    level_name = logging.getLevelName(level)\n    onnx_level = 0\n    if level_name in [\"NOTSET\", \"DEBUG\", \"INFO\", \"WARNING\"]:\n        onnx_level = 4\n    elif level_name in [\"ERROR\", \"CRITICAL\"]:\n        onnx_level = 3\n\n    return onnx_level\n\n\nlogger = logging.getLogger(\"unstructured_inference\")\n\nlogger_onnx = logging.getLogger(\"unstructured_inference_onnxruntime\")\nlogger_onnx.setLevel(translate_log_level(logger.getEffectiveLevel()))\n"
  },
  {
    "path": "unstructured_inference/math.py",
    "content": "\"\"\"a lightweight module that provides helpers to common math operations\"\"\"\n\nimport numpy as np\n\nFLOAT_EPSILON = np.finfo(float).eps\n\n\ndef safe_division(a, b) -> float:\n    \"\"\"a safer division to avoid division by zero when b == 0\n\n    returns a/b or a/FLOAT_EPSILON (should be around 2.2E-16) when b == 0\n\n    Parameters:\n    - a (int/float): a in a/b\n    - b (int/float): b in a/b\n\n    Returns:\n    float: a/b or a/FLOAT_EPSILON (should be around 2.2E-16) when b == 0\n    \"\"\"\n    return a / max(b, FLOAT_EPSILON)\n"
  },
  {
    "path": "unstructured_inference/models/__init__.py",
    "content": ""
  },
  {
    "path": "unstructured_inference/models/base.py",
    "content": "from __future__ import annotations\n\nimport json\nimport os\nimport threading\nfrom typing import Dict, Optional, Tuple, Type\n\nfrom unstructured_inference.models.detectron2onnx import (\n    MODEL_TYPES as DETECTRON2_ONNX_MODEL_TYPES,\n)\nfrom unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel\nfrom unstructured_inference.models.unstructuredmodel import UnstructuredModel\nfrom unstructured_inference.models.yolox import MODEL_TYPES as YOLOX_MODEL_TYPES\nfrom unstructured_inference.models.yolox import UnstructuredYoloXModel\nfrom unstructured_inference.utils import LazyDict\n\nDEFAULT_MODEL = \"yolox\"\n\n\nclass Models(object):\n    \"\"\"Singleton container for loaded models.\n\n    Thread Safety:\n    - Singleton initialization protected by _lock (double-check pattern)\n    - Dict operations (__contains__, __getitem__, __setitem__) rely on CPython's GIL\n      for atomicity. Individual dict operations are atomic in CPython.\n    - Per-model locks in get_model() prevent concurrent initialization of same model\n    - This implementation is CPython-specific and may need changes for Python 3.13+\n      free-threaded mode or alternative Python implementations without GIL\n    \"\"\"\n\n    _instance = None\n    _lock = threading.Lock()\n\n    def __new__(cls):\n        \"\"\"return an instance if one already exists otherwise create an instance\"\"\"\n        if cls._instance is None:\n            with cls._lock:\n                if cls._instance is None:\n                    cls._instance = super(Models, cls).__new__(cls)\n                    cls.models: Dict[str, UnstructuredModel] = {}\n        return cls._instance\n\n    def __contains__(self, key):\n        \"\"\"Check if model exists. Atomic operation under CPython GIL.\"\"\"\n        return key in self.models\n\n    def __getitem__(self, key: str):\n        \"\"\"Get model by name. Atomic operation under CPython GIL.\"\"\"\n        return self.models.__getitem__(key)\n\n    def __setitem__(self, key: str, value: UnstructuredModel):\n        \"\"\"Store model. Atomic operation under CPython GIL.\"\"\"\n        self.models[key] = value\n\n\nmodels: Models = Models()\n\n# Per-model locks for parallel loading of different models\n# Current implementation: Unbounded dictionary grows with unique model names\n# Memory impact: ~200 bytes per lock. Acceptable for <100 models (~20KB).\n# For >1000 models: Consider lock striping (fixed 128 locks, ~25KB, 0.8% collision rate)\n# Note: WeakValueDictionary is NOT suitable - locks would be GC'd immediately\n_models_locks: Dict[str, threading.Lock] = {}\n_models_locks_lock = threading.Lock()\n\n\ndef get_default_model_mappings() -> Tuple[\n    Dict[str, Type[UnstructuredModel]],\n    Dict[str, dict | LazyDict],\n]:\n    \"\"\"default model mappings for models that are in `unstructured_inference` repo\"\"\"\n    return {\n        **dict.fromkeys(DETECTRON2_ONNX_MODEL_TYPES, UnstructuredDetectronONNXModel),\n        **dict.fromkeys(YOLOX_MODEL_TYPES, UnstructuredYoloXModel),\n    }, {**DETECTRON2_ONNX_MODEL_TYPES, **YOLOX_MODEL_TYPES}\n\n\nmodel_class_map, model_config_map = get_default_model_mappings()\n\n\ndef register_new_model(model_config: dict, model_class: UnstructuredModel):\n    \"\"\"Register this model in model_config_map and model_class_map.\n\n    Those maps are updated with the with the new model class information.\n    \"\"\"\n    model_config_map.update(model_config)\n    model_class_map.update(dict.fromkeys(model_config, model_class))\n\n\ndef get_model(model_name: Optional[str] = None) -> UnstructuredModel:\n    \"\"\"Gets the model object by model name.\n\n    Thread-safe with per-model locks to allow parallel loading of different models\n    while preventing duplicate initialization of the same model.\n\n    Thread-safety maintained:\n    - _models_locks_lock protects lock dictionary operations\n    - Per-model locks protect model initialization\n    - Double-check pattern prevents duplicate loads\n    \"\"\"\n    if model_name is None:\n        default_name_from_env = os.environ.get(\"UNSTRUCTURED_DEFAULT_MODEL_NAME\")\n        model_name = default_name_from_env if default_name_from_env is not None else DEFAULT_MODEL\n\n    # Fast path: model already loaded\n    if model_name in models:\n        return models[model_name]\n\n    # Get or create lock for this specific model\n    with _models_locks_lock:\n        if model_name not in _models_locks:\n            _models_locks[model_name] = threading.Lock()\n\n    model_lock = _models_locks[model_name]\n\n    # Double-check pattern with per-model lock\n    with model_lock:\n        if model_name in models:\n            return models[model_name]\n\n        initialize_param_json = os.environ.get(\n            \"UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH\"\n        )\n        if initialize_param_json is not None:\n            with open(initialize_param_json) as fp:\n                initialize_params = json.load(fp)\n                label_map_int_keys = {\n                    int(key): value for key, value in initialize_params[\"label_map\"].items()\n                }\n                initialize_params[\"label_map\"] = label_map_int_keys\n        else:\n            if model_name in model_config_map:\n                initialize_params = model_config_map[model_name]\n            else:\n                raise UnknownModelException(f\"Unknown model type: {model_name}\")\n\n        model: UnstructuredModel = model_class_map[model_name]()\n\n        # Normalize to a plain dict via __iter__ + __getitem__. `**` unpacking\n        # calls `.keys()` on the mapping, which LazyDict inherits from\n        # collections.abc.Mapping — but we've seen environments where that\n        # inherited method isn't found at call time, surfacing as\n        # \"argument after ** must be a mapping, not LazyDict\".\n        initialize_params = {k: initialize_params[k] for k in initialize_params}\n        model.initialize(**initialize_params)\n        models[model_name] = model\n    return model\n\n\nclass UnknownModelException(Exception):\n    \"\"\"A model was requested with an unrecognized identifier.\"\"\"\n\n    pass\n"
  },
  {
    "path": "unstructured_inference/models/detectron2onnx.py",
    "content": "import os\nfrom typing import Dict, Final, List, Optional, Union, cast\n\nimport cv2\nimport numpy as np\nimport onnxruntime\nfrom huggingface_hub.constants import HUGGINGFACE_HUB_CACHE\nfrom onnxruntime.capi import _pybind_state as C\nfrom onnxruntime.quantization import QuantType, quantize_dynamic\nfrom PIL import Image\n\nfrom unstructured_inference.constants import Source\nfrom unstructured_inference.inference.layoutelement import LayoutElement\nfrom unstructured_inference.logger import logger, logger_onnx\nfrom unstructured_inference.models.unstructuredmodel import (\n    UnstructuredObjectDetectionModel,\n)\nfrom unstructured_inference.utils import (\n    LazyDict,\n    LazyEvaluateInfo,\n    download_if_needed_and_get_local_path,\n)\n\nonnxruntime.set_default_logger_severity(logger_onnx.getEffectiveLevel())\n\nDEFAULT_LABEL_MAP: Final[Dict[int, str]] = {\n    0: \"Text\",\n    1: \"Title\",\n    2: \"List\",\n    3: \"Table\",\n    4: \"Figure\",\n}\n\n\n# NOTE(alan): Entries are implemented as LazyDicts so that models aren't downloaded until they are\n# needed.\nMODEL_TYPES: Dict[str, Union[LazyDict, dict]] = {\n    \"detectron2_onnx\": LazyDict(\n        model_path=LazyEvaluateInfo(\n            download_if_needed_and_get_local_path,\n            \"unstructuredio/detectron2_faster_rcnn_R_50_FPN_3x\",\n            \"model.onnx\",\n        ),\n        label_map=DEFAULT_LABEL_MAP,\n        confidence_threshold=0.8,\n    ),\n    \"detectron2_quantized\": {\n        \"model_path\": os.path.join(\n            HUGGINGFACE_HUB_CACHE,\n            \"detectron2_quantized\",\n            \"detectrin2_quantized.onnx\",\n        ),\n        \"label_map\": DEFAULT_LABEL_MAP,\n        \"confidence_threshold\": 0.8,\n    },\n    \"detectron2_mask_rcnn\": LazyDict(\n        model_path=LazyEvaluateInfo(\n            download_if_needed_and_get_local_path,\n            \"unstructuredio/detectron2_mask_rcnn_X_101_32x8d_FPN_3x\",\n            \"model.onnx\",\n        ),\n        label_map=DEFAULT_LABEL_MAP,\n        confidence_threshold=0.8,\n    ),\n}\n\n\nclass UnstructuredDetectronONNXModel(UnstructuredObjectDetectionModel):\n    \"\"\"Unstructured model wrapper for detectron2 ONNX model.\"\"\"\n\n    # The model was trained and exported with this shape\n    required_w = 800\n    required_h = 1035\n\n    def predict(self, image: Image.Image) -> List[LayoutElement]:\n        \"\"\"Makes a prediction using detectron2 model.\"\"\"\n        super().predict(image)\n\n        prepared_input = self.preprocess(image)\n        try:\n            result = self.model.run(None, prepared_input)\n            bboxes = result[0]\n            labels = result[1]\n            # Previous model detectron2_onnx stored confidence scores at index 2,\n            # bigger model stores it at index 3\n            confidence_scores = result[2] if \"R_50\" in self.model_path else result[3]\n        except onnxruntime.capi.onnxruntime_pybind11_state.RuntimeException:\n            logger_onnx.debug(\n                \"Ignoring runtime error from onnx (likely due to encountering blank page).\",\n            )\n            return []\n        input_w, input_h = image.size\n        regions = self.postprocess(bboxes, labels, confidence_scores, input_w, input_h)\n\n        return regions\n\n    def initialize(\n        self,\n        model_path: str,\n        label_map: Dict[int, str],\n        confidence_threshold: Optional[float] = None,\n    ):\n        \"\"\"Loads the detectron2 model using the specified parameters\"\"\"\n        if not os.path.exists(model_path) and \"detectron2_quantized\" in model_path:\n            logger.info(\"Quantized model don't currently exists, quantizing now...\")\n            os.mkdir(\"\".join(os.path.split(model_path)[:-1]))\n            source_path = MODEL_TYPES[\"detectron2_onnx\"][\"model_path\"]\n            quantize_dynamic(source_path, model_path, weight_type=QuantType.QUInt8)\n\n        available_providers = C.get_available_providers()\n        ordered_providers = [\n            \"TensorrtExecutionProvider\",\n            \"CUDAExecutionProvider\",\n            \"CPUExecutionProvider\",\n        ]\n        providers = [provider for provider in ordered_providers if provider in available_providers]\n\n        self.model = onnxruntime.InferenceSession(\n            model_path,\n            providers=providers,\n        )\n        self.model_path = model_path\n        self.label_map = label_map\n        if confidence_threshold is None:\n            confidence_threshold = 0.5\n        self.confidence_threshold = confidence_threshold\n\n    def preprocess(self, image: Image.Image) -> Dict[str, np.ndarray]:\n        \"\"\"Process input image into required format for ingestion into the Detectron2 ONNX binary.\n        This involves resizing to a fixed shape and converting to a specific numpy format.\n        \"\"\"\n        # TODO (benjamin): check other shapes for inference\n        img = np.array(image)\n        # TODO (benjamin): We should use models.get_model() but currenly returns Detectron model\n        session = self.model\n        # onnx input expected\n        # [3,1035,800]\n        img = cv2.resize(\n            img,\n            (self.required_w, self.required_h),\n            interpolation=cv2.INTER_LINEAR,\n        ).astype(np.float32)\n        img = img.transpose(2, 0, 1)\n        ort_inputs = {session.get_inputs()[0].name: img}\n        return ort_inputs\n\n    def postprocess(\n        self,\n        bboxes: np.ndarray,\n        labels: np.ndarray,\n        confidence_scores: np.ndarray,\n        input_w: float,\n        input_h: float,\n    ) -> List[LayoutElement]:\n        \"\"\"Process output into Unstructured class. Bounding box coordinates are converted to\n        original image resolution.\"\"\"\n        regions = []\n        width_conversion = input_w / self.required_w\n        height_conversion = input_h / self.required_h\n        for (x1, y1, x2, y2), label, conf in zip(bboxes, labels, confidence_scores):\n            detected_class = self.label_map[int(label)]\n            if conf >= self.confidence_threshold:\n                region = LayoutElement.from_coords(\n                    x1 * width_conversion,\n                    y1 * height_conversion,\n                    x2 * width_conversion,\n                    y2 * height_conversion,\n                    text=None,\n                    type=detected_class,\n                    prob=conf,\n                    source=Source.DETECTRON2_ONNX,\n                )\n\n                regions.append(region)\n\n        regions.sort(key=lambda element: element.bbox.y1)\n        return cast(List[LayoutElement], regions)\n"
  },
  {
    "path": "unstructured_inference/models/eval.py",
    "content": "from functools import partial\nfrom typing import Callable, Dict, List, Optional\n\nimport pandas as pd\nfrom rapidfuzz import fuzz\n\nEVAL_FUNCTIONS = {\n    \"token_ratio\": fuzz.token_ratio,\n    \"ratio\": fuzz.ratio,\n    \"partial_token_ratio\": fuzz.partial_token_ratio,\n    \"partial_ratio\": fuzz.partial_ratio,\n}\n\n\ndef _join_df_content(df, tab_token=\"\\t\", row_break_token=\"\\n\") -> str:\n    \"\"\"joining dataframe's table content as one long string\"\"\"\n    return row_break_token.join([tab_token.join(row) for row in df.values])\n\n\ndef default_tokenizer(text: str) -> List[str]:\n    \"\"\"a simple tokenizer that splits text by white space\"\"\"\n    return text.split()\n\n\ndef compare_contents_as_df(\n    actual_df: pd.DataFrame,\n    pred_df: pd.DataFrame,\n    eval_func: str = \"token_ratio\",\n    processor: Optional[Callable] = None,\n    tab_token: str = \"\\t\",\n    row_break_token: str = \"\\n\",\n) -> Dict[str, float]:\n    r\"\"\"ravel the table as string then use text distance to compare the prediction against true\n    table\n\n    Parameters\n    ----------\n    actual_df: pd.DataFrame\n        actual table as pandas dataframe\n\n    pred_df: pd.DataFrame\n        predicted table as pandas dataframe\n\n    eval_func: str, default tp \"token_ratio\"\n        the eval_func should be one of \"token_ratio\", \"ratio\", \"partial_token_ratio\",\n        \"partial_ratio\". Those are functions provided by rapidfuzz to evaluate text distances\n        using either tokens or characters. In general token is better than characters for evaluating\n        tables.\n\n    processor: Callable, default to None\n        processor to tokenize the text; by default None means no processing (using characters). For\n        tokens eval functions we recommend using the `default_tokenizer` or some other functions to\n        break down the text into words\n\n    tab_token: str, default to \"\\t\"\n        the string to join cells together\n\n    row_break_token: str, default to \"\\n\"\n        the string to join rows together\n\n    Returns\n    -------\n    Dict[str, int]\n        mapping of by column and by row scores to the scores as float numbers\n    \"\"\"\n    func = EVAL_FUNCTIONS.get(eval_func)\n    if func is None:\n        raise ValueError(\n            'eval_func must be one of \"token_ratio\", \"ratio\", \"partial_token_ratio\", '\n            f'\"partial_ratio\" but got {eval_func}',\n        )\n    join_func = partial(_join_df_content, tab_token=tab_token, row_break_token=row_break_token)\n    return {\n        f\"by_col_{eval_func}\": func(\n            join_func(actual_df),\n            join_func(pred_df),\n            processor=processor,\n        ),\n        f\"by_row_{eval_func}\": func(\n            join_func(actual_df.T),\n            join_func(pred_df.T),\n            processor=processor,\n        ),\n    }\n"
  },
  {
    "path": "unstructured_inference/models/table_postprocess.py",
    "content": "# https://github.com/microsoft/table-transformer/blob/main/src/postprocess.py\n\"\"\"\nCopyright (C) 2021 Microsoft Corporation\n\"\"\"\n\nfrom collections import defaultdict\n\n\nclass Rect:\n    def __init__(self, bbox=None):\n        if bbox is None:\n            self.x_min = 0\n            self.y_min = 0\n            self.x_max = 0\n            self.y_max = 0\n        else:\n            self.x_min = bbox[0]\n            self.y_min = bbox[1]\n            self.x_max = bbox[2]\n            self.y_max = bbox[3]\n\n    def get_area(self):\n        \"\"\"Calculates the area of the rectangle\"\"\"\n        area = (self.x_max - self.x_min) * (self.y_max - self.y_min)\n        return area if area > 0 else 0.0\n\n    def intersect(self, other):\n        \"\"\"Calculates the intersection with another rectangle\"\"\"\n        if self.get_area() == 0:\n            self.x_min = other.x_min\n            self.y_min = other.y_min\n            self.x_max = other.x_max\n            self.y_max = other.y_max\n        else:\n            self.x_min = max(self.x_min, other.x_min)\n            self.y_min = max(self.y_min, other.y_min)\n            self.x_max = min(self.x_max, other.x_max)\n            self.y_max = min(self.y_max, other.y_max)\n\n            if self.x_min > self.x_max or self.y_min > self.y_max or self.get_area() == 0:\n                self.x_min = 0\n                self.y_min = 0\n                self.x_max = 0\n                self.y_max = 0\n\n        return self\n\n    def include_rect(self, bbox):\n        \"\"\"Calculates a rectangle that includes both rectangles\"\"\"\n        other = Rect(bbox)\n\n        if self.get_area() == 0:\n            self.x_min = other.x_min\n            self.y_min = other.y_min\n            self.x_max = other.x_max\n            self.y_max = other.y_max\n            return self\n\n        self.x_min = min(self.x_min, other.x_min)\n        self.y_min = min(self.y_min, other.y_min)\n        self.x_max = max(self.x_max, other.x_max)\n        self.y_max = max(self.y_max, other.y_max)\n\n        # if self.get_area() == 0:\n        #     self.x_min = other.x_min\n        #     self.y_min = other.y_min\n        #     self.x_max = other.x_max\n        #     self.y_max = other.y_max\n\n        return self\n\n    def get_bbox(self):\n        \"\"\"Returns the coordinates that define the rectangle\"\"\"\n        return [self.x_min, self.y_min, self.x_max, self.y_max]\n\n\ndef apply_threshold(objects, threshold):\n    \"\"\"\n    Filter out objects below a certain score.\n    \"\"\"\n    return [obj for obj in objects if obj[\"score\"] >= threshold]\n\n\ndef refine_rows(rows, tokens, score_threshold):\n    \"\"\"\n    Apply operations to the detected rows, such as\n    thresholding, NMS, and alignment.\n    \"\"\"\n\n    if len(tokens) > 0:\n        rows = nms_by_containment(rows, tokens, overlap_threshold=0.5)\n        remove_objects_without_content(tokens, rows)\n    else:\n        rows = nms(rows, match_criteria=\"object2_overlap\", match_threshold=0.5, keep_higher=True)\n    if len(rows) > 1:\n        rows = sort_objects_top_to_bottom(rows)\n\n    return rows\n\n\ndef refine_columns(columns, tokens, score_threshold):\n    \"\"\"\n    Apply operations to the detected columns, such as\n    thresholding, NMS, and alignment.\n    \"\"\"\n\n    if len(tokens) > 0:\n        columns = nms_by_containment(columns, tokens, overlap_threshold=0.5)\n        remove_objects_without_content(tokens, columns)\n    else:\n        columns = nms(\n            columns,\n            match_criteria=\"object2_overlap\",\n            match_threshold=0.25,\n            keep_higher=True,\n        )\n    if len(columns) > 1:\n        columns = sort_objects_left_to_right(columns)\n\n    return columns\n\n\ndef nms_by_containment(container_objects, package_objects, overlap_threshold=0.5):\n    \"\"\"\n    Non-maxima suppression (NMS) of objects based on shared containment of other objects.\n    \"\"\"\n    container_objects = sort_objects_by_score(container_objects)\n    num_objects = len(container_objects)\n    suppression = [False for obj in container_objects]\n\n    packages_by_container, _, _ = slot_into_containers(\n        container_objects,\n        package_objects,\n        overlap_threshold=overlap_threshold,\n        forced_assignment=False,\n    )\n\n    for object2_num in range(1, num_objects):\n        object2_packages = set(packages_by_container[object2_num])\n        if len(object2_packages) == 0:\n            suppression[object2_num] = True\n        for object1_num in range(object2_num):\n            if not suppression[object1_num]:\n                object1_packages = set(packages_by_container[object1_num])\n                if len(object2_packages.intersection(object1_packages)) > 0:\n                    suppression[object2_num] = True\n\n    final_objects = [obj for idx, obj in enumerate(container_objects) if not suppression[idx]]\n    return final_objects\n\n\ndef slot_into_containers(\n    container_objects,\n    package_objects,\n    overlap_threshold=0.5,\n    forced_assignment=False,\n):\n    \"\"\"\n    Slot a collection of objects into the container they occupy most (the container which holds the\n    largest fraction of the object).\n    \"\"\"\n    best_match_scores = []\n\n    container_assignments = [[] for container in container_objects]\n    package_assignments = [[] for package in package_objects]\n\n    if len(container_objects) == 0 or len(package_objects) == 0:\n        return container_assignments, package_assignments, best_match_scores\n\n    match_scores = defaultdict(dict)\n    for package_num, package in enumerate(package_objects):\n        match_scores = []\n        package_rect = Rect(package[\"bbox\"])\n        package_area = package_rect.get_area()\n        for container_num, container in enumerate(container_objects):\n            container_rect = Rect(container[\"bbox\"])\n            intersect_area = container_rect.intersect(Rect(package[\"bbox\"])).get_area()\n\n            if package_area > 0:\n                overlap_fraction = intersect_area / package_area\n\n                match_scores.append(\n                    {\n                        \"container\": container,\n                        \"container_num\": container_num,\n                        \"score\": overlap_fraction,\n                    },\n                )\n\n        if len(match_scores) > 0:\n            sorted_match_scores = sort_objects_by_score(match_scores)\n\n            best_match_score = sorted_match_scores[0]\n            best_match_scores.append(best_match_score[\"score\"])\n            if forced_assignment or best_match_score[\"score\"] >= overlap_threshold:\n                container_assignments[best_match_score[\"container_num\"]].append(package_num)\n                package_assignments[package_num].append(best_match_score[\"container_num\"])\n\n    return container_assignments, package_assignments, best_match_scores\n\n\ndef sort_objects_by_score(objects, reverse=True):\n    \"\"\"\n    Put any set of objects in order from high score to low score.\n    \"\"\"\n    return sorted(objects, key=lambda k: k[\"score\"], reverse=reverse)\n\n\ndef remove_objects_without_content(page_spans, objects):\n    \"\"\"\n    Remove any objects (these can be rows, columns, supercells, etc.) that don't\n    have any text associated with them.\n    \"\"\"\n    for obj in objects[:]:\n        object_text, _ = extract_text_inside_bbox(page_spans, obj[\"bbox\"])\n        if len(object_text.strip()) == 0:\n            objects.remove(obj)\n\n\ndef extract_text_inside_bbox(spans, bbox):\n    \"\"\"\n    Extract the text inside a bounding box.\n    \"\"\"\n    bbox_spans = get_bbox_span_subset(spans, bbox)\n    bbox_text = extract_text_from_spans(bbox_spans, remove_integer_superscripts=True)\n\n    return bbox_text, bbox_spans\n\n\ndef get_bbox_span_subset(spans, bbox, threshold=0.5):\n    \"\"\"\n    Reduce the set of spans to those that fall within a bounding box.\n\n    threshold: the fraction of the span that must overlap with the bbox.\n    \"\"\"\n    span_subset = []\n    for span in spans:\n        if overlaps(span[\"bbox\"], bbox, threshold):\n            span_subset.append(span)\n    return span_subset\n\n\ndef overlaps(bbox1, bbox2, threshold=0.5):\n    \"\"\"\n    Test if more than \"threshold\" fraction of bbox1 overlaps with bbox2.\n    \"\"\"\n    rect1 = Rect(list(bbox1))\n    area1 = rect1.get_area()\n    if area1 == 0:\n        return False\n    return rect1.intersect(Rect(list(bbox2))).get_area() / area1 >= threshold\n\n\ndef extract_text_from_spans(spans, join_with_space=True, remove_integer_superscripts=True):\n    \"\"\"\n    Convert a collection of page tokens/words/spans into a single text string.\n    \"\"\"\n\n    join_char = \" \" if join_with_space else \"\"\n    spans_copy = spans[:]\n\n    if remove_integer_superscripts:\n        for span in spans:\n            if \"flags\" not in span:\n                continue\n            flags = span[\"flags\"]\n            if flags & 2**0:  # superscript flag\n                if span[\"text\"].strip().isdigit():\n                    spans_copy.remove(span)\n                else:\n                    span[\"superscript\"] = True\n\n    if len(spans_copy) == 0:\n        return \"\"\n\n    spans_copy.sort(key=lambda span: span[\"span_num\"])\n    spans_copy.sort(key=lambda span: span[\"line_num\"])\n    spans_copy.sort(key=lambda span: span[\"block_num\"])\n\n    # Force the span at the end of every line within a block to have exactly one space\n    # unless the line ends with a space or ends with a non-space followed by a hyphen\n    line_texts = []\n    line_span_texts = [spans_copy[0][\"text\"]]\n    for span1, span2 in zip(spans_copy[:-1], spans_copy[1:]):\n        if span1[\"block_num\"] != span2[\"block_num\"] or span1[\"line_num\"] != span2[\"line_num\"]:\n            line_text = join_char.join(line_span_texts).strip()\n            if (\n                len(line_text) > 0\n                and line_text[-1] != \" \"\n                and not (len(line_text) > 1 and line_text[-1] == \"-\" and line_text[-2] != \" \")\n                and not join_with_space\n            ):\n                line_text += \" \"\n            line_texts.append(line_text)\n            line_span_texts = [span2[\"text\"]]\n        else:\n            line_span_texts.append(span2[\"text\"])\n    line_text = join_char.join(line_span_texts)\n    line_texts.append(line_text)\n\n    return join_char.join(line_texts).strip()\n\n\ndef sort_objects_left_to_right(objs):\n    \"\"\"\n    Put the objects in order from left to right.\n    \"\"\"\n    return sorted(objs, key=lambda k: k[\"bbox\"][0] + k[\"bbox\"][2])\n\n\ndef sort_objects_top_to_bottom(objs):\n    \"\"\"\n    Put the objects in order from top to bottom.\n    \"\"\"\n    return sorted(objs, key=lambda k: k[\"bbox\"][1] + k[\"bbox\"][3])\n\n\ndef align_columns(columns, bbox):\n    \"\"\"\n    For every column, align the top and bottom boundaries to the final\n    table bounding box.\n    \"\"\"\n    try:\n        for column in columns:\n            column[\"bbox\"][1] = bbox[1]\n            column[\"bbox\"][3] = bbox[3]\n    except Exception as err:\n        print(f\"Could not align columns: {err}\")\n        pass\n\n    return columns\n\n\ndef align_rows(rows, bbox):\n    \"\"\"\n    For every row, align the left and right boundaries to the final\n    table bounding box.\n    \"\"\"\n    try:\n        for row in rows:\n            row[\"bbox\"][0] = bbox[0]\n            row[\"bbox\"][2] = bbox[2]\n    except Exception as err:\n        print(f\"Could not align rows: {err}\")\n        pass\n\n    return rows\n\n\ndef nms(objects, match_criteria=\"object2_overlap\", match_threshold=0.05, keep_higher=True):\n    \"\"\"\n    A customizable version of non-maxima suppression (NMS).\n\n    Default behavior: If a lower-confidence object overlaps more than 5% of its area\n    with a higher-confidence object, remove the lower-confidence object.\n\n    objects: set of dicts; each object dict must have a 'bbox' and a 'score' field\n    match_criteria: how to measure how much two objects \"overlap\"\n    match_threshold: the cutoff for determining that overlap requires suppression of one object\n    keep_higher: if True, keep the object with the higher metric; otherwise, keep the lower\n    \"\"\"\n    if len(objects) == 0:\n        return []\n\n    objects = sort_objects_by_score(objects, reverse=keep_higher)\n\n    num_objects = len(objects)\n    suppression = [False for obj in objects]\n\n    for object2_num in range(1, num_objects):\n        object2_rect = Rect(objects[object2_num][\"bbox\"])\n        object2_area = object2_rect.get_area()\n        for object1_num in range(object2_num):\n            if not suppression[object1_num]:\n                object1_rect = Rect(objects[object1_num][\"bbox\"])\n                object1_area = object1_rect.get_area()\n                intersect_area = object1_rect.intersect(object2_rect).get_area()\n                try:\n                    if match_criteria == \"object1_overlap\":\n                        metric = intersect_area / object1_area\n                    elif match_criteria == \"object2_overlap\":\n                        metric = intersect_area / object2_area\n                    elif match_criteria == \"iou\":\n                        metric = intersect_area / (object1_area + object2_area - intersect_area)\n                    if metric >= match_threshold:\n                        suppression[object2_num] = True\n                        break\n                except ZeroDivisionError:\n                    # Intended to recover from divide-by-zero\n                    pass\n\n    return [obj for idx, obj in enumerate(objects) if not suppression[idx]]\n\n\ndef align_supercells(supercells, rows, columns):\n    \"\"\"\n    For each supercell, align it to the rows it intersects 50% of the height of,\n    and the columns it intersects 50% of the width of.\n    Eliminate supercells for which there are no rows and columns it intersects 50% with.\n    \"\"\"\n    aligned_supercells = []\n\n    for supercell in supercells:\n        supercell[\"header\"] = False\n        row_bbox_rect = None\n        col_bbox_rect = None\n        intersecting_header_rows = set()\n        intersecting_data_rows = set()\n        for row_num, row in enumerate(rows):\n            row_height = row[\"bbox\"][3] - row[\"bbox\"][1]\n            supercell_height = supercell[\"bbox\"][3] - supercell[\"bbox\"][1]\n            min_row_overlap = max(row[\"bbox\"][1], supercell[\"bbox\"][1])\n            max_row_overlap = min(row[\"bbox\"][3], supercell[\"bbox\"][3])\n            overlap_height = max_row_overlap - min_row_overlap\n            if \"span\" in supercell:\n                overlap_fraction = max(\n                    overlap_height / row_height,\n                    overlap_height / supercell_height,\n                )\n            else:\n                overlap_fraction = overlap_height / row_height\n            if overlap_fraction >= 0.5:\n                if \"header\" in row and row[\"header\"]:\n                    intersecting_header_rows.add(row_num)\n                else:\n                    intersecting_data_rows.add(row_num)\n\n        # Supercell cannot span across the header boundary; eliminate whichever\n        # group of rows is the smallest\n        supercell[\"header\"] = False\n        if len(intersecting_data_rows) > 0 and len(intersecting_header_rows) > 0:\n            if len(intersecting_data_rows) > len(intersecting_header_rows):\n                intersecting_header_rows = set()\n            else:\n                intersecting_data_rows = set()\n        if len(intersecting_header_rows) > 0:\n            supercell[\"header\"] = True\n        elif \"span\" in supercell:\n            continue  # Require span supercell to be in the header\n        intersecting_rows = intersecting_data_rows.union(intersecting_header_rows)\n        # Determine vertical span of aligned supercell\n        for row_num in intersecting_rows:\n            if row_bbox_rect is None:\n                row_bbox_rect = Rect(rows[row_num][\"bbox\"])\n            else:\n                row_bbox_rect = row_bbox_rect.include_rect(rows[row_num][\"bbox\"])\n        if row_bbox_rect is None:\n            continue\n\n        intersecting_cols = []\n        for col_num, col in enumerate(columns):\n            col_width = col[\"bbox\"][2] - col[\"bbox\"][0]\n            supercell_width = supercell[\"bbox\"][2] - supercell[\"bbox\"][0]\n            min_col_overlap = max(col[\"bbox\"][0], supercell[\"bbox\"][0])\n            max_col_overlap = min(col[\"bbox\"][2], supercell[\"bbox\"][2])\n            overlap_width = max_col_overlap - min_col_overlap\n            if \"span\" in supercell:\n                overlap_fraction = max(overlap_width / col_width, overlap_width / supercell_width)\n                # Multiply by 2 effectively lowers the threshold to 0.25\n                if supercell[\"header\"]:\n                    overlap_fraction = overlap_fraction * 2\n            else:\n                overlap_fraction = overlap_width / col_width\n            if overlap_fraction >= 0.5:\n                intersecting_cols.append(col_num)\n                if col_bbox_rect is None:\n                    col_bbox_rect = Rect(col[\"bbox\"])\n                else:\n                    col_bbox_rect = col_bbox_rect.include_rect(col[\"bbox\"])\n        if col_bbox_rect is None:\n            continue\n\n        supercell_bbox = row_bbox_rect.intersect(col_bbox_rect).get_bbox()\n        supercell[\"bbox\"] = supercell_bbox\n\n        # Only a true supercell if it joins across multiple rows or columns\n        if (\n            len(intersecting_rows) > 0\n            and len(intersecting_cols) > 0\n            and (len(intersecting_rows) > 1 or len(intersecting_cols) > 1)\n        ):\n            supercell[\"row_numbers\"] = list(intersecting_rows)\n            supercell[\"column_numbers\"] = intersecting_cols\n            aligned_supercells.append(supercell)\n\n            # A span supercell in the header means there must be supercells above it in the header\n            if \"span\" in supercell and supercell[\"header\"] and len(supercell[\"column_numbers\"]) > 1:\n                for row_num in range(0, min(supercell[\"row_numbers\"])):\n                    new_supercell = {\n                        \"row_numbers\": [row_num],\n                        \"column_numbers\": supercell[\"column_numbers\"],\n                        \"score\": supercell[\"score\"],\n                        \"propagated\": True,\n                    }\n                    new_supercell_columns = [columns[idx] for idx in supercell[\"column_numbers\"]]\n                    new_supercell_rows = [rows[idx] for idx in supercell[\"row_numbers\"]]\n                    bbox = [\n                        min([column[\"bbox\"][0] for column in new_supercell_columns]),\n                        min([row[\"bbox\"][1] for row in new_supercell_rows]),\n                        max([column[\"bbox\"][2] for column in new_supercell_columns]),\n                        max([row[\"bbox\"][3] for row in new_supercell_rows]),\n                    ]\n                    new_supercell[\"bbox\"] = bbox\n                    aligned_supercells.append(new_supercell)\n\n    return aligned_supercells\n\n\ndef nms_supercells(supercells):\n    \"\"\"\n    A NMS scheme for supercells that first attempts to shrink supercells to\n    resolve overlap.\n    If two supercells overlap the same (sub)cell, shrink the lower confidence\n    supercell to resolve the overlap. If shrunk supercell is empty, remove it.\n    \"\"\"\n\n    supercells = sort_objects_by_score(supercells)\n    num_supercells = len(supercells)\n    suppression = [False for supercell in supercells]\n\n    for supercell2_num in range(1, num_supercells):\n        supercell2 = supercells[supercell2_num]\n        for supercell1_num in range(supercell2_num):\n            supercell1 = supercells[supercell1_num]\n            remove_supercell_overlap(supercell1, supercell2)\n        if (\n            (len(supercell2[\"row_numbers\"]) < 2 and len(supercell2[\"column_numbers\"]) < 2)\n            or len(supercell2[\"row_numbers\"]) == 0\n            or len(supercell2[\"column_numbers\"]) == 0\n        ):\n            suppression[supercell2_num] = True\n\n    return [obj for idx, obj in enumerate(supercells) if not suppression[idx]]\n\n\ndef header_supercell_tree(supercells):\n    \"\"\"\n    Make sure no supercell in the header is below more than one supercell in any row above it.\n    The cells in the header form a tree, but a supercell with more than one supercell in a row\n    above it means that some cell has more than one parent, which is not allowed. Eliminate\n    any supercell that would cause this to be violated.\n    \"\"\"\n    header_supercells = [\n        supercell for supercell in supercells if \"header\" in supercell and supercell[\"header\"]\n    ]\n    header_supercells = sort_objects_by_score(header_supercells)\n\n    for header_supercell in header_supercells[:]:\n        ancestors_by_row = defaultdict(int)\n        min_row = min(header_supercell[\"row_numbers\"])\n        for header_supercell2 in header_supercells:\n            max_row2 = max(header_supercell2[\"row_numbers\"])\n            if max_row2 < min_row and set(header_supercell[\"column_numbers\"]).issubset(\n                set(header_supercell2[\"column_numbers\"]),\n            ):\n                for row2 in header_supercell2[\"row_numbers\"]:\n                    ancestors_by_row[row2] += 1\n        for row in range(0, min_row):\n            if ancestors_by_row[row] != 1:\n                supercells.remove(header_supercell)\n                break\n\n\ndef remove_supercell_overlap(supercell1, supercell2):\n    \"\"\"\n    This function resolves overlap between supercells (supercells must be\n    disjoint) by iteratively shrinking supercells by the fewest grid cells\n    necessary to resolve the overlap.\n    Example:\n    If two supercells overlap at grid cell (R, C), and supercell #1 is less\n    confident than supercell #2, we eliminate either row R from supercell #1\n    or column C from supercell #1 by comparing the number of columns in row R\n    versus the number of rows in column C. If the number of columns in row R\n    is less than the number of rows in column C, we eliminate row R from\n    supercell #1. This resolves the overlap by removing fewer grid cells from\n    supercell #1 than if we eliminated column C from it.\n    \"\"\"\n    common_rows = set(supercell1[\"row_numbers\"]).intersection(set(supercell2[\"row_numbers\"]))\n    common_columns = set(supercell1[\"column_numbers\"]).intersection(\n        set(supercell2[\"column_numbers\"]),\n    )\n\n    # While the supercells have overlapping grid cells, continue shrinking the less-confident\n    # supercell one row or one column at a time\n    while len(common_rows) > 0 and len(common_columns) > 0:\n        # Try to shrink the supercell as little as possible to remove the overlap;\n        # if the supercell has fewer rows than columns, remove an overlapping column,\n        # because this removes fewer grid cells from the supercell;\n        # otherwise remove an overlapping row\n        if len(supercell2[\"row_numbers\"]) < len(supercell2[\"column_numbers\"]):\n            min_column = min(supercell2[\"column_numbers\"])\n            max_column = max(supercell2[\"column_numbers\"])\n            if max_column in common_columns:\n                common_columns.remove(max_column)\n                supercell2[\"column_numbers\"].remove(max_column)\n            elif min_column in common_columns:\n                common_columns.remove(min_column)\n                supercell2[\"column_numbers\"].remove(min_column)\n            else:\n                supercell2[\"column_numbers\"] = []\n                common_columns = set()\n        else:\n            min_row = min(supercell2[\"row_numbers\"])\n            max_row = max(supercell2[\"row_numbers\"])\n            if max_row in common_rows:\n                common_rows.remove(max_row)\n                supercell2[\"row_numbers\"].remove(max_row)\n            elif min_row in common_rows:\n                common_rows.remove(min_row)\n                supercell2[\"row_numbers\"].remove(min_row)\n            else:\n                supercell2[\"row_numbers\"] = []\n                common_rows = set()\n"
  },
  {
    "path": "unstructured_inference/models/tables.py",
    "content": "# https://github.com/microsoft/table-transformer/blob/main/src/inference.py\n# https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Table%20Transformer/Using_Table_Transformer_for_table_detection_and_table_structure_recognition.ipynb\nimport threading\nimport xml.etree.ElementTree as ET\nfrom collections import defaultdict\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union\n\nimport cv2\nimport numpy as np\nimport torch\nfrom PIL import Image as PILImage\nfrom transformers import DetrImageProcessor, TableTransformerForObjectDetection, logging\nfrom transformers.models.table_transformer.modeling_table_transformer import (\n    TableTransformerObjectDetectionOutput,\n)\n\nfrom unstructured_inference.config import inference_config\nfrom unstructured_inference.inference.layoutelement import table_cells_to_dataframe\nfrom unstructured_inference.logger import logger\nfrom unstructured_inference.models.table_postprocess import Rect\nfrom unstructured_inference.models.unstructuredmodel import UnstructuredModel\nfrom unstructured_inference.utils import pad_image_with_background_color\n\nfrom . import table_postprocess as postprocess\n\nDEFAULT_MODEL = \"microsoft/table-transformer-structure-recognition\"\n\n\nclass UnstructuredTableTransformerModel(UnstructuredModel):\n    \"\"\"Unstructured model wrapper for table-transformer.\"\"\"\n\n    _instance = None\n    _lock = threading.Lock()\n\n    def __new__(cls):\n        \"\"\"return an instance if one already exists otherwise create an instance\"\"\"\n        if cls._instance is None:\n            with cls._lock:\n                if cls._instance is None:\n                    cls._instance = super(UnstructuredTableTransformerModel, cls).__new__(cls)\n        return cls._instance\n\n    def predict(\n        self,\n        x: PILImage.Image,\n        ocr_tokens: Optional[List[Dict]] = None,\n        result_format: str = \"html\",\n    ):\n        \"\"\"Predict table structure deferring to run_prediction with ocr tokens\n\n        Note:\n        `ocr_tokens` is a list of dictionaries representing OCR tokens,\n        where each dictionary has the following format:\n        {\n            \"bbox\": [int, int, int, int],  # Bounding box coordinates of the token\n            \"block_num\": int,  # Block number\n            \"line_num\": int,   # Line number\n            \"span_num\": int,   # Span number\n            \"text\": str,  # Text content of the token\n        }\n        The bounding box coordinates should match the table structure.\n        FIXME: refactor token data into a dataclass so we have clear expectations of the fields\n        \"\"\"\n        super().predict(x)\n        return self.run_prediction(x, ocr_tokens=ocr_tokens, result_format=result_format)\n\n    def initialize(\n        self,\n        model: Union[str, Path],\n        device: Optional[str] = \"cuda\" if torch.cuda.is_available() else \"cpu\",\n    ):\n        \"\"\"Loads the table transformer model using the specified parameters.\n\n        Device placement strategy:\n        - Normalize device names (cuda -> cuda:0) for consistent caching\n        - Load models WITHOUT device_map to avoid meta tensor errors\n        - Use explicit .to(device, dtype=torch.float32) for proper placement\n        \"\"\"\n        # Device normalization for consistent caching\n        if device is None:\n            device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n        if device.startswith(\"cuda\") and \":\" not in device:\n            if torch.cuda.is_available():\n                device = f\"cuda:{torch.cuda.current_device()}\"\n            else:\n                logger.warning(\"CUDA device requested but not available, falling back to CPU\")\n                device = \"cpu\"\n\n        self.device = device\n\n        # Load feature extractor WITHOUT device_map\n        self.feature_extractor = DetrImageProcessor.from_pretrained(model)\n        # value not set in the configuration and needed for newer models\n        # https://huggingface.co/microsoft/table-transformer-structure-recognition-v1.1-all/discussions/1\n        self.feature_extractor.size[\"shortest_edge\"] = inference_config.IMG_PROCESSOR_SHORTEST_EDGE\n        self.feature_extractor.size[\"longest_edge\"] = inference_config.IMG_PROCESSOR_LONGEST_EDGE\n\n        try:\n            logger.info(f\"Loading table structure model to {self.device}...\")\n            cached_current_verbosity = logging.get_verbosity()\n            logging.set_verbosity_error()\n\n            # Load model WITHOUT device_map (prevents meta tensor errors)\n            self.model = TableTransformerForObjectDetection.from_pretrained(model)\n\n            # Explicit device placement with dtype\n            # NOTE: While nn.Module.to() modifies in-place, capturing return value is\n            # recommended best practice per PyTorch docs for consistency and clarity\n            self.model = self.model.to(self.device, dtype=torch.float32)\n\n            logging.set_verbosity(cached_current_verbosity)\n            self.model.eval()\n            logger.info(f\"Table model successfully loaded to {self.device}\")\n\n        except EnvironmentError:\n            logger.critical(\"Failed to initialize the model.\")\n            logger.critical(\"Ensure that the model is correct\")\n            raise ImportError(\n                \"Review the parameters to initialize a UnstructuredTableTransformerModel obj\",\n            )\n\n    def get_structure(\n        self,\n        x: PILImage.Image,\n        pad_for_structure_detection: int = inference_config.TABLE_IMAGE_BACKGROUND_PAD,\n    ) -> TableTransformerObjectDetectionOutput:\n        \"\"\"get the table structure as a dictionary contaning different types of elements as\n        key-value pairs; check table-transformer documentation for more information\"\"\"\n        with torch.no_grad():\n            encoding = self.feature_extractor(\n                pad_image_with_background_color(x, pad_for_structure_detection),\n                return_tensors=\"pt\",\n            ).to(self.device)\n            outputs_structure = self.model(**encoding)\n            outputs_structure[\"pad_for_structure_detection\"] = pad_for_structure_detection\n            return outputs_structure\n\n    def run_prediction(\n        self,\n        x: PILImage.Image,\n        pad_for_structure_detection: int = inference_config.TABLE_IMAGE_BACKGROUND_PAD,\n        ocr_tokens: Optional[List[Dict]] = None,\n        result_format: Optional[str] = \"html\",\n    ):\n        \"\"\"Predict table structure\"\"\"\n        outputs_structure = self.get_structure(x, pad_for_structure_detection)\n        if ocr_tokens is None:\n            raise ValueError(\"Cannot predict table structure with no OCR tokens\")\n\n        recognized_table = recognize(outputs_structure, x, tokens=ocr_tokens)\n        if len(recognized_table) > 0:\n            prediction = recognized_table[0]\n        # NOTE(robinson) - This means that the table was not recognized\n        else:\n            return \"\"\n\n        if result_format == \"html\":\n            # Convert cells to HTML\n            prediction = cells_to_html(prediction) or \"\"\n        elif result_format == \"dataframe\":\n            prediction = table_cells_to_dataframe(prediction)\n        elif result_format == \"cells\":\n            prediction = prediction\n        else:\n            raise ValueError(\n                f\"result_format {result_format} is not a valid format. \"\n                f'Valid formats are: \"html\", \"dataframe\", \"cells\"',\n            )\n\n        return prediction\n\n\ntables_agent: UnstructuredTableTransformerModel = UnstructuredTableTransformerModel()\n\n\ndef load_agent():\n    \"\"\"Loads the Table agent.\"\"\"\n\n    if getattr(tables_agent, \"model\", None) is None:\n        with tables_agent._lock:\n            if getattr(tables_agent, \"model\", None) is None:\n                logger.info(\"Loading the Table agent ...\")\n                tables_agent.initialize(DEFAULT_MODEL)\n\n    return\n\n\ndef get_class_map(data_type: str):\n    \"\"\"Defines class map dictionaries\"\"\"\n    if data_type == \"structure\":\n        class_map = {\n            \"table\": 0,\n            \"table column\": 1,\n            \"table row\": 2,\n            \"table column header\": 3,\n            \"table projected row header\": 4,\n            \"table spanning cell\": 5,\n            \"no object\": 6,\n        }\n    elif data_type == \"detection\":\n        class_map = {\"table\": 0, \"table rotated\": 1, \"no object\": 2}\n    return class_map\n\n\nstructure_class_thresholds = {\n    \"table\": inference_config.TT_TABLE_CONF,\n    \"table column\": inference_config.TABLE_COLUMN_CONF,\n    \"table row\": inference_config.TABLE_ROW_CONF,\n    \"table column header\": inference_config.TABLE_COLUMN_HEADER_CONF,\n    \"table projected row header\": inference_config.TABLE_PROJECTED_ROW_HEADER_CONF,\n    \"table spanning cell\": inference_config.TABLE_SPANNING_CELL_CONF,\n    # FIXME (yao) this parameter doesn't seem to be used at all in inference? Can we remove it\n    \"no object\": 10,\n}\n\n\ndef recognize(outputs: TableTransformerObjectDetectionOutput, img: PILImage.Image, tokens: list):\n    \"\"\"Recognize table elements.\"\"\"\n    str_class_name2idx = get_class_map(\"structure\")\n    str_class_idx2name = {v: k for k, v in str_class_name2idx.items()}\n    class_thresholds = structure_class_thresholds\n\n    # Post-process detected objects, assign class labels\n    objects = outputs_to_objects(outputs, img.size, str_class_idx2name)\n    high_confidence_objects = apply_thresholds_on_objects(objects, class_thresholds)\n    # Further process the detected objects so they correspond to a consistent table\n    tables_structure = objects_to_structures(high_confidence_objects, tokens, class_thresholds)\n    # Enumerate all table cells: grid cells and spanning cells\n    return [structure_to_cells(structure, tokens)[0] for structure in tables_structure]\n\n\ndef outputs_to_objects(\n    outputs: TableTransformerObjectDetectionOutput,\n    img_size: Tuple[int, int],\n    class_idx2name: Mapping[int, str],\n):\n    \"\"\"Output table element types.\"\"\"\n    m = outputs[\"logits\"].softmax(-1).max(-1)\n    pred_labels = m.indices.detach().cpu().numpy()[0]\n    pred_scores = m.values.detach().cpu().numpy()[0]\n    pred_bboxes = outputs[\"pred_boxes\"].detach().cpu()[0]\n\n    pad = outputs.get(\"pad_for_structure_detection\", 0)\n    scale_size = (img_size[0] + pad * 2, img_size[1] + pad * 2)\n    rescaled = rescale_bboxes(pred_bboxes, scale_size)\n    # unshift the padding; padding effectively shifted the bounding boxes of structures in the\n    # original image with half of the total pad\n    if pad != 0:\n        rescaled = rescaled - pad\n    pred_bboxes = rescaled.tolist()\n\n    objects = []\n    for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):\n        class_label = class_idx2name[int(label)]\n        if class_label != \"no object\":\n            objects.append(\n                {\n                    \"label\": class_label,\n                    \"score\": float(score),\n                    \"bbox\": bbox,\n                },\n            )\n\n    return objects\n\n\ndef apply_thresholds_on_objects(\n    objects: Sequence[Mapping[str, Any]],\n    thresholds: Mapping[str, float],\n) -> Sequence[Mapping[str, Any]]:\n    \"\"\"\n    Filters predicted objects which the confidence scores below the thresholds\n\n    Args:\n        objects: Sequence of mappings for example:\n        [\n            {\n                \"label\": \"table row\",\n                \"score\": 0.55,\n                \"bbox\": [...],\n            },\n            ...,\n        ]\n        thresholds: Mapping from labels to thresholds\n\n    Returns:\n        Filtered list of objects\n\n    \"\"\"\n    objects = [obj for obj in objects if obj[\"score\"] >= thresholds[obj[\"label\"]]]\n    return objects\n\n\n# for output bounding box post-processing\ndef box_cxcywh_to_xyxy(x):\n    \"\"\"Convert rectangle format from center-x, center-y, width, height to\n    x-min, y-min, x-max, y-max.\"\"\"\n    x_c, y_c, w, h = x.unbind(-1)\n    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]\n    return torch.stack(b, dim=1)\n\n\ndef rescale_bboxes(out_bbox, size):\n    \"\"\"Rescale relative bounding box to box of size given by size.\"\"\"\n    img_w, img_h = size\n    b = box_cxcywh_to_xyxy(out_bbox)\n    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32, device=out_bbox.device)\n    return b\n\n\ndef iob(bbox1, bbox2):\n    \"\"\"\n    Compute the intersection area over box area, for bbox1.\n    \"\"\"\n    intersection = Rect(bbox1).intersect(Rect(bbox2))\n\n    bbox1_area = Rect(bbox1).get_area()\n    if bbox1_area > 0:\n        return intersection.get_area() / bbox1_area\n\n    return 0\n\n\ndef objects_to_structures(objects, tokens, class_thresholds):\n    \"\"\"\n    Process the bounding boxes produced by the table structure recognition model into\n    a *consistent* set of table structures (rows, columns, spanning cells, headers).\n    This entails resolving conflicts/overlaps, and ensuring the boxes meet certain alignment\n    conditions (for example: rows should all have the same width, etc.).\n    \"\"\"\n\n    tables = [obj for obj in objects if obj[\"label\"] == \"table\"]\n    table_structures = []\n\n    for table in tables:\n        table_objects = [\n            obj\n            for obj in objects\n            if iob(obj[\"bbox\"], table[\"bbox\"]) >= inference_config.TABLE_IOB_THRESHOLD\n        ]\n        table_tokens = [\n            token\n            for token in tokens\n            if iob(token[\"bbox\"], table[\"bbox\"]) >= inference_config.TABLE_IOB_THRESHOLD\n        ]\n\n        structure = {}\n\n        columns = [obj for obj in table_objects if obj[\"label\"] == \"table column\"]\n        rows = [obj for obj in table_objects if obj[\"label\"] == \"table row\"]\n        column_headers = [obj for obj in table_objects if obj[\"label\"] == \"table column header\"]\n        spanning_cells = [obj for obj in table_objects if obj[\"label\"] == \"table spanning cell\"]\n        for obj in spanning_cells:\n            obj[\"projected row header\"] = False\n        projected_row_headers = [\n            obj for obj in table_objects if obj[\"label\"] == \"table projected row header\"\n        ]\n        for obj in projected_row_headers:\n            obj[\"projected row header\"] = True\n        spanning_cells += projected_row_headers\n        for obj in rows:\n            obj[\"column header\"] = False\n            for header_obj in column_headers:\n                if iob(obj[\"bbox\"], header_obj[\"bbox\"]) >= inference_config.TABLE_IOB_THRESHOLD:\n                    obj[\"column header\"] = True\n\n        # Refine table structures\n        rows = postprocess.refine_rows(rows, table_tokens, class_thresholds[\"table row\"])\n        columns = postprocess.refine_columns(\n            columns,\n            table_tokens,\n            class_thresholds[\"table column\"],\n        )\n\n        # Shrink table bbox to just the total height of the rows\n        # and the total width of the columns\n        row_rect = Rect()\n        for obj in rows:\n            row_rect.include_rect(obj[\"bbox\"])\n        column_rect = Rect()\n        for obj in columns:\n            column_rect.include_rect(obj[\"bbox\"])\n        table[\"row_column_bbox\"] = [\n            column_rect.x_min,\n            row_rect.y_min,\n            column_rect.x_max,\n            row_rect.y_max,\n        ]\n        table[\"bbox\"] = table[\"row_column_bbox\"]\n\n        # Process the rows and columns into a complete segmented table\n        columns = postprocess.align_columns(columns, table[\"row_column_bbox\"])\n        rows = postprocess.align_rows(rows, table[\"row_column_bbox\"])\n\n        structure[\"rows\"] = rows\n        structure[\"columns\"] = columns\n        structure[\"column headers\"] = column_headers\n        structure[\"spanning cells\"] = spanning_cells\n\n        if len(rows) > 0 and len(columns) > 1:\n            structure = refine_table_structure(structure, class_thresholds)\n\n        table_structures.append(structure)\n\n    return table_structures\n\n\ndef refine_table_structure(table_structure, class_thresholds):\n    \"\"\"\n    Apply operations to the detected table structure objects such as\n    thresholding, NMS, and alignment.\n    \"\"\"\n    rows = table_structure[\"rows\"]\n    columns = table_structure[\"columns\"]\n\n    # Process the headers\n    column_headers = table_structure[\"column headers\"]\n    column_headers = postprocess.apply_threshold(\n        column_headers,\n        class_thresholds[\"table column header\"],\n    )\n    column_headers = postprocess.nms(column_headers)\n    column_headers = align_headers(column_headers, rows)\n\n    # Process spanning cells\n    spanning_cells = [\n        elem for elem in table_structure[\"spanning cells\"] if not elem[\"projected row header\"]\n    ]\n    projected_row_headers = [\n        elem for elem in table_structure[\"spanning cells\"] if elem[\"projected row header\"]\n    ]\n    spanning_cells = postprocess.apply_threshold(\n        spanning_cells,\n        class_thresholds[\"table spanning cell\"],\n    )\n    projected_row_headers = postprocess.apply_threshold(\n        projected_row_headers,\n        class_thresholds[\"table projected row header\"],\n    )\n    spanning_cells += projected_row_headers\n    # Align before NMS for spanning cells because alignment brings them into agreement\n    # with rows and columns first; if spanning cells still overlap after this operation,\n    # the threshold for NMS can basically be lowered to just above 0\n    spanning_cells = postprocess.align_supercells(spanning_cells, rows, columns)\n    spanning_cells = postprocess.nms_supercells(spanning_cells)\n\n    postprocess.header_supercell_tree(spanning_cells)\n\n    table_structure[\"columns\"] = columns\n    table_structure[\"rows\"] = rows\n    table_structure[\"spanning cells\"] = spanning_cells\n    table_structure[\"column headers\"] = column_headers\n\n    return table_structure\n\n\ndef align_headers(headers, rows):\n    \"\"\"\n    Adjust the header boundary to be the convex hull of the rows it intersects\n    at least 50% of the height of.\n\n    For now, we are not supporting tables with multiple headers, so we need to\n    eliminate anything besides the top-most header.\n    \"\"\"\n\n    aligned_headers = []\n\n    for row in rows:\n        row[\"column header\"] = False\n\n    header_row_nums = []\n    for header in headers:\n        for row_num, row in enumerate(rows):\n            row_height = row[\"bbox\"][3] - row[\"bbox\"][1]\n            min_row_overlap = max(row[\"bbox\"][1], header[\"bbox\"][1])\n            max_row_overlap = min(row[\"bbox\"][3], header[\"bbox\"][3])\n            overlap_height = max_row_overlap - min_row_overlap\n            if overlap_height / row_height >= 0.5:\n                header_row_nums.append(row_num)\n\n    if len(header_row_nums) == 0:\n        return aligned_headers\n\n    header_rect = Rect()\n    if header_row_nums[0] > 0:\n        header_row_nums = list(range(header_row_nums[0] + 1)) + header_row_nums\n\n    last_row_num = -1\n    for row_num in header_row_nums:\n        if row_num == last_row_num + 1:\n            row = rows[row_num]\n            row[\"column header\"] = True\n            header_rect = header_rect.include_rect(row[\"bbox\"])\n            last_row_num = row_num\n        else:\n            # Break as soon as a non-header row is encountered.\n            # This ignores any subsequent rows in the table labeled as a header.\n            # Having more than 1 header is not supported currently.\n            break\n\n    header = {\"bbox\": header_rect.get_bbox()}\n    aligned_headers.append(header)\n\n    return aligned_headers\n\n\ndef compute_confidence_score(cell_match_scores):\n    \"\"\"\n    Compute a confidence score based on how well the page tokens\n    slot into the cells reported by the model\n    \"\"\"\n    try:\n        mean_match_score = sum(cell_match_scores) / len(cell_match_scores)\n        min_match_score = min(cell_match_scores)\n        confidence_score = (mean_match_score + min_match_score) / 2\n    except ZeroDivisionError:\n        confidence_score = 0\n    return confidence_score\n\n\ndef structure_to_cells(table_structure, tokens):\n    \"\"\"\n    Assuming the row, column, spanning cell, and header bounding boxes have\n    been refined into a set of consistent table structures, process these\n    table structures into table cells. This is a universal representation\n    format for the table, which can later be exported to Pandas or CSV formats.\n    Classify the cells as header/access cells or data cells\n    based on if they intersect with the header bounding box.\n    \"\"\"\n    columns = table_structure[\"columns\"]\n    rows = table_structure[\"rows\"]\n    spanning_cells = table_structure[\"spanning cells\"]\n    spanning_cells = sorted(spanning_cells, reverse=True, key=lambda cell: cell[\"score\"])\n\n    cells = []\n    subcells = []\n    # Identify complete cells and subcells\n    for column_num, column in enumerate(columns):\n        for row_num, row in enumerate(rows):\n            column_rect = Rect(list(column[\"bbox\"]))\n            row_rect = Rect(list(row[\"bbox\"]))\n            cell_rect = row_rect.intersect(column_rect)\n            header = \"column header\" in row and row[\"column header\"]\n            cell = {\n                \"bbox\": cell_rect.get_bbox(),\n                \"column_nums\": [column_num],\n                \"row_nums\": [row_num],\n                \"column header\": header,\n            }\n\n            cell[\"subcell\"] = False\n            for spanning_cell in spanning_cells:\n                spanning_cell_rect = Rect(list(spanning_cell[\"bbox\"]))\n                if (\n                    spanning_cell_rect.intersect(cell_rect).get_area() / cell_rect.get_area()\n                ) > inference_config.TABLE_IOB_THRESHOLD:\n                    cell[\"subcell\"] = True\n                    cell[\"is_merged\"] = False\n                    break\n\n            if cell[\"subcell\"]:\n                subcells.append(cell)\n            else:\n                # cell text = extract_text_inside_bbox(table_spans, cell['bbox'])\n                # cell['cell text'] = cell text\n                cell[\"projected row header\"] = False\n                cells.append(cell)\n\n    for spanning_cell in spanning_cells:\n        spanning_cell_rect = Rect(list(spanning_cell[\"bbox\"]))\n        cell_columns = set()\n        cell_rows = set()\n        cell_rect = None\n        header = True\n        for subcell in subcells:\n            subcell_rect = Rect(list(subcell[\"bbox\"]))\n            subcell_rect_area = subcell_rect.get_area()\n            if (\n                subcell_rect.intersect(spanning_cell_rect).get_area() / subcell_rect_area\n            ) > inference_config.TABLE_IOB_THRESHOLD and subcell[\"is_merged\"] is False:\n                if cell_rect is None:\n                    cell_rect = Rect(list(subcell[\"bbox\"]))\n                else:\n                    cell_rect.include_rect(list(subcell[\"bbox\"]))\n                cell_rows = cell_rows.union(set(subcell[\"row_nums\"]))\n                cell_columns = cell_columns.union(set(subcell[\"column_nums\"]))\n                # By convention here, all subcells must be classified\n                # as header cells for a spanning cell to be classified as a header cell;\n                # otherwise, this could lead to a non-rectangular header region\n                header = header and \"column header\" in subcell and subcell[\"column header\"]\n                subcell[\"is_merged\"] = True\n\n        if len(cell_rows) > 0 and len(cell_columns) > 0:\n            cell = {\n                \"bbox\": cell_rect.get_bbox(),\n                \"column_nums\": list(cell_columns),\n                \"row_nums\": list(cell_rows),\n                \"column header\": header,\n                \"projected row header\": spanning_cell[\"projected row header\"],\n            }\n            cells.append(cell)\n\n    _, _, cell_match_scores = postprocess.slot_into_containers(cells, tokens)\n    confidence_score = compute_confidence_score(cell_match_scores)\n\n    # Dilate rows and columns before final extraction\n    # dilated_columns = fill_column_gaps(columns, table_bbox)\n    dilated_columns = columns\n    # dilated_rows = fill_row_gaps(rows, table_bbox)\n    dilated_rows = rows\n    for cell in cells:\n        column_rect = Rect()\n        for column_num in cell[\"column_nums\"]:\n            column_rect.include_rect(list(dilated_columns[column_num][\"bbox\"]))\n        row_rect = Rect()\n        for row_num in cell[\"row_nums\"]:\n            row_rect.include_rect(list(dilated_rows[row_num][\"bbox\"]))\n        cell_rect = column_rect.intersect(row_rect)\n        cell[\"bbox\"] = cell_rect.get_bbox()\n\n    span_nums_by_cell, _, _ = postprocess.slot_into_containers(\n        cells,\n        tokens,\n        overlap_threshold=0.001,\n        forced_assignment=False,\n    )\n\n    for cell, cell_span_nums in zip(cells, span_nums_by_cell):\n        cell_spans = [tokens[num] for num in cell_span_nums]\n        # TODO: Refine how text is extracted; should be character-based, not span-based;\n        # but need to associate\n        cell[\"cell text\"] = postprocess.extract_text_from_spans(\n            cell_spans,\n            remove_integer_superscripts=False,\n        )\n        cell[\"spans\"] = cell_spans\n\n    # Adjust the row, column, and cell bounding boxes to reflect the extracted text\n    num_rows = len(rows)\n    rows = postprocess.sort_objects_top_to_bottom(rows)\n    num_columns = len(columns)\n    columns = postprocess.sort_objects_left_to_right(columns)\n    min_y_values_by_row = defaultdict(list)\n    max_y_values_by_row = defaultdict(list)\n    min_x_values_by_column = defaultdict(list)\n    max_x_values_by_column = defaultdict(list)\n    for cell in cells:\n        min_row = min(cell[\"row_nums\"])\n        max_row = max(cell[\"row_nums\"])\n        min_column = min(cell[\"column_nums\"])\n        max_column = max(cell[\"column_nums\"])\n        for span in cell[\"spans\"]:\n            min_x_values_by_column[min_column].append(span[\"bbox\"][0])\n            min_y_values_by_row[min_row].append(span[\"bbox\"][1])\n            max_x_values_by_column[max_column].append(span[\"bbox\"][2])\n            max_y_values_by_row[max_row].append(span[\"bbox\"][3])\n    for row_num, row in enumerate(rows):\n        if len(min_x_values_by_column[0]) > 0:\n            row[\"bbox\"][0] = min(min_x_values_by_column[0])\n        if len(min_y_values_by_row[row_num]) > 0:\n            row[\"bbox\"][1] = min(min_y_values_by_row[row_num])\n        if len(max_x_values_by_column[num_columns - 1]) > 0:\n            row[\"bbox\"][2] = max(max_x_values_by_column[num_columns - 1])\n        if len(max_y_values_by_row[row_num]) > 0:\n            row[\"bbox\"][3] = max(max_y_values_by_row[row_num])\n    for column_num, column in enumerate(columns):\n        if len(min_x_values_by_column[column_num]) > 0:\n            column[\"bbox\"][0] = min(min_x_values_by_column[column_num])\n        if len(min_y_values_by_row[0]) > 0:\n            column[\"bbox\"][1] = min(min_y_values_by_row[0])\n        if len(max_x_values_by_column[column_num]) > 0:\n            column[\"bbox\"][2] = max(max_x_values_by_column[column_num])\n        if len(max_y_values_by_row[num_rows - 1]) > 0:\n            column[\"bbox\"][3] = max(max_y_values_by_row[num_rows - 1])\n    for cell in cells:\n        row_rect = None\n        column_rect = None\n        for row_num in cell[\"row_nums\"]:\n            if row_rect is None:\n                row_rect = Rect(list(rows[row_num][\"bbox\"]))\n            else:\n                row_rect.include_rect(list(rows[row_num][\"bbox\"]))\n        for column_num in cell[\"column_nums\"]:\n            if column_rect is None:\n                column_rect = Rect(list(columns[column_num][\"bbox\"]))\n            else:\n                column_rect.include_rect(list(columns[column_num][\"bbox\"]))\n        cell_rect = row_rect.intersect(column_rect)\n        if cell_rect.get_area() > 0:\n            cell[\"bbox\"] = cell_rect.get_bbox()\n            pass\n\n    return cells, confidence_score\n\n\ndef fill_cells(cells: List[dict]) -> List[dict]:\n    \"\"\"fills the missing cells in the table by adding a cells with empty text\n    where there are no cells detected by the model.\n\n    A cell contains the following keys relevent to the html conversion:\n    row_nums: List[int]\n        the row numbers this cell belongs to; for cells spanning multiple rows there are more than\n        one numbers\n    column_nums: List[int]\n        the columns numbers this cell belongs to; for cells spanning multiple columns there are more\n        than one numbers\n    cell text: str\n        the text in this cell\n    column header: bool\n        whether this cell is a column header\n\n    \"\"\"\n    if not cells:\n        return []\n\n    # Find max row and col indices\n    max_row = max(row for cell in cells for row in cell[\"row_nums\"])\n    max_col = max(col for cell in cells for col in cell[\"column_nums\"])\n    filled = set()\n    for cell in cells:\n        for row in cell[\"row_nums\"]:\n            for col in cell[\"column_nums\"]:\n                filled.add((row, col))\n    header_rows = set()\n    for cell in cells:\n        if cell[\"column header\"]:\n            header_rows.update(cell[\"row_nums\"])\n\n    # Compose output list directly for speed\n    new_cells = cells.copy()\n    for row in range(max_row + 1):\n        for col in range(max_col + 1):\n            if (row, col) not in filled:\n                new_cells.append(\n                    {\n                        \"row_nums\": [row],\n                        \"column_nums\": [col],\n                        \"cell text\": \"\",\n                        \"column header\": row in header_rows,\n                    }\n                )\n    return new_cells\n\n\ndef cells_to_html(cells: List[dict]) -> str:\n    \"\"\"Convert table structure to html format.\n\n    Args:\n        cells: List of dictionaries representing table cells, where each dictionary has the\n            following format:\n            {\n                \"row_nums\": List[int],\n                \"column_nums\": List[int],\n                \"cell text\": str,\n                \"column header\": bool,\n            }\n    Returns:\n        str: HTML table string\n    \"\"\"\n    # Pre-sort with tuple key, as per original\n    cells_filled = fill_cells(cells)\n    cells_sorted = sorted(cells_filled, key=lambda k: (min(k[\"row_nums\"]), min(k[\"column_nums\"])))\n\n    table = ET.Element(\"table\")\n    current_row = -1\n\n    # Check if any column header exists\n    table_has_header = any(cell[\"column header\"] for cell in cells_sorted)\n    table_header = ET.SubElement(table, \"thead\") if table_has_header else None\n    table_body = ET.SubElement(table, \"tbody\")\n\n    row = None\n    for cell in cells_sorted:\n        this_row = min(cell[\"row_nums\"])\n        attrib = {}\n        colspan = len(cell[\"column_nums\"])\n        if colspan > 1:\n            attrib[\"colspan\"] = str(colspan)\n        rowspan = len(cell[\"row_nums\"])\n        if rowspan > 1:\n            attrib[\"rowspan\"] = str(rowspan)\n        if this_row > current_row:\n            current_row = this_row\n            if cell[\"column header\"]:\n                table_subelement = table_header\n                cell_tag = \"th\"\n            else:\n                table_subelement = table_body\n                cell_tag = \"td\"\n            row = ET.SubElement(table_subelement, \"tr\")  # type: ignore\n        if row is not None:\n            tcell = ET.SubElement(row, cell_tag, attrib=attrib)\n            tcell.text = cell[\"cell text\"]\n\n    return str(ET.tostring(table, encoding=\"unicode\", short_empty_elements=False))\n\n\ndef zoom_image(image: PILImage.Image, zoom: float) -> PILImage.Image:\n    \"\"\"scale an image based on the zoom factor using cv2; the scaled image is post processed by\n    dilation then erosion to improve edge sharpness for OCR tasks\"\"\"\n    if zoom <= 0:\n        # no zoom but still does dilation and erosion\n        zoom = 1\n    new_image = cv2.resize(\n        cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR),\n        None,\n        fx=zoom,\n        fy=zoom,\n        interpolation=cv2.INTER_CUBIC,\n    )\n    kernel = np.ones((1, 1), np.uint8)\n    new_image = cv2.dilate(new_image, kernel, iterations=1, dst=new_image)\n    new_image = cv2.erode(new_image, kernel, iterations=1, dst=new_image)\n\n    return PILImage.fromarray(new_image)\n"
  },
  {
    "path": "unstructured_inference/models/unstructuredmodel.py",
    "content": "from __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom typing import Any, List, cast\n\nimport numpy as np\nfrom PIL.Image import Image\n\nfrom unstructured_inference.constants import ElementType\nfrom unstructured_inference.inference.elements import (\n    grow_region_to_match_region,\n    intersections,\n)\nfrom unstructured_inference.inference.layoutelement import (\n    LayoutElement,\n    LayoutElements,\n    clean_layoutelements,\n    partition_groups_from_regions,\n    separate,\n)\n\n\nclass UnstructuredModel(ABC):\n    \"\"\"Wrapper class for the various models used by unstructured.\"\"\"\n\n    def __init__(self):\n        \"\"\"model should support inference of some sort, either by calling or by some method.\n        UnstructuredModel doesn't provide any training interface, it's assumed the model is\n        already trained.\n        \"\"\"\n        self.model = None\n\n    @abstractmethod\n    def predict(self, x: Any) -> Any:\n        \"\"\"Do inference using the wrapped model.\"\"\"\n        if self.model is None:\n            raise ModelNotInitializedError(\n                \"Model has not been initialized. Please call the initialize method with the \"\n                \"appropriate arguments for loading the model.\",\n            )\n        pass  # pragma: no cover\n\n    def __call__(self, x: Any) -> Any:\n        \"\"\"Inference using function call interface.\"\"\"\n        return self.predict(x)\n\n    @abstractmethod\n    def initialize(self, *args, **kwargs):\n        \"\"\"Load the model for inference.\"\"\"\n        pass  # pragma: no cover\n\n\nclass UnstructuredObjectDetectionModel(UnstructuredModel):\n    \"\"\"Wrapper class for object detection models used by unstructured.\"\"\"\n\n    @abstractmethod\n    def predict(self, x: Image) -> LayoutElements | list[LayoutElement]:\n        \"\"\"Do inference using the wrapped model.\"\"\"\n        super().predict(x)\n        return []\n\n    def __call__(self, x: Image) -> LayoutElements:\n        \"\"\"Inference using function call interface.\"\"\"\n        return super().__call__(x)\n\n    @staticmethod\n    def enhance_regions(\n        elements: List[LayoutElement],\n        iom_to_merge: float = 0.3,\n    ) -> List[LayoutElement]:\n        \"\"\"This function traverses all the elements and either deletes nested elements,\n        or merges or splits them depending on the iom score for both regions\"\"\"\n        rects = [el.bbox for el in elements]\n        intersections_mtx = intersections(*rects)\n\n        for i, row in enumerate(intersections_mtx):\n            first = elements[i]\n            if first:\n                # We get only the elements which intersected\n                indices_to_check = np.where(row)[0]\n                # Delete the first element, since it will always intersect with itself\n                indices_to_check = indices_to_check[indices_to_check != i]\n                if len(indices_to_check) == 0:\n                    continue\n                if len(indices_to_check) > 1:  # sort by iom\n                    iom_to_check = [\n                        (j, first.bbox.intersection_over_minimum(elements[j].bbox))\n                        for j in indices_to_check\n                        if elements[j] is not None\n                    ]\n                    iom_to_check.sort(\n                        key=lambda x: x[1],\n                        reverse=True,\n                    )  # sort elements by iom, so we first check the greatest\n                    indices_to_check = [x[0] for x in iom_to_check if x[0] != i]  # type:ignore\n                for j in indices_to_check:\n                    if elements[j] is None or elements[i] is None:\n                        continue\n                    second = elements[j]\n                    intersection = first.bbox.intersection(\n                        second.bbox,\n                    )  # we know it does, but need the region\n                    first_inside_second = first.bbox.is_in(second.bbox)\n                    second_inside_first = second.bbox.is_in(first.bbox)\n\n                    if first_inside_second and not second_inside_first:\n                        elements[i] = None  # type:ignore\n                    elif second_inside_first and not first_inside_second:\n                        # delete second element\n                        elements[j] = None  # type:ignore\n                    elif intersection:\n                        iom = first.bbox.intersection_over_minimum(second.bbox)\n                        if iom < iom_to_merge:  # small\n                            separate(first.bbox, second.bbox)\n                            # The rectangle could become too small, which is a\n                            # good size to delete?\n                        else:  # big\n                            # merge\n                            if first.bbox.area > second.bbox.area:\n                                grow_region_to_match_region(first.bbox, second.bbox)\n                                elements[j] = None  # type:ignore\n                            else:\n                                grow_region_to_match_region(second.bbox, first.bbox)\n                                elements[i] = None  # type:ignore\n\n        elements = [e for e in elements if e is not None]\n        return elements\n\n    @staticmethod\n    def clean_type(\n        elements: list[LayoutElement],\n        type_to_clean=ElementType.TABLE,\n    ) -> List[LayoutElement]:\n        \"\"\"After this function, the list of elements will not contain any element inside\n        of the type specified\"\"\"\n        target_elements = [e for e in elements if e.type == type_to_clean]\n        other_elements = [e for e in elements if e.type != type_to_clean]\n        if len(target_elements) == 0 or len(other_elements) == 0:\n            return elements\n\n        # Sort elements from biggest to smallest\n        target_elements.sort(key=lambda e: e.bbox.area, reverse=True)\n        other_elements.sort(key=lambda e: e.bbox.area, reverse=True)\n\n        # First check if targets contains each other\n        for element in target_elements:  # Just handles containment or little overlap\n            contains = [\n                e\n                for e in target_elements\n                if e.bbox.is_almost_subregion_of(element.bbox) and e != element\n            ]\n            for contained in contains:\n                target_elements.remove(contained)\n        # Then check if remaining elements intersect with targets\n        other_elements = filter(\n            lambda e: (\n                not any(e.bbox.is_almost_subregion_of(target.bbox) for target in target_elements)\n            ),\n            other_elements,\n        )  # type:ignore\n\n        final_elements = list(other_elements)\n        final_elements.extend(target_elements)\n        # Note(benjamin): could use bisect.insort if < operator is added to LayoutElement\n        final_elements.sort(key=lambda e: e.bbox.y1)\n        return final_elements\n\n    def deduplicate_detected_elements(\n        self,\n        elements: LayoutElements,\n        min_text_size: int = 15,\n    ) -> LayoutElements:\n        \"\"\"Deletes overlapping elements in a list of elements.\"\"\"\n\n        if len(elements) <= 1:\n            return elements\n\n        cleaned_elements = []\n        # TODO: Delete nested elements with low or None probability\n        # TODO: Keep most confident\n        # TODO: Better to grow horizontally than vertically?\n        groups = cast(list[LayoutElements], partition_groups_from_regions(elements))\n        for group in groups:\n            cleaned_elements.append(clean_layoutelements(group))\n        return LayoutElements.concatenate(cleaned_elements)\n\n\nclass UnstructuredElementExtractionModel(UnstructuredModel):\n    \"\"\"Wrapper class for object extraction models used by unstructured.\"\"\"\n\n    @abstractmethod\n    def predict(self, x: Image) -> List[LayoutElement]:\n        \"\"\"Do inference using the wrapped model.\"\"\"\n        super().predict(x)\n        return []  # pragma: no cover\n\n    def __call__(self, x: Image) -> List[LayoutElement]:\n        \"\"\"Inference using function call interface.\"\"\"\n        return super().__call__(x)\n\n\nclass ModelNotInitializedError(Exception):\n    pass\n"
  },
  {
    "path": "unstructured_inference/models/yolox.py",
    "content": "# Copyright (c) Megvii, Inc. and its affiliates.\n# Unstructured modified the original source code found at:\n# https://github.com/Megvii-BaseDetection/YOLOX/blob/237e943ac64aa32eb32f875faa93ebb18512d41d/yolox/data/data_augment.py\n# https://github.com/Megvii-BaseDetection/YOLOX/blob/ac379df3c97d1835ebd319afad0c031c36d03f36/yolox/utils/demo_utils.py\n\nimport cv2\nimport numpy as np\nimport onnxruntime\nfrom onnxruntime.capi import _pybind_state as C\nfrom PIL import Image as PILImage\n\nfrom unstructured_inference.constants import ElementType, Source\nfrom unstructured_inference.inference.layoutelement import LayoutElements\nfrom unstructured_inference.models.unstructuredmodel import (\n    UnstructuredObjectDetectionModel,\n)\nfrom unstructured_inference.utils import (\n    LazyDict,\n    LazyEvaluateInfo,\n    download_if_needed_and_get_local_path,\n)\n\nYOLOX_LABEL_MAP = {\n    0: ElementType.CAPTION,\n    1: ElementType.FOOTNOTE,\n    2: ElementType.FORMULA,\n    3: ElementType.LIST_ITEM,\n    4: ElementType.PAGE_FOOTER,\n    5: ElementType.PAGE_HEADER,\n    6: ElementType.PICTURE,\n    7: ElementType.SECTION_HEADER,\n    8: ElementType.TABLE,\n    9: ElementType.TEXT,\n    10: ElementType.TITLE,\n}\n\nMODEL_TYPES = {\n    \"yolox\": LazyDict(\n        model_path=LazyEvaluateInfo(\n            download_if_needed_and_get_local_path,\n            \"unstructuredio/yolo_x_layout\",\n            \"yolox_l0.05.onnx\",\n        ),\n        label_map=YOLOX_LABEL_MAP,\n    ),\n    \"yolox_tiny\": LazyDict(\n        model_path=LazyEvaluateInfo(\n            download_if_needed_and_get_local_path,\n            \"unstructuredio/yolo_x_layout\",\n            \"yolox_tiny.onnx\",\n        ),\n        label_map=YOLOX_LABEL_MAP,\n    ),\n    \"yolox_quantized\": LazyDict(\n        model_path=LazyEvaluateInfo(\n            download_if_needed_and_get_local_path,\n            \"unstructuredio/yolo_x_layout\",\n            \"yolox_l0.05_quantized.onnx\",\n        ),\n        label_map=YOLOX_LABEL_MAP,\n    ),\n}\n\n\nclass UnstructuredYoloXModel(UnstructuredObjectDetectionModel):\n    def predict(self, x: PILImage.Image):\n        \"\"\"Predict using YoloX model.\"\"\"\n        super().predict(x)\n        return self.image_processing(x)\n\n    def initialize(self, model_path: str, label_map: dict):\n        \"\"\"Start inference session for YoloX model.\"\"\"\n        self.model_path = model_path\n\n        available_providers = C.get_available_providers()\n        ordered_providers = [\n            \"TensorrtExecutionProvider\",\n            \"CUDAExecutionProvider\",\n            \"CPUExecutionProvider\",\n        ]\n        providers = [provider for provider in ordered_providers if provider in available_providers]\n\n        self.model = onnxruntime.InferenceSession(\n            model_path,\n            providers=providers,\n        )\n\n        self.layout_classes = label_map\n\n    def image_processing(\n        self,\n        image: PILImage.Image,\n    ) -> LayoutElements:\n        \"\"\"Method runing YoloX for layout detection, returns a PageLayout\n        parameters\n        ----------\n        page\n            Path for image file with the image to process\n        origin_img\n            If specified, an Image object for process with YoloX model\n        page_number\n            Number asigned to the PageLayout returned\n        output_directory\n            Boolean indicating if result will be stored\n        \"\"\"\n        # The model was trained and exported with this shape\n        # TODO (benjamin): check other shapes for inference\n        input_shape = (1024, 768)\n        origin_img = np.array(image)\n        image.close()\n        img, ratio = preprocess(origin_img, input_shape)\n        del origin_img  # Free full-size image array before ONNX inference\n        session = self.model\n\n        ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}\n        output = session.run(None, ort_inputs)\n        del img, ort_inputs  # Free preprocessed inputs after inference\n        # TODO(benjamin): check for p6\n        predictions = demo_postprocess(output[0], input_shape, p6=False)[0]\n        del output\n\n        boxes = predictions[:, :4]\n        scores = predictions[:, 4:5] * predictions[:, 5:]\n\n        boxes_xyxy = np.ones_like(boxes)\n        boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0\n        boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0\n        boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0\n        boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0\n        boxes_xyxy /= ratio\n\n        # Note (Benjamin): Distinct models (quantized and original) requires distincts\n        # levels of thresholds\n        if \"quantized\" in self.model_path:\n            dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.0, score_thr=0.07)\n        else:\n            dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.1, score_thr=0.25)\n\n        order = np.argsort(dets[:, 1])\n        sorted_dets = dets[order]\n\n        return LayoutElements(\n            element_coords=sorted_dets[:, :4].astype(float),\n            element_probs=sorted_dets[:, 4].astype(float),\n            element_class_ids=sorted_dets[:, 5].astype(int),\n            element_class_id_map=self.layout_classes,\n            sources=np.array([Source.YOLOX] * sorted_dets.shape[0]),\n        )\n\n\n# Note: preprocess function was named preproc on original source\n\n\ndef preprocess(img, input_size, swap=(2, 0, 1)):\n    \"\"\"Preprocess image data before YoloX inference.\"\"\"\n    if len(img.shape) == 3:\n        padded_img = np.full((input_size[0], input_size[1], 3), 114, dtype=np.uint8)\n    else:\n        padded_img = np.full(input_size, 114, dtype=np.uint8)\n\n    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])\n    resized_img = cv2.resize(\n        img,\n        (int(img.shape[1] * r), int(img.shape[0] * r)),\n        interpolation=cv2.INTER_LINEAR,\n    ).astype(np.uint8)\n    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img\n\n    padded_img = padded_img.transpose(swap)\n    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)\n    return padded_img, r\n\n\ndef demo_postprocess(outputs, img_size, p6=False):\n    \"\"\"Postprocessing for YoloX model.\"\"\"\n    grids = []\n    expanded_strides = []\n\n    strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]\n\n    hsizes = [img_size[0] // stride for stride in strides]\n    wsizes = [img_size[1] // stride for stride in strides]\n\n    for hsize, wsize, stride in zip(hsizes, wsizes, strides):\n        xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))\n        grid = np.stack((xv, yv), 2).reshape(1, -1, 2)\n        grids.append(grid)\n        shape = grid.shape[:2]\n        expanded_strides.append(np.full((*shape, 1), stride))\n\n    grids = np.concatenate(grids, 1)\n    expanded_strides = np.concatenate(expanded_strides, 1)\n    outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides\n    outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides\n\n    return outputs\n\n\ndef multiclass_nms(boxes, scores, nms_thr, score_thr, class_agnostic=True):\n    \"\"\"Multiclass NMS implemented in Numpy\"\"\"\n    # TODO(benjamin): check for non-class agnostic\n    # if class_agnostic:\n    nms_method = multiclass_nms_class_agnostic\n    # else:\n    #    nms_method = multiclass_nms_class_aware\n    return nms_method(boxes, scores, nms_thr, score_thr)\n\n\ndef multiclass_nms_class_agnostic(boxes, scores, nms_thr, score_thr):\n    \"\"\"Multiclass NMS implemented in Numpy. Class-agnostic version.\"\"\"\n    cls_inds = scores.argmax(1)\n    cls_scores = scores[np.arange(len(cls_inds)), cls_inds]\n\n    valid_score_mask = cls_scores > score_thr\n    valid_scores = cls_scores[valid_score_mask]\n    valid_boxes = boxes[valid_score_mask]\n    valid_cls_inds = cls_inds[valid_score_mask]\n    keep = nms(valid_boxes, valid_scores, nms_thr)\n    dets = np.concatenate(\n        [valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]],\n        1,\n    )\n    return dets\n\n\ndef nms(boxes, scores, nms_thr):\n    \"\"\"Single class NMS implemented in Numpy.\"\"\"\n    x1 = boxes[:, 0]\n    y1 = boxes[:, 1]\n    x2 = boxes[:, 2]\n    y2 = boxes[:, 3]\n\n    areas = (x2 - x1 + 1) * (y2 - y1 + 1)\n    order = scores.argsort()[::-1]\n\n    keep = []\n    while order.size > 0:\n        i = order[0]\n        keep.append(i)\n        xx1 = np.maximum(x1[i], x1[order[1:]])\n        yy1 = np.maximum(y1[i], y1[order[1:]])\n        xx2 = np.minimum(x2[i], x2[order[1:]])\n        yy2 = np.minimum(y2[i], y2[order[1:]])\n\n        w = np.maximum(0.0, xx2 - xx1 + 1)\n        h = np.maximum(0.0, yy2 - yy1 + 1)\n        inter = w * h\n        ovr = inter / (areas[i] + areas[order[1:]] - inter)\n\n        inds = np.where(ovr <= nms_thr)[0]\n        order = order[inds + 1]\n\n    return keep\n"
  },
  {
    "path": "unstructured_inference/utils.py",
    "content": "import os\nfrom collections.abc import Mapping\nfrom html.parser import HTMLParser\nfrom io import StringIO\nfrom typing import Any, Callable, Hashable, Iterable, Iterator, Union\n\nfrom huggingface_hub import hf_hub_download\nfrom PIL import Image\n\nfrom unstructured_inference.inference.layoutelement import LayoutElement\n\n\nclass LazyEvaluateInfo:\n    \"\"\"Class that stores the information needed to lazily evaluate a function with given arguments.\n    The object stores the information needed for evaluation as a function and its arguments.\n    \"\"\"\n\n    def __init__(self, evaluate: Callable, *args, **kwargs):\n        self.evaluate = evaluate\n        self.info = (args, kwargs)\n\n\nclass LazyDict(Mapping):\n    \"\"\"Class that wraps a dict and only evaluates keys of the dict when the key is accessed. Keys\n    that should be evaluated lazily should use LazyEvaluateInfo objects as values. By default when\n    a value is computed from a LazyEvaluateInfo object, it is converted to the raw value in the\n    internal dict, so subsequent accessing of the key will produce the same value. Set cache=False\n    to avoid storing the raw value.\n    \"\"\"\n\n    def __init__(self, *args, cache=True, **kwargs):\n        self.cache = cache\n        self._raw_dict = dict(*args, **kwargs)\n\n    def __getitem__(self, key: Hashable) -> Union[LazyEvaluateInfo, Any]:\n        value = self._raw_dict.__getitem__(key)\n        if isinstance(value, LazyEvaluateInfo):\n            evaluate = value.evaluate\n            args, kwargs = value.info\n            value = evaluate(*args, **kwargs)\n            if self.cache:\n                self._raw_dict[key] = value\n        return value\n\n    def __iter__(self) -> Iterator:\n        return iter(self._raw_dict)\n\n    def __len__(self) -> int:\n        return len(self._raw_dict)\n\n\ndef tag(elements: Iterable[LayoutElement]):\n    \"\"\"Asign an numeric id to the elements in the list.\n    Useful for debugging\"\"\"\n    colors = [\"red\", \"blue\", \"green\", \"magenta\", \"brown\"]\n    for i, e in enumerate(elements):\n        e.text = f\"-{i}-:{e.text}\"\n        # currently not a property\n        e.id = i  # type:ignore\n        e.color = colors[i % len(colors)]  # type:ignore\n\n\ndef pad_image_with_background_color(\n    image: Image.Image,\n    pad: int = 10,\n    background_color: str = \"white\",\n) -> Image.Image:\n    \"\"\"pads an input image with the same background color around it by pad on all 4 sides\n\n    The original image is kept intact and a new image is returned with padding added.\n    \"\"\"\n    width, height = image.size\n    if pad < 0:\n        raise ValueError(\n            \"Can not pad an image with negative space! Please use a positive value for `pad`.\",\n        )\n    new = Image.new(image.mode, (width + pad * 2, height + pad * 2), background_color)\n    new.paste(image, (pad, pad))\n    return new\n\n\nclass MLStripper(HTMLParser):\n    \"\"\"simple markup language stripper that helps to strip tags from string\"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self.reset()\n        self.strict = True\n        self.convert_charrefs = True\n        self.text = StringIO()\n\n    def handle_data(self, d):\n        \"\"\"process input data\"\"\"\n        self.text.write(d)\n\n    def get_data(self):\n        \"\"\"performs stripping by get the value of text\"\"\"\n        return self.text.getvalue()\n\n\ndef strip_tags(html: str) -> str:\n    \"\"\"stripping html tags from input string and return string without tags\"\"\"\n    s = MLStripper()\n    s.feed(html)\n    return s.get_data()\n\n\ndef download_if_needed_and_get_local_path(path_or_repo: str, filename: str, **kwargs) -> str:\n    \"\"\"Returns path to local file if it exists, otherwise treats it as a huggingface repo and\n    attempts to download.\"\"\"\n    full_path = os.path.join(path_or_repo, filename)\n    if os.path.exists(full_path):\n        return full_path\n    else:\n        return hf_hub_download(path_or_repo, filename, **kwargs)\n"
  },
  {
    "path": "unstructured_inference/visualize.py",
    "content": "# Copyright (c) Megvii Inc. All rights reserved.\n# Unstructured modified the original source code found at\n# https://github.com/Megvii-BaseDetection/YOLOX/blob/ac379df3c97d1835ebd319afad0c031c36d03f36/yolox/utils/visualize.py\nimport typing\nfrom typing import Optional, Union\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom PIL import ImageFont\nfrom PIL.Image import Image\nfrom PIL.ImageDraw import ImageDraw\n\nfrom unstructured_inference.inference.elements import TextRegion\n\n\n@typing.no_type_check\ndef draw_bbox(\n    image: Image,\n    element: TextRegion,\n    color: str = \"red\",\n    width=1,\n    details: bool = False,\n) -> Image:\n    \"\"\"Draws bounding box in image\"\"\"\n    try:\n        img = image.copy()\n        draw = ImageDraw(img)\n        topleft, _, bottomright, _ = element.bbox.coordinates\n        c = getattr(element, \"color\", color)\n        if details:\n            source = getattr(element, \"source\", \"Unknown\")\n            type = getattr(element, \"type\", \"\")\n            kbd = ImageFont.truetype(\"Keyboard.ttf\", 20)\n            draw.text(topleft, text=f\"{type} {source}\", fill=c, font=kbd)\n        draw.rectangle((topleft, bottomright), outline=c, width=width)\n    except OSError:\n        print(\"Failed to find font file. Skipping details.\")\n        img = draw_bbox(image, element, color, width)\n    except Exception as e:\n        print(f\"Failed to draw bounding box: {e}\")\n    return img\n\n\ndef show_plot(\n    image: Union[Image, np.ndarray],\n    desired_width: Optional[int] = None,\n):\n    \"\"\"\n    Display an image using matplotlib with an optional desired width while maintaining the aspect\n     ratio.\n\n    Parameters:\n    - image (Union[Image, np.ndarray]): An image in PIL Image format or a numpy ndarray format.\n    - desired_width (Optional[int]): Desired width for the display size of the image.\n        If provided, the height is calculated based on the original aspect ratio.\n        If not provided, the image will be displayed with its original dimensions.\n\n    Raises:\n    - ValueError: If the provided image type is neither PIL Image nor numpy ndarray.\n\n    Returns:\n    - None: The function displays the image using matplotlib but does not return any value.\n    \"\"\"\n    if isinstance(image, Image):\n        image_width, image_height = image.size\n    elif isinstance(image, np.ndarray):\n        image_height, image_width, _ = image.shape\n    else:\n        raise ValueError(\"Unsupported Image Type\")\n\n    if desired_width:\n        # Calculate the desired height based on the original aspect ratio\n        aspect_ratio = image_width / image_height\n        desired_height = desired_width / aspect_ratio\n\n        # Create a figure with the desired size and aspect ratio\n        fig, ax = plt.subplots(figsize=(desired_width, desired_height))\n    else:\n        # Create figure and axes\n        fig, ax = plt.subplots()\n    # Display the image\n    ax.imshow(image)\n    plt.show()\n\n\n_COLORS = np.array(\n    [\n        [0.000, 0.447, 0.741],\n        [0.850, 0.325, 0.098],\n        [0.929, 0.694, 0.125],\n        [0.494, 0.184, 0.556],\n        [0.466, 0.674, 0.188],\n        [0.301, 0.745, 0.933],\n        [0.635, 0.078, 0.184],\n        [0.300, 0.300, 0.300],\n        [0.600, 0.600, 0.600],\n        [1.000, 0.000, 0.000],\n        [1.000, 0.500, 0.000],\n        [0.749, 0.749, 0.000],\n        [0.000, 1.000, 0.000],\n        [0.000, 0.000, 1.000],\n        [0.667, 0.000, 1.000],\n        [0.333, 0.333, 0.000],\n        [0.333, 0.667, 0.000],\n        [0.333, 1.000, 0.000],\n        [0.667, 0.333, 0.000],\n        [0.667, 0.667, 0.000],\n        [0.667, 1.000, 0.000],\n        [1.000, 0.333, 0.000],\n        [1.000, 0.667, 0.000],\n        [1.000, 1.000, 0.000],\n        [0.000, 0.333, 0.500],\n        [0.000, 0.667, 0.500],\n        [0.000, 1.000, 0.500],\n        [0.333, 0.000, 0.500],\n        [0.333, 0.333, 0.500],\n        [0.333, 0.667, 0.500],\n        [0.333, 1.000, 0.500],\n        [0.667, 0.000, 0.500],\n        [0.667, 0.333, 0.500],\n        [0.667, 0.667, 0.500],\n        [0.667, 1.000, 0.500],\n        [1.000, 0.000, 0.500],\n        [1.000, 0.333, 0.500],\n        [1.000, 0.667, 0.500],\n        [1.000, 1.000, 0.500],\n        [0.000, 0.333, 1.000],\n        [0.000, 0.667, 1.000],\n        [0.000, 1.000, 1.000],\n        [0.333, 0.000, 1.000],\n        [0.333, 0.333, 1.000],\n        [0.333, 0.667, 1.000],\n        [0.333, 1.000, 1.000],\n        [0.667, 0.000, 1.000],\n        [0.667, 0.333, 1.000],\n        [0.667, 0.667, 1.000],\n        [0.667, 1.000, 1.000],\n        [1.000, 0.000, 1.000],\n        [1.000, 0.333, 1.000],\n        [1.000, 0.667, 1.000],\n        [0.333, 0.000, 0.000],\n        [0.500, 0.000, 0.000],\n        [0.667, 0.000, 0.000],\n        [0.833, 0.000, 0.000],\n        [1.000, 0.000, 0.000],\n        [0.000, 0.167, 0.000],\n        [0.000, 0.333, 0.000],\n        [0.000, 0.500, 0.000],\n        [0.000, 0.667, 0.000],\n        [0.000, 0.833, 0.000],\n        [0.000, 1.000, 0.000],\n        [0.000, 0.000, 0.167],\n        [0.000, 0.000, 0.333],\n        [0.000, 0.000, 0.500],\n        [0.000, 0.000, 0.667],\n        [0.000, 0.000, 0.833],\n        [0.000, 0.000, 1.000],\n        [0.000, 0.000, 0.000],\n        [0.143, 0.143, 0.143],\n        [0.286, 0.286, 0.286],\n        [0.429, 0.429, 0.429],\n        [0.571, 0.571, 0.571],\n        [0.714, 0.714, 0.714],\n        [0.857, 0.857, 0.857],\n        [0.000, 0.447, 0.741],\n        [0.314, 0.717, 0.741],\n        [0.50, 0.5, 0],\n    ],\n).astype(np.float32)\n"
  }
]