Repository: ocrmypdf/OCRmyPDF
Branch: main
Commit: ef76625abb80
Files: 236
Total size: 1.6 MB

Directory structure:
gitextract_k_dq053s/

├── .docker/
│   ├── Dockerfile
│   └── Dockerfile.alpine
├── .dockerignore
├── .git_archival.txt
├── .gitattributes
├── .github/
│   ├── FUNDING.yml
│   ├── ISSUE_TEMPLATE/
│   │   ├── 1-bug-report-general.yml
│   │   ├── 2-problem-with-specific-file.yml
│   │   ├── 3-app.yml
│   │   └── 4-feature-request.yml
│   ├── dependabot.yml
│   └── workflows/
│       ├── build.yml
│       ├── release.yml
│       └── triage.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── LICENSE
├── LICENSES/
│   ├── AGPL-3.0-or-later.txt
│   ├── Apache-2.0.txt
│   ├── CC-BY-SA-1.0.txt
│   ├── CC-BY-SA-2.0.txt
│   ├── CC-BY-SA-2.5.txt
│   ├── CC-BY-SA-3.0.txt
│   ├── CC-BY-SA-4.0.txt
│   ├── GFDL-1.2-or-later.txt
│   ├── MIT.txt
│   ├── MPL-2.0.txt
│   └── Zlib.txt
├── README.md
├── README_ZH.md
├── REUSE.toml
├── bin/
│   └── bump_version.py
├── docs/
│   ├── advanced.md
│   ├── api.md
│   ├── apiref.md
│   ├── batch.md
│   ├── cloud.md
│   ├── conf.py
│   ├── contributing.md
│   ├── cookbook.md
│   ├── design_notes.md
│   ├── docker.md
│   ├── errors.md
│   ├── index.md
│   ├── installation.md
│   ├── introduction.md
│   ├── jbig2.md
│   ├── languages.md
│   ├── maintainers.md
│   ├── optimizer.md
│   ├── pdfsecurity.md
│   ├── performance.md
│   ├── plugins.md
│   └── releasenotes/
│       ├── index.md
│       ├── version02.md
│       ├── version03.md
│       ├── version04.md
│       ├── version05.md
│       ├── version06.md
│       ├── version07.md
│       ├── version08.md
│       ├── version09.md
│       ├── version10.md
│       ├── version11.md
│       ├── version12.md
│       ├── version13.md
│       ├── version14.md
│       ├── version15.md
│       ├── version16.md
│       └── version17.md
├── misc/
│   ├── _webservice.py
│   ├── batch.py
│   ├── bisect_pdf.py
│   ├── completion/
│   │   ├── ocrmypdf.bash
│   │   └── ocrmypdf.fish
│   ├── docker-compose.example.yml
│   ├── example_plugin.py
│   ├── flatpak/
│   │   └── io.ocrmypdf.ocrmypdf.metainfo.xml
│   ├── ocrmypdf_compare.py
│   ├── pdf_compare.py
│   ├── pdf_text_diff.py
│   ├── screencast/
│   │   ├── README.md
│   │   └── demo.cast
│   ├── synology.py
│   ├── watcher.py
│   └── webservice.py
├── pyproject.toml
├── scripts/
│   └── generate_glyphless_font.py
├── snapcraft.yaml
├── src/
│   └── ocrmypdf/
│       ├── RELEASE.md
│       ├── __init__.py
│       ├── __main__.py
│       ├── _annots.py
│       ├── _concurrent.py
│       ├── _defaults.py
│       ├── _exec/
│       │   ├── __init__.py
│       │   ├── ghostscript.py
│       │   ├── jbig2enc.py
│       │   ├── pngquant.py
│       │   ├── tesseract.py
│       │   ├── unpaper.py
│       │   └── verapdf.py
│       ├── _graft.py
│       ├── _jobcontext.py
│       ├── _logging.py
│       ├── _metadata.py
│       ├── _options.py
│       ├── _pipeline.py
│       ├── _pipelines/
│       │   ├── __init__.py
│       │   ├── _common.py
│       │   ├── hocr_to_ocr_pdf.py
│       │   ├── ocr.py
│       │   └── pdf_to_hocr.py
│       ├── _plugin_manager.py
│       ├── _plugin_registry.py
│       ├── _progressbar.py
│       ├── _validation.py
│       ├── _validation_coordinator.py
│       ├── _version.py
│       ├── api.py
│       ├── builtin_plugins/
│       │   ├── __init__.py
│       │   ├── concurrency.py
│       │   ├── default_filters.py
│       │   ├── ghostscript.py
│       │   ├── null_ocr.py
│       │   ├── optimize.py
│       │   ├── pypdfium.py
│       │   └── tesseract_ocr.py
│       ├── cli.py
│       ├── data/
│       │   ├── __init__.py
│       │   └── sRGB.icc
│       ├── exceptions.py
│       ├── extra_plugins/
│       │   ├── __init__.py
│       │   └── semfree.py
│       ├── font/
│       │   ├── __init__.py
│       │   ├── font_manager.py
│       │   ├── font_provider.py
│       │   ├── multi_font_manager.py
│       │   └── system_font_provider.py
│       ├── fpdf_renderer/
│       │   ├── __init__.py
│       │   └── renderer.py
│       ├── helpers.py
│       ├── hocrtransform/
│       │   ├── __init__.py
│       │   ├── __main__.py
│       │   └── hocr_parser.py
│       ├── imageops.py
│       ├── languages.py
│       ├── models/
│       │   ├── __init__.py
│       │   └── ocr_element.py
│       ├── optimize.py
│       ├── pdfa.py
│       ├── pdfinfo/
│       │   ├── __init__.py
│       │   ├── _contentstream.py
│       │   ├── _image.py
│       │   ├── _types.py
│       │   ├── _worker.py
│       │   ├── info.py
│       │   └── layout.py
│       ├── pluginspec.py
│       ├── py.typed
│       ├── quality.py
│       └── subprocess/
│           ├── __init__.py
│           └── _windows.py
└── tests/
    ├── __init__.py
    ├── cache/
    │   └── manifest.jsonl
    ├── conftest.py
    ├── plugins/
    │   ├── gs_feature_elision.py
    │   ├── gs_pdfa_failure.py
    │   ├── gs_raster_failure.py
    │   ├── gs_raster_soft_error.py
    │   ├── gs_render_failure.py
    │   ├── gs_render_soft_error.py
    │   ├── tesseract_badutf8.py
    │   ├── tesseract_big_image_error.py
    │   ├── tesseract_cache.py
    │   ├── tesseract_crash.py
    │   ├── tesseract_debug_rotate.py
    │   ├── tesseract_noop.py
    │   └── tesseract_simulate_oom_killer.py
    ├── resources/
    │   ├── README.rst
    │   ├── arabic.hocr
    │   ├── cjk.hocr
    │   ├── devanagari.hocr
    │   ├── hello_world_scripts.hocr
    │   ├── latin.hocr
    │   ├── linn.txt
    │   ├── multilingual.hocr
    │   └── tagged.odt
    ├── test_acroform.py
    ├── test_annots.py
    ├── test_api.py
    ├── test_check_pdf.py
    ├── test_completion.py
    ├── test_concurrency.py
    ├── test_fpdf_renderer.py
    ├── test_ghostscript.py
    ├── test_graft.py
    ├── test_helpers.py
    ├── test_hocr_parser.py
    ├── test_hocrtransform.py
    ├── test_image_input.py
    ├── test_imageops.py
    ├── test_json_serialization.py
    ├── test_logging.py
    ├── test_main.py
    ├── test_metadata.py
    ├── test_multi_font_manager.py
    ├── test_multilingual_direct.py
    ├── test_null_ocr_engine.py
    ├── test_ocr_element.py
    ├── test_ocr_engine_interface.py
    ├── test_ocr_engine_selection.py
    ├── test_optimize.py
    ├── test_page_boxes.py
    ├── test_page_numbers.py
    ├── test_pdf_renderer.py
    ├── test_pdfa.py
    ├── test_pdfinfo.py
    ├── test_pipeline.py
    ├── test_pipeline_generate_ocr.py
    ├── test_preprocessing.py
    ├── test_quality.py
    ├── test_rasterizer.py
    ├── test_rotation.py
    ├── test_semfree.py
    ├── test_soft_error.py
    ├── test_stdio.py
    ├── test_system_font_provider.py
    ├── test_tagged.py
    ├── test_tesseract.py
    ├── test_unpaper.py
    ├── test_userunit.py
    ├── test_validation.py
    ├── test_verapdf.py
    └── test_watcher.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .docker/Dockerfile
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

FROM ubuntu:25.04 AS base

ENV LANG=C.UTF-8
ENV TZ=UTC
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections

RUN apt-get update && apt-get install -y --no-install-recommends \
  python3 \
  python-is-python3

FROM base AS builder

# Note we need leptonica here to build jbig2
RUN apt-get update && apt-get install -y --no-install-recommends \
  build-essential autoconf automake libtool \
  libleptonica-dev \
  zlib1g-dev \
  libffi-dev \
  ca-certificates \
  curl \
  git \
  libcairo2-dev \
  pkg-config

# Compile and install jbig2
# Needs libleptonica-dev, zlib1g-dev
RUN \
  mkdir jbig2 \
  && curl -L https://github.com/agl/jbig2enc/archive/c0141bf.tar.gz | \
  tar xz -C jbig2 --strip-components=1 \
  && cd jbig2 \
  && ./autogen.sh && ./configure && make && make install \
  && cd .. \
  && rm -rf jbig2


WORKDIR /app

# Copy uv from ghcr
COPY --from=ghcr.io/astral-sh/uv:0.9.8 /uv /uvx /bin/

ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy

# Install the project's dependencies using the lockfile and settings
RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=uv.lock,target=uv.lock \
    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
    uv sync --frozen --no-install-project --no-dev

# Then, add the rest of the project source code and install it
# Installing separately from its dependencies allows optimal layer caching
COPY . /app
RUN --mount=type=cache,target=/root/.cache/uv \
    uv sync --frozen \
        --extra webservice --extra watcher --no-dev \
        --no-install-package pyarrow

FROM base

RUN apt-get update && apt-get install -y software-properties-common

RUN add-apt-repository -y ppa:alex-p/tesseract-ocr5

RUN apt-get update && apt-get install -y --no-install-recommends \
  ghostscript \
  fonts-droid-fallback \
  fonts-noto-core \
  fonts-noto-cjk \
  jbig2dec \
  pngquant \
  tesseract-ocr \
  tesseract-ocr-chi-sim \
  tesseract-ocr-deu \
  tesseract-ocr-eng \
  tesseract-ocr-fra \
  tesseract-ocr-por \
  tesseract-ocr-spa \
  unpaper \
  && rm -rf /var/lib/apt/lists/*

WORKDIR /app

COPY --from=builder /usr/local/lib/ /usr/local/lib/
COPY --from=builder /usr/local/bin/ /usr/local/bin/

COPY --from=builder --chown=app:app /app /app

RUN rm -rf /app/.git && \
ln -s /app/misc/webservice.py /app/webservice.py && \
ln -s /app/misc/watcher.py /app/watcher.py

ENV PATH="/app/.venv/bin:${PATH}"

ENTRYPOINT ["/app/.venv/bin/ocrmypdf"]


================================================
FILE: .docker/Dockerfile.alpine
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

FROM alpine:3.23 AS base

ENV LANG=C.UTF-8
ENV TZ=UTC

RUN apk add --no-cache \
    python3 \
    zlib

FROM base AS builder

# Yes it really is python3-dev, and py3-package
RUN apk add --no-cache \
    ca-certificates \
    git \
    python3-dev \
    py3-pyarrow \
    curl

WORKDIR /app

COPY --from=ghcr.io/astral-sh/uv:0.9.8 /uv /uvx /bin/

ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy

RUN uv venv --system-site-packages .venv

# Install the project's dependencies using the lockfile and settings
RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=uv.lock,target=uv.lock \
    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
    uv sync --frozen --no-install-project --no-dev

# Then, add the rest of the project source code and install it
# Installing separately from its dependencies allows optimal layer caching
COPY . /app
RUN --mount=type=cache,target=/root/.cache/uv \
    uv sync --frozen \
        --extra webservice --extra watcher --no-dev \
        --no-install-package pyarrow

FROM base

RUN apk add --no-cache \
    ghostscript \
    jbig2dec \
    jbig2enc \
    pngquant \
    tesseract-ocr \
    tesseract-ocr-data-chi_sim \
    tesseract-ocr-data-deu \
    tesseract-ocr-data-eng \
    tesseract-ocr-data-fra \
    tesseract-ocr-data-osd \
    tesseract-ocr-data-por \
    tesseract-ocr-data-spa \
    font-noto \
    ttf-droid \
    unpaper \
    && rm -rf /var/cache/apk/*

WORKDIR /app

COPY --from=builder --chown=app:app /app /app

RUN rm -rf /app/.git && \
    ln -s /app/misc/webservice.py /app/webservice.py && \
    ln -s /app/misc/watcher.py /app/watcher.py

ENV PATH="/app/.venv/bin:${PATH}"

ENTRYPOINT ["/app/.venv/bin/ocrmypdf"]


================================================
FILE: .dockerignore
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

# dotfiles
.*
!.coveragerc
!.dockerignore
!.git_archival.txt
!.gitattributes
!.gitignore
!.pre-commit-config.yaml
!.readthedocs.yml

# Dev scratch
*.ipynb
**/*.pyc
/*.pdf
/*.qdf
/*.png
/scratch.py
IDEAS
log/
tests/resources/private/
tmp/
venv*/
/debug_tests.py
*.traineddata
/private

# Package building
*.egg-info/
build/
dist/
wheelhouse/
pip-wheel-metadata/

# Code coverage
htmlcov/

# Docker specific
bin/
docs/
include/
lib/

# Docker include .git/
!.git/


================================================
FILE: .git_archival.txt
================================================
node: $Format:%H$
node-date: $Format:%cI$
describe-name: $Format:%(describe:tags=true)$
ref-names: $Format:%D$


================================================
FILE: .gitattributes
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

# Always use Unix convention for new lines
* text eol=lf

# These files are binary and should be left untouched
# (binary is a macro for -text -diff)
*.jar	binary
*.pdf	binary
*.PDF	binary
*.png	binary
*.jpg	binary
*.bin   binary
*.afdesign  binary
*.ttf   binary

.git_archival.txt  export-subst


================================================
FILE: .github/FUNDING.yml
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

# These are supported funding model platforms

github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: james-barlow
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']


================================================
FILE: .github/ISSUE_TEMPLATE/1-bug-report-general.yml
================================================
name: Installation, packaging, dependencies
description: Installation, packages, dependencies, "nothing works", test suite failures...
title: "[Bug]: "
labels: ["triage"]
assignees:
  - jbarlow83
body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this bug report!

        If your issue involves using OCRmyPDF on specific file(s) and not getting
        good results, this is the *wrong* issue template. Please use the recommended
        template to ensure we have enough information to help.
  - type: textarea
    id: what-happened
    attributes:
      label: What were you trying to do?
      description: Also tell us, what did you expect to happen?
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: dropdown
    id: packaging-system
    attributes:
      label: Where are you installing/running from?
      multiple: true
      options:
        - PyPI (pip, poetry, pipx, etc.)
        - Linux package manager (apt, dnf, etc.)
        - Wndows package manager (chocolatey, etc.)
        - Homebrew
        - Docker container
        - Ubuntu snap
        - Conda
        - source build
    validations:
      required: true
  - type: input
    id: version
    attributes:
      label: OCRmyPDF version
      description: Paste "ocrmypdf --version" here
  - type: dropdown
    id: operating-system
    attributes:
      label: What operating system are you working on?
      multiple: true
      options:
        - Linux
        - Windows
        - macOS
        - BSD
  - type: input
    id: os_version
    attributes:
      label: Operating system details and version
  - type: checkboxes
    attributes:
      label: Simple sanity checks
      description: Select all that apply
      options:
        - label: Operating system is currently supported by its vendor (not end of life)
        - label: Python version is compatible with OCRmyPDF
        - label: This issue is not about a specific input file
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: plain text


================================================
FILE: .github/ISSUE_TEMPLATE/2-problem-with-specific-file.yml
================================================
name: Problem with specific file
description: Something went wrong while trying to OCR a specific file
title: "[Bug]: "
labels: ["triage"]
assignees:
  - jbarlow83
body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to describe this issue with a particular file.
  - type: textarea
    id: what-happened
    attributes:
      label: Describe the bug
      description: A clear and concise description of what the bug is.
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: textarea
    id: reproduce
    attributes:
      label: Steps to reproduce
      description: Please include steps to reproduce.
      value: |
        1. Run ocrmypdf -v1 ...arguments... input.pdf output.pdf
        2. Open output.pdf
        3. ...
      render: plain text
  - type: textarea
    id: files
    attributes:
      label: Files
      description: |
        Please attach the input and output files, or any screenshots that may be helpful.

        If you cannot provide a test file, we probably won't be able to help with the issue.
        PDF is a complex file format, and there may be technical details in the PDF that are
        causing the issue. There's really no substitute for a test file.

        We understand files may contain personal or sensitive information. Here are some options:
        - Try reproducing the issue with a file from the OCRmyPDF test suite. (See tests/resources)
        - Try to create another file in the same way as your private file.
        - Encrypt the file to OCRmyPDF's private GPG key, and then zip the GPG file.
        - Use ``qpdf --json yourfile.pdf`` to produce a JSON representation of your file that
          omits personal information.
      placeholder: |
        Drag and drop files here.
  - type: dropdown
    id: packaging-system
    attributes:
      label: How did you download and install the software?
      multiple: true
      options:
        - PyPI (pip, poetry, pipx, etc.)
        - Linux package manager (apt, dnf, etc.)
        - Windows package manager (chocolatey, etc.)
        - Homebrew
        - Docker container
        - Ubuntu snap
        - Conda
        - source build
  - type: input
    id: version
    attributes:
      label: OCRmyPDF version
      description: Paste "ocrmypdf --version" here
      placeholder: ocrmypdf --version
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      placeholder: Run OCRmyPDF with verbosity `-v1` to get more detailed logging output.
      render: plain text


================================================
FILE: .github/ISSUE_TEMPLATE/3-app.yml
================================================
name: Problem with third party app that uses OCRmyPDF
description: |
  For PDF generation issues with third party software such as Paperless-ngx that
  uses OCRmyPDF to perform OCR or generate PDFs.
title: "[3rdparty]: "
labels: ["triage"]
assignees:
  - jbarlow83
body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to describe this issue with a particular file
        and third party app.

        If you are comfortable using OCRmyPDF, please trying to install OCRmyPDF,
        run it on your file, and see if it works. It's easier for everyone
        if you can confirm that the issue occurs with OCRmyPDF and not with
        the third party app.
  - type: checkboxes
    attributes:
      label: Simple sanity checks
      description: Select all that apply
      options:
        - label: This is an issue with an app that uses OCRmyPDF for OCR
        - label: I am using a recent version of the third party app
        - label: I will include a file that reproduces the issuse
  - type: input
    id: thirdparty-app-name-version
    attributes:
      label: Third party app name and version
      description: e.g. Paperless-ngx 2.9.0
  - type: textarea
    id: what-happened
    attributes:
      label: Describe the bug
      description: A clear and concise description of what the bug is.
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: textarea
    id: reproduce
    attributes:
      label: Steps to reproduce
      description: Please include steps to reproduce.
      value: |
        1. Import attached file into Paperless-ngx
        2. Trigger OCR
        3. Check log file
        4. ...
      render: plain text
  - type: textarea
    id: files
    attributes:
      label: Files
      description: |
        Please attach the input and output files, or any screenshots that may be helpful.

        If you cannot provide a test file, we probably won't be able to help with the issue.
        PDF is a complex file format, and there may be technical details in the PDF that are
        causing the issue. There's really no substitute for a test file.

        We understand files may contain personal or sensitive information. Here are some options:
        - Try reproducing the issue with a file from the test suite. (See tests/resources)
        - Try to create another file in the same way as your private file.
        - Encrypt the file to OCRmyPDF's private GPG key, and then zip the GPG file.
        - Use ``qpdf --json yourfile.pdf`` to produce a JSON representation of your file that
          omits personal information.
      placeholder: |
        Drag and drop files here.
  - type: input
    id: version
    attributes:
      label: OCRmyPDF version
      description: Paste "ocrmypdf --version" here
      placeholder: ocrmypdf --version
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      placeholder: Run OCRmyPDF with verbosity `-v1` to get more detailed logging output.
      render: plain text


================================================
FILE: .github/ISSUE_TEMPLATE/4-feature-request.yml
================================================
name: Feature request
description: Suggest an idea for this project
title: "[Feature]: "
labels: ["enhancement", "triage"]
assignees:
  - jbarlow83
body:
  - type: textarea
    id: feature
    attributes:
      label: Describe the proposed feature
      description: A clear and concise description of what the desired is.


================================================
FILE: .github/dependabot.yml
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates

version: 2
updates:
  - package-ecosystem: "github-actions" # See documentation for possible values
    directory: "/" # Location of package manifests
    schedule:
      interval: "weekly"


================================================
FILE: .github/workflows/build.yml
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
name: Test and deploy

on:
  push:
    branches:
      - main
      - ci
      - release/*
      - feature/*
    paths-ignore:
      - README*
  pull_request:

jobs:
  test_linux:
    name: Test ${{ matrix.os }} with Python ${{ matrix.python }}
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-22.04, ubuntu-24.04]
        python: ["3.11", "3.12", "3.13", "3.14"]
        include:
          - os: ubuntu-22.04
            tesseract_ppa: "ppa"
            python: "3.11"

    env:
      OS: ${{ matrix.os }}
      PYTHON: ${{ matrix.python }}

    steps:
      - uses: actions/checkout@v6

      - name: Install uv
        uses: astral-sh/setup-uv@v7
        with:
          version: "0.9.x"

      - name: "Set up Python"
        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python }}

      - name: Install Tesseract from PPA
        if: matrix.tesseract_ppa == 'ppa'
        run: |
          sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5

      - name: Install common packages
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends \
            curl \
            fonts-noto-core \
            fonts-noto-cjk \
            ghostscript \
            jbig2dec \
            img2pdf \
            libexempi8 \
            libffi-dev \
            libsm6 libxext6 libxrender-dev \
            pngquant \
            poppler-utils \
            tesseract-ocr \
            tesseract-ocr-deu \
            tesseract-ocr-eng \
            tesseract-ocr-osd \
            unpaper \
            zlib1g

      - name: Install Python packages
        run: |
          uv sync --group test

      - name: Report versions
        run: |
          tesseract --version
          gs --version
          pngquant --version
          unpaper --version
          uv run --no-dev img2pdf --version

      - name: Test
        run: |
          uv run --no-dev pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        with:
          files: ./coverage.xml
          env_vars: OS,PYTHON

  test_macos:
    name: Test macOS
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [macos-latest]
        python: ["3.11", "3.12", "3.13", "3.14"]

    env:
      OS: ${{ matrix.os }}
      PYTHON: ${{ matrix.python }}

    steps:
      - uses: actions/checkout@v6

      - name: Install Homebrew deps
        continue-on-error: true
        run: |
          brew update
          brew install \
            exempi \
            ghostscript \
            jbig2enc \
            openjpeg \
            pngquant \
            poppler \
            tesseract \
            verapdf

      - name: Install uv
        uses: astral-sh/setup-uv@v7
        with:
          version: "0.9.x"

      - name: "Set up Python"
        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python }}

      - name: Install Python packages
        run: |
          uv sync --group test

      - name: Report versions
        run: |
          tesseract --version
          gs --version
          pngquant --version
          uv run --no-dev img2pdf --version

      - name: Test
        run: |
          uv run --no-dev pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        with:
          files: ./coverage.xml
          env_vars: OS,PYTHON

  test_windows:
    name: Test Windows
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [windows-latest]
        python: ["3.11", "3.12", "3.13", "3.14"]

    env:
      OS: ${{ matrix.os }}
      PYTHON: ${{ matrix.python }}

    steps:
      - uses: actions/checkout@v6

      - name: Install uv
        uses: astral-sh/setup-uv@v7
        with:
          version: "0.9.x"

      - name: "Set up Python"
        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python }}

      - name: Install system packages
        run: |
          choco install --yes --no-progress tesseract
          choco install --yes --no-progress --ignore-checksums ghostscript --version 9.56.1
          choco install --yes --no-progress poppler --version=25.11.0

      - name: Install Python packages
        run: |
          uv sync --group test

      - name: Test
        run: |
          uv run --no-dev pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        with:
          files: ./coverage.xml
          env_vars: OS,PYTHON

  wheel_sdist_linux:
    name: Build sdist and wheels
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6

      - name: Install uv
        uses: astral-sh/setup-uv@v7
        with:
          version: "0.9.x"

      - name: Make wheels and sdist
        run: |
          uv build --sdist --wheel

      - uses: actions/upload-artifact@v6
        with:
          name: artifact
          path: |
            ./dist/*.whl
            ./dist/*.tar.gz

  stage_release:
    name: Stage release artifacts
    needs: [wheel_sdist_linux, test_linux, test_macos, test_windows]
    runs-on: ubuntu-latest
    if: github.ref == 'refs/heads/main'
    permissions:
      contents: write
    steps:
      - uses: actions/checkout@v6

      - uses: actions/download-artifact@v7
        with:
          name: artifact
          path: dist

      - name: Read version from source
        id: version
        run: |
          VERSION=$(python3 -c "exec(open('src/ocrmypdf/_version.py').read()); print(__version__)")
          echo "version=$VERSION" >> $GITHUB_OUTPUT

      - name: Create or update draft release
        env:
          GITHUB_TOKEN: ${{ github.token }}
        run: |
          TAG="v${{ steps.version.outputs.version }}"

          # Delete existing draft release if it exists (ignore errors)
          gh release delete "$TAG" --yes 2>/dev/null || true

          # Create new draft release with all artifacts
          gh release create "$TAG" \
            --draft \
            --title "$TAG" \
            --notes "Draft release - will be updated when tag is pushed" \
            dist/*

  docker_ubuntu:
    name: Build Ubuntu-based Docker image
    needs: [wheel_sdist_linux, test_linux, test_macos, test_windows]
    runs-on: ubuntu-latest
    if: github.event_name != 'pull_request'
    steps:
      - name: Set image tag to release or branch
        run: echo "DOCKER_IMAGE_TAG=${GITHUB_REF##*/}" >> $GITHUB_ENV

      - name: If main, set to latest
        run: echo 'DOCKER_IMAGE_TAG=latest' >> $GITHUB_ENV
        if: env.DOCKER_IMAGE_TAG == 'main'

      - name: Set Docker Hub repository to username
        run: echo "DOCKER_REPOSITORY=jbarlow83" >> $GITHUB_ENV

      - name: Set image name
        run: echo "DOCKER_IMAGE_NAME=ocrmypdf" >> $GITHUB_ENV

      - uses: actions/checkout@v6

      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: jbarlow83
          password: ${{ secrets.DOCKERHUB_TOKEN }}

      - name: Set up QEMU
        uses: docker/setup-qemu-action@v3

      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v3

      - name: Print image tag
        run: echo "Building image ${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}"

      - name: Build
        run: |
          docker buildx build \
            --push \
            --platform linux/arm64/v8,linux/amd64  \
            --tag "${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" \
            --tag "${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}-ubuntu:${DOCKER_IMAGE_TAG}" \
            --file .docker/Dockerfile .

  docker_alpine:
    name: Build Alpine-based Docker images
    needs: [wheel_sdist_linux, test_linux, test_macos, test_windows]
    runs-on: ubuntu-latest
    if: github.event_name != 'pull_request'
    steps:
      - name: Set image tag to release or branch
        run: echo "DOCKER_IMAGE_TAG=${GITHUB_REF##*/}" >> $GITHUB_ENV

      - name: If main, set to latest
        run: echo 'DOCKER_IMAGE_TAG=latest' >> $GITHUB_ENV
        if: env.DOCKER_IMAGE_TAG == 'main'

      - name: Set Docker Hub repository to username
        run: echo "DOCKER_REPOSITORY=jbarlow83" >> $GITHUB_ENV

      - name: Set image name
        run: echo "DOCKER_IMAGE_NAME=ocrmypdf-alpine" >> $GITHUB_ENV

      - uses: actions/checkout@v6

      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: jbarlow83
          password: ${{ secrets.DOCKERHUB_TOKEN }}

      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v3

      - name: Print image tag
        run: echo "Building image ${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}"

      - name: Build
        run: |
          docker buildx build \
            --push \
            --platform linux/amd64,linux/arm64  \
            --tag "${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" \
            --file .docker/Dockerfile.alpine .


================================================
FILE: .github/workflows/release.yml
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

name: Publish Release

on:
  push:
    tags:
      - "v*"

jobs:
  publish:
    name: Publish release
    runs-on: ubuntu-latest
    environment:
      name: release
      url: https://pypi.org/p/ocrmypdf
    permissions:
      contents: write
      id-token: write
    steps:
      - uses: actions/checkout@v6

      - name: Download artifacts from draft release
        env:
          GITHUB_TOKEN: ${{ github.token }}
        run: |
          mkdir -p dist
          gh release download "$GITHUB_REF_NAME" --dir dist --pattern '*.whl'
          gh release download "$GITHUB_REF_NAME" --dir dist --pattern '*.tar.gz'

      - name: Publish to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1

      # PyPI doesn't support sigstore publishing, so generate after publishing to PyPI
      - name: Sign the dists with Sigstore
        uses: sigstore/gh-action-sigstore-python@v3.2.0
        with:
          inputs: |
            ./dist/*.tar.gz
            ./dist/*.whl

      - name: Extract release notes
        run: |
          VERSION="${GITHUB_REF_NAME#v}"
          MAJOR="${VERSION%%.*}"
          MAJOR_PADDED=$(printf "%02d" "$MAJOR")
          RELEASE_FILE="docs/releasenotes/version${MAJOR_PADDED}.md"

          python3 << EOF
          import re

          version = "${VERSION}"
          release_file = "${RELEASE_FILE}"

          try:
              with open(release_file) as f:
                  content = f.read()

              # Find the section for this version
              # Match from "## vX.Y.Z" until the next "## v" or end of file
              pattern = rf"## v{re.escape(version)}\n(.*?)(?=\n## v|\Z)"
              match = re.search(pattern, content, re.DOTALL)
              notes = match.group(1).strip() if match else ""
          except FileNotFoundError:
              notes = ""

          with open("release_notes.md", "w") as f:
              f.write(notes)
          EOF

      - name: Publish release (convert draft to published)
        env:
          GITHUB_TOKEN: ${{ github.token }}
        run: |
          # Update release: remove draft status, add release notes
          gh release edit "$GITHUB_REF_NAME" \
            --draft=false \
            --notes-file release_notes.md

          # Upload signatures to the release
          gh release upload "$GITHUB_REF_NAME" dist/*.sigstore.json --clobber

  docker_tag:
    name: Tag Docker images with release version
    needs: [publish]
    runs-on: ubuntu-latest
    steps:
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: jbarlow83
          password: ${{ secrets.DOCKERHUB_TOKEN }}

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Tag ocrmypdf (Ubuntu) image
        run: |
          docker buildx imagetools create \
            --tag "jbarlow83/ocrmypdf:$GITHUB_REF_NAME" \
            "jbarlow83/ocrmypdf:latest"

      - name: Tag ocrmypdf-ubuntu image
        run: |
          docker buildx imagetools create \
            --tag "jbarlow83/ocrmypdf-ubuntu:$GITHUB_REF_NAME" \
            "jbarlow83/ocrmypdf-ubuntu:latest"

      - name: Tag ocrmypdf-alpine image
        run: |
          docker buildx imagetools create \
            --tag "jbarlow83/ocrmypdf-alpine:$GITHUB_REF_NAME" \
            "jbarlow83/ocrmypdf-alpine:latest"


================================================
FILE: .github/workflows/triage.yml
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

name: Remove Triage Label on Reply

on:
  issue_comment:
    types:
      - created

jobs:
  remove-triage-label:
    runs-on: ubuntu-latest

    steps:
      - name: Check if comment is by the repository owner
        id: check_comment
        run: |
          echo "::set-output name=is_owner::$(
            if [[ '${{ github.event.comment.user.login }}' == 'jbarlow83' ]]; then
              echo 'true';
            else
              echo 'false';
            fi
          )"

      - name: Remove 'triage' label
        if: ${{ steps.check_comment.outputs.is_owner == 'true' }}
        uses: actions-ecosystem/action-remove-labels@v1
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
          labels: triage


================================================
FILE: .gitignore
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: CC-BY-SA-4.0

# dotfiles
.coverage
.venv*/
.tox/
.vscode/
.hypothesis/
.ipynb_checkpoints/
.mypy_cache/
.pytest_cache/

# Dev scratch
*.ipynb
**/*.pyc
/*.pdf
/*.qdf
/*.png
/scratch.py
IDEAS
log/
tests/resources/private/
tmp/
venv*/
/debug_tests.py
*.traineddata
/private
/coverage.xml
/issuepdf

# Package building
*.egg-info/
build/
dist/
wheelhouse/
pip-wheel-metadata/

# Code coverage
htmlcov/

# Automatically generated files
docs/_build/
docs/_static/
docs/_templates/
docs/Makefile

.idea/
.aider*
CLAUDE.md
.claude/

================================================
FILE: .pre-commit-config.yaml
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.4.0
    hooks:
      - id: check-case-conflict
      - id: check-merge-conflict
      - id: check-toml
      - id: check-yaml
      - id: debug-statements
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: "v0.14.11"
    hooks:
      - id: ruff-check
        args: [--fix]
      - id: ruff-format
  - repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.2.0
    hooks:
      - id: mypy
        additional_dependencies:
          - types-toml
          - types-setuptools
          - types-requests
          - types-Pillow


================================================
FILE: .readthedocs.yaml
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

# Required
version: 2

# Build documentation in the docs/ directory with Sphinx
sphinx:
  configuration: docs/conf.py

# Optionally set the version of Python and requirements required to build your docs
build:
  os: ubuntu-22.04
  tools:
    python: "3.13"
  jobs:
    pre_create_environment:
      - asdf plugin add uv
      - asdf install uv latest
      - asdf global uv latest
    create_environment:
      - uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
    install:
      - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs

================================================
FILE: LICENSE
================================================
Mozilla Public License Version 2.0
==================================

1. Definitions
--------------

1.1. "Contributor"
    means each individual or legal entity that creates, contributes to
    the creation of, or owns Covered Software.

1.2. "Contributor Version"
    means the combination of the Contributions of others (if any) used
    by a Contributor and that particular Contributor's Contribution.

1.3. "Contribution"
    means Covered Software of a particular Contributor.

1.4. "Covered Software"
    means Source Code Form to which the initial Contributor has attached
    the notice in Exhibit A, the Executable Form of such Source Code
    Form, and Modifications of such Source Code Form, in each case
    including portions thereof.

1.5. "Incompatible With Secondary Licenses"
    means

    (a) that the initial Contributor has attached the notice described
        in Exhibit B to the Covered Software; or

    (b) that the Covered Software was made available under the terms of
        version 1.1 or earlier of the License, but not also under the
        terms of a Secondary License.

1.6. "Executable Form"
    means any form of the work other than Source Code Form.

1.7. "Larger Work"
    means a work that combines Covered Software with other material, in
    a separate file or files, that is not Covered Software.

1.8. "License"
    means this document.

1.9. "Licensable"
    means having the right to grant, to the maximum extent possible,
    whether at the time of the initial grant or subsequently, any and
    all of the rights conveyed by this License.

1.10. "Modifications"
    means any of the following:

    (a) any file in Source Code Form that results from an addition to,
        deletion from, or modification of the contents of Covered
        Software; or

    (b) any new file in Source Code Form that contains any Covered
        Software.

1.11. "Patent Claims" of a Contributor
    means any patent claim(s), including without limitation, method,
    process, and apparatus claims, in any patent Licensable by such
    Contributor that would be infringed, but for the grant of the
    License, by the making, using, selling, offering for sale, having
    made, import, or transfer of either its Contributions or its
    Contributor Version.

1.12. "Secondary License"
    means either the GNU General Public License, Version 2.0, the GNU
    Lesser General Public License, Version 2.1, the GNU Affero General
    Public License, Version 3.0, or any later versions of those
    licenses.

1.13. "Source Code Form"
    means the form of the work preferred for making modifications.

1.14. "You" (or "Your")
    means an individual or a legal entity exercising rights under this
    License. For legal entities, "You" includes any entity that
    controls, is controlled by, or is under common control with You. For
    purposes of this definition, "control" means (a) the power, direct
    or indirect, to cause the direction or management of such entity,
    whether by contract or otherwise, or (b) ownership of more than
    fifty percent (50%) of the outstanding shares or beneficial
    ownership of such entity.

2. License Grants and Conditions
--------------------------------

2.1. Grants

Each Contributor hereby grants You a world-wide, royalty-free,
non-exclusive license:

(a) under intellectual property rights (other than patent or trademark)
    Licensable by such Contributor to use, reproduce, make available,
    modify, display, perform, distribute, and otherwise exploit its
    Contributions, either on an unmodified basis, with Modifications, or
    as part of a Larger Work; and

(b) under Patent Claims of such Contributor to make, use, sell, offer
    for sale, have made, import, and otherwise transfer either its
    Contributions or its Contributor Version.

2.2. Effective Date

The licenses granted in Section 2.1 with respect to any Contribution
become effective for each Contribution on the date the Contributor first
distributes such Contribution.

2.3. Limitations on Grant Scope

The licenses granted in this Section 2 are the only rights granted under
this License. No additional rights or licenses will be implied from the
distribution or licensing of Covered Software under this License.
Notwithstanding Section 2.1(b) above, no patent license is granted by a
Contributor:

(a) for any code that a Contributor has removed from Covered Software;
    or

(b) for infringements caused by: (i) Your and any other third party's
    modifications of Covered Software, or (ii) the combination of its
    Contributions with other software (except as part of its Contributor
    Version); or

(c) under Patent Claims infringed by Covered Software in the absence of
    its Contributions.

This License does not grant any rights in the trademarks, service marks,
or logos of any Contributor (except as may be necessary to comply with
the notice requirements in Section 3.4).

2.4. Subsequent Licenses

No Contributor makes additional grants as a result of Your choice to
distribute the Covered Software under a subsequent version of this
License (see Section 10.2) or under the terms of a Secondary License (if
permitted under the terms of Section 3.3).

2.5. Representation

Each Contributor represents that the Contributor believes its
Contributions are its original creation(s) or it has sufficient rights
to grant the rights to its Contributions conveyed by this License.

2.6. Fair Use

This License is not intended to limit any rights You have under
applicable copyright doctrines of fair use, fair dealing, or other
equivalents.

2.7. Conditions

Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
in Section 2.1.

3. Responsibilities
-------------------

3.1. Distribution of Source Form

All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under
the terms of this License. You must inform recipients that the Source
Code Form of the Covered Software is governed by the terms of this
License, and how they can obtain a copy of this License. You may not
attempt to alter or restrict the recipients' rights in the Source Code
Form.

3.2. Distribution of Executable Form

If You distribute Covered Software in Executable Form then:

(a) such Covered Software must also be made available in Source Code
    Form, as described in Section 3.1, and You must inform recipients of
    the Executable Form how they can obtain a copy of such Source Code
    Form by reasonable means in a timely manner, at a charge no more
    than the cost of distribution to the recipient; and

(b) You may distribute such Executable Form under the terms of this
    License, or sublicense it under different terms, provided that the
    license for the Executable Form does not attempt to limit or alter
    the recipients' rights in the Source Code Form under this License.

3.3. Distribution of a Larger Work

You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for
the Covered Software. If the Larger Work is a combination of Covered
Software with a work governed by one or more Secondary Licenses, and the
Covered Software is not Incompatible With Secondary Licenses, this
License permits You to additionally distribute such Covered Software
under the terms of such Secondary License(s), so that the recipient of
the Larger Work may, at their option, further distribute the Covered
Software under the terms of either this License or such Secondary
License(s).

3.4. Notices

You may not remove or alter the substance of any license notices
(including copyright notices, patent notices, disclaimers of warranty,
or limitations of liability) contained within the Source Code Form of
the Covered Software, except that You may alter any license notices to
the extent required to remedy known factual inaccuracies.

3.5. Application of Additional Terms

You may choose to offer, and to charge a fee for, warranty, support,
indemnity or liability obligations to one or more recipients of Covered
Software. However, You may do so only on Your own behalf, and not on
behalf of any Contributor. You must make it absolutely clear that any
such warranty, support, indemnity, or liability obligation is offered by
You alone, and You hereby agree to indemnify every Contributor for any
liability incurred by such Contributor as a result of warranty, support,
indemnity or liability terms You offer. You may include additional
disclaimers of warranty and limitations of liability specific to any
jurisdiction.

4. Inability to Comply Due to Statute or Regulation
---------------------------------------------------

If it is impossible for You to comply with any of the terms of this
License with respect to some or all of the Covered Software due to
statute, judicial order, or regulation then You must: (a) comply with
the terms of this License to the maximum extent possible; and (b)
describe the limitations and the code they affect. Such description must
be placed in a text file included with all distributions of the Covered
Software under this License. Except to the extent prohibited by statute
or regulation, such description must be sufficiently detailed for a
recipient of ordinary skill to be able to understand it.

5. Termination
--------------

5.1. The rights granted under this License will terminate automatically
if You fail to comply with any of its terms. However, if You become
compliant, then the rights granted under this License from a particular
Contributor are reinstated (a) provisionally, unless and until such
Contributor explicitly and finally terminates Your grants, and (b) on an
ongoing basis, if such Contributor fails to notify You of the
non-compliance by some reasonable means prior to 60 days after You have
come back into compliance. Moreover, Your grants from a particular
Contributor are reinstated on an ongoing basis if such Contributor
notifies You of the non-compliance by some reasonable means, this is the
first time You have received notice of non-compliance with this License
from such Contributor, and You become compliant prior to 30 days after
Your receipt of the notice.

5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions,
counter-claims, and cross-claims) alleging that a Contributor Version
directly or indirectly infringes any patent, then the rights granted to
You by any and all Contributors for the Covered Software under Section
2.1 of this License shall terminate.

5.3. In the event of termination under Sections 5.1 or 5.2 above, all
end user license agreements (excluding distributors and resellers) which
have been validly granted by You or Your distributors under this License
prior to termination shall survive termination.

************************************************************************
*                                                                      *
*  6. Disclaimer of Warranty                                           *
*  -------------------------                                           *
*                                                                      *
*  Covered Software is provided under this License on an "as is"       *
*  basis, without warranty of any kind, either expressed, implied, or  *
*  statutory, including, without limitation, warranties that the       *
*  Covered Software is free of defects, merchantable, fit for a        *
*  particular purpose or non-infringing. The entire risk as to the     *
*  quality and performance of the Covered Software is with You.        *
*  Should any Covered Software prove defective in any respect, You     *
*  (not any Contributor) assume the cost of any necessary servicing,   *
*  repair, or correction. This disclaimer of warranty constitutes an   *
*  essential part of this License. No use of any Covered Software is   *
*  authorized under this License except under this disclaimer.         *
*                                                                      *
************************************************************************

************************************************************************
*                                                                      *
*  7. Limitation of Liability                                          *
*  --------------------------                                          *
*                                                                      *
*  Under no circumstances and under no legal theory, whether tort      *
*  (including negligence), contract, or otherwise, shall any           *
*  Contributor, or anyone who distributes Covered Software as          *
*  permitted above, be liable to You for any direct, indirect,         *
*  special, incidental, or consequential damages of any character      *
*  including, without limitation, damages for lost profits, loss of    *
*  goodwill, work stoppage, computer failure or malfunction, or any    *
*  and all other commercial damages or losses, even if such party      *
*  shall have been informed of the possibility of such damages. This   *
*  limitation of liability shall not apply to liability for death or   *
*  personal injury resulting from such party's negligence to the       *
*  extent applicable law prohibits such limitation. Some               *
*  jurisdictions do not allow the exclusion or limitation of           *
*  incidental or consequential damages, so this exclusion and          *
*  limitation may not apply to You.                                    *
*                                                                      *
************************************************************************

8. Litigation
-------------

Any litigation relating to this License may be brought only in the
courts of a jurisdiction where the defendant maintains its principal
place of business and such litigation shall be governed by laws of that
jurisdiction, without reference to its conflict-of-law provisions.
Nothing in this Section shall prevent a party's ability to bring
cross-claims or counter-claims.

9. Miscellaneous
----------------

This License represents the complete agreement concerning the subject
matter hereof. If any provision of this License is held to be
unenforceable, such provision shall be reformed only to the extent
necessary to make it enforceable. Any law or regulation which provides
that the language of a contract shall be construed against the drafter
shall not be used to construe this License against a Contributor.

10. Versions of the License
---------------------------

10.1. New Versions

Mozilla Foundation is the license steward. Except as provided in Section
10.3, no one other than the license steward has the right to modify or
publish new versions of this License. Each version will be given a
distinguishing version number.

10.2. Effect of New Versions

You may distribute the Covered Software under the terms of the version
of the License under which You originally received the Covered Software,
or under the terms of any subsequent version published by the license
steward.

10.3. Modified Versions

If you create software not governed by this License, and you want to
create a new license for such software, you may create and use a
modified version of this License if you rename the license and remove
any references to the name of the license steward (except to note that
such modified license differs from this License).

10.4. Distributing Source Code Form that is Incompatible With Secondary
Licenses

If You choose to distribute Source Code Form that is Incompatible With
Secondary Licenses under the terms of this version of the License, the
notice described in Exhibit B of this License must be attached.

Exhibit A - Source Code Form License Notice
-------------------------------------------

  This Source Code Form is subject to the terms of the Mozilla Public
  License, v. 2.0. If a copy of the MPL was not distributed with this
  file, You can obtain one at http://mozilla.org/MPL/2.0/.

If it is not possible or desirable to put the notice in a particular
file, then You may include the notice in a location (such as a LICENSE
file in a relevant directory) where a recipient would be likely to look
for such a notice.

You may add additional accurate notices of copyright ownership.

Exhibit B - "Incompatible With Secondary Licenses" Notice
---------------------------------------------------------

  This Source Code Form is "Incompatible With Secondary Licenses", as
  defined by the Mozilla Public License, v. 2.0.


================================================
FILE: LICENSES/AGPL-3.0-or-later.txt
================================================
GNU AFFERO GENERAL PUBLIC LICENSE
Version 3, 19 November 2007

Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>

Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.

                            Preamble

The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software.

The licenses for most software and other practical works are designed to take away your freedom to share and change the works.  By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users.

When we speak of free software, we are referring to freedom, not price.  Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things.

Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software.

A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate.  Many developers of free software are heartened and encouraged by the resulting cooperation.  However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public.

The GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community.  It requires the operator of a network server to provide the source code of the modified version running there to the users of that server.  Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version.

An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals.  This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license.

The precise terms and conditions for copying, distribution and modification follow.

                       TERMS AND CONDITIONS

0. Definitions.

"This License" refers to version 3 of the GNU Affero General Public License.

"Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks.

"The Program" refers to any copyrightable work licensed under this License.  Each licensee is addressed as "you".  "Licensees" and "recipients" may be individuals or organizations.

To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy.  The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work.

A "covered work" means either the unmodified Program or a work based on the Program.

To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy.  Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well.

To "convey" a work means any kind of propagation that enables other parties to make or receive copies.  Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying.

An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License.  If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion.

1. Source Code.
The "source code" for a work means the preferred form of the work for making modifications to it.  "Object code" means any non-source form of a work.

A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language.

The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form.  A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it.

The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities.  However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work.  For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those
subprograms and other parts of the work.

The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source.

The Corresponding Source for a work in source code form is that same work.

2. Basic Permissions.
All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met.  This License explicitly affirms your unlimited permission to run the unmodified Program.  The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work.  This License acknowledges your rights of fair use or other equivalent, as provided by copyright law.

You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force.  You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright.  Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you.

Conveying under any other circumstances is permitted solely under the conditions stated below.  Sublicensing is not allowed; section 10 makes it unnecessary.

3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures.

When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures.

4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program.

You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee.

5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7.  This requirement modifies the requirement in section 4 to "keep intact all notices".

    c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy.  This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged.  This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so.

A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit.  Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate.

6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways:

    a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source.  This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b.

    d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge.  You need not require recipients to copy the Corresponding Source along with the object code.  If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source.  Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d.

A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work.

A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling.  In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage.  For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product.  A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product.

"Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source.  The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made.

If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information.  But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM).

The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed.  Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network.

Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying.

7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law.  If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions.

When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it.  (Additional permissions may be written to require their own removal in certain cases when you modify the work.)  You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission.

Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or authors of the material; or

    e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors.

All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10.  If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term.  If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying.

If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms.

Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way.

8. Termination.

You may not propagate or modify a covered work except as expressly provided under this License.  Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11).

However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation.

Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice.

Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License.  If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10.

9. Acceptance Not Required for Having Copies.

You are not required to accept this License in order to receive or run a copy of the Program.  Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance.  However, nothing other than this License grants you permission to propagate or modify any covered work.  These actions infringe copyright if you do not accept this License.  Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so.

10. Automatic Licensing of Downstream Recipients.

Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License.  You are not responsible for enforcing compliance by third parties with this License.

An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations.  If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts.

You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License.  For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it.

11. Patents.

A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based.  The work thus licensed is called the contributor's "contributor version".

A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version.  For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License.

Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version.

In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement).  To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party.

If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid.

If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it.

A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License.  You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007.

Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law.

12. No Surrender of Others' Freedom.

If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License.  If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program.

13. Remote Network Interaction; Use with the GNU General Public License.

Notwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software.  This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph.

Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work.  The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License.

14. Revised Versions of this License.

The Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time.  Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.

Each version is given a distinguishing version number.  If the Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation.  If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation.

If the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program.

Later license versions may give you additional or different permissions.  However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version.

15. Disclaimer of Warranty.

THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

16. Limitation of Liability.

IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

17. Interpretation of Sections 15 and 16.

If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee.

END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms.

To do so, attach the following notices to the program.  It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found.

     <one line to give the program's name and a brief idea of what it does.>
     Copyright (C) <year>  <name of author>

     This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

     This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more details.

     You should have received a copy of the GNU Affero General Public License along with this program.  If not, see <http://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

If your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source.  For example, if your program is a web application, its interface could display a "Source" link that leads users to an archive of the code.  There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements.

You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see <http://www.gnu.org/licenses/>.


================================================
FILE: LICENSES/Apache-2.0.txt
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

1. Definitions.

"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.

"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.

"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.

"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.

"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.

"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.

"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).

"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.

"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."

"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.

2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.

3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.

4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:

     (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and

     (b) You must cause any modified files to carry prominent notices stating that You changed the files; and

     (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and

     (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.

     You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.

5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.

6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.

7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.

8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.

9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

APPENDIX: How to apply the Apache License to your work.

To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!)  The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.

Copyright [yyyy] [name of copyright owner]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


================================================
FILE: LICENSES/CC-BY-SA-1.0.txt
================================================
Creative Commons Attribution-ShareAlike 1.0

 CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DRAFT LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.

License

THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE IS PROHIBITED.

BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.

1. Definitions

     a. "Collective Work" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.

     b. "Derivative Work" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License.

     c. "Licensor" means the individual or entity that offers the Work under the terms of this License.

     d. "Original Author" means the individual or entity who created the Work.

     e. "Work" means the copyrightable work of authorship offered under the terms of this License.

     f. "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.

2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.

3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:

     a. to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;

     b. to create and reproduce Derivative Works;

     c. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;

     d. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works;

The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.

4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:

     a. You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any reference to such Licensor or the Original Author, as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any reference to such Licensor or the Original Author, as requested.

     b. You may distribute, publicly display, publicly perform, or publicly digitally perform a Derivative Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of each Derivative Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Derivative Works that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder, and You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Derivative Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Derivative Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Derivative Work itself to be made subject to the terms of this License.

     c. If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and give the Original Author credit reasonable to the medium or means You are utilizing by conveying the name (or pseudonym if applicable) of the Original Author if supplied; the title of the Work if supplied; in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.

5. Representations, Warranties and Disclaimer

     a. By offering the Work for public release under this License, Licensor represents and warrants that, to the best of Licensor's knowledge after reasonable inquiry:

           i. Licensor has secured all rights in the Work necessary to grant the license rights hereunder and to permit the lawful exercise of the rights granted hereunder without You having any obligation to pay any royalties, compulsory license fees, residuals or any other payments;

          ii. The Work does not infringe the copyright, trademark, publicity rights, common law rights or any other right of any third party or constitute defamation, invasion of privacy or other tortious injury to any third party.

     b. EXCEPT AS EXPRESSLY STATED IN THIS LICENSE OR OTHERWISE AGREED IN WRITING OR REQUIRED BY APPLICABLE LAW, THE WORK IS LICENSED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES REGARDING THE CONTENTS OR ACCURACY OF THE WORK.

6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, AND EXCEPT FOR DAMAGES ARISING FROM LIABILITY TO A THIRD PARTY RESULTING FROM BREACH OF THE WARRANTIES IN SECTION 5, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

7. Termination

     a. This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.

     b. Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.

8. Miscellaneous

     a. Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.

     b. Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.

     c. If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.

     d. No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.

     e. This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.

Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.

Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, neither party will use the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time.

Creative Commons may be contacted at http://creativecommons.org/.


================================================
FILE: LICENSES/CC-BY-SA-2.0.txt
================================================
Creative Commons Attribution-ShareAlike 2.0

 CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.

License

THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.

BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.

1. Definitions

     a. "Collective Work" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.

     b. "Derivative Work" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License. For the avoidance of doubt, where the Work is a musical composition or sound recording, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered a Derivative Work for the purpose of this License.

     c. "Licensor" means the individual or entity that offers the Work under the terms of this License.

     d. "Original Author" means the individual or entity who created the Work.

     e. "Work" means the copyrightable work of authorship offered under the terms of this License.

     f. "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.

     g. "License Elements" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike.

2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.

3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:

     a. to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;

     b. to create and reproduce Derivative Works;

     c. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;

     d. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works.

     e. For the avoidance of doubt, where the work is a musical composition:

          i. Performance Royalties Under Blanket Licenses. Licensor waives the exclusive right to collect, whether individually or via a performance rights society (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital performance (e.g. webcast) of the Work.

          ii. Mechanical Rights and Statutory Royalties. Licensor waives the exclusive right to collect, whether individually or via a music rights society or designated agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from the Work ("cover version") and distribute, subject to the compulsory license created by 17 USC Section 115 of the US Copyright Act (or the equivalent in other jurisdictions).

     f. Webcasting Rights and Statutory Royalties. For the avoidance of doubt, where the Work is a sound recording, Licensor waives the exclusive right to collect, whether individually or via a performance-rights society (e.g. SoundExchange), royalties for the public digital performance (e.g. webcast) of the Work, subject to the compulsory license created by 17 USC Section 114 of the US Copyright Act (or the equivalent in other jurisdictions).

The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.

4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:

     a. You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any reference to such Licensor or the Original Author, as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any reference to such Licensor or the Original Author, as requested.

     b. You may distribute, publicly display, publicly perform, or publicly digitally perform a Derivative Work only under the terms of this License, a later version of this License with the same License Elements as this License, or a Creative Commons iCommons license that contains the same License Elements as this License (e.g. Attribution-ShareAlike 2.0 Japan). You must include a copy of, or the Uniform Resource Identifier for, this License or other license specified in the previous sentence with every copy or phonorecord of each Derivative Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Derivative Works that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder, and You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Derivative Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Derivative Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Derivative Work itself to be made subject to the terms of this License.

     c. If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and give the Original Author credit reasonable to the medium or means You are utilizing by conveying the name (or pseudonym if applicable) of the Original Author if supplied; the title of the Work if supplied; to the extent reasonably practicable, the Uniform Resource Identifier, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.

5. Representations, Warranties and Disclaimer

UNLESS OTHERWISE AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE MATERIALS, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.

6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

7. Termination

     a. This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.

     b. Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.

8. Miscellaneous

     a. Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.

     b. Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.

     c. If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.

     d. No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.

     e. This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.

Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.

Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, neither party will use the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time.

Creative Commons may be contacted at http://creativecommons.org/.


================================================
FILE: LICENSES/CC-BY-SA-2.5.txt
================================================
Creative Commons Attribution-ShareAlike 2.5

 CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.

License

THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.

BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.

1. Definitions

     a. "Collective Work" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.

     b. "Derivative Work" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License. For the avoidance of doubt, where the Work is a musical composition or sound recording, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered a Derivative Work for the purpose of this License.

     c. "Licensor" means the individual or entity that offers the Work under the terms of this License.

     d. "Original Author" means the individual or entity who created the Work.

     e. "Work" means the copyrightable work of authorship offered under the terms of this License.

     f. "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.

     g. "License Elements" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike.

2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.

3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:

     a. to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;

     b. to create and reproduce Derivative Works;

     c. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;

     d. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works.

     e. For the avoidance of doubt, where the work is a musical composition:

          i. Performance Royalties Under Blanket Licenses. Licensor waives the exclusive right to collect, whether individually or via a performance rights society (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital performance (e.g. webcast) of the Work.

          ii. Mechanical Rights and Statutory Royalties. Licensor waives the exclusive right to collect, whether individually or via a music rights society or designated agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from the Work ("cover version") and distribute, subject to the compulsory license created by 17 USC Section 115 of the US Copyright Act (or the equivalent in other jurisdictions).

     f. Webcasting Rights and Statutory Royalties. For the avoidance of doubt, where the Work is a sound recording, Licensor waives the exclusive right to collect, whether individually or via a performance-rights society (e.g. SoundExchange), royalties for the public digital performance (e.g. webcast) of the Work, subject to the compulsory license created by 17 USC Section 114 of the US Copyright Act (or the equivalent in other jurisdictions).

The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.

4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:

     a. You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any credit as required by clause 4(c), as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any credit as required by clause 4(c), as requested.

     b. You may distribute, publicly display, publicly perform, or publicly digitally perform a Derivative Work only under the terms of this License, a later version of this License with the same License Elements as this License, or a Creative Commons iCommons license that contains the same License Elements as this License (e.g. Attribution-ShareAlike 2.5 Japan). You must include a copy of, or the Uniform Resource Identifier for, this License or other license specified in the previous sentence with every copy or phonorecord of each Derivative Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Derivative Works that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder, and You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Derivative Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Derivative Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Derivative Work itself to be made subject to the terms of this License.

     c. If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or (ii) if the Original Author and/or Licensor designate another party or parties (e.g. a sponsor institute, publishing entity, journal) for attribution in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; the title of the Work if supplied; to the extent reasonably practicable, the Uniform Resource Identifier, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.

5. Representations, Warranties and Disclaimer

UNLESS OTHERWISE AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE MATERIALS, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.

6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

7. Termination

     a. This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.

     b. Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.

8. Miscellaneous

     a. Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.

     b. Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.

     c. If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.

     d. No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.

     e. This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.

Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.

Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, neither party will use the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time.

Creative Commons may be contacted at http://creativecommons.org/.


================================================
FILE: LICENSES/CC-BY-SA-3.0.txt
================================================
Creative Commons Attribution-ShareAlike 3.0 Unported

 CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.

License

THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.

BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.

1. Definitions

     a. "Adaptation" means a work based upon the Work, or upon the Work and other pre-existing works, such as a translation, adaptation, derivative work, arrangement of music or other alterations of a literary or artistic work, or phonogram or performance and includes cinematographic adaptations or any other form in which the Work may be recast, transformed, or adapted including in any form recognizably derived from the original, except that a work that constitutes a Collection will not be considered an Adaptation for the purpose of this License. For the avoidance of doubt, where the Work is a musical work, performance or phonogram, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered an Adaptation for the purpose of this License.

     b. "Collection" means a collection of literary or artistic works, such as encyclopedias and anthologies, or performances, phonograms or broadcasts, or other works or subject matter other than works listed in Section 1(f) below, which, by reason of the selection and arrangement of their contents, constitute intellectual creations, in which the Work is included in its entirety in unmodified form along with one or more other contributions, each constituting separate and independent works in themselves, which together are assembled into a collective whole. A work that constitutes a Collection will not be considered an Adaptation (as defined below) for the purposes of this License.

     c. "Creative Commons Compatible License" means a license that is listed at http://creativecommons.org/compatiblelicenses that has been approved by Creative Commons as being essentially equivalent to this License, including, at a minimum, because that license: (i) contains terms that have the same purpose, meaning and effect as the License Elements of this License; and, (ii) explicitly permits the relicensing of adaptations of works made available under that license under this License or a Creative Commons jurisdiction license with the same License Elements as this License.

     d. "Distribute" means to make available to the public the original and copies of the Work or Adaptation, as appropriate, through sale or other transfer of ownership.

     e. "License Elements" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike.

     f. "Licensor" means the individual, individuals, entity or entities that offer(s) the Work under the terms of this License.

     g. "Original Author" means, in the case of a literary or artistic work, the individual, individuals, entity or entities who created the Work or if no individual or entity can be identified, the publisher; and in addition (i) in the case of a performance the actors, singers, musicians, dancers, and other persons who act, sing, deliver, declaim, play in, interpret or otherwise perform literary or artistic works or expressions of folklore; (ii) in the case of a phonogram the producer being the person or legal entity who first fixes the sounds of a performance or other sounds; and, (iii) in the case of broadcasts, the organization that transmits the broadcast.

     h. "Work" means the literary and/or artistic work offered under the terms of this License including without limitation any production in the literary, scientific and artistic domain, whatever may be the mode or form of its expression including digital form, such as a book, pamphlet and other writing; a lecture, address, sermon or other work of the same nature; a dramatic or dramatico-musical work; a choreographic work or entertainment in dumb show; a musical composition with or without words; a cinematographic work to which are assimilated works expressed by a process analogous to cinematography; a work of drawing, painting, architecture, sculpture, engraving or lithography; a photographic work to which are assimilated works expressed by a process analogous to photography; a work of applied art; an illustration, map, plan, sketch or three-dimensional work relative to geography, topography, architecture or science; a performance; a broadcast; a phonogram; a compilation of data to the extent it is protected as a copyrightable work; or a work performed by a variety or circus performer to the extent it is not otherwise considered a literary or artistic work.

     i. "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.

     j. "Publicly Perform" means to perform public recitations of the Work and to communicate to the public those public recitations, by any means or process, including by wire or wireless means or public digital performances; to make available to the public Works in such a way that members of the public may access these Works from a place and at a place individually chosen by them; to perform the Work to the public by any means or process and the communication to the public of the performances of the Work, including by public digital performance; to broadcast and rebroadcast the Work by any means including signs, sounds or images.

     k. "Reproduce" means to make copies of the Work by any means including without limitation by sound or visual recordings and the right of fixation and reproducing fixations of the Work, including storage of a protected performance or phonogram in digital form or other electronic medium.

2. Fair Dealing Rights. Nothing in this License is intended to reduce, limit, or restrict any uses free from copyright or rights arising from limitations or exceptions that are provided for in connection with the copyright protection under copyright law or other applicable laws.

3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:

     a. to Reproduce the Work, to incorporate the Work into one or more Collections, and to Reproduce the Work as incorporated in the Collections;

     b. to create and Reproduce Adaptations provided that any such Adaptation, including any translation in any medium, takes reasonable steps to clearly label, demarcate or otherwise identify that changes were made to the original Work. For example, a translation could be marked "The original work was translated from English to Spanish," or a modification could indicate "The original work has been modified.";

     c. to Distribute and Publicly Perform the Work including as incorporated in Collections; and,

     d. to Distribute and Publicly Perform Adaptations.

     e. For the avoidance of doubt:

          i. Non-waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme cannot be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License;

          ii. Waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme can be waived, the Licensor waives the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; and,

          iii. Voluntary License Schemes. The Licensor waives the right to collect royalties, whether individually or, in the event that the Licensor is a member of a collecting society that administers voluntary licensing schemes, via that society, from any exercise by You of the rights granted under this License.

The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. Subject to Section 8(f), all rights not expressly granted by Licensor are hereby reserved.

4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:

     a. You may Distribute or Publicly Perform the Work only under the terms of this License. You must include a copy of, or the Uniform Resource Identifier (URI) for, this License with every copy of the Work You Distribute or Publicly Perform. You may not offer or impose any terms on the Work that restrict the terms of this License or the ability of the recipient of the Work to exercise the rights granted to that recipient under the terms of the License. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work You Distribute or Publicly Perform. When You Distribute or Publicly Perform the Work, You may not impose any effective technological measures on the Work that restrict the ability of a recipient of the Work from You to exercise the rights granted to that recipient under the terms of the License. This Section 4(a) applies to the Work as incorporated in a Collection, but this does not require the Collection apart from the Work itself to be made subject to the terms of this License. If You create a Collection, upon notice from any Licensor You must, to the extent practicable, remove from the Collection any credit as required by Section 4(c), as requested. If You create an Adaptation, upon notice from any Licensor You must, to the extent practicable, remove from the Adaptation any credit as required by Section 4(c), as requested.

     b. You may Distribute or Publicly Perform an Adaptation only under the terms of: (i) this License; (ii) a later version of this License with the same License Elements as this License; (iii) a Creative Commons jurisdiction license (either this or a later license version) that contains the same License Elements as this License (e.g., Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible License. If you license the Adaptation under one of the licenses mentioned in (iv), you must comply with the terms of that license. If you license the Adaptation under the terms of any of the licenses mentioned in (i), (ii) or (iii) (the "Applicable License"), you must comply with the terms of the Applicable License generally and the following provisions: (I) You must include a copy of, or the URI for, the Applicable License with every copy of each Adaptation You Distribute or Publicly Perform; (II) You may not offer or impose any terms on the Adaptation that restrict the terms of the Applicable License or the ability of the recipient of the Adaptation to exercise the rights granted to that recipient under the terms of the Applicable License; (III) You must keep intact all notices that refer to the Applicable License and to the disclaimer of warranties with every copy of the Work as included in the Adaptation You Distribute or Publicly Perform; (IV) when You Distribute or Publicly Perform the Adaptation, You may not impose any effective technological measures on the Adaptation that restrict the ability of a recipient of the Adaptation from You to exercise the rights granted to that recipient under the terms of the Applicable License. This Section 4(b) applies to the Adaptation as incorporated in a Collection, but this does not require the Collection apart from the Adaptation itself to be made subject to the terms of the Applicable License.

     c. If You Distribute, or Publicly Perform the Work or any Adaptations or Collections, You must, unless a request has been made pursuant to Section 4(a), keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or if the Original Author and/or Licensor designate another party or parties (e.g., a sponsor institute, publishing entity, journal) for attribution ("Attribution Parties") in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; (ii) the title of the Work if supplied; (iii) to the extent reasonably practicable, the URI, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and (iv) , consistent with Ssection 3(b), in the case of an Adaptation, a credit identifying the use of the Work in the Adaptation (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). The credit required by this Section 4(c) may be implemented in any reasonable manner; provided, however, that in the case of a Adaptation or Collection, at a minimum such credit will appear, if a credit for all contributing authors of the Adaptation or Collection appears, then as part of these credits and in a manner at least as prominent as the credits for the other contributing authors. For the avoidance of doubt, You may only use the credit required by this Section for the purpose of attribution in the manner set out above and, by exercising Your rights under this License, You may not implicitly or explicitly assert or imply any connection with, sponsorship or endorsement by the Original Author, Licensor and/or Attribution Parties, as appropriate, of You or Your use of the Work, without the separate, express prior written permission of the Original Author, Licensor and/or Attribution Parties.

     d. Except as otherwise agreed in writing by the Licensor or as may be otherwise permitted by applicable law, if You Reproduce, Distribute or Publicly Perform the Work either by itself or as part of any Adaptations or Collections, You must not distort, mutilate, modify or take other derogatory action in relation to the Work which would be prejudicial to the Original Author's honor or reputation. Licensor agrees that in those jurisdictions (e.g. Japan), in which any exercise of the right granted in Section 3(b) of this License (the right to make Adaptations) would be deemed to be a distortion, mutilation, modification or other derogatory action prejudicial to the Original Author's honor and reputation, the Licensor will waive or not assert, as appropriate, this Section, to the fullest extent permitted by the applicable national law, to enable You to reasonably exercise Your right under Section 3(b) of this License (right to make Adaptations) but not otherwise.

5. Representations, Warranties and Disclaimer

UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.

6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

7. Termination

     a. This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Adaptations or Collections from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.

     b. Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.

8. Miscellaneous

     a. Each time You Distribute or Publicly Perform the Work or a Collection, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.

     b. Each time You Distribute or Publicly Perform an Adaptation, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.

     c. If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.

     d. No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.

     e. This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.

     f. The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). These rights and subject matter take effect in the relevant jurisdiction in which the License terms are sought to be enforced according to the corresponding provisions of the implementation of those treaty provisions in the applicable national law. If the standard suite of rights granted under applicable copyright law includes additional rights not granted under this License, such additional rights are deemed to be included in the License; this License is not intended to restrict the license of any rights under applicable law.

Creative Commons Notice

Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.

Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, Creative Commons does not authorize the use by either party of the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time. For the avoidance of doubt, this trademark restriction does not form part of the License.

Creative Commons may be contacted at http://creativecommons.org/.


================================================
FILE: LICENSES/CC-BY-SA-4.0.txt
================================================
Creative Commons Attribution-ShareAlike 4.0 International

 Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible.

Using Creative Commons Public Licenses

Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses.

Considerations for licensors: Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. More considerations for licensors.

Considerations for the public: By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described.

Although not required by our licenses, you are encouraged to respect those requests where reasonable. More considerations for the public.

Creative Commons Attribution-ShareAlike 4.0 International Public License

By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.

Section 1 – Definitions.

     a.	Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.

     b.	Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.

     c.	BY-SA Compatible License means a license listed at creativecommons.org/compatiblelicenses, approved by Creative Commons as essentially the equivalent of this Public License.

     d.	Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.

     e.	Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.

     f.	Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.

     g.	License Elements means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution and ShareAlike.

     h.	Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.

     i.	Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.

     j.	Licensor means the individual(s) or entity(ies) granting rights under this Public License.

     k.	Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.

     l.	Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.

     m.	You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.

Section 2 – Scope.

     a.	License grant.

          1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:

               A. reproduce and Share the Licensed Material, in whole or in part; and

               B. produce, reproduce, and Share Adapted Material.

          2. Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.

          3. Term. The term of this Public License is specified in Section 6(a).

          4. Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.

          5. Downstream recipients.

               A. Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.

               B. Additional offer from the Licensor – Adapted Material. Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply.

               C. No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.

          6. No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).

     b.	Other rights.

          1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.

          2. Patent and trademark rights are not licensed under this Public License.

          3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties.

Section 3 – License Conditions.

Your exercise of the Licensed Rights is expressly made subject to the following conditions.

     a.	Attribution.

          1. If You Share the Licensed Material (including in modified form), You must:

               A. retain the following if it is supplied by the Licensor with the Licensed Material:

                    i.	identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);

                    ii.	a copyright notice;

                    iii. a notice that refers to this Public License;

                    iv.	a notice that refers to the disclaimer of warranties;

                    v.	a URI or hyperlink to the Licensed Material to the extent reasonably practicable;

               B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and

               C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.

          2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.

          3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.

     b.	ShareAlike.In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply.

          1. The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-SA Compatible License.

          2. You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material.

          3. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply.

Section 4 – Sui Generis Database Rights.

Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:

     a.	for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database;

     b.	if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and

     c.	You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.

Section 5 – Disclaimer of Warranties and Limitation of Liability.

     a.	Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.

     b.	To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.

     c.	The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.

Section 6 – Term and Termination.

     a.	This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.

     b.	Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:

          1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or

          2. upon express reinstatement by the Licensor.

     c.	For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.

     d.	For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.

     e.	Sections 1, 5, 6, 7, and 8 survive termination of this Public License.

Section 7 – Other Terms and Conditions.

     a.	The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.

     b.	Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.

Section 8 – Interpretation.

     a.	For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.

     b.	To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.

     c.	No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.

     d.	Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.

Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at creativecommons.org/policies, Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses.

Creative Commons may be contacted at creativecommons.org.


================================================
FILE: LICENSES/GFDL-1.2-or-later.txt
================================================
GNU Free Documentation License
Version 1.2, November 2002

Copyright (C) 2000,2001,2002 Free Software Foundation, Inc. 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.

0. PREAMBLE

The purpose of this License is to make a manual, textbook, or other functional and useful document "free" in the sense of freedom: to assure everyone the effective freedom to copy and redistribute it, with or without modifying it, either commercially or noncommercially. Secondarily, this License preserves for the author and publisher a way to get credit for their work, while not being considered responsible for modifications made by others.

This License is a kind of "copyleft", which means that derivative works of the document must themselves be free in the same sense. It complements the GNU General Public License, which is a copyleft license designed for free software.

We have designed this License in order to use it for manuals for free software, because free software needs free documentation: a free program should come with manuals providing the same freedoms that the software does. But this License is not limited to software manuals; it can be used for any textual work, regardless of subject matter or whether it is published as a printed book. We recommend this License principally for works whose purpose is instruction or reference.

1. APPLICABILITY AND DEFINITIONS

This License applies to any manual or other work, in any medium, that contains a notice placed by the copyright holder saying it can be distributed under the terms of this License. Such a notice grants a world-wide, royalty-free license, unlimited in duration, to use that work under the conditions stated herein. The "Document", below, refers to any such manual or work. Any member of the public is a licensee, and is addressed as "you". You accept the license if you copy, modify or distribute the work in a way requiring permission under copyright law.

A "Modified Version" of the Document means any work containing the Document or a portion of it, either copied verbatim, or with modifications and/or translated into another language.

A "Secondary Section" is a named appendix or a front-matter section of the Document that deals exclusively with the relationship of the publishers or authors of the Document to the Document's overall subject (or to related matters) and contains nothing that could fall directly within that overall subject. (Thus, if the Document is in part a textbook of mathematics, a Secondary Section may not explain any mathematics.) The relationship could be a matter of historical connection with the subject or with related matters, or of legal, commercial, philosophical, ethical or political position regarding them.

The "Invariant Sections" are certain Secondary Sections whose titles are designated, as being those of Invariant Sections, in the notice that says that the Document is released under this License. If a section does not fit the above definition of Secondary then it is not allowed to be designated as Invariant. The Document may contain zero Invariant Sections. If the Document does not identify any Invariant Sections then there are none.

The "Cover Texts" are certain short passages of text that are listed, as Front-Cover Texts or Back-Cover Texts, in the notice that says that the Document is released under this License. A Front-Cover Text may be at most 5 words, and a Back-Cover Text may be at most 25 words.

A "Transparent" copy of the Document means a machine-readable copy, represented in a format whose specification is available to the general public, that is suitable for revising the document straightforwardly with generic text editors or (for images composed of pixels) generic paint programs or (for drawings) some widely available drawing editor, and that is suitable for input to text formatters or for automatic translation to a variety of formats suitable for input to text formatters. A copy made in an otherwise Transparent file format whose markup, or absence of markup, has been arranged to thwart or discourage subsequent modification by readers is not Transparent. An image format is not Transparent if used for any substantial amount of text. A copy that is not "Transparent" is called "Opaque".

Examples of suitable formats for Transparent copies include plain ASCII without markup, Texinfo input format, LaTeX input format, SGML or XML using a publicly available DTD, and standard-conforming simple HTML, PostScript or PDF designed for human modification. Examples of transparent image formats include PNG, XCF and JPG. Opaque formats include proprietary formats that can be read and edited only by proprietary word processors, SGML or XML for which the DTD and/or processing tools are not generally available, and the machine-generated HTML, PostScript or PDF produced by some word processors for output purposes only.

The "Title Page" means, for a printed book, the title page itself, plus such following pages as are needed to hold, legibly, the material this License requires to appear in the title page. For works in formats which do not have any title page as such, "Title Page" means the text near the most prominent appearance of the work's title, preceding the beginning of the body of the text.

A section "Entitled XYZ" means a named subunit of the Document whose title either is precisely XYZ or contains XYZ in parentheses following text that translates XYZ in another language. (Here XYZ stands for a specific section name mentioned below, such as "Acknowledgements", "Dedications", "Endorsements", or "History".) To "Preserve the Title" of such a section when you modify the Document means that it remains a section "Entitled XYZ" according to this definition.

The Document may include Warranty Disclaimers next to the notice which states that this License applies to the Document. These Warranty Disclaimers are considered to be included by reference in this License, but only as regards disclaiming warranties: any other implication that these Warranty Disclaimers may have is void and has no effect on the meaning of this License.

2. VERBATIM COPYING

You may copy and distribute the Document in any medium, either commercially or noncommercially, provided that this License, the copyright notices, and the license notice saying this License applies to the Document are reproduced in all copies, and that you add no other conditions whatsoever to those of this License. You may not use technical measures to obstruct or control the reading or further copying of the copies you make or distribute. However, you may accept compensation in exchange for copies. If you distribute a large enough number of copies you must also follow the conditions in section 3.

You may also lend copies, under the same conditions stated above, and you may publicly display copies.

3. COPYING IN QUANTITY

If you publish printed copies (or copies in media that commonly have printed covers) of the Document, numbering more than 100, and the Document's license notice requires Cover Texts, you must enclose the copies in covers that carry, clearly and legibly, all these Cover Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on the back cover. Both covers must also clearly and legibly identify you as the publisher of these copies. The front cover must present the full title with all words of the title equally prominent and visible. You may add other material on the covers in addition. Copying with changes limited to the covers, as long as they preserve the title of the Document and satisfy these conditions, can be treated as verbatim copying in other respects.

If the required texts for either cover are too voluminous to fit legibly, you should put the first ones listed (as many as fit reasonably) on the actual cover, and continue the rest onto adjacent pages.

If you publish or distribute Opaque copies of the Document numbering more than 100, you must either include a machine-readable Transparent copy along with each Opaque copy, or state in or with each Opaque copy a computer-network location from which the general network-using public has access to download using public-standard network protocols a complete Transparent copy of the Document, free of added material. If you use the latter option, you must take reasonably prudent steps, when you begin distribution of Opaque copies in quantity, to ensure that this Transparent copy will remain thus accessible at the stated location until at least one year after the last time you distribute an Opaque copy (directly or through your agents or retailers) of that edition to the public.

It is requested, but not required, that you contact the authors of the Document well before redistributing any large number of copies, to give them a chance to provide you with an updated version of the Document.

4. MODIFICATIONS

You may copy and distribute a Modified Version of the Document under the conditions of sections 2 and 3 above, provided that you release the Modified Version under precisely this License, with the Modified Version filling the role of the Document, thus licensing distribution and modification of the Modified Version to whoever possesses a copy of it. In addition, you must do these things in the Modified Version:

     A. Use in the Title Page (and on the covers, if any) a title distinct from that of the Document, and from those of previous versions (which should, if there were any, be listed in the History section of the Document). You may use the same title as a previous version if the original publisher of that version gives permission.
     B. List on the Title Page, as authors, one or more persons or entities responsible for authorship of the modifications in the Modified Version, together with at least five of the principal authors of the Document (all of its principal authors, if it has fewer than five), unless they release you from this requirement.
     C. State on the Title page the name of the publisher of the Modified Version, as the publisher.
     D. Preserve all the copyright notices of the Document.
     E. Add an appropriate copyright notice for your modifications adjacent to the other copyright notices.
     F. Include, immediately after the copyright notices, a license notice giving the public permission to use the Modified Version under the terms of this License, in the form shown in the Addendum below.
     G. Preserve in that license notice the full lists of Invariant Sections and required Cover Texts given in the Document's license notice.
     H. Include an unaltered copy of this License.
     I. Preserve the section Entitled "History", Preserve its Title, and add to it an item stating at least the title, year, new authors, and publisher of the Modified Version as given on the Title Page. If there is no section Entitled "History" in the Document, create one stating the title, year, authors, and publisher of the Document as given on its Title Page, then add an item describing the Modified Version as stated in the previous sentence.
     J. Preserve the network location, if any, given in the Document for public access to a Transparent copy of the Document, and likewise the network locations given in the Document for previous versions it was based on. These may be placed in the "History" section. You may omit a network location for a work that was published at least four years before the Document itself, or if the original publisher of the version it refers to gives permission.
     K. For any section Entitled "Acknowledgements" or "Dedications", Preserve the Title of the section, and preserve in the section all the substance and tone of each of the contributor acknowledgements and/or dedications given therein.
     L. Preserve all the Invariant Sections of the Document, unaltered in their text and in their titles. Section numbers or the equivalent are not considered part of the section titles.
     M. Delete any section Entitled "Endorsements". Such a section may not be included in the Modified Version.
     N. Do not retitle any existing section to be Entitled "Endorsements" or to conflict in title with any Invariant Section.
     O. Preserve any Warranty Disclaimers.

If the Modified Version includes new front-matter sections or appendices that qualify as Secondary Sections and contain no material copied from the Document, you may at your option designate some or all of these sections as invariant. To do this, add their titles to the list of Invariant Sections in the Modified Version's license notice. These titles must be distinct from any other section titles.

You may add a section Entitled "Endorsements", provided it contains nothing but endorsements of your Modified Version by various parties--for example, statements of peer review or that the text has been approved by an organization as the authoritative definition of a standard.

You may add a passage of up to five words as a Front-Cover Text, and a passage of up to 25 words as a Back-Cover Text, to the end of the list of Cover Texts in the Modified Version. Only one passage of Front-Cover Text and one of Back-Cover Text may be added by (or through arrangements made by) any one entity. If the Document already includes a cover text for the same cover, previously added by you or by arrangement made by the same entity you are acting on behalf of, you may not add another; but you may replace the old one, on explicit permission from the previous publisher that added the old one.

The author(s) and publisher(s) of the Document do not by this License give permission to use their names for publicity for or to assert or imply endorsement of any Modified Version.

5. COMBINING DOCUMENTS

You may combine the Document with other documents released under this License, under the terms defined in section 4 above for modified versions, provided that you include in the combination all of the Invariant Sections of all of the original documents, unmodified, and list them all as Invariant Sections of your combined work in its license notice, and that you preserve all their Warranty Disclaimers.

The combined work need only contain one copy of this License, and multiple identical Invariant Sections may be replaced with a single copy. If there are multiple Invariant Sections with the same name but different contents, make the title of each such section unique by adding at the end of it, in parentheses, the name of the original author or publisher of that section if known, or else a unique number. Make the same adjustment to the section titles in the list of Invariant Sections in the license notice of the combined work.

In the combination, you must combine any sections Entitled "History" in the various original documents, forming one section Entitled "History"; likewise combine any sections Entitled "Acknowledgements", and any sections Entitled "Dedications". You must delete all sections Entitled "Endorsements".

6. COLLECTIONS OF DOCUMENTS

You may make a collection consisting of the Document and other documents released under this License, and replace the individual copies of this License in the various documents with a single copy that is included in the collection, provided that you follow the rules of this License for verbatim copying of each of the documents in all other respects.

You may extract a single document from such a collection, and distribute it individually under this License, provided you insert a copy of this License into the extracted document, and follow this License in all other respects regarding verbatim copying of that document.

7. AGGREGATION WITH INDEPENDENT WORKS

A compilation of the Document or its derivatives with other separate and independent documents or works, in or on a volume of a storage or distribution medium, is called an "aggregate" if the copyright resulting from the compilation is not used to limit the legal rights of the compilation's users beyond what the individual works permit. When the Document is included in an aggregate, this License does not apply to the other works in the aggregate which are not themselves derivative works of the Document.

If the Cover Text requirement of section 3 is applicable to these copies of the Document, then if the Document is less than one half of the entire aggregate, the Document's Cover Texts may be placed on covers that bracket the Document within the aggregate, or the electronic equivalent of covers if the Document is in electronic form. Otherwise they must appear on printed covers that bracket the whole aggregate.

8. TRANSLATION

Translation is considered a kind of modification, so you may distribute translations of the Document under the terms of section 4. Replacing Invariant Sections with translations requires special permission from their copyright holders, but you may include translations of some or all Invariant Sections in addition to the original versions of these Invariant Sections. You may include a translation of this License, and all the license notices in the Document, and any Warranty Disclaimers, provided that you also include the original English version of this License and the original versions of those notices and disclaimers. In case of a disagreement between the translation and the original version of this License or a notice or disclaimer, the original version will prevail.

If a section in the Document is Entitled "Acknowledgements", "Dedications", or "History", the requirement (section 4) to Preserve its Title (section 1) will typically require changing the actual title.

9. TERMINATION

You may not copy, modify, sublicense, or distribute the Document except as expressly provided for under this License. Any other attempt to copy, modify, sublicense or distribute the Document is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance.

10. FUTURE REVISIONS OF THIS LICENSE

The Free Software Foundation may publish new, revised versions of the GNU Free Documentation License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. See http://www.gnu.org/copyleft/.

Each version of the License is given a distinguishing version number. If the Document specifies that a particular numbered version of this License "or any later version" applies to it, you have the option of following the terms and conditions either of that specified version or of any later version that has been published (not as a draft) by the Free Software Foundation. If the Document does not specify a version number of this License, you may choose any version ever published (not as a draft) by the Free Software Foundation.

ADDENDUM: How to use this License for your documents

To use this License in a document you have written, include a copy of the License in the document and put the following copyright and license notices just after the title page:

 Copyright (c) YEAR YOUR NAME. Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.2 or any later version published by the Free Software Foundation; with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the license is included in the section entitled "GNU Free Documentation License".

If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts, replace the "with...Texts." line with this:

 with the Invariant Sections being LIST THEIR TITLES, with the Front-Cover Texts being LIST, and with the Back-Cover Texts being LIST.

If you have Invariant Sections without Cover Texts, or some other combination of the three, merge those two alternatives to suit the situation.

If your document contains nontrivial examples of program code, we recommend releasing these examples in parallel under your choice of free software license, such as the GNU General Public License, to permit their use in free software.


================================================
FILE: LICENSES/MIT.txt
================================================
MIT License

Copyright (c) <year> <copyright holders>

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: LICENSES/MPL-2.0.txt
================================================
Mozilla Public License Version 2.0
==================================

1. Definitions
--------------

1.1. "Contributor"
    means each individual or legal entity that creates, contributes to
    the creation of, or owns Covered Software.

1.2. "Contributor Version"
    means the combination of the Contributions of others (if any) used
    by a Contributor and that particular Contributor's Contribution.

1.3. "Contribution"
    means Covered Software of a particular Contributor.

1.4. "Covered Software"
    means Source Code Form to which the initial Contributor has attached
    the notice in Exhibit A, the Executable Form of such Source Code
    Form, and Modifications of such Source Code Form, in each case
    including portions thereof.

1.5. "Incompatible With Secondary Licenses"
    means

    (a) that the initial Contributor has attached the notice described
        in Exhibit B to the Covered Software; or

    (b) that the Covered Software was made available under the terms of
        version 1.1 or earlier of the License, but not also under the
        terms of a Secondary License.

1.6. "Executable Form"
    means any form of the work other than Source Code Form.

1.7. "Larger Work"
    means a work that combines Covered Software with other material, in 
    a separate file or files, that is not Covered Software.

1.8. "License"
    means this document.

1.9. "Licensable"
    means having the right to grant, to the maximum extent possible,
    whether at the time of the initial grant or subsequently, any and
    all of the rights conveyed by this License.

1.10. "Modifications"
    means any of the following:

    (a) any file in Source Code Form that results from an addition to,
        deletion from, or modification of the contents of Covered
        Software; or

    (b) any new file in Source Code Form that contains any Covered
        Software.

1.11. "Patent Claims" of a Contributor
    means any patent claim(s), including without limitation, method,
    process, and apparatus claims, in any patent Licensable by such
    Contributor that would be infringed, but for the grant of the
    License, by the making, using, selling, offering for sale, having
    made, import, or transfer of either its Contributions or its
    Contributor Version.

1.12. "Secondary License"
    means either the GNU General Public License, Version 2.0, the GNU
    Lesser General Public License, Version 2.1, the GNU Affero General
    Public License, Version 3.0, or any later versions of those
    licenses.

1.13. "Source Code Form"
    means the form of the work preferred for making modifications.

1.14. "You" (or "Your")
    means an individual or a legal entity exercising rights under this
    License. For legal entities, "You" includes any entity that
    controls, is controlled by, or is under common control with You. For
    purposes of this definition, "control" means (a) the power, direct
    or indirect, to cause the direction or management of such entity,
    whether by contract or otherwise, or (b) ownership of more than
    fifty percent (50%) of the outstanding shares or beneficial
    ownership of such entity.

2. License Grants and Conditions
--------------------------------

2.1. Grants

Each Contributor hereby grants You a world-wide, royalty-free,
non-exclusive license:

(a) under intellectual property rights (other than patent or trademark)
    Licensable by such Contributor to use, reproduce, make available,
    modify, display, perform, distribute, and otherwise exploit its
    Contributions, either on an unmodified basis, with Modifications, or
    as part of a Larger Work; and

(b) under Patent Claims of such Contributor to make, use, sell, offer
    for sale, have made, import, and otherwise transfer either its
    Contributions or its Contributor Version.

2.2. Effective Date

The licenses granted in Section 2.1 with respect to any Contribution
become effective for each Contribution on the date the Contributor first
distributes such Contribution.

2.3. Limitations on Grant Scope

The licenses granted in this Section 2 are the only rights granted under
this License. No additional rights or licenses will be implied from the
distribution or licensing of Covered Software under this License.
Notwithstanding Section 2.1(b) above, no patent license is granted by a
Contributor:

(a) for any code that a Contributor has removed from Covered Software;
    or

(b) for infringements caused by: (i) Your and any other third party's
    modifications of Covered Software, or (ii) the combination of its
    Contributions with other software (except as part of its Contributor
    Version); or

(c) under Patent Claims infringed by Covered Software in the absence of
    its Contributions.

This License does not grant any rights in the trademarks, service marks,
or logos of any Contributor (except as may be necessary to comply with
the notice requirements in Section 3.4).

2.4. Subsequent Licenses

No Contributor makes additional grants as a result of Your choice to
distribute the Covered Software under a subsequent version of this
License (see Section 10.2) or under the terms of a Secondary License (if
permitted under the terms of Section 3.3).

2.5. Representation

Each Contributor represents that the Contributor believes its
Contributions are its original creation(s) or it has sufficient rights
to grant the rights to its Contributions conveyed by this License.

2.6. Fair Use

This License is not intended to limit any rights You have under
applicable copyright doctrines of fair use, fair dealing, or other
equivalents.

2.7. Conditions

Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
in Section 2.1.

3. Responsibilities
-------------------

3.1. Distribution of Source Form

All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under
the terms of this License. You must inform recipients that the Source
Code Form of the Covered Software is governed by the terms of this
License, and how they can obtain a copy of this License. You may not
attempt to alter or restrict the recipients' rights in the Source Code
Form.

3.2. Distribution of Executable Form

If You distribute Covered Software in Executable Form then:

(a) such Covered Software must also be made available in Source Code
    Form, as described in Section 3.1, and You must inform recipients of
    the Executable Form how they can obtain a copy of such Source Code
    Form by reasonable means in a timely manner, at a charge no more
    than the cost of distribution to the recipient; and

(b) You may distribute such Executable Form under the terms of this
    License, or sublicense it under different terms, provided that the
    license for the Executable Form does not attempt to limit or alter
    the recipients' rights in the Source Code Form under this License.

3.3. Distribution of a Larger Work

You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for
the Covered Software. If the Larger Work is a combination of Covered
Software with a work governed by one or more Secondary Licenses, and the
Covered Software is not Incompatible With Secondary Licenses, this
License permits You to additionally distribute such Covered Software
under the terms of such Secondary License(s), so that the recipient of
the Larger Work may, at their option, further distribute the Covered
Software under the terms of either this License or such Secondary
License(s).

3.4. Notices

You may not remove or alter the substance of any license notices
(including copyright notices, patent notices, disclaimers of warranty,
or limitations of liability) contained within the Source Code Form of
the Covered Software, except that You may alter any license notices to
the extent required to remedy known factual inaccuracies.

3.5. Application of Additional Terms

You may choose to offer, and to charge a fee for, warranty, support,
indemnity or liability obligations to one or more recipients of Covered
Software. However, You may do so only on Your own behalf, and not on
behalf of any Contributor. You must make it absolutely clear that any
such warranty, support, indemnity, or liability obligation is offered by
You alone, and You hereby agree to indemnify every Contributor for any
liability incurred by such Contributor as a result of warranty, support,
indemnity or liability terms You offer. You may include additional
disclaimers of warranty and limitations of liability specific to any
jurisdiction.

4. Inability to Comply Due to Statute or Regulation
---------------------------------------------------

If it is impossible for You to comply with any of the terms of this
License with respect to some or all of the Covered Software due to
statute, judicial order, or regulation then You must: (a) comply with
the terms of this License to the maximum extent possible; and (b)
describe the limitations and the code they affect. Such description must
be placed in a text file included with all distributions of the Covered
Software under this License. Except to the extent prohibited by statute
or regulation, such description must be sufficiently detailed for a
recipient of ordinary skill to be able to understand it.

5. Termination
--------------

5.1. The rights granted under this License will terminate automatically
if You fail to comply with any of its terms. However, if You become
compliant, then the rights granted under this License from a particular
Contributor are reinstated (a) provisionally, unless and until such
Contributor explicitly and finally terminates Your grants, and (b) on an
ongoing basis, if such Contributor fails to notify You of the
non-compliance by some reasonable means prior to 60 days after You have
come back into compliance. Moreover, Your grants from a particular
Contributor are reinstated on an ongoing basis if such Contributor
notifies You of the non-compliance by some reasonable means, this is the
first time You have received notice of non-compliance with this License
from such Contributor, and You become compliant prior to 30 days after
Your receipt of the notice.

5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions,
counter-claims, and cross-claims) alleging that a Contributor Version
directly or indirectly infringes any patent, then the rights granted to
You by any and all Contributors for the Covered Software under Section
2.1 of this License shall terminate.

5.3. In the event of termination under Sections 5.1 or 5.2 above, all
end user license agreements (excluding distributors and resellers) which
have been validly granted by You or Your distributors under this License
prior to termination shall survive termination.

************************************************************************
*                                                                      *
*  6. Disclaimer of Warranty                                           *
*  -------------------------                                           *
*                                                                      *
*  Covered Software is provided under this License on an "as is"       *
*  basis, without warranty of any kind, either expressed, implied, or  *
*  statutory, including, without limitation, warranties that the       *
*  Covered Software is free of defects, merchantable, fit for a        *
*  particular purpose or non-infringing. The entire risk as to the     *
*  quality and performance of the Covered Software is with You.        *
*  Should any Covered Software prove defective in any respect, You     *
*  (not any Contributor) assume the cost of any necessary servicing,   *
*  repair, or correction. This disclaimer of warranty constitutes an   *
*  essential part of this License. No use of any Covered Software is   *
*  authorized under this License except under this disclaimer.         *
*                                                                      *
************************************************************************

************************************************************************
*                                                                      *
*  7. Limitation of Liability                                          *
*  --------------------------                                          *
*                                                                      *
*  Under no circumstances and under no legal theory, whether tort      *
*  (including negligence), contract, or otherwise, shall any           *
*  Contributor, or anyone who distributes Covered Software as          *
*  permitted above, be liable to You for any direct, indirect,         *
*  special, incidental, or consequential damages of any character      *
*  including, without limitation, damages for lost profits, loss of    *
*  goodwill, work stoppage, computer failure or malfunction, or any    *
*  and all other commercial damages or losses, even if such party      *
*  shall have been informed of the possibility of such damages. This   *
*  limitation of liability shall not apply to liability for death or   *
*  personal injury resulting from such party's negligence to the       *
*  extent applicable law prohibits such limitation. Some               *
*  jurisdictions do not allow the exclusion or limitation of           *
*  incidental or consequential damages, so this exclusion and          *
*  limitation may not apply to You.                                    *
*                                                                      *
************************************************************************

8. Litigation
-------------

Any litigation relating to this License may be brought only in the
courts of a jurisdiction where the defendant maintains its principal
place of business and such litigation shall be governed by laws of that
jurisdiction, without reference to its conflict-of-law provisions.
Nothing in this Section shall prevent a party's ability to bring
cross-claims or counter-claims.

9. Miscellaneous
----------------

This License represents the complete agreement concerning the subject
matter hereof. If any provision of this License is held to be
unenforceable, such provision shall be reformed only to the extent
necessary to make it enforceable. Any law or regulation which provides
that the language of a contract shall be construed against the drafter
shall not be used to construe this License against a Contributor.

10. Versions of the License
---------------------------

10.1. New Versions

Mozilla Foundation is the license steward. Except as provided in Section
10.3, no one other than the license steward has the right to modify or
publish new versions of this License. Each version will be given a
distinguishing version number.

10.2. Effect of New Versions

You may distribute the Covered Software under the terms of the version
of the License under which You originally received the Covered Software,
or under the terms of any subsequent version published by the license
steward.

10.3. Modified Versions

If you create software not governed by this License, and you want to
create a new license for such software, you may create and use a
modified version of this License if you rename the license and remove
any references to the name of the license steward (except to note that
such modified license differs from this License).

10.4. Distributing Source Code Form that is Incompatible With Secondary
Licenses

If You choose to distribute Source Code Form that is Incompatible With
Secondary Licenses under the terms of this version of the License, the
notice described in Exhibit B of this License must be attached.

Exhibit A - Source Code Form License Notice
-------------------------------------------

  This Source Code Form is subject to the terms of the Mozilla Public
  License, v. 2.0. If a copy of the MPL was not distributed with this
  file, You can obtain one at https://mozilla.org/MPL/2.0/.

If it is not possible or desirable to put the notice in a particular
file, then You may include the notice in a location (such as a LICENSE
file in a relevant directory) where a recipient would be likely to look
for such a notice.

You may add additional accurate notices of copyright ownership.

Exhibit B - "Incompatible With Secondary Licenses" Notice
---------------------------------------------------------

  This Source Code Form is "Incompatible With Secondary Licenses", as
  defined by the Mozilla Public License, v. 2.0.


================================================
FILE: LICENSES/Zlib.txt
================================================
zlib License

This software is provided 'as-is', without any express or implied warranty.  In no event will the authors be held liable for any damages arising from the use of this software.

Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:

     1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.

     2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.

     3. This notice may not be removed or altered from any source distribution.


================================================
FILE: README.md
================================================
<!-- SPDX-FileCopyrightText: 2014 Julien Pfefferkorn -->
<!-- SPDX-FileCopyrightText: 2015 James R. Barlow -->
<!-- SPDX-License-Identifier: CC-BY-SA-4.0 -->

<img src="docs/images/logo.svg" width="240" alt="OCRmyPDF">

[![Build Status](https://github.com/ocrmypdf/OCRmyPDF/actions/workflows/build.yml/badge.svg)](https://github.com/ocrmypdf/OCRmyPDF/actions/workflows/build.yml) [![PyPI version][pypi]](https://pypi.org/project/ocrmypdf/) ![Homebrew version][homebrew] ![ReadTheDocs][docs] ![Python versions][pyversions]

[pypi]: https://img.shields.io/pypi/v/ocrmypdf.svg "PyPI version"
[homebrew]: https://img.shields.io/homebrew/v/ocrmypdf.svg "Homebrew version"
[docs]: https://readthedocs.org/projects/ocrmypdf/badge/?version=latest "RTD"
[pyversions]: https://img.shields.io/pypi/pyversions/ocrmypdf "Supported Python versions"

OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched or copy-pasted.

```bash
ocrmypdf                      # it's a scriptable command line program
   -l eng+fra                 # it supports multiple languages
   --rotate-pages             # it can fix pages that are misrotated
   --deskew                   # it can deskew crooked PDFs!
   --title "My PDF"           # it can change output metadata
   --jobs 4                   # it uses multiple cores by default
   --output-type pdfa         # it produces PDF/A by default
   input_scanned.pdf          # takes PDF input (or images)
   output_searchable.pdf      # produces validated PDF output
```

[See the release notes for details on the latest changes](https://ocrmypdf.readthedocs.io/en/latest/release_notes.html).

## Main features

- Generates a searchable [PDF/A](https://en.wikipedia.org/?title=PDF/A) file from a regular PDF
- Places OCR text accurately below the image to ease copy / paste
- Keeps the exact resolution of the original embedded images
- When possible, inserts OCR information as a "lossless" operation without disrupting any other content
- Optimizes PDF images, often producing files smaller than the input file
- If requested, deskews and/or cleans the image before performing OCR
- Validates input and output files
- Distributes work across all available CPU cores
- Uses [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) engine to recognize more than [100 languages](https://github.com/tesseract-ocr/tessdata)
- Keeps your private data private.
- Scales properly to handle files with thousands of pages.
- Battle-tested on millions of PDFs.

<img src="misc/screencast/demo.svg" alt="Demo of OCRmyPDF in a terminal session">

For details: please consult the [documentation](https://ocrmypdf.readthedocs.io/en/latest/).

## Motivation

I searched the web for a free command line tool to OCR PDF files: I found many, but none of them were really satisfying:

- Either they produced PDF files with misplaced text under the image (making copy/paste impossible)
- Or they did not handle accents and multilingual characters
- Or they changed the resolution of the embedded images
- Or they generated ridiculously large PDF files
- Or they crashed when trying to OCR
- Or they did not produce valid PDF files
- On top of that none of them produced PDF/A files (format dedicated for long time storage)

...so I decided to develop my own tool.

## Installation

Linux, Windows, macOS and FreeBSD are supported. Docker images are also available, for both x64 and ARM.

| Operating system              | Install command               |
| ----------------------------- | ------------------------------|
| Debian, Ubuntu                | ``apt install ocrmypdf``      |
| Windows Subsystem for Linux   | ``apt install ocrmypdf``      |
| Fedora                        | ``dnf install ocrmypdf``      |
| macOS (Homebrew)              | ``brew install ocrmypdf``     |
| macOS (MacPorts)              | ``port install ocrmypdf``     |
| macOS (nix)                   | ``nix-env -i ocrmypdf``       |
| LinuxBrew                     | ``brew install ocrmypdf``     |
| FreeBSD                       | ``pkg install py-ocrmypdf``   |
| OpenBSD                       | ``pkg_add ocrmypdf``          |
| Ubuntu Snap                   | ``snap install ocrmypdf``     |

For everyone else, [see our documentation](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for installation steps.

## Languages

OCRmyPDF uses Tesseract for OCR, and relies on its language packs. For Linux users, you can often find packages that provide language packs:

```bash

# Debian/Ubuntu users
apt-cache search tesseract-ocr # Display a list of all Tesseract language packs
apt-get install tesseract-ocr-chi-sim  # Example: Install Chinese Simplified language pack


# Arch Linux users
pacman -S tesseract-data-eng tesseract-data-deu # Example: Install the English and German language packs

# OpenBSD users
pkg_info -aQ tesseract  # Display a list of all Tesseract language packs
pkg_add tesseract-cym  # Example: Install the Welsh language pack

# brew macOS users
brew install tesseract-lang

# Fedora users
dnf search tesseract-langpack # Display a list of all Tesseract language packs 
dnf install tesseract-langpack-ita # Example: Install the Italian language pack


```

You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested.

OCRmyPDF supports Tesseract 4.1.1+. It will automatically use whichever version it finds first on the `PATH` environment variable. On Windows, if `PATH` does not provide a Tesseract binary, we use the highest version number that is installed according to the Windows Registry.

## Documentation and support

Once OCRmyPDF is installed, the built-in help which explains the command syntax and options can be accessed via:

```bash
ocrmypdf --help
```

Our [documentation is served on Read the Docs](https://ocrmypdf.readthedocs.io/en/latest/index.html).

Please report issues on our [GitHub issues](https://github.com/ocrmypdf/OCRmyPDF/issues) page, and follow the issue template for quick response.

## Feature demo

```bash
# Add an OCR layer and require PDF/A
ocrmypdf --output-type pdfa input.pdf output.pdf

# Convert an image to single page PDF
ocrmypdf input.jpg output.pdf

# Add OCR to a file in place (only modifies file on success)
ocrmypdf myfile.pdf myfile.pdf

# OCR with non-English languages (look up your language's ISO 639-3 code)
ocrmypdf -l fra LeParisien.pdf LeParisien.pdf

# OCR multilingual documents
ocrmypdf -l eng+fra Bilingual-English-French.pdf Bilingual-English-French.pdf

# Deskew (straighten crooked pages)
ocrmypdf --deskew input.pdf output.pdf
```

For more features, see the [documentation](https://ocrmypdf.readthedocs.io/en/latest/index.html).

## Requirements

In addition to the required Python version, OCRmyPDF requires external program installations of Ghostscript and Tesseract OCR. OCRmyPDF is pure Python, and runs on pretty much everything: Linux, macOS, Windows and FreeBSD.

## Plugins

OCRmyPDF provides a plugin interface allowing its capabilities to be extended or replaced. Here are some plugins we are aware of:

- [OCRmyPDF-AppleOCR](https://github.com/mkyt/ocrmypdf-AppleOCR): replaces the standard Tesseract OCR engine with Apple Vision Framework. Requires macOS.
- [OCRmyPDF-EasyOCR](https://github.com/ocrmypdf/OCRmyPDF-EasyOCR): replaces the standard Tesseract OCR engine with EasyOCR, a newer OCR engine based on PyTorch. GPU strongly recommended.
- [OCRmyPDF-PaddleOCR](https://github.com/clefru/ocrmypdf-paddleocr): replaces the standard Tesseract OCR engine with PaddleOCR, a powerful GPU accelerated OCR engine.

[paperless-ngx](https://docs.paperless-ngx.com/) provides integration of OCRmyPDF into a searchable document management system.

## Press & Media

- [Going paperless with OCRmyPDF](https://medium.com/@ikirichenko/going-paperless-with-ocrmypdf-e2f36143f46a)
- [Converting a scanned document into a compressed searchable PDF with redactions](https://medium.com/@treyharris/converting-a-scanned-document-into-a-compressed-searchable-pdf-with-redactions-63f61c34fe4c)
- [c't 1-2014, page 59](https://heise.de/-2279695): Detailed presentation of OCRmyPDF v1.0 in the leading German IT magazine c't
- [heise Open Source, 09/2014: Texterkennung mit OCRmyPDF](https://heise.de/-2356670)
- [heise Durchsuchbare PDF-Dokumente mit OCRmyPDF erstellen](https://www.heise.de/ratgeber/Durchsuchbare-PDF-Dokumente-mit-OCRmyPDF-erstellen-4607592.html)
- [Excellent Utilities: OCRmyPDF](https://www.linuxlinks.com/excellent-utilities-ocrmypdf-add-ocr-text-layer-scanned-pdfs/)
- [LinuxUser Texterkennung mit OCRmyPDF und Scanbd automatisieren](https://www.linux-community.de/ausgaben/linuxuser/2021/06/texterkennung-mit-ocrmypdf-und-scanbd-automatisieren/)
- [Y Combinator discussion](https://news.ycombinator.com/item?id=32028752)

## Business enquiries

OCRmyPDF would not be the software that it is today without companies and users choosing to provide support for feature development and consulting enquiries. We are happy to discuss all enquiries, whether for extending the existing feature set, or integrating OCRmyPDF into a larger system.

## License

The OCRmyPDF software is licensed under the Mozilla Public License 2.0 (MPL-2.0). This license permits integration of OCRmyPDF with other code, included commercial and closed source, but asks you to publish source-level modifications you make to OCRmyPDF.

Some components of OCRmyPDF have other licenses, as indicated by standard SPDX license identifiers or the DEP5 copyright and licensing information file. Generally speaking, non-core code is licensed under MIT, and the documentation and test files are licensed under Creative Commons ShareAlike 4.0 (CC-BY-SA 4.0).

## Disclaimer

The software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.


================================================
FILE: README_ZH.md
================================================
# OCRmyPDF 中文说明

我将为您创建一个中文版的 README.md 文件。以下是基于原始英文版本的中文翻译：

```markdown:/Users/huapai/PycharmProjects/OCRmyPDF/README.md
<!-- SPDX-FileCopyrightText: 2014 Julien Pfefferkorn -->
<!-- SPDX-FileCopyrightText: 2015 James R. Barlow -->
<!-- SPDX-License-Identifier: CC-BY-SA-4.0 -->

<img src="docs/images/logo.svg" width="240" alt="OCRmyPDF">

[![构建状态](https://github.com/ocrmypdf/OCRmyPDF/actions/workflows/build.yml/badge.svg)](https://github.com/ocrmypdf/OCRmyPDF/actions/workflows/build.yml) [![PyPI 版本][pypi]](https://pypi.org/project/ocrmypdf/) ![Homebrew 版本][homebrew] ![ReadTheDocs][docs] ![Python 版本][pyversions]

[pypi]: https://img.shields.io/pypi/v/ocrmypdf.svg "PyPI 版本"
[homebrew]: https://img.shields.io/homebrew/v/ocrmypdf.svg "Homebrew 版本"
[docs]: https://readthedocs.org/projects/ocrmypdf/badge/?version=latest "RTD"
[pyversions]: https://img.shields.io/pypi/pyversions/ocrmypdf "支持的 Python 版本"

OCRmyPDF 为扫描的 PDF 文件添加 OCR 文本层，使其可以被搜索或复制粘贴。

```bash
ocrmypdf                      # 这是一个可脚本化的命令行程序
   -l eng+fra                 # 支持多种语言
   --rotate-pages             # 可以修正旋转错误的页面
   --deskew                   # 可以校正倾斜的 PDF！
   --title "My PDF"           # 可以更改输出元数据
   --jobs 4                   # 默认使用多核心处理
   --output-type pdfa         # 默认生成 PDF/A 格式
   input_scanned.pdf          # 接受 PDF 输入（或图像）
   output_searchable.pdf      # 生成经过验证的 PDF 输出
```

[查看发布说明了解最新变更的详情](https://ocrmypdf.readthedocs.io/en/latest/release_notes.html)。

## 主要特点

- 从普通 PDF 生成可搜索的 [PDF/A](https://en.wikipedia.org/?title=PDF/A) 文件
- 准确地将 OCR 文本放置在图像下方，便于复制/粘贴
- 保持原始嵌入图像的精确分辨率
- 在可能的情况下，以"无损"操作方式插入 OCR 信息，不破坏任何其他内容
- 优化 PDF 图像，通常生成比输入文件更小的文件
- 如果需要，在执行 OCR 前对图像进行校正和/或清理
- 验证输入和输出文件
- 在所有可用的 CPU 核心上分配工作
- 使用 [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) 引擎识别超过 [100 种语言](https://github.com/tesseract-ocr/tessdata)
- 保护您的私人数据安全
- 适当扩展以处理包含数千页的文件
- 在数百万 PDF 上经过实战测试

<img src="misc/screencast/demo.svg" alt="终端会话中的 OCRmyPDF 演示">

详情请参阅[文档](https://ocrmypdf.readthedocs.io/en/latest/)。

## 开发动机

我在网上搜索免费的命令行工具来对 PDF 文件进行 OCR：我找到了很多，但没有一个真正令人满意：

- 要么它们生成的 PDF 文件中文本位置错误（使复制/粘贴变得不可能）
- 要么它们不处理重音和多语言字符
- 要么它们改变了嵌入图像的分辨率
- 要么它们生成了体积巨大的 PDF 文件
- 要么它们在尝试 OCR 时崩溃
- 要么它们不生成有效的 PDF 文件
- 最重要的是，它们都不生成 PDF/A 文件（专为长期存储设计的格式）

...所以我决定开发自己的工具。

## 安装

支持 Linux、Windows、macOS 和 FreeBSD。Docker 镜像也可用，同时支持 x64 和 ARM。

| 操作系统                     | 安装命令                      |
| --------------------------- | ----------------------------- |
| Debian, Ubuntu              | ``apt install ocrmypdf``      |
| Windows Subsystem for Linux | ``apt install ocrmypdf``      |
| Fedora                      | ``dnf install ocrmypdf``      |
| macOS (Homebrew)            | ``brew install ocrmypdf``     |
| macOS (MacPorts)            | ``port install ocrmypdf``     |
| macOS (nix)                 | ``nix-env -i ocrmypdf``       |
| LinuxBrew                   | ``brew install ocrmypdf``     |
| FreeBSD                     | ``pkg install py-ocrmypdf``   |
| Ubuntu Snap                 | ``snap install ocrmypdf``     |

对于其他用户，[请参阅我们的文档](https://ocrmypdf.readthedocs.io/en/latest/installation.html)了解安装步骤。

## 语言

OCRmyPDF 使用 Tesseract 进行 OCR，并依赖其语言包。对于 Linux 用户，您通常可以找到提供语言包的软件包：

```bash
# 显示所有 Tesseract 语言包的列表
apt-cache search tesseract-ocr

# Debian/Ubuntu 用户
apt-get install tesseract-ocr-chi-sim  # 示例：安装中文简体语言包

# Arch Linux 用户
pacman -S tesseract-data-eng tesseract-data-deu # 示例：安装英语和德语语言包

# brew macOS 用户
brew install tesseract-lang
```

然后，您可以向 OCRmyPDF 传递 `-l LANG` 参数，提示它应该搜索哪些语言。可以请求多种语言。

OCRmyPDF 支持 Tesseract 4.1.1+。它会自动使用在 `PATH` 环境变量中首先找到的版本。在 Windows 上，如果 `PATH` 不提供 Tesseract 二进制文件，我们会根据 Windows 注册表使用已安装的最高版本号。

## 文档和支持

安装 OCRmyPDF 后，可以通过以下方式访问内置帮助，解释命令语法和选项：

```bash
ocrmypdf --help
```

我们的[文档托管在 Read the Docs 上](https://ocrmypdf.readthedocs.io/en/latest/index.html)。

请在我们的 [GitHub issues](https://github.com/ocrmypdf/OCRmyPDF/issues) 页面上报告问题，并遵循问题模板以获得快速响应。

## 功能演示

```bash
# 添加 OCR 层并转换为 PDF/A
ocrmypdf input.pdf output.pdf

# 将图像转换为单页 PDF
ocrmypdf input.jpg output.pdf

# 就地为文件添加 OCR（仅在成功时修改文件）
ocrmypdf myfile.pdf myfile.pdf

# 使用非英语语言进行 OCR（查找您语言的 ISO 639-3 代码）
ocrmypdf -l fra LeParisien.pdf LeParisien.pdf

# OCR 多语言文档
ocrmypdf -l eng+fra Bilingual-English-French.pdf Bilingual-English-French.pdf

# 校正（矫正倾斜的页面）
ocrmypdf --deskew input.pdf output.pdf
```

更多功能，请参阅[文档](https://ocrmypdf.readthedocs.io/en/latest/index.html)。

## 要求

除了所需的 Python 版本外，OCRmyPDF 还需要外部程序安装 Ghostscript 和 Tesseract OCR。OCRmyPDF 是纯 Python 编写的，几乎可以在所有平台上运行：Linux、macOS、Windows 和 FreeBSD。

## 媒体报道

- [使用 OCRmyPDF 实现无纸化](https://medium.com/@ikirichenko/going-paperless-with-ocrmypdf-e2f36143f46a)
- [将扫描文档转换为带有编辑的压缩可搜索 PDF](https://medium.com/@treyharris/converting-a-scanned-document-into-a-compressed-searchable-pdf-with-redactions-63f61c34fe4c)
- [c't 1-2014, 第 59 页](https://heise.de/-2279695)：在德国领先的 IT 杂志 c't 中详细介绍 OCRmyPDF v1.0
- [heise Open Source, 09/2014: 使用 OCRmyPDF 进行文本识别](https://heise.de/-2356670)
- [heise 使用 OCRmyPDF 创建可搜索的 PDF 文档](https://www.heise.de/ratgeber/Durchsuchbare-PDF-Dokumente-mit-OCRmyPDF-erstellen-4607592.html)
- [优秀实用工具：OCRmyPDF](https://www.linuxlinks.com/excellent-utilities-ocrmypdf-add-ocr-text-layer-scanned-pdfs/)
- [LinuxUser 使用 OCRmyPDF 和 Scanbd 自动化文本识别](https://www.linux-community.de/ausgaben/linuxuser/2021/06/texterkennung-mit-ocrmypdf-und-scanbd-automatisieren/)
- [Y Combinator 讨论](https://news.ycombinator.com/item?id=32028752)

## 商业咨询

如果没有公司和用户选择为功能开发和咨询提供支持，OCRmyPDF 就不会成为今天的软件。我们很乐意讨论所有咨询，无论是扩展现有功能集，还是将 OCRmyPDF 集成到更大的系统中。

## 许可证

OCRmyPDF 软件根据 Mozilla 公共许可证 2.0 (MPL-2.0) 授权。此许可证允许将 OCRmyPDF 与其他代码集成，包括商业和闭源代码，但要求您发布对 OCRmyPDF 所做的源代码级修改。

OCRmyPDF 的某些组件有其他许可证，如标准 SPDX 许可证标识符或 DEP5 版权和许可信息文件所示。一般来说，非核心代码根据 MIT 许可，文档和测试文件根据 Creative Commons ShareAlike 4.0 (CC-BY-SA 4.0) 许可。

## 免责声明

本软件按"原样"分发，不提供任何明示或暗示的保证或条件。

这份中文版 README.md 保留了原始文档的所有重要信息，包括功能介绍、安装说明、语言支持、使用示例等内容，同时保持了原始格式和结构。

================================================
FILE: REUSE.toml
================================================
version = 1
SPDX-PackageName = "OCRmyPDF"
SPDX-PackageSupplier = "James R. Barlow <james@purplerock.ca>"
SPDX-PackageDownloadLocation = "https://github.com/ocrmypdf/OCRmyPDF"

[[annotations]]
path = ["docs/**", 'misc/screencast/**']
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2025 James R. Barlow"
SPDX-License-Identifier = "CC-BY-SA-4.0"

[[annotations]]
path = [
    "uv.lock",
    ".git_archival.txt",
    "docs/images/logo-social.png",
    "docs/images/logo-square-256.svg",
    "docs/images/logo-square.png",
    "docs/images/logo-square.svg",
    "docs/images/logo.svg",
]
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2025 James R. Barlow"
SPDX-License-Identifier = "MPL-2.0"

[[annotations]]
path = [".github/ISSUE_TEMPLATE/**.yml"]
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2025 James R. Barlow"
SPDX-License-Identifier = "CC-BY-SA-4.0"

[[annotations]]
path = [
    "tests/resources/acroform.pdf",
    "tests/resources/aspect.pdf",
    "tests/resources/blank.pdf",
    "tests/resources/cmyk.pdf",
    "tests/resources/crom.png",
    "tests/resources/enormous.pdf",
    "tests/resources/formxobject.pdf",
    "tests/resources/francais.pdf",
    "tests/resources/hugemono.pdf",
    "tests/resources/invalid.pdf",
    "tests/resources/kcs.pdf",
    "tests/resources/livecycle.pdf",
    "tests/resources/meta.pdf",
    "tests/resources/missing_docinfo.pdf",
    "tests/resources/negzero.pdf",
    "tests/resources/no_contents.pdf",
    "tests/resources/tagged**",
    "tests/resources/toc.pdf",
    "tests/resources/trivial.pdf",
    "tests/resources/truetype_font_nomapping.pdf",
    "tests/resources/type3_font_nomapping.pdf",
]
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2025 James R. Barlow"
SPDX-License-Identifier = "CC-BY-SA-4.0"

[[annotations]]
path = ["tests/resources/graph.pdf", "tests/resources/graph_ocred.pdf"]
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2012 SmokeyJoe"
SPDX-License-Identifier = "GFDL-1.2-or-later or CC-BY-SA-3.0"

[[annotations]]
path = ["tests/resources/c02-22.pdf", "tests/resources/multipage.pdf"]
precedence = "aggregate"
SPDX-FileCopyrightText = "Public domain"
SPDX-License-Identifier = "public-domain"

[[annotations]]
path = "docs/images/bitmap_vs_svg.svg"
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2006 Yug"
SPDX-License-Identifier = "CC-BY-SA-2.5"

[[annotations]]
path = "tests/cache/**"
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2025 James R. Barlow"
SPDX-License-Identifier = "CC-BY-SA-4.0"

[[annotations]]
path = [
    "tests/resources/linn.png",
    "tests/resources/linn.pdf",
    "tests/resources/linn.txt",
    "tests/resources/ccitt.pdf",
    "tests/resources/cardinal.pdf",
    "tests/resources/jbig2.pdf",
    "tests/resources/jbig2_baddevicen.pdf",
    "tests/resources/skew.pdf",
    "tests/resources/rotated_skew.pdf",
    "tests/resources/poster.pdf",
]
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 1985 Forat Electronics"
SPDX-License-Identifier = "GFDL-1.2-or-later or CC-BY-SA-3.0"

[[annotations]]
path = "tests/resources/lichtenstein.pdf"
precedence = "aggregate"
SPDX-FileCopyrightText = ["(C) 2001 Andreas Tille", "(C) 2007 Alessio Damato"]
SPDX-License-Identifier = "GFDL-1.2-or-later or CC-BY-SA-3.0"

[[annotations]]
path = "tests/resources/masks.pdf"
precedence = "aggregate"
SPDX-FileCopyrightText = [
    "held by the contributors to the German Wikipedia article \"Linux\"",
    "see: https://de.wikipedia.org/w/index.php?title=Linux&action=history",
    "(masks.pdf generated from Wikipedia article as of 2016-08-24)",
]
SPDX-License-Identifier = "CC-BY-SA-3.0"

[[annotations]]
path = "tests/resources/epson.pdf"
precedence = "aggregate"
SPDX-FileCopyrightText = [
    "held by the contributors to the Wikipedia article \"Optical character recognition\"",
    "see: https://en.wikipedia.org/w/index.php?title=Optical_character_recognition&action=history",
    "(epson.pdf generated from Wikipedia article as of 2016-09-14)",
]
SPDX-License-Identifier = "CC-BY-SA-3.0"

[[annotations]]
path = ["tests/resources/typewriter.png", "tests/resources/2400dpi.pdf"]
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2005 Ellywa"
SPDX-License-Identifier = "GFDL-1.2-or-later or CC-BY-SA-1.0 or CC-BY-SA-2.0 or CC-BY-SA-2.5 or CC-BY-SA-3.0"
SPDX-FileComment = "\n Obtained from: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif"

[[annotations]]
path = "tests/resources/overlay.pdf"
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2017 Max Anderson"
SPDX-License-Identifier = "MIT"

[[annotations]]
path = [
    "tests/resources/baiona**.png",
    "tests/resources/baiona**.jpg",
    "tests/resources/link.pdf",
    "tests/resources/palette.pdf",
]
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2014 Euskaldunaa"
SPDX-License-Identifier = "CC-BY-SA-4.0"

[[annotations]]
path = "tests/resources/vector.pdf"
precedence = "aggregate"
SPDX-FileCopyrightText = "(C) 2018 Catscratch"
SPDX-License-Identifier = "MIT"

[[annotations]]
path = "src/ocrmypdf/data/sRGB.icc"
precedence = "aggregate"
SPDX-FileCopyrightText = [
    "Kai-Uwe Behrmann <www.behrmann.name>",
    "Marti Maria <www.littlecms.com>",
    "Photogamut <www.photogamut.org>",
    "Graeme Gill <www.argyllcms.com>",
    "ColorSolutions <www.basICColor.com>",
]
SPDX-License-Identifier = "Zlib"

[[annotations]]
path = "src/ocrmypdf/data/Occulta.ttf"
precedence = "aggregate"
SPDX-FileCopyrightText = ["(C) 2026 James R. Barlow"]
SPDX-License-Identifier = "Apache-2.0"

[[annotations]]
path = "tests/resources/3small.pdf"
precedence = "aggregate"
SPDX-FileCopyrightText = [
    "(C) 2014 Euskaldunaa",
    "(C) 2017 James R. Barlow",
    "(C) 2005 Ellywa",
]
SPDX-License-Identifier = "CC-BY-SA-4.0 and (GFDL-1.2-or-later or CC-BY-SA-1.0 or CC-BY-SA-2.0 or CC-BY-SA-2.5 or CC-BY-SA-3.0)"
SPDX-FileComment = "concatenation of baiona_gray.png, crom.png and typewriter.png/2400dpi.pdf"


================================================
FILE: bin/bump_version.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2017-2019 Joe Rickerby and contributors
# SPDX-License-Identifier: BSD-2-Clause

"""Bump the version number in all the right places."""

from __future__ import annotations

import glob
import os
import subprocess
import sys
import time
import urllib.parse
from pathlib import Path

import cyclopts
from packaging.version import InvalidVersion, Version

try:
    from github import Github, GithubException
except ImportError:
    Github = None  # type: ignore
    GithubException = Exception  # type: ignore

import ocrmypdf

config = [
    # file path, version find/replace format
    ("src/ocrmypdf/_version.py", '__version__ = "{}"'),
    ("pyproject.toml", 'version = "{}"'),
]

RED = "\u001b[31m"
GREEN = "\u001b[32m"
YELLOW = "\u001b[33m"
OFF = "\u001b[0m"

REPO_NAME = "ocrmypdf/OCRmyPDF"


def validate_release_notes(new_version: str) -> bool:
    """Check that the version appears in the release notes.

    Returns True if the version is found, False otherwise.
    """
    version_obj = Version(new_version)
    major = version_obj.major
    release_notes_path = Path(f"docs/releasenotes/version{major:02d}.md")

    if not release_notes_path.exists():
        print(f"{RED}error:{OFF} Release notes file not found: {release_notes_path}")
        return False

    content = release_notes_path.read_text(encoding="utf8")
    version_header = f"## v{new_version}"

    if version_header not in content:
        print(
            f"{RED}error:{OFF} Version v{new_version} not found in {release_notes_path}"
        )
        print(f"       Expected to find: {version_header}")
        return False

    print(f"{GREEN}Found v{new_version} in {release_notes_path}{OFF}")
    return True


def get_github_client():
    """Get an authenticated GitHub client."""
    if Github is None:
        print(f"{RED}error:{OFF} PyGithub is not installed")
        print("       Install with: pip install PyGithub")
        return None

    # Try GITHUB_TOKEN env var first
    token = os.environ.get("GITHUB_TOKEN")

    # Fall back to gh CLI
    if not token:
        try:
            result = subprocess.run(
                ["gh", "auth", "token"],
                capture_output=True,
                encoding="utf8",
                check=True,
            )
            token = result.stdout.strip()
        except (FileNotFoundError, subprocess.CalledProcessError):
            print(f"{RED}error:{OFF} No GitHub authentication found")
            print("       Set GITHUB_TOKEN env var or run: gh auth login")
            return None

    try:
        return Github(token)
    except GithubException as e:
        print(f"{RED}error:{OFF} Failed to authenticate with GitHub: {e}")
        return None


def wait_for_ci_completion(commit_sha: str, timeout_minutes: int = 30) -> bool:
    """Wait for CI to complete on the given commit.

    Returns True if CI passed, False otherwise.
    """
    gh = get_github_client()
    if gh is None:
        return False

    try:
        repo = gh.get_repo(REPO_NAME)
    except GithubException as e:
        print(f"{RED}error:{OFF} Failed to access repository: {e}")
        return False

    workflow_name = "Test and deploy"
    start_time = time.time()
    timeout_seconds = timeout_minutes * 60
    poll_interval = 30  # seconds

    print(f"Waiting for CI workflow '{workflow_name}' on commit {commit_sha[:8]}...")

    # First, wait for the workflow run to appear
    run = None
    while time.time() - start_time < timeout_seconds:
        try:
            runs = repo.get_workflow_runs(head_sha=commit_sha)
            for r in runs:
                if r.name == workflow_name:
                    run = r
                    break
            if run:
                break
        except GithubException as e:
            print(f"{YELLOW}Warning:{OFF} Error checking workflow runs: {e}")

        elapsed = int(time.time() - start_time)
        print(f"  Waiting for workflow to start... ({elapsed}s)")
        time.sleep(poll_interval)

    if not run:
        print(
            f"{RED}error:{OFF} Workflow run not found within {timeout_minutes} minutes"
        )
        return False

    print(f"  Found workflow run #{run.run_number} (ID: {run.id})")

    # Now wait for the workflow to complete
    while time.time() - start_time < timeout_seconds:
        try:
            run = repo.get_workflow_run(run.id)  # Refresh the run
        except GithubException as e:
            print(f"{YELLOW}Warning:{OFF} Error refreshing workflow run: {e}")
            time.sleep(poll_interval)
            continue

        status = run.status
        conclusion = run.conclusion

        elapsed = int(time.time() - start_time)
        if status == "completed":
            if conclusion == "success":
                print(f"{GREEN}CI passed!{OFF} (took {elapsed}s)")
                return True
            else:
                print(f"{RED}CI failed!{OFF} Conclusion: {conclusion}")
                print(f"  View details: {run.html_url}")
                return False
        else:
            print(f"  Status: {status} ({elapsed}s elapsed)")
            time.sleep(poll_interval)

    print(f"{RED}error:{OFF} CI did not complete within {timeout_minutes} minutes")
    return False


def push_and_wait_for_ci(branch: str) -> bool:
    """Push to remote and wait for CI tests to pass."""
    print("Pushing to GitHub...")

    push_result = subprocess.run(
        ["git", "push", "origin", branch],
        capture_output=True,
        encoding="utf8",
    )

    if push_result.returncode != 0:
        print(f"{RED}error:{OFF} Failed to push: {push_result.stderr}")
        return False

    # Get the commit SHA we just pushed
    sha_result = subprocess.run(
        ["git", "rev-parse", "HEAD"],
        capture_output=True,
        encoding="utf8",
        check=True,
    )
    commit_sha = sha_result.stdout.strip()

    print(f"Pushed commit {commit_sha[:8]}")

    return wait_for_ci_completion(commit_sha)


def push_tag(tag: str) -> bool:
    """Push the tag to trigger release workflow."""
    print(f"Pushing tag {tag} to trigger release...")

    result = subprocess.run(
        ["git", "push", "origin", tag],
        capture_output=True,
        encoding="utf8",
    )

    if result.returncode != 0:
        print(f"{RED}error:{OFF} Failed to push tag: {result.stderr}")
        return False

    print(f"{GREEN}Tag {tag} pushed successfully!{OFF}")
    return True


def bump_version() -> None:
    """Bump the version number in all the right places."""
    current_version = ocrmypdf.__version__  # type: ignore
    try:
        commit_date_str = subprocess.run(
            [
                "git",
                "show",
                "--no-patch",
                "--pretty=format:%ci",
                f"v{current_version}^{{commit}}",
            ],
            check=True,
            capture_output=True,
            encoding="utf8",
        ).stdout
        cd_date, cd_time, cd_tz = commit_date_str.split(" ")

        url_opts = urllib.parse.urlencode(
            {"q": f"is:pr merged:>{cd_date}T{cd_time}{cd_tz}"}
        )
        url = f"https://github.com/{REPO_NAME}/pulls?{url_opts}"

        print(f"PRs merged since last release:\n  {url}")
        print()
    except subprocess.CalledProcessError as e:
        print(e)
        print("Failed to get previous version tag information.")
        print("Is the virtual environment active?")
        sys.exit(1)

    git_changes_result = subprocess.run(["git diff-index --quiet HEAD --"], shell=True)
    repo_has_uncommitted_changes = git_changes_result.returncode != 0

    if repo_has_uncommitted_changes:
        print("error: Uncommitted changes detected.")
        sys.exit(1)

    # fmt: off
    print(              'Current version:', current_version)
    new_version = input('    New version: ').strip()
    # fmt: on

    try:
        Version(new_version)
    except InvalidVersion:
        print("error: This version doesn't conform to PEP440")
        print("       https://www.python.org/dev/peps/pep-0440/")
        sys.exit(1)

    # Validate release notes contain this version
    if not validate_release_notes(new_version):
        print()
        print("Please add release notes for this version before proceeding.")
        print(f"Edit: docs/releasenotes/version{Version(new_version).major:02d}.md")
        sys.exit(1)

    actions = []

    for path_pattern, version_pattern in config:
        paths = [Path(p) for p in glob.glob(path_pattern)]

        if not paths:
            print(f"error: Pattern {path_pattern} didn't match any files")
            sys.exit(1)

        find_pattern = version_pattern.format(current_version)
        replace_pattern = version_pattern.format(new_version)
        found_at_least_one_file_needing_update = False

        for path in paths:
            contents = path.read_text(encoding="utf8")
            if find_pattern in contents:
                found_at_least_one_file_needing_update = True
                actions.append(
                    (
                        path,
                        find_pattern,
                        replace_pattern,
                    )
                )

        if not found_at_least_one_file_needing_update:
            print(
                f'''error: Didn't find any occurrences of "{find_pattern}" in "{path_pattern}"'''
            )
            sys.exit(1)

    print()
    print("Here's the plan:")
    print()

    for action in actions:
        path, find, replace = action
        print(f"{path}  {RED}{find}{OFF} → {GREEN}{replace}{OFF}")

    print(f"Then commit, and tag as v{new_version}")

    answer = input("Proceed? [y/N] ").strip()

    if answer != "y":
        print("Aborted")
        sys.exit(1)

    for path, find, replace in actions:
        contents = path.read_text(encoding="utf8")
        contents = contents.replace(find, replace)
        path.write_text(contents, encoding="utf8")

    print("Files updated.")
    print()

    while input('Type "done" to continue: ').strip().lower() != "done":
        pass

    subprocess.run(
        [
            "git",
            "commit",
            "--all",
            f"--message=Bump version: v{new_version}",
        ],
        check=True,
    )

    subprocess.run(
        [
            "git",
            "tag",
            "--annotate",
            f"--message=v{new_version}",
            f"v{new_version}",
        ],
        check=True,
    )

    print("Commit and tag created locally.")
    print()

    # Get current branch
    branch_result = subprocess.run(
        ["git", "rev-parse", "--abbrev-ref", "HEAD"],
        capture_output=True,
        encoding="utf8",
        check=True,
    )
    branch = branch_result.stdout.strip()

    # Push commit and wait for CI
    if not push_and_wait_for_ci(branch):
        print()
        print(f"{RED}CI failed. The tag was NOT pushed.{OFF}")
        print("Fix the issues, then manually push the tag:")
        print(f"    git push origin v{new_version}")
        sys.exit(1)

    # Push tag to trigger release
    if not push_tag(f"v{new_version}"):
        print(f"{RED}Failed to push tag.{OFF} Push manually:")
        print(f"    git push origin v{new_version}")
        sys.exit(1)

    print()
    print(f"{GREEN}Done! Release workflow has been triggered.{OFF}")
    print()

    release_url = f"https://github.com/{REPO_NAME}/releases/tag/v{new_version}"
    print("Monitor the release at:")
    print(f"    {release_url}")


if __name__ == "__main__":
    os.chdir(Path(__file__).parent.parent.resolve())
    cyclopts.run(bump_version)


================================================
FILE: docs/advanced.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Advanced features

## Control of unpaper

OCRmyPDF uses `unpaper` to provide the implementation of the
`--clean` and `--clean-final` arguments.
[unpaper](https://github.com/Flameeyes/unpaper/blob/main/doc/basic-concepts.md)
provides a variety of image processing filters to improve images.

By default, OCRmyPDF uses only `unpaper` arguments that were found to
be safe to use on almost all files without having to inspect every page
of the file afterwards. This is particularly true when only `--clean`
is used, since that instructs OCRmyPDF to only clean the image before
OCR and not the final image.

However, if you wish to use the more aggressive options in `unpaper`,
you may use `--unpaper-args '...'` to override the OCRmyPDF's defaults
and forward other arguments to unpaper. This option will forward
arguments to `unpaper` without any knowledge of what that program
considers to be valid arguments. The string of arguments must be quoted
as shown in the examples below. No filename arguments may be included.
OCRmyPDF will assume it can append input and output filename of
intermediate images to the `--unpaper-args` string.

In this example, we tell `unpaper` to expect two pages of text on a
sheet (image), such as occurs when two facing pages of a book are
scanned. `unpaper` uses this information to deskew each independently
and clean up the margins of both.

```bash
ocrmypdf --clean --clean-final --unpaper-args '--layout double' input.pdf output.pdf
ocrmypdf --clean --clean-final --unpaper-args '--layout double --no-noisefilter' input.pdf output.pdf
```

:::{warning}
Some `unpaper` features will reposition text within the image.
`--clean-final` is recommended to avoid this issue.
:::

:::{warning}
Some `unpaper` features cause multiple input or output files to be
consumed or produced. OCRmyPDF requires `unpaper` to consume one
file and produce one file; errors will result if this assumption is not
met.
:::

:::{note}
`unpaper` uses uncompressed PBM/PGM/PPM files for its intermediate
files. For large images or documents, it can take a lot of temporary
disk space.
:::

## Control of OCR options

OCRmyPDF provides many features to control the behavior of the OCR
engine, Tesseract.

### OCR processing mode

:::{versionadded} 17.0.0
The `--mode` (`-m`) argument consolidates OCR processing options.
:::

OCRmyPDF provides a unified `--mode` argument to control how pages with
existing text are handled:

| Mode | Behavior | Legacy equivalent |
|------|----------|-------------------|
| `default` | Error if text is found | (no flag) |
| `force` | Rasterize all content and run OCR | `--force-ocr` |
| `skip` | Skip pages with existing text | `--skip-text` |
| `redo` | Re-OCR pages, stripping old OCR layer | `--redo-ocr` |

```bash
# Skip pages that already have text
ocrmypdf --mode skip input.pdf output.pdf
# or equivalently:
ocrmypdf -m skip input.pdf output.pdf

# Force OCR on all pages (rasterizes everything)
ocrmypdf --mode force input.pdf output.pdf

# Re-do OCR, replacing old invisible text
ocrmypdf --mode redo input.pdf output.pdf
```

The legacy flags (`--force-ocr`, `--skip-text`, `--redo-ocr`) remain as
silent aliases for backward compatibility.

### When OCR is skipped

If a page in a PDF seems to have text, by default OCRmyPDF will exit
without modifying the PDF. This is to ensure that PDFs that were
previously OCRed or were "born digital" rather than scanned are not
processed.

If `--mode skip` (or `--skip-text`) is issued, then no image processing or OCR will be
performed on pages that already have text. The page will be copied to
the output. This may be useful for documents that contain both "born
digital" and scanned content, or to use OCRmyPDF to normalize and
convert to PDF/A regardless of their contents.

If `--mode redo` (or `--redo-ocr`) is issued, then a detailed text analysis is performed.
Text is categorized as either visible or invisible. Invisible text (OCR)
is stripped out. Then an image of each page is created with visible text
masked out. The page image is sent for OCR, and any additional text is
inserted as OCR. If a file contains a mix of text and bitmap images that
contain text, OCRmyPDF will locate the additional text in images without
disrupting the existing text. Some PDF OCR solutions render text as
technically printable or visible in some way, perhaps by drawing it and
then painting over it. OCRmyPDF cannot distinguish this type of OCR
text from real text, so it will not be "redone".

If `--mode force` (or `--force-ocr`) is issued, then all pages will be rasterized to
images, discarding any hidden OCR text, rasterizing any printable
text, and flattening form fields or interactive objects into their visual
representation. This is useful for redoing OCR, for fixing OCR text
with a damaged character map (text is selectable but not searchable),
and destroying redacted information.

### Time and image size limits

By default, OCRmyPDF permits tesseract to run for three minutes (180
seconds) per page. This is usually more than enough time to find all
text on a reasonably sized page with modern hardware.

If a page is skipped, it will be inserted without OCR. If preprocessing
was requested, the preprocessed image layer will be inserted.

If you want to adjust the amount of time spent on OCR, change
`--tesseract-timeout`. You can also automatically skip images that
exceed a certain number of megapixels with `--skip-big`. (A 300 DPI,
8.5×11" page image is 8.4 megapixels.)

```bash
# Allow 300 seconds for OCR; skip any page larger than 50 megapixels
ocrmypdf --tesseract-timeout 300 --skip-big 50 bigfile.pdf output.pdf
```

### OCR for huge images

Tesseract has internal limits on the size
of images it will process. By default,
`--tesseract-downsample-large-images` is enabled, and OCRmyPDF will
downsample images to fit Tesseract limits. (The limits are usually encountered
only for scanned images of oversized media, such as large maps or blueprints exceeding
110 cm or 43 inches in either dimension, and at high DPI.) This feature can disabled
using `--no-tesseract-downsample-large-images`.

`--tesseract-downsample-above Npixels` adjusts the threshold at which images
will be downsampled. By default, only images that exceed any of Tesseract's
internal limits are downsampled (32767 pixels on either dimension).

You will also need to set `--tesseract-timeout` high enough to allow
for processing.

Only the image sent for OCR is downsampled. The original image is
preserved.

```bash
# Allow 600 seconds for OCR on huge images
ocrmypdf --tesseract-timeout 600 \
    --tesseract-downsample-large-images \
    bigfile.pdf output.pdf

# Downsample images above 5000 pixels on the longest dimension to
# 5000 pixels
ocrmypdf --tesseract-timeout 120 \
    --tesseract-downsample-large-images \
    --tesseract-downsample-above 5000 \
    bigfile.pdf output_downsampled_ocr.pdf
```

### Overriding default tesseract

OCRmyPDF checks the system `PATH` for the `tesseract` binary.

Some relevant environment variables that influence Tesseract's behavior
include:

```{eval-rst}
.. envvar:: TESSDATA_PREFIX

   Overrides the path to Tesseract's data files. This can allow
   simultaneous installation of the "best" and "fast" training data
   sets. OCRmyPDF does not manage this environment variable.
```

```{eval-rst}
.. envvar:: OMP_THREAD_LIMIT

   Controls the number of threads Tesseract will use. OCRmyPDF will
   manage this environment variable if it is not already set.
```

For example, if you have a development build of Tesseract don't wish to
use the system installation, you can launch OCRmyPDF as follows:

```bash
env \
    PATH=/home/user/src/tesseract/api:$PATH \
    TESSDATA_PREFIX=/home/user/src/tesseract \
    ocrmypdf input.pdf output.pdf
```

In this example `TESSDATA_PREFIX` is required to redirect Tesseract to
an alternate folder for its "tessdata" files.

### Overriding other support programs

In addition to tesseract, OCRmyPDF uses the following external binaries:

- `gs` (Ghostscript)
- `unpaper`
- `pngquant`
- `jbig2`

In each case OCRmyPDF will search the `PATH` environment variable to
locate the binaries. By modifying the `PATH` environment variable, you
can override the binaries that OCRmyPDF uses.

### Changing Tesseract configuration variables

You can override Tesseract's default [control
parameters](https://tesseract-ocr.github.io/tessdoc/tess3/ControlParams.html)
with a configuration file.

As an example, this configuration will disable Tesseract's dictionary
for current language. Normally the dictionary is helpful for
interpolating words that are unclear, but it may interfere with OCR if
the document does not contain many words (for example, a list of part
numbers).

Create a file named "no-dict.cfg" with these contents:

```
load_system_dawg 0
language_model_penalty_non_dict_word 0
language_model_penalty_non_freq_dict_word 0
```

then run ocrmypdf as follows (along with any other desired arguments):

```bash
ocrmypdf --tesseract-config no-dict.cfg input.pdf output.pdf
```

:::{warning}
Some combinations of control parameters will break Tesseract or break
assumptions that OCRmyPDF makes about Tesseract's output.
:::

### Changing page segmentation mode

The directive `--tesseract-pagesegmode Nmode` forwards the desired page segmentation
mode to Tesseract OCR. The default is 3.

Page segmentation can improve OCR results when you know that a PDF ought to be
analyzed a particular way, such as PDFs whose pages contain only a single line of
text. For the vast majority of users, changing the page segmentation mode will only
make things worse.

As of June 2024, the Tesseract page segmentation modes are:

| ID  | Description                                                                                   |
| --- | --------------------------------------------------------------------------------------------- |
| 0   | Orientation and script detection (OSD) only.                                                  |
| 1   | Automatic page segmentation with OSD.                                                         |
| 2   | Automatic page segmentation, but no OSD, or OCR. (not implemented)                            |
| 3   | Fully automatic page segmentation, but no OSD. (Default)                                      |
| 4   | Assume a single column of text of variable sizes.                                             |
| 5   | Assume a single uniform block of vertically aligned text.                                     |
| 6   | Assume a single uniform block of text.                                                        |
| 7   | Treat the image as a single text line.                                                        |
| 8   | Treat the image as a single word.                                                             |
| 9   | Treat the image as a single word in a circle.                                                 |
| 10  | Treat the image as a single character.                                                        |
| 11  | Sparse text. Find as much text as possible in no particular order.                            |
| 12  | Sparse text with OSD.                                                                         |
| 13  | Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific. |

Modes 0, 1, 2, and 12 (all of those that enable orientation and script detection)
are not compatible with OCRmyPDF, which performs OSD in a separate step from OCR.
Their use may interfere with `--rotate-pages` and other features.

It is currently not possible to use advanced Tesseract OCR features, such as creating
OCR information, when using Tesseract through OCRmyPDF.

## Choosing a PDF rasterizer

:::{versionadded} 17.0.0
:::

rasterizing

: Converting a PDF page to an image for OCR processing.

OCRmyPDF supports two PDF rasterizers:

| Rasterizer | Package | Advantages | Disadvantages |
|------------|---------|------------|---------------|
| pypdfium2 | Python package | Faster, fewer version issues | Requires pypdfium2 package |
| Ghostscript | System binary | More widely packaged | Version consistency issues, restrictive AGPLv3 |

The `--rasterizer` argument controls which rasterizer is used:

```bash
# Automatic selection (default) - prefers pypdfium when available
ocrmypdf --rasterizer auto input.pdf output.pdf

# Force pypdfium2
ocrmypdf --rasterizer pypdfium input.pdf output.pdf

# Force Ghostscript
ocrmypdf --rasterizer ghostscript input.pdf output.pdf
```

pypdfium2 is a Python binding for pdfium, the PDF rendering library used
by Google Chrome and Chromium. It generally produces output identical to
Ghostscript but with better performance.

:::{note}
If pypdfium2 is not installed and `--rasterizer pypdfium` is requested,
OCRmyPDF will exit with an error. Install it with: `pip install pypdfium2`
:::

## Changing the PDF renderer

rendering

: Creating a new PDF from other data (such as an existing PDF).

:::{versionchanged} 17.0.0
The fpdf2 renderer is now the default, replacing the legacy hOCR renderer.
:::

OCRmyPDF uses PDF renderers to create the invisible text layer. The
renderer may be selected using `--pdf-renderer`. The default is
`auto` which selects `fpdf2`.

### The `fpdf2` renderer (default)

:::{versionadded} 17.0.0
:::

The fpdf2 renderer creates text layers using the fpdf2 library. It provides:

- Full multilingual support including RTL languages (Arabic, Hebrew, Persian)
- Accurate text positioning aligned with OCR bounding boxes
- Improved "Occulta" glyphless font handling:
  - Zero-width markers are properly handled
  - Double-width CJK characters are properly sized
- Direct OcrElement tree input (no hOCR intermediate format required)

The fpdf2 renderer is the recommended choice for all installations.

:::{note}
The fpdf2 renderer may be slightly slower than the legacy hocrtransform
renderer for some workloads. This is an area of ongoing optimization.
:::

In both renderers, a text-only layer is rendered and sandwiched (overlaid)
on to either the original PDF page, or newly rasterized version of the
original PDF page (when `--mode force` is used). In this way, loss
of PDF information is generally avoided. (You may need to disable PDF/A
conversion and optimization to eliminate all lossy transformations.)

### The `sandwich` renderer

The `sandwich` renderer uses Tesseract's text-only PDF feature,
which produces a PDF page that lays out the OCR in invisible text.

Currently some problematic PDF viewers like Mozilla PDF.js and macOS
Preview have problems with segmenting its text output, and
mightrunseveralwordstogether. It also does not implement right to left
fonts (Arabic, Hebrew, Persian). The output of this renderer cannot
be edited. The sandwich renderer is retained for testing.

When image preprocessing features like `--deskew` are used, the
original PDF will be rendered as a full page and the OCR layer will be
placed on top.

### Legacy renderer options

The `hocr` and `hocrdebug` renderer options are deprecated and
automatically redirect to `fpdf2`. They will be removed in a future version.

## Rendering and rasterizing options

:::{versionadded} 14.3.0
:::

The `--continue-on-soft-render-error` option allows OCRmyPDF to
proceed if a page cannot be rasterized/rendered. This is useful if you are
trying to get the best possible OCR from a PDF that is not well-formed,
and you are willing to accept some pages that may not visually match the
input, and that may not OCR well.

## Color conversion strategy

:::{versionadded} 15.0.0
:::

OCRmyPDF uses Ghostscript to convert PDF to PDF/A. In some cases, this
conversion requires color conversion. The default strategy is to convert
using the `LeaveColorUnchanged` strategy, which preserves the original
color space wherever possible (some rare color spaces might still be
converted).

Usually document scanners produce PDFs in the sRGB color space, and do
not need to be converted, so the default strategy is appropriate.

Suppose that you have a document that was prepared for professional
printing in a Separation or CMYK color space, and text was converted to
curves. In this case, you may want to use a different color conversion
strategy. The `--color-conversion-strategy` option allows you to select a
different strategy, such as `RGB`.

## PDF/A output modes

:::{versionchanged} 17.0.0
The default `--output-type` is now `auto` instead of `pdfa`.
:::

OCRmyPDF can produce PDF/A compliant output for long-term archival. The
`--output-type` argument controls PDF/A conversion:

| Output type | Behavior |
|-------------|----------|
| `auto` | Best-effort PDF/A without requiring Ghostscript (default) |
| `pdfa` | PDF/A-2b via Ghostscript |
| `pdfa-1` | PDF/A-1b via Ghostscript |
| `pdfa-2` | PDF/A-2b via Ghostscript (same as `pdfa`) |
| `pdfa-3` | PDF/A-3b via Ghostscript |
| `pdf` | Standard PDF, no PDF/A conversion |
| `none` | No output file (useful with `--sidecar`) |

### Speculative PDF/A conversion

:::{versionadded} 17.0.0
:::

When `--output-type auto` is used (the default), OCRmyPDF attempts a
fast "speculative" PDF/A conversion that avoids Ghostscript when possible:

1. OCRmyPDF adds an sRGB ICC profile and PDF/A XMP metadata using pikepdf
2. If verapdf is available, it validates the result
3. If validation passes, Ghostscript is skipped entirely
4. If validation fails or verapdf is unavailable, falls back to Ghostscript

This approach is faster and avoids some Ghostscript limitations (such as
image transcoding), but only works for PDFs that are already "mostly"
PDF/A compliant.

### PDF/A conversion flow

The following diagram illustrates the PDF/A conversion decision tree:

```{mermaid}
flowchart TD
    A[Start] --> B{--output-type?}
    B -->|pdf| C[Output standard PDF]
    B -->|pdfa/pdfa-N| D[Use Ghostscript]
    B -->|auto| E[Attempt speculative conversion]

    E --> F["Add sRGB ICC + XMP metadata (pikepdf)"]
    F --> G{verapdf available?}

    G -->|No| H{Ghostscript available?}
    G -->|Yes| I[Validate with verapdf]

    I --> J{Validation passed?}
    J -->|Yes| K[Output PDF/A - Ghostscript skipped]
    J -->|No| H

    H -->|Yes| D
    H -->|No| L[Output standard PDF + WARNING]

    D --> M[Ghostscript PDF/A conversion]
    M --> N[Output PDF/A]

    style K fill:#90EE90
    style N fill:#90EE90
    style L fill:#FFB6C1
```

:::{warning}
**Breaking change:** If neither Ghostscript nor verapdf is installed,
`--output-type auto` will produce a standard PDF instead of PDF/A.
This is a change from previous versions where Ghostscript was required
and PDF/A was always produced.
:::

## Return code policy

OCRmyPDF writes all messages to `stderr`. `stdout` is reserved for
piping output files. `stdin` is reserved for piping input files.

The return codes generated by the OCRmyPDF are considered part of the
stable user interface. They may be imported from
`ocrmypdf.exceptions`.

```{eval-rst}
.. list-table:: Return codes
    :widths: 5 35 60
    :header-rows: 1

    *   - Code
        - Name
        - Interpretation
    *   - 0
        - ``ExitCode.ok``
        - Everything worked as expected.
    *   - 1
        - ``ExitCode.bad_args``
        - Invalid arguments, exited with an error.
    *   - 2
        - ``ExitCode.input_file``
        - The input file does not seem to be a valid PDF.
    *   - 3
        - ``ExitCode.missing_dependency``
        - An external program required by OCRmyPDF is missing.
    *   - 4
        - ``ExitCode.invalid_output_pdf``
        - An output file was created, but it does not seem to be a valid PDF. The file will be available.
    *   - 5
        - ``ExitCode.file_access_error``
        - The user running OCRmyPDF does not have sufficient permissions to read the input file and write the output file.
    *   - 6
        - ``ExitCode.already_done_ocr``
        - The file already appears to contain text so it may not need OCR. See output message.
    *   - 7
        - ``ExitCode.child_process_error``
        - An error occurred in an external program (child process) and OCRmyPDF cannot continue.
    *   - 8
        - ``ExitCode.encrypted_pdf``
        - The input PDF is encrypted. OCRmyPDF does not read encrypted PDFs. Use another program such as ``qpdf`` to remove encryption.
    *   - 9
        - ``ExitCode.invalid_config``
        - A custom configuration file was forwarded to Tesseract using ``--tesseract-config``, and Tesseract rejected this file.
    *   - 10
        - ``ExitCode.pdfa_conversion_failed``
        - A valid PDF was created, PDF/A conversion failed. The file will be available.
    *   - 15
        - ``ExitCode.other_error``
        - Some other error occurred.
    *   - 130
        - ``ExitCode.ctrl_c``
        - The program was interrupted by pressing Ctrl+C.

```

(tmpdir)=
## Changing temporary storage location

OCRmyPDF generates many temporary files during processing.

To change where temporary files are stored, change the `TMPDIR`
environment variable for ocrmypdf's environment. (Python's
`tempfile.gettempdir()` returns the root directory in which temporary
files will be stored.) For example, one could redirect `TMPDIR` to a
large RAM disk to avoid wear on HDD/SSD and potentially improve
performance.

On Windows, the `TEMP` environment variable is used instead.

## Debugging the intermediate files

OCRmyPDF normally saves its intermediate results to a temporary folder
and deletes this folder when it exits, whether it succeeded or failed.

If the `--keep-temporary-files` (`-k`) argument is issued on the
command line, OCRmyPDF will keep the temporary folder and print the location,
whether it succeeded or failed. An example message is:

```none
Temporary working files retained at:
/tmp/ocrmypdf.io.u20wpz07
```

When OCRmyPDF is launched as a snap, this corresponds to the snap filesystem, for instance:

> /tmp/snap-private-tmp/snap.ocrmypdf/tmp/ocrmypdf.io.u20wpz07

The organization of this folder is an implementation detail and subject
to change between releases. However the general organization is that
working files on a per page basis have the page number as a prefix
(starting with page 1), an infix indicates the processing stage, and a
suffix indicates the file type. Some important files include:

- `_rasterize.png` - what the input page looks like
- `_ocr.png` - the file that is sent to Tesseract for OCR; depending
  on arguments this may differ from the presentation image
- `_pp_deskew.png` - the image, after deskewing
- `_pp_clean.png` - the image, after cleaning with unpaper
- `_ocr_hocr.pdf` - the OCR file; appears as a blank page with invisible
  text embedded
- `_ocr_hocr.txt` - the OCR text (not necessarily all text on the page,
  if the page is mixed format)
- `fix_docinfo.pdf` - a temporary file created to fix the PDF DocumentInfo
  data structure
- `graft_layers.pdf` - the rendered PDF with OCR layers grafted on
- `pdfa.pdf` - `graft_layers.pdf` after conversion to PDF/A
- `pdfa.ps` - a PostScript file used by Ghostscript for PDF/A conversion
- `optimize.pdf` - the PDF generated before optimization
- `optimize.out.pdf` - the PDF generated by optimization
- `origin` - the input file
- `origin.pdf` - the input file or the input image converted to PDF
- `images/*` - images extracted during the optimization process; here
  the prefix indicates a PDF object ID not a page number


================================================
FILE: docs/api.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Using the OCRmyPDF API

OCRmyPDF originated as a command line program and continues to have this
legacy, but parts of it can be imported and used in other Python
applications.

Some applications may want to consider running ocrmypdf from a
subprocess call anyway, as this provides isolation of its activities.

## Example

OCRmyPDF provides one high-level function to run its main engine from an
application.

```{versionchanged} 17.0
The {func}`ocrmypdf.ocr` function now accepts an {class}`~ocrmypdf.OcrOptions`
object as its first argument, providing a cleaner API with full type hints
and validation. The previous positional argument style remains supported.
```

### Modern API (recommended)

The recommended way to call {func}`ocrmypdf.ocr` is to construct an
{class}`~ocrmypdf.OcrOptions` object with all settings, then pass it
as the sole argument:

```python
import ocrmypdf
from ocrmypdf import OcrOptions

if __name__ == '__main__':  # To ensure correct behavior on Windows and macOS
    options = OcrOptions(
        input_file='input.pdf',
        output_file='output.pdf',
        deskew=True,
        languages=['eng'],
    )
    ocrmypdf.ocr(options)
```

{class}`~ocrmypdf.OcrOptions` is a Pydantic model that provides:

- Full type hints and IDE autocompletion
- Validation of option values at construction time
- Clear documentation of all available options

```{versionadded} 17.0
The {class}`~ocrmypdf.OcrOptions` class is now exported from the top-level
`ocrmypdf` module.
```

### Legacy API

For compatibility with OCRmyPDF < v17, the traditional calling style
with positional arguments is still fully supported:

```python
import ocrmypdf

if __name__ == '__main__':  # To ensure correct behavior on Windows and macOS
    ocrmypdf.ocr('input.pdf', 'output.pdf', deskew=True)
```

With this style, all of the command line arguments are available
and may be passed as equivalent keywords.

A few differences are that `verbose` and `quiet` are not available.
Instead, output should be managed by configuring logging.

### Parent process requirements

The {func}`ocrmypdf.ocr` function runs OCRmyPDF similar to command line
execution. To do this, it will:

- create worker processes or threads
- manage the signal flags of its worker processes
- execute other subprocesses (forking and executing other programs)

The Python process that calls {func}`ocrmypdf.ocr()` must be sufficiently
privileged to perform these actions.

There currently is no option to manage how jobs are scheduled other
than the argument `jobs=` which will limit the number of worker
processes.

Creating a child process to call {func}`ocrmypdf.ocr()` is suggested. That
way your application will survive and remain interactive even if
OCRmyPDF fails for any reason. For example:

```python
from multiprocessing import Process
import ocrmypdf
from ocrmypdf import OcrOptions

def ocrmypdf_process():
    options = OcrOptions(input_file='input.pdf', output_file='output.pdf')
    ocrmypdf.ocr(options)

def call_ocrmypdf_from_my_app():
    p = Process(target=ocrmypdf_process)
    p.start()
    p.join()
```

Programs that call {func}`ocrmypdf.ocr()` should also install a SIGBUS signal
handler (except on Windows), to raise an exception if access to a memory
mapped file fails. OCRmyPDF may use memory mapping.

{func}`ocrmypdf.ocr()` will take a threading lock to prevent multiple runs of itself
in the same Python interpreter process. This is not thread-safe, because of how
OCRmyPDF's plugins and Python's library import system work. If you need to parallelize
OCRmyPDF, use processes.

:::{warning}
On Windows and macOS, the script that calls {func}`ocrmypdf.ocr()` must be
protected by an "ifmain" guard (`if __name__ == '__main__'`). If you do
not take at least one of these steps, process semantics will prevent
OCRmyPDF from working correctly.
:::

### Logging

OCRmyPDF will log under loggers named `ocrmypdf`. In addition, it
imports `pdfminer` and `PIL`, both of which post log messages under
those logging namespaces.

You can configure the logging as desired for your application or call
{func}`ocrmypdf.configure_logging` to configure logging the same way
OCRmyPDF itself does. The command line parameters such as `--quiet`
and `--verbose` have no equivalents in the API; you must use the
provided configuration function or do configuration in a way that suits
your use case.

### Progress monitoring

OCRmyPDF uses the `rich` package to implement its progress bars.
{func}`ocrmypdf.configure_logging` will set up logging output to
`sys.stderr` in a way that is compatible with the display of the
progress bar. Use `ocrmypdf.ocr(...progress_bar=False)` to disable
the progress bar.

### Standard output

OCRmyPDF is strict about not writing to standard output so that
users can safely use it in a pipeline and produce a valid output
file. A caller application will have to ensure it does not write to
standard output either, if it wants to be compatible with this
behavior and support piping to a file. Another benefit of running
OCRmyPDF in a child process, as recommended above, is that it will
not interfere with the parent process's standard output.

### Exceptions

OCRmyPDF may throw standard Python exceptions, `ocrmypdf.exceptions.*`
exceptions, some exceptions related to multiprocessing, and
{exc}`KeyboardInterrupt`. The parent process should provide an exception
handler. OCRmyPDF will clean up its temporary files and worker processes
automatically when an exception occurs.

When OCRmyPDF succeeds conditionally, it returns an integer exit code.

### Plugin Development Changes

```{versionchanged} 16.13
Plugin hooks now receive {class}`~ocrmypdf.OcrOptions` objects instead of
`argparse.Namespace`.
```

- {class}`~ocrmypdf.OcrOptions` provides the same attribute access as `Namespace` (duck-typing compatible)
- Plugin developers should update type hints: `from ocrmypdf import OcrOptions`
- Built-in plugins no longer modify options in-place for better immutability

Most existing plugins will continue working without modification due to the
duck-typing compatibility between {class}`~ocrmypdf.OcrOptions` and `Namespace`.


================================================
FILE: docs/apiref.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# API reference

This page summarizes the rest of the public API. Generally speaking this
should be mainly of interest to plugin developers.

## ocrmypdf.api

```{eval-rst}
.. automodule:: ocrmypdf.api
    :members:
```

## ocrmypdf._options

```{eval-rst}
.. automodule:: ocrmypdf._options
    :members: OcrOptions
```

## ocrmypdf.exceptions

```{eval-rst}
.. automodule:: ocrmypdf.exceptions
    :members:
    :undoc-members:
```

## ocrmypdf.helpers

```{eval-rst}
.. automodule:: ocrmypdf.helpers
    :members:
```

## ocrmypdf.hocrtransform

```{eval-rst}
.. automodule:: ocrmypdf.hocrtransform
    :members:
```

## ocrmypdf.pdfa

```{eval-rst}
.. automodule:: ocrmypdf.pdfa
    :members:
```

## ocrmypdf.quality

```{eval-rst}
.. automodule:: ocrmypdf.quality
    :members:
```

## ocrmypdf.subprocess

```{eval-rst}
.. automodule:: ocrmypdf.subprocess
    :members:
```


================================================
FILE: docs/batch.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

Batch processing
================

This article provides information about running OCRmyPDF on multiple
files or configuring it as a service triggered by file system events.

Batch jobs
----------

Consider using the excellent [GNU
Parallel](https://www.gnu.org/software/parallel/) to apply OCRmyPDF to
multiple files at once.

Both `parallel` and `ocrmypdf` will try to use all available processors.
To maximize parallelism without overloading your system with processes,
consider using `parallel -j 2` to limit parallel to running two jobs at
once.

This command will run `ocrmypdf` on all files named `*.pdf` in the
current directory and write them to the previously created `output/`
folder. It will not search subdirectories.

The `--tag` argument tells parallel to print the filename as a prefix
whenever a message is printed, so that one can trace any errors to the
file that produced them.

:::{code} bash
parallel --tag -j 2 ocrmypdf '{}' 'output/{}' ::: *.pdf
:::

OCRmyPDF automatically repairs PDFs before parsing and gathering
information from them.

Directory trees
---------------

This will walk through a directory tree and run OCR on all files in
place, and printing each filename in between runs:

:::{code} bash
find . -name '*.pdf' -printf '%p\n' -exec ocrmypdf '{}' '{}' \;
:::

This only runs one `ocrmypdf` process at a time. This variation uses
`find` to create a directory list and `parallel` to parallelize runs of
`ocrmypdf`, again updating files in place.

:::{code} bash
find . -name '*.pdf' | parallel --tag -j 2 ocrmypdf '{}' '{}'
:::

In a Windows batch file, use

:::{code} bat
for /r %%f in (*.pdf) do ocrmypdf %%f %%f
:::

With a Docker container, you will need to stream through standard input
and output:

:::{code} bash
find . -name '*.pdf' -print0 | xargs -0 | while read pdf; do
    pdfout=$(mktemp)
    docker run --rm -i jbarlow83/ocrmypdf - - <$pdf >$pdfout && cp $pdfout $pdf
done
:::

### Sample script

This user contributed script also provides an example of batch
processing.

:::{literalinclude} ../misc/batch.py
---
caption: misc/batch.py
---
:::

### Synology DiskStations

Synology DiskStations (Network Attached Storage devices) can run the
Docker image of OCRmyPDF if the Synology [Docker
package](https://www.synology.com/en-global/dsm/packages/Docker) is
installed. Attached is a script to address particular quirks of using
OCRmyPDF on one of these devices.

At the time this script was written, it only worked for x86-based
Synology products. It is not known if it will work on ARM-based Synology
products. Further adjustments might be needed to deal with the
Synology\'s relatively limited CPU and RAM.

:::{literalinclude} ../misc/synology.py
---
caption: misc/synology.py - Sample script for Synology DiskStations
---
:::

### Huge batch jobs

If you have thousands of files to work with, contact the author.
Consulting work related to OCRmyPDF helps fund this open source project
and all inquiries are appreciated.

Hot (watched) folders
---------------------

### Watched folders with watcher.py

OCRmyPDF has a folder watcher called watcher.py, which is currently
included in source distributions but not part of the main program. It
may be used natively or may run in a Docker container. Native instances
tend to give better performance. watcher.py works on all platforms.

Users may need to customize the script to meet their requirements.

:::{code} bash
# Using uv (recommended)
uv sync --extra watcher

# Or using pip
pip3 install ocrmypdf[watcher]

env OCR_INPUT_DIRECTORY=/mnt/input-pdfs \
    OCR_OUTPUT_DIRECTORY=/mnt/output-pdfs \
    OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
    python3 watcher.py
:::

:::{list-table} watcher.py environment variables
---
header-rows: 1
---

* - Environment variable
  - Description
* - OCR\_INPUT\_DIRECTORY
  - Set input directory to monitor (recursive)
* - OCR\_OUTPUT\_DIRECTORY
  - Set output directory (should not be under input)
* - OCR\_ARCHIVE\_DIRECTORY
  - Set archive directory for processed originals (should not be under input, requires `OCR_ON_SUCCESS_ARCHIVE` to be set)
* - OCR\_ON\_SUCCESS\_DELETE
  - This will move the processed original file to `OCR_ARCHIVE_DIRECTORY` if the exit code is 0 (OK). Note that `OCR_ON_SUCCESS_DELETE` takes precedence over this option, i.e. if both options are set, the input file will be deleted.
* - OCR\_OUTPUT\_DIRECTORY\_YEAR\_MONTH
  - This will place files in the output in `{output}/{year}/{month}/{filename}`
* - OCR\_DESKEW
  - Apply deskew to crooked input PDFs
* - OCR\_JSON\_SETTINGS
  - A JSON string specifying any other arguments for `ocrmypdf.ocr`, e.g. `'OCR_JSON_SETTINGS={"rotate_pages": true, "optimize": "3"}'`.
* - OCR\_POLL\_NEW\_FILE\_SECONDS
  - Polling interval
* - OCR\_LOGLEVEL
  - Level of log messages t
:::

One could configure a networked scanner or scanning computer to drop
files in the watched folder.

### Watched folders with Docker

The watcher service is included in the OCRmyPDF Docker image. To run it:

:::{code} bash
docker run \
    --volume <path to files to convert>:/input \
    --volume <path to store results>:/output \
    --volume <path to store processed originals>:/processed \
    --env OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
    --env OCR_ON_SUCCESS_ARCHIVE=1 \
    --env OCR_DESKEW=1 \
    --env PYTHONUNBUFFERED=1 \
    --interactive --tty --entrypoint python3 \
    jbarlow83/ocrmypdf \
    watcher.py
:::

This service will watch for a file that matches `/input/\*.pdf`, convert
it to a OCRed PDF in `/output/`, and move the processed original to
`/processed`. The parameters to this image are:

:::{list-table} Watcher Docker Parameters
:header-rows: 1

* - Parameter
  - Description
* - `--volume <path to files to convert>:/input`
  - Files placed in this location will be OCRed
* - `--volume <path to store results>:/output`
  - This is where OCRed files will be stored
* - `--volume <path to store processed originals>:/processed`
  - Archive processed originals here
* - `--env OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1`
  - Define environment variable `OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1` to place files in the output in `{output}/{year}/{month}/{filename}`
* - `--env OCR_ON_SUCCESS_ARCHIVE=1`
  - Define environment variable `OCR_ON_SUCCESS_ARCHIVE` to move processed originals
* - `--env OCR_DESKEW=1`
  - Define environment variable `OCR_DESKEW` to apply deskew to crooked input PDFs
* - `--env PYTHONBUFFERED=1`
  - This will force `STDOUT` to be unbuffered and allow you to see messages in docker logs
* - `--env OCR_LOGLEVEL='DEBUG'`
  - Level of log messages
* - `--env OCR_JSON_SETTINGS={"language":"deu+eng", "rotate_pages": true}`
  - A JSON string specifying any other arguments for `ocrmypdf.ocr`
:::

This service relies on polling to check for changes to the filesystem.
It may not be suitable for some environments, such as filesystems shared
on a slow network.

A configuration manager such as Docker Compose could be used to ensure
that the service is always available.

:::{literalinclude} ../misc/docker-compose.example.yml
---
caption: misc/docker-compose.example.yml
---
:::

### Caveats

-   `watchmedo` may not work properly on a networked file system,
    depending on the capabilities of the file system client and server.
-   This simple recipe does not filter for the type of file system
    event, so file copies, deletes and moves, and directory operations,
    will all be sent to ocrmypdf, producing errors in several cases.
    Disable your watched folder if you are doing anything other than
    copying files to it.
-   If the source and destination directory are the same, watchmedo may
    create an infinite loop.
-   On BSD, FreeBSD and older versions of macOS, you may need to
    increase the number of file descriptors to monitor more files, using
    `ulimit -n 1024` to watch a folder of up to 1024 files.

### Alternatives

-   On Linux, [systemd user
    services](https://wiki.archlinux.org/index.php/Systemd/User) can be
    configured to automatically perform OCR on a collection of files.
-   [Watchman](https://facebook.github.io/watchman/) is a more powerful
    alternative to `watchmedo`.

macOS Automator
---------------

You can use the Automator app with macOS, to create a Workflow or Quick
Action. Use a *Run Shell Script* action in your workflow. In the context
of Automator, the `PATH` may be set differently your Terminal\'s `PATH`;
you may need to explicitly set the PATH to include `ocrmypdf`. The
following example may serve as a starting point:

![](images/macos-workflow.png)

You may customize the command sent to ocrmypdf.


================================================
FILE: docs/cloud.md
================================================
% SPDX-FileCopyrightText: 2025 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

(ocr-service)=

# Online deployments

OCRmyPDF is designed to be used as a command line tool, but it can be
used in a web service. This document describes some considerations for
doing so.

A basic web service implementation is provided in the source code
repository, as `misc/webservice.py`. It is only demonstration quality
and is not intended for production use.

OCRmyPDF is not designed for use as a public web service where a
malicious user could upload a chosen PDF. In particular, it is not
necessarily secure against PDF malware or PDFs that cause denial of
service. For further discussino of security, see
[security](security).

OCRmyPDF relies on Ghostscript, and therefore, if deployed online one
should be prepared to comply with Ghostscript\'s Affero GPL license, and
any other licenses.

Setting aside these concerns, a side effect of OCRmyPDF is that it may
incidentally sanitize PDFs containing certain types of malware. It
repairs the PDF with pikepdf/libqpdf, which could correct malformed PDF
structures that are part of an attack. When PDF/A output is selected
(the default), the input PDF is partially reconstructed by Ghostscript.
When `--force-ocr` is used, all pages are rasterized and reconverted to
PDF, which could remove malware in embedded images.

## Limiting CPU usage

OCRmyPDF will attempt to use all available CPUs and storage, so
executing `nice ocrmypdf` or limiting the number of jobs with the
`--jobs` argument may ensure the server remains responsive. Another
option would be to run OCRmyPDF jobs inside a Docker container, a
virtual machine, or a cloud instance, which can impose its own limits on
CPU usage and be terminated \"from orbit\" if it fails to complete.

## Temporary storage requirements

OCRmyPDF will use a large amount of temporary storage for its work,
proportional to the total number of pixels needed to rasterize the PDF.
The raster image of a 8.5×11\" color page at 300 DPI takes 25 MB
uncompressed; OCRmyPDF saves its intermediates as PNG, but that still
means it requires about 9 MB per intermediate based on average
compression ratios. Multiple intermediates per page are also required,
depending on the command line given. A rule of thumb would be to allow
100 MB of temporary storage per page in a file -- meaning that a small
cloud servers or small VM partitions should be provisioned with plenty
of extra space, if say, a 500 page file might be sent.

To change the temporary directory, see [tmpdir](#tmpdir).

On Amazon Web Services or other cloud vendors, consider setting your
temporary directory to [empheral
storage](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html).

## Timeouts

To prevent excessively long OCR jobs consider setting
`--tesseract-timeout` and/or `--skip-big` arguments. `--skip-big` is
particularly helpful if your PDFs include documents such as reports on
standard page sizes with large images attached - often large images are
not worth OCR\'ing anyway.

## Document management systems

If you are looking for a full document management system, consider
[paperless-ngx](https://github.com/paperless-ngx/paperless-ngx), which
is a web application that uses OCRmyPDF to automatically OCR and archive
documents.

## Commercial OCR alternatives

The author also provides professional services that include OCR and
building databases around PDFs, and is happy to provide consultation.

Abbyy Cloud OCR is viable commercial alternative with a web services
API. Amazon Textract, Google Cloud Vision, and Microsoft Azure Computer
Vision provide advanced OCR but have less PDF rendering capability.


================================================
FILE: docs/conf.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: CC-BY-SA-4.0

# ruff: noqa: E402

# ocrmypdf documentation build configuration file, created by
# sphinx-quickstart on Sun Sep  4 14:29:43 2016.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))

# -- General configuration ------------------------------------------------
from __future__ import annotations

needs_sphinx = '8'

import datetime as dt

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'myst_parser',
    'sphinx.ext.autodoc',
    'sphinx.ext.intersphinx',
    'sphinx.ext.autosummary',
    'sphinx.ext.napoleon',
    'sphinx.ext.imgconverter',  # PDF docs needs this for SVG to PNG conversion
    'sphinx_issues',
    'sphinx_reredirects',
    'sphinxcontrib.mermaid',
]

myst_enable_extensions = ['colon_fence', 'attrs_block', 'attrs_inline', 'substitution']

# Extension settings
intersphinx_mapping = {'python': ('https://docs.python.org/3', None)}
napoleon_use_rtype = False
issues_github_path = "ocrmypdf/OCRmyPDF"
redirects = {
    "release_notes": "releasenotes/index.html",
}

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# The suffix(es) of source filenames.
source_suffix = {'.rst': 'restructuredtext', '.md': 'markdown', '.txt': 'markdown'}

# The master toctree document.
master_doc = 'index'

# General information about the project.
project = 'ocrmypdf'

year = str(dt.date.today().year)
copyright = (
    f'{year}, James R. Barlow. '
    + 'Licensed under Creative Commons Attribution-ShareAlike 4.0'
)
author = 'James R. Barlow'

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.

import os
from importlib.metadata import version as package_version

on_rtd = os.environ.get('READTHEDOCS') == 'True'

if on_rtd:
    # Help ReadTheDocs avoid having to install any binary extension modules
    import sys
    from unittest.mock import MagicMock

    class Mock(MagicMock):
        @classmethod
        def __getattr__(cls, name):
            return MagicMock()

    MOCK_MODULES = [
        'pikepdf',
        'pikepdf.canvas',
        'pikepdf.models',
        'pikepdf.models.metadata',
    ]
    sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)


# The full version, including alpha/beta/rc tags.
release = package_version('ocrmypdf')
version = '.'.join(release.split('.')[:2])


# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = 'en'

# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#
# today = ''
#
# Else, today_fmt is used as the format for a strftime call.
#
today_fmt = '%Y-%m-%d'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

# The reST default role (used for this markup: `text`) to use for all
# documents.
#
# default_role = None

# If true, '()' will be appended to :func: etc. cross-reference text.
#
# add_function_parentheses = True

# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#
# add_module_names = True

# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#
# show_authors = False

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'

# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []

# If true, keep warnings as "system message" paragraphs in the built documents.
# keep_warnings = False

# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False


# -- Options for HTML output ----------------------------------------------

import sphinx_rtd_theme  # noqa: F401

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {}

# Add any paths that contain custom themes here, relative to this directory.
# html_theme_path = []

# The name for this set of Sphinx documents.
# "<project> v<release> documentation" by default.
#
# html_title = 'ocrmypdf v4.2'

# A shorter title for the navigation bar.  Default is the same as html_title.
#
# html_short_title = None

# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#
# html_logo = "images/logo.svg"  # looks bad

# The name of an image file (relative to this directory) to use as a favicon of
# the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#
# html_favicon = None

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']

# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#
# html_extra_path = []

# If not None, a 'Last updated on:' timestamp is inserted at every page
# bottom, using the given strftime format.
# The empty string is equivalent to '%b %d, %Y'.
#
# html_last_updated_fmt = None

# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#
# html_use_smartypants = True

# Custom sidebar templates, maps document names to template names.
#
# html_sidebars = {}

# Additional templates that should be rendered to pages, maps page names to
# template names.
#
# html_additional_pages = {}

# If false, no module index is generated.
#
# html_domain_indices = True

# If false, no index is generated.
#
# html_use_index = True

# If true, the index is split into individual pages for each letter.
#
# html_split_index = False

# If true, links to the reST sources are added to the pages.
#
# html_show_sourcelink = True

# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#
# html_show_sphinx = True

# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#
# html_show_copyright = True

# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it.  The value of this option must be the
# base URL from which the finished HTML is served.
#
# html_use_opensearch = ''

# This is the file name suffix for HTML files (e.g. ".xhtml").
# html_file_suffix = None

# Language to be used for generating the HTML full-text search index.
# Sphinx supports the following languages:
#   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
#   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
#
# html_search_language = 'en'

# A dictionary with options for the search language support, empty by default.
# 'ja' uses this config value.
# 'zh' user can custom change `jieba` dictionary path.
#
# html_search_options = {'type': 'default'}

# The name of a javascript file (relative to the configuration directory) that
# implements a search results scorer. If empty, the default will be used.
#
# html_search_scorer = 'scorer.js'

# Output file base name for HTML help builder.
htmlhelp_basename = 'ocrmypdfdoc'

# -- Options for LaTeX output ---------------------------------------------

latex_elements = {  # type: ignore
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',
    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',
    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',
    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_documents = [
    (master_doc, 'ocrmypdf.tex', 'ocrmypdf Documentation', 'James R. Barlow', 'manual')
]

# The name of an image file (relative to this directory) to place at the top of
# the title page.
#
# latex_logo = None

# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#
# latex_use_parts = False

# If true, show page references after internal links.
#
# latex_show_pagerefs = False

# If true, show URL addresses after external links.
#
# latex_show_urls = False

# Documents to append as an appendix to all manuals.
#
# latex_appendices = []

# It false, will not define \strong, \code, 	itleref, \crossref ... but only
# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
# packages.
#
# latex_keep_old_macro_names = True

# If false, no module index is generated.
#
# latex_domain_indices = True


# -- Options for manual page output ---------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, 'ocrmypdf', 'ocrmypdf Documentation', [author], 1)]

# If true, show URL addresses after external links.
#
# man_show_urls = False


# -- Options for Texinfo output -------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
#  dir menu entry, description, category)
texinfo_documents = [
    (
        master_doc,
        'ocrmypdf',
        'ocrmypdf Documentation',
        author,
        'ocrmypdf',
        'One line description of project.',
        'Miscellaneous',
    )
]

# Documents to append as an appendix to all manuals.
#
# texinfo_appendices = []

# If false, no module index is generated.
#
# texinfo_domain_indices = True

# How to display URL addresses: 'footnote', 'no', or 'inline'.
#
# texinfo_show_urls = 'footnote'

# If true, do not generate a @detailmenu in the "Top" node's menu.
#
# texinfo_no_detailmenu = False


================================================
FILE: docs/contributing.md
================================================
% SPDX-FileCopyrightText: 2025 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Contributing guidelines

Contributions are welcome!

## Big changes

Please open a new issue to discuss or propose a major change. Not only
is it fun to discuss big ideas, but we might save each other\'s time
too. Perhaps some of the work you\'re contemplating is already half-done
in a development branch.

## Code style

We use `ruff` for code formatting.
The settings for these programs are in `pyproject.toml`. Pull requests
should follow the style guide. One difference we use from \"black\"
style is that strings shown to the user are always in double quotes
(`"`) and strings for internal uses are in single quotes (`'`).

## Tests

New features should come with tests that confirm their correctness.

## New dependencies

If you are proposing a change that will require a new dependency, we
prefer dependencies that are already packaged by Debian or Red Hat. This
makes life much easier for our downstream package maintainers. A package
that is only available on PyPI or GitHub, and not more widely packaged,
may not be accepted.

We are unlikely to accept a dependency on CUDA or other GPU-based
libraries, because these are still difficult to package and install on
many systems. We recommend implementing these changes as plugins.

Python dependencies must also be license-compatible. GPLv3 or AGPLv3 are
likely incompatible with the project\'s license, but LGPLv3 is
compatible.

## New non-Python dependencies

OCRmyPDF uses several external programs (Tesseract, Ghostscript and
others) for its functionality. In general we prefer to avoid adding new
external programs, and if we are to add external programs, we prefer
those that are already packaged by Debian or Red Hat.

## Plugins

Some new features may be a good fit for a plugin. Plugins are a way to
add features to OCRmyPDF without adding them to the core program.
Plugins are installed separately from OCRmyPDF. They are written in
Python and can be installed from PyPI. See the [plugin
documentation](https://ocrmypdf.readthedocs.io/en/latest/plugins.html).

We are happy to link users to your plugin from the documentation.

## Style guide: Is it OCRmyPDF or ocrmypdf?

The program/project is OCRmyPDF and the name of the executable or
library is ocrmypdf.

## Copyright and license

For contributions over 10 lines of code, please add your name to list of
copyright holders for that file. The core program is licensed under
MPL-2.0, test files and documentation under CC-BY-SA 4.0, and
miscellaneous files under MIT, with a few minor exceptions. Please
contribute only content that you own or have the right to contribute
under these licenses.


================================================
FILE: docs/cookbook.md
================================================
% SPDX-FileCopyrightText: 2025 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Cookbook

## Basic examples

### Help!

ocrmypdf has built-in help.

```bash
ocrmypdf --help
```

### Add an OCR layer and convert to PDF/A

```bash
ocrmypdf input.pdf output.pdf
```

### Add an OCR layer and output a standard PDF

```bash
ocrmypdf --output-type pdf input.pdf output.pdf
```

### Create a PDF/A with all color and grayscale images converted to JPEG

```bash
ocrmypdf --output-type pdfa --pdfa-image-compression jpeg input.pdf output.pdf
```

### Modify a file in place

The file will only be overwritten if OCRmyPDF is successful.

```bash
ocrmypdf myfile.pdf myfile.pdf
```

### Correct page rotation

OCR will attempt to automatic correct the rotation of each page. This
can help fix a scanning job that contains a mix of landscape and
portrait pages.

```bash
ocrmypdf --rotate-pages myfile.pdf myfile.pdf
```

You can increase (decrease) the parameter `--rotate-pages-threshold` to
make page rotation more (less) aggressive. The threshold number is the
ratio of how confidence the OCR engine is that the document image should
be changed, compared to kept the same. The default value is quite
conservative; on some files it may not attempt rotations at all unless
it is very confident that the current rotation is wrong. A lower value
of `2.0` will produce more rotations, and more false positives. Run with
`-v1` to see the confidence level for each page to see if there may be a
better value for your files.

If the page is \"just a little off horizontal\", like a crooked picture,
then you want `--deskew`. `--rotate-pages` is for when the cardinal
angle is wrong.

### OCR languages other than English

OCRmyPDF assumes the document is in English unless told otherwise. OCR
quality may be poor if the wrong language is used.

```bash
ocrmypdf -l fra LeParisien.pdf LeParisien.pdf
ocrmypdf -l eng+fra Bilingual-English-French.pdf Bilingual-English-French.pdf
```

Language packs must be installed for all languages specified. See
`Installing additional language packs <lang-packs>`{.interpreted-text
role="ref"}.

Unfortunately, the Tesseract OCR engine has no ability to detect the
language when it is unknown.

### Produce PDF and text file containing OCR text

This produces a file named \"output.pdf\" and a companion text file
named \"output.txt\".

```bash
ocrmypdf --sidecar output.txt input.pdf output.pdf
```

:::{note}
The sidecar file contains the **OCR text** found by OCRmyPDF. If the
document contains pages that already have text, that text will not
appear in the sidecar. If the option `--pages` is used, only those pages
on which OCR was performed will be included in the sidecar. If certain
pages were skipped because of options like `--skip-big` or
`--tesseract-timeout`, those pages will not be in the sidecar.

If you don\'t want to generate the output PDF, use `--output-type=none`
to avoid generating one. Set the output filename to `-` (i.e. redirect
to stdout).

To extract all text from a PDF, whether generated from OCR or otherwise,
use a program like Poppler\'s `pdftotext` or `pdfgrep`.
:::

### OCR images, not PDFs

#### Option: use Tesseract

If you are starting with images, you can just use Tesseract directly to
convert images to PDFs:

```bash
tesseract my-image.jpg output-prefix pdf
```

```bash
# When there are multiple images
tesseract text-file-containing-list-of-image-filenames.txt output-prefix pdf
```

Tesseract\'s PDF output is quite good -- OCRmyPDF uses it internally, in
some cases. However, OCRmyPDF has many features not available in
Tesseract like image processing, metadata control, and PDF/A generation.

#### Option: use img2pdf

You can also use a program like
[img2pdf](https://gitlab.mister-muffin.de/josch/img2pdf) to convert your
images to PDFs, and then pipe the results to run ocrmypdf. The `-` tells
ocrmypdf to read standard input.

```bash
img2pdf my-images*.jpg | ocrmypdf - myfile.pdf
```

`img2pdf` is recommended because it does an excellent job at generating
PDFs without transcoding images.

#### Option: use OCRmyPDF (single images only)

For convenience, OCRmyPDF can also convert single images to PDFs on its
own. If the resolution (dots per inch, DPI) of an image is not set or is
incorrect, it can be overridden with `--image-dpi`. (As 1 inch is 2.54
cm, 1 dpi = 0.39 dpcm).

```bash
ocrmypdf --image-dpi 300 image.png myfile.pdf
```

If you have multiple images, you must use `img2pdf` to convert the
images to PDF.

#### Not recommended

We caution against using ImageMagick or Ghostscript to convert images to
PDF, since they may transcode images or produce downsampled images,
sometimes without warning.

(image-processing)=

## Image processing

OCRmyPDF perform some image processing on each page of a PDF, if
desired. The same processing is applied to each page. It is suggested
that the user review files after image processing as these commands
might remove desirable content, especially from poor quality scans.

-   `--rotate-pages` attempts to determine the correct orientation for
    each page and rotates the page if necessary.
-   `--remove-background` attempts to detect and remove a noisy
    background from grayscale or color images. Monochrome images are
    ignored. This should not be used on documents that contain color
    photos as it may remove them.
-   `--deskew` will correct pages that were scanned at a skewed angle by
    rotating them back into place.
-   `--clean` uses [unpaper](https://www.flameeyes.eu/projects/unpaper)
    to clean up pages before OCR, but does not alter the final output.
    This makes it less likely that OCR will try to find text in
    background noise.
-   `--clean-final` uses unpaper to clean up pages before OCR and
    inserts the page into the final output. You will want to review each
    page to ensure that unpaper did not remove something important.

:::{note}
In many cases image processing will rasterize PDF pages as images,
potentially losing quality.
:::

:::{warning}
`--clean-final` and `--remove-background` may leave undesirable visual
artifacts in some images where their algorithms have shortcomings. Files
should be visually reviewed after using these options.
:::

### Example: OCR and correct document skew (crooked scan)

Deskew:

```bash
ocrmypdf --deskew input.pdf output.pdf
```

Image processing commands can be combined. The order in which options
are given does not matter. OCRmyPDF always applies the steps of the
image processing pipeline in the same order (rotate, remove background,
deskew, clean).

```bash
ocrmypdf --deskew --clean --rotate-pages input.pdf output.pdf
```

Don\'t actually OCR my PDF
--------------------------

If you set `--ocr-engine none` OCRmyPDF will apply its image processing without
performing OCR. This works if all you want to is to apply image processing or PDF/A
conversion.

```bash
ocrmypdf --ocr-engine none --deskew --output-type pdfa input.pdf output.pdf
```

:::{versionchanged} v17.0.0

Prior to this version, `--tesseract-timeout 0` was recommended as an idiom
to turn off OCR. This is not longer recommended, as we move away from
Tesseract OCR as the primary OCR engine.

:::

:::{versionchanged} v14.1.0

Prior to this version, `--tesseract-timeout 0` would prevent other uses
of Tesseract, such as deskewing, from working. This is no longer the
case. Use `--tesseract-non-ocr-timeout` to control the timeout for
non-OCR operations, if needed.
:::

### Remove all text or OCR from my PDF

This is getting ridiculous, but OCRmyPDF can complete strip all textual
information from a PDF and reconstruct it as a \"bag of images\" PDF.

```bash
ocrmypdf --ocr-engine none --force-ocr input.pdf output.pdf
```

Why would you want to do this? Perhaps you have a PDF where OCR fails to
produce useful results, and just want to get rid of all OCR information.
This command also removes OCR generated by third party tools.

### Optimize images without performing OCR

You can also optimize all images without performing any OCR:

```bash
ocrmypdf --ocr-engine none --optimize 3 --skip-text input.pdf output.pdf
```

## Using v17 features

### Select a rasterizer

:::{versionadded} 17.0.0
:::

OCRmyPDF can use pypdfium2 or Ghostscript to rasterize PDF pages. pypdfium2
is generally faster and is preferred when available.

```bash
# Automatic selection (default) - prefers pypdfium when available
ocrmypdf --rasterizer auto input.pdf output.pdf

# Explicitly use pypdfium2 (requires pip install pypdfium2)
ocrmypdf --rasterizer pypdfium input.pdf output.pdf

# Explicitly use Ghostscript
ocrmypdf --rasterizer ghostscript input.pdf output.pdf
```

### PDF/A without Ghostscript

:::{versionadded} 17.0.0
:::

With verapdf installed, OCRmyPDF can produce PDF/A without using Ghostscript
for conversion. This is faster and avoids some Ghostscript limitations.

```bash
# Uses speculative conversion with verapdf validation (default)
ocrmypdf --output-type auto input.pdf output.pdf

# Explicitly request Ghostscript-based PDF/A conversion
ocrmypdf --output-type pdfa input.pdf output.pdf
```

### Using --mode instead of legacy flags

:::{versionadded} 17.0.0
:::

The `--mode` (`-m`) flag consolidates OCR behavior options:

```bash
# Instead of --skip-text
ocrmypdf --mode skip input.pdf output.pdf

# Instead of --force-ocr
ocrmypdf --mode force input.pdf output.pdf

# Instead of --redo-ocr
ocrmypdf --mode redo input.pdf output.pdf

# Short form
ocrmypdf -m skip input.pdf output.pdf
```

The legacy flags continue to work as aliases.

### Process only certain pages

You can ask OCRmyPDF to only apply [image processing](#image-processing)
and OCR to certain pages.

```bash
ocrmypdf --pages 2,3,13-17 input.pdf output.pdf
```

Hyphens denote a range of pages and commas separate page numbers. If you
prefer to use spaces, quote all of the page numbers:
`--pages '2, 3, 5, 7'`.

OCRmyPDF will warn if your list of page numbers contains duplicates or
overlapping pages. OCRmyPDF does not currently account for document page
numbers, such as an introduction section of a book that uses Roman
numerals. It simply counts the number of virtual pieces of paper since
the start. If your list of pages is out of numerical order, OCRmyPDF
will sort it for you.

Regardless of the argument to `--pages`, OCRmyPDF will optimize all
pages/images in the file and convert it to PDF/A, unless you disable
those options. Both of these steps are \"whole file\" operations. In
this example, we want to OCR only the title and otherwise change the PDF
as little as possible:

```bash
ocrmypdf --pages 1 --output-type pdf --optimize 0 input.pdf output.pdf
```

## Redo existing OCR

To redo OCR on a file OCRed with other OCR software or a previous
version of OCRmyPDF and/or Tesseract, you may use the `--redo-ocr`
argument. (Normally, OCRmyPDF will exit with an error if asked to modify
a file with OCR.)

This may be helpful for users who want to take advantage of accuracy
improvements in Tesseract for files they previously OCRed with an
earlier version of Tesseract and OCRmyPDF.

```bash
ocrmypdf --redo-ocr input.pdf output.pdf
```

This method will replace OCR without rasterizing, reducing quality or
removing vector content. If a file contains a mix of pure digital text
and OCR, digital text will be ignored and OCR will be replaced. As such
this mode is incompatible with image processing options, since they
alter the appearance of the file.

In some cases, existing OCR cannot be detected or replaced. Files
produced by OCRmyPDF v2.2 or earlier, for example, are internally
represented as having visible text with an opaque image drawn on top.
This situation cannot be detected.

If `--redo-ocr` does not work, you can use `--force-ocr`, which will
force rasterization of all pages, potentially reducing quality or losing
vector content.

Improving OCR quality
---------------------

The [Image processing](#image-processing) features can improve OCR
quality.

Rotating pages and deskewing helps to ensure that the page orientation
is correct before OCR begins. Removing the background and/or cleaning
the page can also improve results. The `--oversample DPI` argument can
be specified to resample images to higher resolution before attempting
OCR; this can improve results as well.

OCR quality will suffer if the resolution of input images is not correct
(since the range of pixel sizes that will be checked for possible fonts
will also be incorrect).

## PDF optimization

By default OCRmyPDF will attempt to perform lossless optimizations on
the images inside PDFs after OCR is complete. Optimization is performed
even if no OCR text is found.

The `--optimize N` (short form `-O`) argument controls optimization,
where `N` ranges from 0 to 3 inclusive, analogous to the optimization
levels in the GCC compiler. `-O1` is the default.

For further details, see the section on [PDF optimization](optimizer).

```bash
ocrmypdf --optimize 3 in.pdf out.pdf  # Make it small
```

Some users may consider enabling lossy JBIG2. See:
`jbig2-lossy`{.interpreted-text role="ref"}.

:::{note}
Image processing and PDF/A conversion can also introduce lossy
transformations to your PDF images, even when `--optimize 1` is in use.
:::

Digitally signed PDFs
---------------------

OCRmyPDF cannot preserve digital signatures in PDFs and also add OCR to
them. By default, it will refuse to modify a signed PDF regardless of
other settings. You can override this behavior with
`--invalidate-digital-signatures`; as the name suggests, any digital
signatures will be invalidated.

OCRmyPDF cannot open documents that are encrypted with a digital
certificate.

Versions of OCRmyPDF prior to 14.4.0 would invalidate existing digital
signatures without warning.


================================================
FILE: docs/design_notes.md
================================================
% SPDX-FileCopyrightText: 2023 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Design notes

## Why doesn\'t OCRmyPDF use PyTesseract?

PyTesseract is a Python wrapper around the Tesseract OCR engine. When
OCRmyPDF was first written, PyTesseract used ABI bindings to call the
Tesseract library. This was not a good fit for OCRmyPDF because ABI
bindings can be fragile.

PyTesseract has since evolved calling the Tesseract executable,
abandoning the ABI approach and using the CLI instead, just like
OCRmyPDF does. If it were written from scratch today, OCRmyPDF might use
PyTesseract.

PyTesseract has more features don\'t particularly need PDF output, but
less features than OCRmyPDF\'s API for creating PDFs.

## What is `executor()`?

OCRmyPDF uses a custom concurrent executor which can support either
threads or processes with the same interface. This is useful because
OCRmyPDF can use either threads or processes to parallelize work,
whichever is more appropriate for the task at hand.

The interface is currently private and subject to change. In particular,
if experiments with asyncio and anyio are successful, the interface will
change.


================================================
FILE: docs/docker.md
================================================
# OCRmyPDF Docker image {#docker}

OCRmyPDF is also available in Docker images that packages recent
versions of all dependencies.

For users who already have Docker installed this may be an easy and
convenient option.

On platforms other than Linux, Docker runs in a virtual machine, and so
may be less performant. You may also want to adjust the Docker virtual
machine\'s memory and CPU allocation. On Linux, the Docker image runs
natively and performance is comparable to a system installation.

{#docker-install}
## Installing the Docker image

If you have [Docker](https://docs.docker.com/) installed on your system,
you can install a Docker image of the latest release.

If you can run this command successfully, your system is ready to
download and execute the image:

:::{code} bash
docker run hello-world
:::

:::{list-table} Docker Images
:header-rows: 1

* - Image
  - Architecture
  - Description
* - `jbarlow83/ocrmypdf-alpine`
  - x86_64 and arm64
  - Recommended image, based on Alpine Linux.
* - `jbarlow83/ocrmypdf-ubuntu`
  - x86_64 and arm64
  - Alternate image, based on Ubuntu. When the Alpine image is considered stable and available for arm64, this image will be deprecated.
* - `jbarlow83/ocrmypdf`
  - x86_64 and arm64
  - Currently an alias for ocrmypdf-ubuntu. When the Alpine image is considered stable and available for arm64, this name will point to the Alpine image. If you don\'t know about the difference between Alpine and Ubuntu, use this image.
:::

To install:

:::{code} bash
docker pull jbarlow83/ocrmypdf-alpine
:::

The `ocrmypdf` image is also available, but is deprecated and will be
removed in the future.

OCRmyPDF will use all available CPU cores. See the Docker documentation
for [adjusting memory and CPU on other
platforms](https://docs.docker.com/config/containers/resource_constraints/)
if you are using Docker on macOS or Windows, where you may need to
manually assign more resources. On Linux, all resources will be
available automatically.

The underlying operating system and other details in Docker images are
considered implementation details and **subject to change at minor
releases**. If you are modifying the image, you should pin the version
you intend to use.

## Using the Docker image on the command line

**Unlike typical Docker containers**, in this section the OCRmyPDF
Docker container is ephemeral -- it runs for one OCR job and terminates,
just like a command line program. We are using Docker to deliver an
application (as opposed to the more conventional case, where a Docker
container runs as a server). For that reason we usually use the `--rm`
argument to delete the container when it exits.

To start a Docker container (instance of the image):

:::{code} bash
docker run --rm -i jbarlow83/ocrmypdf-alpine (... all other arguments here...) - -
:::

For convenience, create a shell alias to hide the Docker command. It is
easier to send the input file as stdin and read the output from stdout
-- **this avoids the messy permission issues with Docker entirely**.

:::{code} bash
alias docker_ocrmypdf='docker run --rm -i jbarlow83/ocrmypdf-alpine'
docker_ocrmypdf --version  # runs docker version
docker_ocrmypdf - - <input.pdf >output.pdf
:::

Or in the wonderful [fish shell](https://fishshell.com/):

:::{code} fish
alias docker_ocrmypdf 'docker run --rm jbarlow83/ocrmypdf-alpine'
funcsave docker_ocrmypdf
:::

Alternately, you could mount the local current working directory as a
Docker volume:

:::{code} bash
alias docker_ocrmypdf='docker run --rm  -i --user "$(id -u):$(id -g)" --workdir /data -v "$PWD:/data" jbarlow83/ocrmypdf-alpine'
docker_ocrmypdf /data/input.pdf /data/output.pdf
:::

## Podman

Especially if you use [Podman](https://podman.io/) (or use Docker in
rootless mode), you may need to add `--userns keep-id` there,
otherwise you may get access errors, because the user ID is otherwise not
mapped to the same UID as on the host:

:::{code} bash
alias podman_ocrmypdf='podman run --rm -i --user "$(id -u):$(id -g)" --userns keep-id --workdir /data -v "$PWD:/data" jbarlow83/ocrmypdf-alpine'
podman_ocrmypdf /data/input.pdf /data/output.pdf
:::

If you have SELinux enabled, you may additionally need to add the `:Z` [suffix to
the
volume](https://docs.podman.io/en/stable/markdown/podman-run.1.html#volume-v-source-volume-host-dir-container-dir-options)
or disable SELinux for the container using
`--security-opt label=disable`, which is suggested for system files as
they should not be re-labelled. Please refer to the „Note" section at
the end of the linked podman documentation for details. This results in
the following full command:

:::{code} bash
alias podman_ocrmypdf='podman run --rm -i --user "$(id -u):$(id -g)" --userns keep-id --workdir /data -v "$PWD:/data" --security-opt label=disable jbarlow83/ocrmypdf-alpine'
podman_ocrmypdf /data/input.pdf /data/output.pdf
:::

{#docker-lang-packs}
## Adding languages to the Docker image

By default the Docker image includes English, German, Simplified
Chinese, French, Portuguese and Spanish, the most popular languages for
OCRmyPDF users based on feedback. You may add other languages by
creating a new Dockerfile based on the public one.

:::{code} dockerfile
FROM jbarlow83/ocrmypdf

# Example: add Italian
RUN apt install tesseract-ocr-ita
:::

To install language packs (training data) such as the
[tessdata\_best](https://github.com/tesseract-ocr/tessdata_best) suite
or custom data, you first need to determine the version of Tesseract
data files, which may differ from the Tesseract program version. Use
this command to determine the data file version:

:::{code} bash
docker run -i --rm --entrypoint /bin/ls jbarlow83/ocrmypdf /usr/share/tesseract-ocr
:::

As of 2021, the data file version is probably `4.00`.

You can then add new data with either a Dockerfile:

:::{code} dockerfile
FROM jbarlow83/ocrmypdf:{TAG}

# Example: add a tessdata_best file
COPY chi_tra_vert.traineddata /usr/share/tesseract-ocr/<data version>/tessdata/
:::

When creating your own image, you should always pin a specific version
of the OCRmyPDF Docker image. This ensures that your image will not
break when a new version of OCRmyPDF is released.

Alternately, you can copy training data into a Docker container as
follows:

:::{code} bash
docker cp mycustomtraining.traineddata name_of_container:/usr/share/tesseract-ocr/<tesseract version>/tessdata/
:::

Extending the Docker image
--------------------------

You can extend the Docker image with your own customizations, similar to
the way it is extended to add language packs.

Note that the Docker image is subject to change at any time. For
example, the base image may be updated to a newer version of Ubuntu or
Debian. Such changes will be noted in the release notes but might occur
at minor versions releases, unless the way a \"casual\" user of the
Docker image is affected.

If you extend the Docker image, you should pin a specific version of the
OCRmyPDF Docker image.

Executing the test suite
------------------------

The OCRmyPDF test suite is installed with image. To run it:

:::{code} bash
docker run --rm --entrypoint python  jbarlow83/ocrmypdf -m pytest
:::

Accessing the shell
-------------------

To use the shell in the Docker image:

:::{code} bash
docker run -it --entrypoint sh  jbarlow83/ocrmypdf
:::

Using the OCRmyPDF web service wrapper
--------------------------------------

The OCRmyPDF Docker image includes an example, barebones HTTP web
service. The webservice may be launched as follows:

:::{code} bash
docker run --entrypoint python -p 5000:5000  jbarlow83/ocrmypdf webservice.py
:::

We omit the `--rm` parameter so that the container will not be
automatically deleted when it exits.

This will configure the machine to listen on port 5000. On Linux
machines this is port 5000 of localhost. On macOS or Windows machines
running Docker, this is port 5000 of the virtual machine that runs your
Docker images. You can find its IP address using the command
`docker-machine ip`.

Unlike command line usage this program will open a socket and wait for
connections.

:::{warning}
The OCRmyPDF web service wrapper is intended for demonstration or
development. It provides no security, no authentication, no protection
against denial of service attacks, and no load balancing. The default
Flask WSGI server is used, which is intended for development only. The
server is single-threaded and so can respond to only one client at a
time. While running OCR, it cannot respond to any other clients.
:::

Clients must keep their open connection while waiting for OCR to
complete. This may entail setting a long timeout; this interface is more
useful for internal HTTP API calls.

Unlike the rest of OCRmyPDF, this web service is licensed under the
Affero GPLv3 (AGPLv3) since Ghostscript is also licensed in this way.

In addition to the above, please read our
`general remarks on using OCRmyPDF as a service <ocr-service>`{.interpreted-text
role="ref"}.


================================================
FILE: docs/errors.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Common error messages

## Page already has text

:::{code}
ERROR -    1: page already has text! – aborting (use --force-ocr to force OCR)
:::

You ran ocrmypdf on a file that already contains printable text or a
hidden OCR text layer (it can\'t quite tell the difference). You
probably don\'t want to do this, because the file is already searchable.

As the error message suggests, your options are:

-   `ocrmypdf --force-ocr` to
    `rasterize <raster-vector>`{.interpreted-text role="ref"} all vector
    content and run OCR on the images. This is useful if a previous OCR
    program failed, or if the document contains a text watermark.
-   `ocrmypdf --skip-text` to skip OCR and other processing on any pages
    that contain text. Text pages will be copied into the output PDF
    without modification.
-   `ocrmypdf --redo-ocr` to scan the file for any existing OCR
    (non-printing text), remove it, and do OCR again. This is one way to
    take advantage of improvements in OCR accuracy. Printable vector
    text is excluded from OCR, so this can be used on files that contain
    a mix of digital and scanned files.

## Input file \'filename\' is not a valid PDF

OCRmyPDF checks files with pikepdf, a library that in turn uses libqpdf
to fixes errors in PDFs, before it tries to work on them. In most cases
this happens because the PDF is corrupt and truncated (incomplete file
copying) and not much can be done.

You can try rewriting the file with Ghostscript:

:::{code} bash
gs -o output.pdf -dSAFER -sDEVICE=pdfwrite input.pdf
:::

`pdftk` can also rewrite PDFs:

:::{code} bash
pdftk input.pdf cat output output.pdf
:::

Sometimes Acrobat can repair PDFs with its [Preflight
tool](https://helpx.adobe.com/acrobat/using/correcting-problem-areas-preflight-tool.html).


================================================
FILE: docs/index.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# OCRmyPDF documentation

:::{figure} images/logo.svg
:::

OCRmyPDF adds an optical character recognition (OCR) text layer to scanned PDF
files, allowing them to be searched.

PDF is the best format for storing and exchanging scanned documents.
Unfortunately, PDFs can be difficult to modify. OCRmyPDF makes it easy to apply
image processing and OCR (recognized, searchable text) to existing PDFs.

```{toctree}
:maxdepth: 1

introduction
releasenotes/index
installation
languages
jbig2
```

```{toctree}
:caption: Usage
:maxdepth: 2

cookbook
optimizer
docker
advanced
batch
cloud
performance
pdfsecurity
errors
```

```{toctree}
:caption: Developers
:maxdepth: 2

api
plugins
apiref
design_notes
contributing
maintainers
```

# Indices and tables

- {ref}`genindex`
- {ref}`modindex`
- {ref}`search`


================================================
FILE: docs/installation.md
================================================
---
myst:
  substitutions:
    deb_12: |-
      :::{image} https://repology.org/badge/version-for-repo/debian_12/ocrmypdf.svg
      :alt: Debian 12
      :::
    deb_13: |-
      :::{image} https://repology.org/badge/version-for-repo/debian_13/ocrmypdf.svg
      :alt: Debian 13
      :::
    deb_unstable: |-
      :::{image} https://repology.org/badge/version-for-repo/debian_unstable/ocrmypdf.svg
      :alt: Debian unstable
      :::
    fedora_40: |-
      :::{image} https://repology.org/badge/version-for-repo/fedora_40/ocrmypdf.svg
      :alt: Fedora 40
      :::
    fedora_41: |-
      :::{image} https://repology.org/badge/version-for-repo/fedora_41/ocrmypdf.svg
      :alt: Fedora 41
      :::
    fedora_rawhide: |-
      :::{image} https://repology.org/badge/version-for-repo/fedora_rawhide/ocrmypdf.svg
      :alt: Fedora Rawhide
      :::
    latest: |-
      :::{image} https://img.shields.io/pypi/v/ocrmypdf.svg
      :alt: OCRmyPDF latest released version on PyPI
      :::
    ubu_2204: |-
      :::{image} https://repology.org/badge/version-for-repo/ubuntu_22_04/ocrmypdf.svg
      :alt: Ubuntu 22.04 LTS
      :::
    ubu_2404: |-
      :::{image} https://repology.org/badge/version-for-repo/ubuntu_24_04/ocrmypdf.svg
      :alt: Ubuntu 24.04 LTS
      :::
---

% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Installing OCRmyPDF

(latest)=

The easiest way to install OCRmyPDF is to follow the steps for your operating
system/platform. This version may be out of date, however.

These platforms have one-liner installs:

:::{list-table}
:header-rows: 0

* - Homebrew (macOS and Linux)
  - ``brew install ocrmypdf``
* - Debian, Ubuntu
  - ``apt install ocrmypdf``
* - Windows Subsystem for Linux
  - ``apt install ocrmypdf``
* - Fedora
  - ``dnf install ocrmypdf tesseract-osd``
* - macOS (MacPorts)
  - ``port install ocrmypdf``
* - FreeBSD
  - ``pkg install textproc/py-ocrmypdf``
* - Snap (snapcraft packaging)
  - ``snap install ocrmypdf``
:::

More detailed procedures are outlined below. If you want to do a manual
install, or install a more recent version than your platform provides, read on.

:::{contents} Platform-specific steps
:depth: 2
:local: true
:::

## Installing on Linux

### Debian and Ubuntu 22.04 or newer

:::{list-table}
:header-rows: 1

* - OCRmyPDF versions in Debian & Ubuntu
* - {{ latest }}
* - {{ deb_12 }} {{ deb_13 }} {{ deb_unstable }}
* - {{ ubu_2204 }} {{ ubu_2404 }}
:::

Users of Debian or Ubuntu may simply

```bash
apt install ocrmypdf
```

As indicated in the table above, Debian and Ubuntu releases may lag
behind the latest version. If the version available for your platform is
out of date, you could opt to install the latest version from source.
See [Installing HEAD revision from
sources](#installing-head-revision-from-sources).

For full details on version availability for your platform, check the
[Debian Package Tracker](https://tracker.debian.org/pkg/ocrmypdf) or
[Ubuntu launchpad.net](https://launchpad.net/ocrmypdf).

:::{note}
OCRmyPDF for Debian and Ubuntu currently omit the JBIG2 encoder.
OCRmyPDF works fine without it but will produce larger output files.
All JBIG2 patents expired in 2017, so if you build jbig2enc from source,
OCRmyPDF will automatically detect it on the `PATH`.
To add JBIG2 encoding, see {ref}`jbig2`.
:::

### Fedora

:::{list-table}
:header-rows: 1

* - OCRmyPDF version
* - {{latest}}
* - {{fedora_40}} {{fedora_41}} {{fedora_rawhide}}
:::

Users of Fedora may simply

```bash
dnf install ocrmypdf tesseract-osd
```

For full details on version availability, check the [Fedora Package
Tracker](https://packages.fedoraproject.org/pkgs/ocrmypdf/ocrmypdf/).

If the version available for your platform is out of date, you could opt
to install the latest version from source. See [Installing HEAD revision
from sources](#installing-head-revision-from-sources).

:::{note}
OCRmyPDF for Fedora currently omits the JBIG2 encoder. All JBIG2 patents
expired in 2017. OCRmyPDF works fine without it but will produce larger
output files. If you build jbig2enc from source, OCRmyPDF will automatically
detect it on the `PATH`. To add JBIG2 encoding, see {ref}`jbig2`.
:::

(ubuntu-lts-latest)=

### RHEL 9

Prepare the environment by getting Python 3.12:

```bash
dnf install python3.12 python3.12-pip
```

Then, follow [Requirements for pip and HEAD install](#requirements-for-pip-and-head-install) to install dependencies:

```bash
dnf install ghostscript tesseract
```

and build ocrmypdf in virtual environment:

```bash
python3.12 -m venv .venv
```

To add JBIG2 encoding, see {ref}`Installing the JBIG2 encoder <jbig2>`.

Note Fedora packages for language data haven't been branched for RHEL/EPEL, but you can get traineddata files directly from [tesseract](https://github.com/tesseract-ocr/tessdata/) and place them in `/usr/share/tesseract/tessdata`.

### Installing the latest version on Ubuntu 22.04/24.04 LTS

Ubuntu includes an older version of OCRmyPDF - you can install that with
`apt install ocrmypdf`. To install the latest version, we recommend using uv:

```bash
# Install system dependencies first
sudo apt-get update
sudo apt-get -y install ocrmypdf

# Install uv and upgrade to the latest OCRmyPDF
pip install uv
uv pip install --user --upgrade ocrmypdf
```

Alternatively, use Homebrew on Linux for a full-featured installation (see below).

To add JBIG2 encoding, see {ref}`jbig2`.

### Ubuntu 20.04 LTS (and other older distributions)

:::{note}
Ubuntu 20.04 is approaching end of life. Consider upgrading to Ubuntu 22.04 or 24.04 LTS.
:::

For older distributions, the most convenient way to install a recent version of
OCRmyPDF is to use Homebrew on Linux:

```bash
brew install ocrmypdf
```

See {ref}`homebrew-linux` for more information on using Homebrew on Linux.

### Arch Linux (AUR)

:::{image} https://repology.org/badge/version-for-repo/aur/ocrmypdf.svg
:alt: ArchLinux
:target: https://repology.org/metapackage/ocrmypdf
:::

There is an [Arch User Repository (AUR) package for OCRmyPDF](https://aur.archlinux.org/packages/ocrmypdf/).

Installing AUR packages as root is not allowed, so you must first [setup a
non-root user](https://wiki.archlinux.org/index.php/Users_and_groups#User_management) and
[configure sudo](https://wiki.archlinux.org/index.php/Sudo#Configuration).
The standard Docker image, `archlinux/base:latest`, does **not** have a
non-root user configured, so users of that image must follow these guides. If
you are using a VM image, such as [the official Vagrant image](https://app.vagrantup.com/archlinux/boxes/archlinux), this work may already
be completed for you.

Next you should install the [base-devel package group](https://archlinux.org/packages/core/any/base-devel/). This includes the
standard tooling needed to build packages, such as a compiler and binary tools.

```bash
sudo pacman -S --needed base-devel
```

Now you are ready to install the OCRmyPDF package.

```bash
curl -O https://aur.archlinux.org/cgit/aur.git/snapshot/ocrmypdf.tar.gz
tar xvzf ocrmypdf.tar.gz
cd ocrmypdf
makepkg -sri
```

At this point you will have a working install of OCRmyPDF, but the Tesseract
install won’t include any OCR language data. You can install [the
tesseract-data package group](https://www.archlinux.org/groups/any/tesseract-data/) to add all supported
languages, or use that package listing to identify the appropriate package for
your desired language.

```bash
sudo pacman -S tesseract-data-eng
```

As an alternative to this manual procedure, consider using an [AUR helper](https://wiki.archlinux.org/index.php/AUR_helpers). Such a tool will
automatically fetch, build and install the AUR package, resolve dependencies
(including dependencies on AUR packages), and ease the upgrade procedure.

If you have any difficulties with installation, check the repository package
page.

:::{note}
The OCRmyPDF AUR package currently omits the JBIG2 encoder. OCRmyPDF works
fine without it but will produce larger output files. The encoder is
available from [the jbig2enc-git AUR package](https://aur.archlinux.org/packages/jbig2enc-git/) and may be installed
using the same series of steps as for the installation OCRmyPDF AUR
package. Alternatively, it may be built manually from source following the
instructions in {ref}`Installing the JBIG2 encoder <jbig2>`. If JBIG2 is
installed, OCRmyPDF 7.0.0 and later will automatically detect it.
:::

### Alpine Linux

:::{image} https://repology.org/badge/version-for-repo/alpine_edge/ocrmypdf.svg
:alt: Alpine Linux
:target: https://repology.org/metapackage/ocrmypdf
:::

To install OCRmyPDF for Alpine Linux:

```bash
apk add ocrmypdf
```

### Gentoo Linux

:::{image} https://repology.org/badge/version-for-repo/gentoo_ovl_guru/ocrmypdf.svg
:alt: Gentoo Linux
:target: https://repology.org/metapackage/ocrmypdf
:::

To install OCRmyPDF on Gentoo Linux, use the following commands:

```bash
eselect repository enable guru
emaint sync --repo guru
emerge --ask app-text/OCRmyPDF
```

### Other Linux packages

See the
[Repology](https://repology.org/metapackage/ocrmypdf/versions) page.

In general, first install the OCRmyPDF package for your system, then
optionally use the procedure [Installing with Python
pip](#installing-with-python-pip) to install a more recent version.

(homebrew-linux)=

## Installing with Homebrew (macOS and Linux)

:::{image} https://img.shields.io/homebrew/v/ocrmypdf.svg
:alt: homebrew
:target: https://formulae.brew.sh/formula/ocrmypdf
:::

[Homebrew](https://brew.sh) provides a full-featured OCRmyPDF installation
on both macOS and Linux with all recommended dependencies. This is often
the easiest way to get a complete, up-to-date installation.

```bash
brew install ocrmypdf
```

This includes Tesseract, Ghostscript, and all required dependencies. English
language support is included by default. For other languages:

```bash
brew install tesseract-lang  # Optional: Install all language packs
```

:::{tip}
**For Linux users:** Homebrew on Linux is an excellent choice when your
distribution's package is outdated or missing optional dependencies like
jbig2enc, pngquant, or unpaper. Homebrew provides a consistent, full-featured
installation that works across many Linux distributions.

Install Homebrew on Linux: https://brew.sh
:::

## Installing on macOS

### Homebrew

See {ref}`homebrew-linux` above - the installation is identical on macOS.

### MacPorts

:::{image} https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fports.macports.org%2Fapi%2Fv1%2Fports%2Focrmypdf%2F%3Fformat%3Djson&query=version&label=MacPorts
:alt: Macports Version Information
:target: https://ports.macports.org/port/ocrmypdf
:::

OCRmyPDF is included in MacPorts:

```bash
sudo port install ocrmypdf
```

Note that while this will install tesseract you will need to install
the appropriate tesseract [language ports](https://ports.macports.org/search/?selected_facets=categories_exact%3Atextproc&installed_file=&q=tesseract&name=on).

### Manual installation on macOS

These instructions are for installing a more current version of OCRmyPDF than
is available from Homebrew. Note that Homebrew versions usually track
releases fairly closely.

If it's not already present, [install Homebrew](http://brew.sh/).

Update Homebrew and install dependencies:

```bash
brew update
```

Install or upgrade the required Homebrew packages, if any are missing.
To do this, use `brew edit ocrmypdf` to obtain a recent list of Homebrew
dependencies. You could also check the `.workflows/build.yml`.

This will include the English, French, German and Spanish language
packs. If you need other languages you can optionally install them all:

(macos-all-languages)=

> ```bash
> brew install tesseract-lang  # Option 2: for all language packs
> ```

Install uv and OCRmyPDF:

```bash
pip install uv
uv pip install --user ocrmypdf
```

The command line program should now be available:

```bash
ocrmypdf --help
```

## Installing on Windows

### Native Windows

% If you have a Windows that is not the Home edition, you can use Windows Sandbox to test on a blank Windows instance.
% https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/

:::{note}
Administrator privileges will be required for some of these steps.
:::

You must install the following for Windows:

- Python 64-bit
- Tesseract 64-bit
- Ghostscript 64-bit

Using the [winget](https://docs.microsoft.com/en-us/windows/package-manager/winget/)
package manager:

- `winget install -e --id Python.Python.3.12`
- `winget install -e --id UB-Mannheim.TesseractOCR`

You will need to install Ghostscript manually, [since it does not support automated
installs anymore](https://artifex.com/news/ghostscript-10.01.0-disabling-silent-install-option).

- [Ghostscript download page](https://ghostscript.com/releases/gsdnld.html).\`

(Or alternately, using the [Chocolatey](https://chocolatey.org/) package manager, install
the following when running in an Administrator command prompt):

- `choco install python3`
- `choco install --pre tesseract`
- `choco install pngquant` (optional)

Either set of commands will install the required software. At the moment there is no
single command to install Windows.

You may then use `pip` to install ocrmypdf. (This can performed by a user or
Administrator.):

- `python3 -m pip install ocrmypdf`

% The Windows Python versions do not place any python or python3 executable in the path.
% They add the py launcher to the path:
% https://docs.python.org/3/using/windows.html#python-launcher-for-windows

If you installed Python using WinGet, then use the following command instead:

- `py -m pip install ocrmypdf`

and use:

- `py -m ocrmypdf`

To start OCRmyPDF.

If you intend to use more Python software on your Windows machine, consider the use of
[pipx](https://pipx.pypa.io/stable/) or a similar tool to create isolated Python
environments for each Python software that you want to use.

OCRmyPDF will check the Windows Registry and standard locations in your Program Files
for third party software it needs (specifically, Tesseract and Ghostscript). To
override the versions OCRmyPDF selects, you can modify the `PATH` environment
variable. [Follow these directions](https://www.computerhope.com/issues/ch000549.htm#dospath)
to change the PATH.

:::{warning}
32-bit Windows is not supported.
:::

### Windows Subsystem for Linux

1. Install Ubuntu 22.04 for Windows Subsystem for Linux, if not already installed.
2. Follow the procedure to install {ref}`OCRmyPDF on Ubuntu 22.04 <ubuntu-lts-latest>`.
3. Open the Windows command prompt and create a symlink:

```powershell
wsl sudo ln -s  /home/$USER/.local/bin/ocrmypdf /usr/local/bin/ocrmypdf
```

Then confirm that the expected version from PyPI ({{ latest }}) is installed:

```powershell
wsl ocrmypdf --version
```

You can then run OCRmyPDF in the Windows command prompt or Powershell, prefixing
`wsl`, and call it from Windows programs or batch files.

### Cygwin64

First install the the following prerequisite Cygwin packages using `setup-x86_64.exe`:

```
python311 (or later)
python3?-devel
python3?-pip
python3?-lxml
python3?-imaging

   (where 3? means match the version of python3 you installed)

gcc-g++
ghostscript
libexempi3
libexempi-devel
libffi6
libffi-devel
pngquant
qpdf
libqpdf-devel
tesseract-ocr
tesseract-ocr-devel
```

Then open a Cygwin terminal (i.e. `mintty`), run the following commands. Note
that if you are using the version of `pip` that was installed with the Cygwin
Python package, the command name will be `pip3`. If you have since updated
`pip` (with, for instance `pip3 install --upgrade pip`) the the command is
likely just `pip` instead of `pip3`:

```bash
pip3 install wheel
pip3 install ocrmypdf
```

The optional dependency "unpaper" that is currently not available under Cygwin.
Without it, certain options such as `--clean` will produce an error message.
However, the OCR-to-text-layer functionality is available.

### Docker

You can also [Install the Docker image](docker) on Windows. Ensure that
your command prompt can run the docker "hello world" container.

## Installing on FreeBSD

:::{image} https://repology.org/badge/version-for-repo/freebsd/ocrmypdf.svg
:alt: FreeBSD
:target: https://repology.org/project/ocrmypdf/versions
:::

```bash
pkg install textproc/py-ocrmypdf
```

To install a more recent version, you could attempt to first install the system
version with `pkg`, then use `pip install --user ocrmypdf`.

## Installing the Docker image

For some users, installing the Docker image will be easier than
installing all of OCRmyPDF's dependencies.

See [Installing the Docker image](docker) for more information.

(installing-with-python-pip)=

## Installing with uv (recommended)

We recommend using [uv](https://docs.astral.sh/uv/) for installing OCRmyPDF from PyPI.
uv is a fast, modern Python package manager that provides better dependency resolution
and consistent behavior across all platforms.

For best results, first install [your platform's
version](https://repology.org/metapackage/ocrmypdf/versions) of
`ocrmypdf` using the instructions elsewhere in this document to satisfy system
dependencies. Then use uv to get the latest OCRmyPDF version.

```bash
# Install uv if you don't have it
pip install uv

# Install ocrmypdf in a virtual environment (recommended)
uv venv
source .venv/bin/activate  # On Windows: .venv\Scripts\activate
uv pip install ocrmypdf

# Or install globally
uv pip install --system ocrmypdf
```

Use `ocrmypdf --version` to confirm what version was installed.

### Installing with pip

If you prefer pip, you can still use it:

```bash
pip install --user ocrmypdf
```

(If the message appears `Requirement already satisfied: ocrmypdf in...`,
you will need to use `pip install --user --upgrade ocrmypdf`.)

### Installing with pipx

Some users may prefer pipx for isolated command-line tool installations:

```bash
pipx install ocrmypdf
```

Or run without permanent installation:

```bash
pipx run ocrmypdf
```

(requirements-for-pip-and-head-install)=

### Requirements for pip and HEAD install

OCRmyPDF currently requires these external programs and libraries to be
installed, and must be satisfied using the operating system package
manager. `pip` cannot provide them.

:::{versionchanged} 17.0.0
Ghostscript is now optional. pypdfium2 can be used for PDF rasterization,
and verapdf can validate speculative PDF/A conversion.
:::

The following versions are required:

- Python 3.11 or newer (3.12+ recommended)
- Tesseract 4.1.1 or newer
- One of: Ghostscript 9.54+ **or** pypdfium2 (Python package)
- One of: Ghostscript 9.54+ **or** verapdf (for PDF/A output)
- fpdf2 2.8 or newer (Python package)
- uharfbuzz (Python package)
- fonts-noto or equivalent (system package, recommended)
- jbig2enc 0.29 or newer (optional)
- pngquant 2.5 or newer (optional)
- unpaper 6.1 (optional)

:::{note}
For the best user experience, install both Ghostscript and pypdfium2. pypdfium2 is
faster for rasterization, while Ghostscript provides is required for certain PDF/A
conversions.
:::

**Dependency summary:**

| Feature | Option 1 | Option 2 | Notes |
|---------|----------|----------|-------|
| PDF rasterization | pypdfium2 (Python) | Ghostscript (binary) | pypdfium2 preferred when available |
| PDF/A conversion | verapdf + pikepdf | Ghostscript | verapdf validates speculative conversion |
| Text rendering | fpdf2 + uharfbuzz | - | Required |
| OCR | tesseract-ocr | `--ocr-engine none` | Can be skipped entirely |

**Minimum viable installation:**
tesseract-ocr + (pypdfium2 OR Ghostscript) + fpdf2 + uharfbuzz

**Recommended installation:**
tesseract-ocr + pypdfium2 + Ghostscript + verapdf + fpdf2 + uharfbuzz + fonts-noto + unpaper + pngquant + jbig2enc

We recommend 64-bit versions of all software. (32-bit versions are not
supported, although on Linux, they may still work.)

**fpdf2** and **uharfbuzz** are required dependencies that provide the text
layer rendering engine. fpdf2 generates the PDF text layer, while uharfbuzz
provides text shaping for proper multilingual support. These replace the
legacy hOCR-based renderer. Install with: `pip install fpdf2 uharfbuzz`

**fonts-noto** (or an equivalent comprehensive font package) is recommended
for proper text rendering, especially for non-Latin scripts. On Debian/Ubuntu:
`apt install fonts-noto`. On Fedora: `dnf install google-noto-fonts-common`.
On macOS with Homebrew: `brew install font-noto`.

**pypdfium2**, if present, provides fast PDF page rasterization using
the pdfium library (the same library used by Google Chrome). It is
preferred over Ghostscript when available due to better performance.
Install with: `pip install pypdfium2`

**verapdf**, if present, enables fast speculative PDF/A conversion.
OCRmyPDF attempts to create PDF/A by adding metadata and ICC profiles
using pikepdf, then validates with verapdf. If validation passes,
Ghostscript is skipped entirely. See your distribution's package manager
or visit [verapdf.org](https://verapdf.org/).

**jbig2enc**, if present, will be used to optimize the encoding of
monochrome images. This can significantly reduce the file size of the
output file. It is not required.
[jbig2enc](https://github.com/agl/jbig2enc) is not available in some
distributions due to historical patent concerns, but all JBIG2 patents
expired in 2017. It can easily be built from source. To add JBIG2 encoding,
see {ref}`jbig2`.

:::{warning}
Lossy JBIG2 encoding (`--jbig2-lossy`) has been removed in v17.0.0 due to
well-documented risks of character substitution errors. Only lossless
JBIG2 compression is now supported.
:::

**pngquant**, if present, is optionally used to optimize the encoding of
PNG-style images in PDFs (actually, any that are that losslessly
encoded) by lossily quantizing to a smaller color palette. It is only
activated then the `--optimize` argument is `2` or `3`.

**unpaper**, if present, enables the `--clean` and `--clean-final`
command line options.

These are in addition to the Python packaging dependencies, meaning that
unfortunately, the `pip install` command cannot satisfy all of them.

(installing-head-revision-from-sources)=

## Installing HEAD revision from sources

If you have `git` and Python 3.12 or newer installed, you can install
from source. (Python 3.11 is supported but 3.12+ is recommended.) When the `pip` installer runs, it will alert you if
dependencies are missing.

If you prefer to build every from source, you will need to [build
pikepdf from
source](https://pikepdf.readthedocs.io/en/latest/installation.html#building-from-source).
First ensure you can build and install pikepdf.

We recommend using uv to install from sources:

```bash
git clone -b main https://github.com/ocrmypdf/OCRmyPDF.git
cd OCRmyPDF
pip install uv  # If not already installed
uv sync
```

This creates a virtual environment and installs all dependencies. Activate
the environment to use ocrmypdf:

```bash
source .venv/bin/activate
ocrmypdf --help
```

Alternatively, install directly from GitHub using pip:

```bash
pip install git+https://github.com/ocrmypdf/OCRmyPDF.git
```

Or, to install in editable mode allowing customization:

```bash
git clone -b main https://github.com/ocrmypdf/OCRmyPDF.git
cd OCRmyPDF
pip install -e .
```

Note: `ocrmypdf` will only be accessible when the virtual environment
is activated.

To run the program:

```bash
ocrmypdf --help
```

If not yet installed, the script will notify you about dependencies that
need to be installed. The script requires specific versions of the
dependencies. Older version than the ones mentioned in the release notes
are likely not to be compatible to OCRmyPDF.

## Optional Features

OCRmyPDF provides optional features and development tools. We recommend using `uv` as your package manager.

### Installing User Features

User features are available as optional dependencies. Install them with `uv` (recommended) or `pip`:

```bash
# Using uv (recommended)
uv sync --extra watcher        # File watching service
uv sync --extra webservice     # Streamlit web UI
uv sync --extra watcher --extra webservice  # Multiple features
```

### Development Tools

Development tools use dependency groups:

```bash
# Testing infrastructure
uv sync --group test

# Documentation building
uv sync --group docs

# Enhanced Streamlit development
uv sync --group streamlit-dev

# All development groups
uv sync
```

**Why use uv?**

- Modern, fast Python package manager
- Required for development (testing, docs)
- Better dependency resolution
- Consistent across all platforms

Install uv: `curl -LsSf https://astral.sh/uv/install.sh | sh` or visit https://docs.astral.sh/uv/

### For development

To install all of the development and test requirements:

```bash
git clone -b main https://github.com/ocrmypdf/OCRmyPDF.git
cd OCRmyPDF
uv sync --all-groups
```

To add JBIG2 encoding, see {ref}`jbig2`.

## Shell completions

Completions for `bash` and `fish` are available in the project's
`misc/completion` folder. The `bash` completions are likely `zsh`
compatible but this has not been confirmed. Package maintainers, please
install these at the appropriate locations for your system.

To manually install the `bash` completion, copy
`misc/completion/ocrmypdf.bash` to `/etc/bash_completion.d/ocrmypdf`
(rename the file).

To manually install the `fish` completion, copy
`misc/completion/ocrmypdf.fish` to
`~/.config/fish/completions/ocrmypdf.fish`.

## Note on 32-bit support

We don't support any 32-bit system, including 32-bit Python or 32-bit
Ghostscript on Windows.

================================================
FILE: docs/introduction.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Introduction

OCRmyPDF is a Python application and library that adds text "layers" to images in
PDFs, making scanned image PDFs searchable. It uses OCR to guess the text
contained in images. OCRmyPDF also supports plugins
that enable customization of its processing steps, and it is highly tolerant
of PDFs containing scanned images and "born digital" content that doesn't
require text recognition.

## About OCR

[Optical character
recognition](https://en.wikipedia.org/wiki/Optical_character_recognition)
is a technology that converts images of typed or handwritten text, such as
in a scanned document, into computer text that can be selected, searched and copied.

OCRmyPDF uses
[Tesseract](https://github.com/tesseract-ocr/tesseract), a widely
available open source OCR engine, to perform OCR.

(raster-vector)=

## About PDFs

PDFs are page description files that attempt to preserve a layout
exactly. They contain [vector
graphics](http://vector-conversions.com/vectorizing/raster_vs_vector.html)
that can contain raster objects, such as scanned images. Because PDFs can
contain multiple pages (unlike many image formats) and can contain fonts
and text, they are a suitable format for exchanging scanned documents.

:::{image} images/bitmap_vs_svg.svg
:::

A PDF page may contain multiple images, even if it appears to have only
one image. Some scanners or scanning software may segment pages into
monochromatic text and color regions, for example, to enhance the compression
ratio and appearance of the page.

Rasterizing a PDF is the process of generating corresponding raster images.
OCR engines like Tesseract work with images, not scalable vector graphics
or mixed raster-vector-text graphics such as PDF.

## About PDF/A

[PDF/A](https://en.wikipedia.org/wiki/PDF/A) is an ISO-standardized
subset of the full PDF specification that is designed for archiving (the
'A' stands for Archive). PDF/A differs from PDF primarily by omitting
features that could complicate future file readability,
such as embedded Javascript, video, audio and references to external
fonts. All fonts and resources needed to interpret the PDF must be
contained within it. Because PDF/A disables Javascript and other types
of embedded content, it is likely more secure.

There are various conformance levels and versions, such as "PDF/A-2b".

In general, the preferred format for scanned documents is PDF/A. Some
governments and jurisdictions, US Courts in particular, [mandate the use
of PDF/A](https://pdfblog.com/2012/02/13/what-is-pdfa/) for scanned
documents.

Since most individuals scanning documents aim for long-term readability,
OCRmyPDF defaults to generating PDF/A-2b.

PDF/A does have a few drawbacks. Some PDF viewers display an alert
indicating that the file is in PDF/A format, which may confuse some users.
Additionally, it tends to result in larger files than standard PDFs because
it embeds certain resources, even if they are widely available. PDF/A
files can be digitally signed but may not be encrypted to ensure future
readability. Fortunately, converting from PDF/A to a regular PDF is
straightforward, and any PDF viewer can handle PDF/A files.

## What OCRmyPDF does

OCRmyPDF analyzes each page of a PDF to determine the required colorspace
and resolution (DPI) for capturing all the information on that page without
losing content. It uses a PDF rasterizer (pypdfium2 or
[Ghostscript](http://ghostscript.com/)) to convert each page to an image and
subsequently performs OCR on the rasterized image to generate an OCR "layer."
This layer is then integrated back into the original PDF.

:::{versionchanged} 17.0.0
OCRmyPDF now supports pypdfium2 as an alternative rasterizer to Ghostscript.
pypdfium2 is a Python binding for pdfium, the PDF rendering library used by
Google Chrome. The `--rasterizer auto` setting (default) prefers pypdfium2
when available.
:::

While it is possible to use a program like Ghostscript or ImageMagick to
obtain an image and then run that image through Tesseract OCR, this process
actually generates a new PDF, potentially resulting in the loss of various
details (such as the document's metadata). In contrast, OCRmyPDF can produce
a minimally altered PDF as the output.

OCRmyPDF also offers several image processing options, such as deskew, which
enhances the visual quality of files and the accuracy of OCR. When these
options are utilized, the OCR layer is integrated into the processed image.

By default, OCRmyPDF generates archival PDFs in the PDF/A format, which is
a more rigid subset of PDF features designed for long-term archives. If you
prefer regular PDFs, you can disable this feature using the
`--output-type pdf` option.

## Why you shouldn't do this manually

A PDF is similar to an HTML file, in that it contains document structure
along with images. While some PDFs may solely display a full-page image,
they often contain additional content that would be forfeited if not preserved.

A manual process could take one of these approaches:

1. Rasterize each page as an image, perform OCR on the images, and then merge the
   output into a PDF. This method preserves the layout of each page, but
   resamples all images potentially leading to quality loss, increased file size,
   and the introduction of compression artifacts, among other issues.
2. Extract each image, OCR, and combine the output into a PDF. This approach
   loses the context in which images are used in the PDF, potentially resulting
   in loss of information related to scaling and position of images. Some scanned
   PDFs contain multiple images segmented into black and white, grayscale
   and color regions, with stencil masks to prevent overlap, as this can
   enhance the appearance of a file while reducing file size.
   Reassembling these images can be challenging, and risks losing vector art
   or text that is not part of an image.

In cases where a PDF solely serves as a container for images without any
rotation, scaling, or cropping, the second approach can be lossless.

OCRmyPDF uses various strategies depending on input options and the input PDF
itself. Generally, it rasterizes a page for OCR and then integrates the OCR
data back into the original PDF. This approach allows it to handle complex
PDFs and preserve their content as much as possible.

Furthermore, OCRmyPDF supports a wide range of edge cases that have emerged
during several years of development. It accommodates PDF features like
images within Form XObjects and pages with UserUnit scaling. It also
supports less common image formats like non-monochrome 1-bit images and
provides warnings about files you may not want to OCR. Thanks to tools
like pikepdf and QPDF, it can auto-repair damaged PDFs. You don't need to
understand the intricacies of these issues; you should be able to use
OCRmyPDF with any PDF file, and expect reasonable results.

## Limitations

OCRmyPDF is subject to limitations imposed by the Tesseract OCR engine.
These limitations are inherent to any software relying on Tesseract:

- The OCR accuracy may not match that of commercial OCR solutions.
- It is incapable of recognizing handwriting.
- It may detect gibberish and report it as OCR output.
- Results may be subpar when a document contains languages not specified
  in the `-l LANG` argument.
- Tesseract may struggle to analyze the natural reading order of documents.
  For instance, it might fail to recognize two columns in a document and
  attempt to join text across columns.
- Poor quality scans can result in subpar OCR quality. In other words, the
  quality of the OCR output depends on the quality of the input.
- Tesseract does not provide information about the font family to which text
  belongs.
- Tesseract does not divide text into paragraphs or headings. It only provides
  the text and its bounding box. As such, the generated PDF does not
  contain any information about the document's structure.

### Ghostscript considerations

:::{versionchanged} 17.0.0
Ghostscript is no longer strictly required. OCRmyPDF can use pypdfium2
for rasterization and verapdf for PDF/A validation.
:::

While Ghostscript remains a capable and feature-rich tool with a long history,
recent releases have introduced some compatibility challenges that OCRmyPDF
v17 addresses through alternative codepaths. When Ghostscript is used:

- PDFs containing JPEG 2000-encoded content may be converted to JPEG
  encoding, which may introduce compression artifacts, if Ghostscript
  PDF/A is enabled.
- Ghostscript may transcode grayscale and color images, potentially
  lossily, based on an internal algorithm. This
  behavior can be suppressed by setting `--pdfa-image-compression` to
  `jpeg` or `lossless` to set all images to one type or the other.
  Ghostscript lacks an option to maintain the input image's format.
  (Modern Ghostscript can copy JPEG images without transcoding them.)
- Ghostscript's PDF/A conversion removes any XMP metadata that is not
  one of the standard XMP metadata namespaces for PDFs. In particular,
  PRISM Metadata is removed.
- Ghostscript's PDF/A conversion may remove or deactivate
  hyperlinks and other active content.

When pypdfium2 and verapdf are available, many of these limitations can be
avoided by using the speculative PDF/A conversion path (enabled by default
with `--output-type auto`).

You can use `--output-type pdf` to disable PDF/A conversion and produce
a standard, non-archival PDF.

Regarding OCRmyPDF itself:

- PDFs using transparency are not currently represented in the test
  suite

## Similar programs

To the author's knowledge, OCRmyPDF is the most feature-rich and
thoroughly tested command line OCR PDF conversion tool. If it does not
meet your needs, contributions and suggestions are welcome.

Ghostscript recently added three "pdfocr" output devices. They work by
rasterizing all content and converting all pages to a single colour space.

## Web front-ends

The Docker image of OCRmyPDF provides a web service front-end
that allows files to submitted over HTTP, and the results can be downloaded.
This is an HTTP server intended to demonstrate how OCRmyPDF can be
integrated into a web service. It is not intended to be deployed on the
public internet and does not provide any security measures.

In addition, the following third-party integrations are available:

- [Paperless-ngx](https://docs.paperless-ngx.com/) is a free software
  document management system that uses OCRmyPDF to perform OCR on
  uploaded documents.
- [Nextcloud OCR](https://github.com/janis91/ocr) is a free software
  plugin for the Nextcloud private cloud software.

OCRmyPDF is not designed to be secure against malware-bearing PDFs (see
[Using OCRmyPDF online](ocr-service)). Users should ensure they
comply with OCRmyPDF's licenses and the licenses of all dependencies. In
particular, OCRmyPDF requires Ghostscript, which is licensed under
AGPLv3.


================================================
FILE: docs/jbig2.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

{#jbig2}

# Installing the JBIG2 encoder

Most Linux distributions do not include a JBIG2 encoder since JBIG2
encoding was patented for a long time. All known JBIG2 US patents have
expired as of 2017, but it is possible that unknown patents exist.

JBIG2 encoding is recommended for OCRmyPDF and is used to losslessly
create smaller PDFs. If JBIG2 encoding is not available, lower quality
CCITT encoding will be used for monochrome images.

JBIG2 decoding is not patented and is performed automatically by most
PDF viewers. It is widely supported and has been part of the PDF
specification since 2001.

JBIG encoding is automatically provided by these OCRmyPDF packages: -
Docker image (both Ubuntu and Alpine) - Snap package - ArchLinux AUR
package - Alpine Linux package - Homebrew on macOS

For all other platforms, you would need to build the JBIG2 encoder from
source:

:::{code} bash
git clone https://github.com/agl/jbig2enc
cd jbig2enc
./autogen.sh
./configure && make
[sudo] make install
:::

Dependencies include libtoolize and libleptonica, which on Ubuntu
systems are packaged as libtool and libleptonica-dev. On Fedora (35)
they are packaged as libtool and leptonica-devel. For this to work,
please make sure to install `autotools`, `automake`, `libtool`, `pkg-config`
and `leptonica` first if not already installed. Other dependencies might
be required depending on your system.

:::{code} bash
[sudo] apt install autotools-dev automake libtool libleptonica-dev pkg-config
:::

## JBIG2 Compression

OCRmyPDF uses JBIG2 lossless compression for bitonal (black and white)
images. This provides excellent compression ratios compared to the older
CCITT G4 standard, while preserving the exact pixel content of the
original image.

You can adjust the threshold for JBIG2 compression with
`--jbig2-threshold`. The default is 0.85.

:::{note}
Previous versions of OCRmyPDF supported a lossy JBIG2 mode
(`--jbig2-lossy`). This feature has been removed due to the well-known
risk of character substitution errors (e.g., 6/8 confusion). See
[JBIG2 disadvantages](https://en.wikipedia.org/wiki/JBIG2#Disadvantages)
for more information on why lossy JBIG2 is problematic. The `--jbig2-lossy`
and `--jbig2-page-group-size` arguments are now ignored with a warning.
:::


================================================
FILE: docs/languages.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

(lang-packs)=

# Installing additional language packs

OCRmyPDF uses Tesseract for OCR, and relies on its language packs for all languages.
On most platforms, English is installed with Tesseract by default, but not always.

Tesseract supports [most
languages](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc#languages).
Languages are identified by standardized three-letter codes (called ISO 639-2 Alpha-3).
Tesseract's documentation also lists the three-letter code for your language.
Some are anglicized, e.g. Spanish is `spa` rather than `esp`, while others
are not, e.g. German is `deu` and French is `fra`.

Language packs (strictly speaking, Tesseract "traineddata" files) generally correspond
to the language in question, but different language packs are used in certain
situations. For German, the "Fraktur" language pack can assist with reading older
materials in the Fraktur typeface family (`deu_frak`). Some communities have changed
their script from Cyrillic to Latin; the Cyrillic version of Uzbek is available
as `uzb_cyrl` and the Latin version is `uzb`.

After you have installed a language pack, you can use it with `ocrmypdf -l <language>`,
for example `ocrmypdf -l spa`. For multilingual documents, you can specify
all languages to be expected, e.g. `ocrmypdf -l eng+fra` for English and French.
English is assumed by default unless other language(s) are specified.

For Linux users, you can often find packages that provide language
packs.

## Platform install steps

### Debian and Ubuntu (apt)

```bash
# Display a list of all Tesseract language packs
apt-cache search tesseract-ocr

# Install Chinese Simplified language pack
apt-get install tesseract-ocr-chi-sim
```

You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as
to what languages it should search for. Multiple languages can be
requested using either `-l eng+fra` (English and French) or
`-l eng -l fra`.

### Fedora

```bash
# Display a list of all Tesseract language packs
dnf search tesseract

# Install Chinese Simplified language pack
dnf install tesseract-langpack-chi_sim
```

You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as
to what languages it should search for. Multiple languages can be
requested using either `-l eng+fra` (English and French) or
`-l eng -l fra`.

### Arch Linux

```bash
# Display a list of all Tesseract language packs
pacman -Ss tesseract-data

# Install German language pack
pacman -S tesseract-data-deu
```

You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as
to what languages it should search for. Multiple languages can be
requested using either `-l eng+fra` (English and French) or
`-l eng -l fra`.

### Gentoo

On Gentoo the package `app-text/tessdata_fast`, which `app-text/tesseract` depends on, handles Tesseract languages.
It accepts USE flags to select what languages should be installed, these can be set in `/etc/portage/package.use`.
Alternatively one can globally set the [L10N use extension](https://wiki.gentoo.org/wiki/Localization/Guide#L10N) in `/etc/portage/make.conf`.
This enables these languages for all packages (e.g. including aspell).

```bash
# Display a list of all Tesseract language packs
equery uses app-text/tessdata_fast

# Add English and German language support for Tesseract only
echo 'app-text/tessdata_fast l10n_de l10n_en' >> /etc/portage/package.use

# Add global English and German language support (the `l10n_` from equery has to be omitted)
echo L10N="de en" >> /etc/portage/make.conf

# update system to reflect changed USE flags
emerge --update --deep --newuse @world
```

You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as
to what languages it should search for. Multiple languages can be
requested using either `-l eng+fra` (English and French) or
`-l eng -l fra`.

### macOS

You can install additional language packs by
{ref}`installing Tesseract using Homebrew with all language packs <macos-all-languages>`.

### Docker

Users of the OCRmyPDF Docker image should install language packs into a
derived Docker image as
{ref}`described in that section <docker-lang-packs>`.

### Windows

The Tesseract installer provided by Chocolatey currently includes only English language.
To install other languages, download the respective language pack (`.traineddata` file)
from <https://github.com/tesseract-ocr/tessdata/> and place it in
`C:\\Program Files\\Tesseract-OCR\\tessdata` (or wherever Tesseract OCR is installed).

## Custom language packs

If you have fine-tuned or trained Tesseract and generated custom trained data, you can
copy your `customlang.traineddata` file into your Tesseract "tessdata" folder, and
then use the `-l customlang` argument to tell OCRmyPDF to pass that language on to
Tesseract.


================================================
FILE: docs/maintainers.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Maintainer notes

This is for those who package OCRmyPDF for downstream use. (Thank you
for your hard work.)

## Known ports/packagers

OCRmyPDF has been ported to many platforms already. If you are
interesting in porting to a new platform, check with
[Repology](https://repology.org/projects/?search=ocrmypdf) to see the
status of that platform.

### Make sure you can package pikepdf

pikepdf, created by the same author, is a mixed Python and C++14 package
with much stiffer build requirements. If you want to use OCRmyPDF on
some novel platform or distribution, first make sure you can package
pikepdf.

### Core dependencies

:::{versionchanged} 17.0.0
Ghostscript is no longer strictly required. OCRmyPDF now supports alternative
codepaths for both PDF rasterization and PDF/A conversion.
:::

OCRmyPDF has the following runtime dependencies:

**For PDF rasterization** (converting PDF pages to images for OCR):

- `pypdfium2` (Python package) - OR -
- `ghostscript` (system binary)
- Recommendation: Install both for best compatibility

**For PDF/A conversion**:

- `verapdf` (system binary) with pikepdf's speculative conversion - OR -
- `ghostscript` (system binary)
- Recommendation: Install both for best compatibility

**For OCR**:
- `tesseract-ocr` (system binary) - Required for MVP

**For text rendering** (expressing OCR results in PDF):
- `fpdf2` (Python package) - Required for text layer rendering
- `uharfbuzz` (Python package) - Required for text layer rendering
- `font-noto` (system package) - Recommended for text layer rendering

**Other dependencies**:
- `unpaper` (system binary) - Optional, enables `--clean` and `--clean-final`
- `pngquant` (system binary) - Optional, enables `--optimize 2` and `--optimize 3`
- `jbig2enc` (system binary) - Optional, improves compression of monochrome images

While Ghostscript remains a capable and feature-rich tool with a long history,
recent releases have introduced some compatibility challenges that OCRmyPDF v17
addresses through alternative codepaths. For the best user experience, packagers
should install both Ghostscript and the alternative tools (pypdfium2, verapdf)
when available.

On Windows, OCRmyPDF will also check the registry for Tesseract and Ghostscript
locations.

Tesseract OCR relies on SIMD for performance and only has proper support
for this on ARM and x86\_64. Performance may be poor on other processor
architectures.

### Versioning scheme

OCRmyPDF uses hatch-vcs for versioning, which derives the version from
Git as a single source of truth. This may be unsuitable for some
distributions, e.g. to indicate that your distribution modifies OCRmyPDF
in some way.

You can patch the `__version__` variable in `src/ocrmypdf/_version.py`
if necessary, or set the environment variable
`SETUPTOOLS_SCM_PRETEND_VERSION` to the required version, if you need to
override versioning for some reason.

### jbig2enc

OCRmyPDF will use jbig2enc, a JBIG2 encoder, if one can be found. Some
distributions have shied away from packaging JBIG2 because it contains
patented algorithms, but all patents have expired since 2017. If
possible, consider packaging it too to improve OCRmyPDF's compression.

:::{note}
Lossy JBIG2 encoding has been removed in v17.0.0 due to well-documented
risks of character substitution errors. Previously we provided this feature
on a "caveat emptor" basis but in the interest of focusing and eliminating
risks, we decided to remove this option. Now, only lossless JBIG2 compression
is supported.
:::

### Dependency matrix for packagers

:::{versionadded} 17.0.0
:::

The following table summarizes the dependency options introduced in v17.0.0:

| Feature | Option 1 | Option 2 | Notes |
|---------|----------|----------|-------|
| PDF rasterization | pypdfium2 (Python) | ghostscript (binary) | pypdfium2 preferred when available |
| PDF/A conversion | verapdf + pikepdf | ghostscript | verapdf validates speculative conversion |
| Text rendering | fpdf2 (Python) | - | Required, replaces legacy hOCR renderer |
| OCR | tesseract-ocr | `--ocr-engine none` | Can be skipped entirely |

**Minimum viable installation:**

- tesseract-ocr + (pypdfium2 OR ghostscript) + fpdf2

**Recommended installation:**

- tesseract-ocr + pypdfium2 + ghostscript + verapdf + fpdf2 + unpaper + pngquant + jbig2enc

:::{warning}
If Ghostscript is not installed and verapdf is not available, PDF/A output
cannot be produced. The output will be a standard PDF instead. This is a
breaking change for rare configurations that previously relied on PDF/A
output without Ghostscript alternatives.
:::

**Sample debian/control dependency specification**

```
Depends:
 fonts-noto,
 fpdf2 (>= 2.8),
 ghostscript (>= 9.55),  # Not strictly required, but best user experience
 icc-profiles-free,
 img2pdf,
 python3-coloredlogs,
 python3-deprecation,
 python3-pdfminer (>= 20181108+dfsg-3),
 python3-pikepdf (>= 8.14.0),
 python3-pil,
 python3-pluggy,
 python3-reportlab,
 python3-rich,
 python3-uharfbuzz,  # Not currently in Debian
 tesseract-ocr (>= 5.0.0),
 zlib1g,
 ${misc:Depends},
 ${python3:Depends},
Recommends:
 cyclopts,   # Not currently in Debian
 jbig2
 paddleocr,  # Not currently in Debian
 pngquant,
 pypdfium2,  # Not currently in Debian
 unpaper,
 verapdf,    # Not currently in Debian
Suggests:
 ocrmypdf-doc,
 python-watchdog,
```

### Command line completions

Please ensure that command line completions are installed, as described
in the installation documentation.

### 32-bit Linux support

If you maintain a Linux distribution that supports 32-bit x86 or ARM,
OCRmyPDF should continue to work as long as all of its dependencies
continue to be available in 32-bit form. Please note we do not test on
32-bit platforms.

### HEIF/HEIC

OCRmyPDF defaults to installing the pi-heif PyPI package, which supports
converting HEIF (High Efficiency Image File Format) images to PDF from
the command line. If your distribution does not have this library
available, you can exclude it and OCRmyPDF will gracefully degrade
automatically, losing only support for this feature.


================================================
FILE: docs/optimizer.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# PDF optimization

OCRmyPDF includes an image-oriented PDF optimizer. By default, the
optimizer runs with safe settings with the goal of improving compression
at no loss of quality. At higher optimization levels, lossy
optimizations may be applied and tuned. Optimization occurs after OCR,
and only if OCR succeeded. It does not perform other possible
optimizations such as deduplicating resources, consolidating fonts,
simplifying vector drawings, or anything of that nature.

:::{list-table} OCRmyPDF optimization settings
---
widths: 33 6 60
header-rows: 1
---

* - Optimization level
  - Shorthand
  - Description
* - ``--optimize 0``
  - ``-O0``
  - Disable most optimizations.
* - ``--optimize 1`` (default)
  - ``-O1``
  - Enables lossless optimizations, such as transcoding images to more
      efficient formats. Also compress other uncompressed objects in the
      PDF and enables the more efficient "object streams" within the PDF.
* - ``--optimize 2``
  - ``-O2``
  - All of the above, and enables lossy optimizations and color quantization.
* - ``--optimize 3``
  - ``-O3``
  - All of the above, and enables more aggressive optimizations and targets lower
      image quality.
:::

The exact type of optimizations performed will vary over time, and
depend on what third party tools are installed.

Despite optimizations, OCRmyPDF might still increase the overall file
size, since it must embed information about the recognized text, and
depending on the settings chosen, may not be able to represent the
output file as compactly as the input file.

## Optimizations that always occurs

OCRmyPDF will automatically replace obsolete or inferior compression
schemes such as RLE or LZW with superior schemes such as Deflate, and
convert monochrome images to CCITT G4. Since this is lossless, it always
occurs and there is no way to disable it. Other non-image compressed
objects are compressed as well.

## Fast web view

OCRmyPDF automatically optimizes PDFs for \"fast web view\" in Adobe
Acrobat\'s parlance, or equivalently, linearizes PDFs so that the
resources they reference are presented in the order a viewer needs them
for sequential display. This reduces the latency of viewing a PDF both
online and from local storage, in exchange for a slight increase in file
size.

To disable this optimization and all others, use
`ocrmypdf --optimize 0 ...` or the shorthand `-O0`.

Adobe Acrobat might not report the file as being \"fast web view\".

## Lossless optimizations

At optimization level `-O1` (the default), OCRmyPDF will also attempt
lossless image optimization.

If a JBIG2 encoder is available, then monochrome images will be
converted to JBIG2, with the potential for huge savings on large black
and white images, since JBIG2 is far more efficient than any other
monochrome (bi-level) compression. (All known US patents related to
JBIG2 have probably expired, but it remains the responsibility of the
user to supply a JBIG2 encoder such as
[jbig2enc](https://github.com/agl/jbig2enc). OCRmyPDF does not implement
JBIG2 encoding on its own.)

OCRmyPDF currently does not attempt to recompress losslessly compressed
objects more aggressively.

## Lossy optimizations

At optimization level `-O1`, `-O2` and `-O3`, OCRmyPDF will some attempt
loss image optimization.

If Ghostscript is used to create a PDF/A (the default), Ghostscript will
optimize some images by converting them to JPEG, which are lossy. If
`--output-type pdf` is used, there are no lossy optimizations. Ghostscript's
JPEG conversion is quite safe.

If `pngquant` is installed, OCRmyPDF will use it to perform quantize
paletted images to reduce their size.

The quality of JPEGs may be lowered, on the assumption that a lower
quality image may be suitable for storage after OCR.

It is not possible to optimize all image types. Uncommon image types may
be skipped by the optimizer.


================================================
FILE: docs/pdfsecurity.md
================================================
(security)=

# PDF security issues

> OCRmyPDF should only be used on PDFs you trust. It is not designed to
> protect you against malware.

Recognizing that many users have an interest in handling PDFs and
applying OCR to PDFs they did not generate themselves, this article
discusses the security implications of PDFs and how users can protect
themselves.

The disclaimer applies: this software has no warranties of any kind.

## PDFs may contain malware

PDF is a rich, complex file format. The official PDF 1.7 specification,
ISO 32000:2008, is hundreds of pages long and references several annexes
each of which are similar in length. PDFs can contain video, audio, XML,
JavaScript and other programming, and forms. In some cases, they can
open internet connections to pre-selected URLs. All of these are
possible attack vectors.

In short, PDFs [may contain
viruses](https://security.stackexchange.com/questions/64052/can-a-pdf-file-contain-a-virus).

If you do not trust a PDF or its source, do not open it or use OCRmyPDF
on it. Consider using a Docker container or virtual machine to isolate
an untrusted PDF from your system.

## How OCRmyPDF processes PDFs

OCRmyPDF must open and interpret your PDF in order to insert an OCR
layer. First, it runs all PDFs through
[pikepdf](https://github.com/pikepdf/pikepdf), a library based on
[QPDF](https://github.com/qpdf/qpdf), a program that repairs PDFs with
syntax errors. This is done because, in the author\'s experience, a
significant number of PDFs in the wild, especially those created by
scanners, are not well-formed files. QPDF makes it more likely that
OCRmyPDF will succeed, but offers no security guarantees. QPDF is also
used to split the PDF into single page PDFs.

Finally, OCRmyPDF rasterizes each page of the PDF using
[Ghostscript](http://ghostscript.com/) in `-dSAFER` mode.

Depending on the options specified, OCRmyPDF may graft the OCR layer
into the existing PDF or it may essentially reconstruct (\"re-fry\") a
visually identical PDF that may be quite different at the binary level.
That said, OCRmyPDF is not a tool designed for sanitizing PDFs.

## Password protected PDFs

Password protected PDFs usually have two passwords, and owner and user
password. When the user password is set to empty, PDF readers will open
the file automatically and mark it as \"(SECURED)\". Password security
can also request certain restrictions on the PDF, but anyone can remove
these restrictions if they have either the owner *or* user password.
Passwords mainly present a barrier for casual users.

OCRmyPDF cannot remove passwords from PDFs. If you want to remove a
password from a PDF, you must use other software, such as `qpdf`.

If the owner and user password are set, a password is required for
`qpdf`. If only the owner password is set, then the password can be
stripped, even if one does not have the owner password. To remove the
password from a using QPDF, use:

:::{code} bash
qpdf --decrypt --password='abc123' input.pdf no_password.pdf
:::

Then you can run OCRmyPDF on the file.

In its default mode, OCRmyPDF generates PDF/A. Passwords may not be set
on PDF/A documents. If you want to set a password on the output PDF, you
must specify `--output-type pdf`.

## Signature images

Many programs exist which are capable of inserting an image of
someone\'s signature. On its own, this offers no security guarantees. It
is trivial to remove the signature image and apply it to other files.
This practice offers no real security.

## Digital signatures

Important documents can be digitally signed and certified to attest to
their authorship, approval or execution of a legal agreement. OCRmyPDF
will detect signed PDFs and will not modify them, unless the
`--invalidate-digital-signatures` option is used, which will invalidate
any signatures. (The signature may still be present in the PDF if
opened, but PDF readers will not validate it.)

A digital signature adds a cryptographic hash of the document to the
document, so tamper protection is provided. That also precludes OCRmyPDF
from modifying the document and preserving the signature.

Digital signatures are not the same as a signature image. A digital
signature is a cryptographic hash of the document that is encrypted with
the author\'s private key. The signature is decrypted with the author\'s
public key. The public key is usually distributed by a certificate
authority. The signature is then verified by the PDF reader. If the
document is modified, the signature will be invalidated.

## Certificate-encrypted PDFs

PDFs can be encrypted with a certificate. This is a more secure form of
encryption than a password. The certificate is usually issued by a
certificate authority. A certificate is used to encrypt the document
using the public key for the benefit of a specific recipient who
possesses the private key.

OCRmyPDF cannot open certificate-encrypted PDFs. If you have the
certificate, you can use other PDF software, such as Acrobat, to decrypt
the PDF.


================================================
FILE: docs/performance.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Performance

Some users have noticed that current versions of OCRmyPDF do not run as
quickly as some older versions (specifically 6.x and older). This is
because OCRmyPDF added image optimization as a postprocessing step, and
it is enabled by default.

## Speed

If running OCRmyPDF quickly is your main goal, you can use settings such
as:

-   `--optimize 0` to disable file size optimization
-   `--output-type pdf` to disable PDF/A generation
-   `--fast-web-view 999999` to disable fast web view optimization
-   `--skip-big` to skip large images, if some pages have large images

You can also avoid:

-   `--force-ocr`
-   Image preprocessing


================================================
FILE: docs/plugins.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Plugins

> The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL
> NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and
> "OPTIONAL" in this document are to be interpreted as described in
> RFC 2119.

You can use plugins to customize the behavior of OCRmyPDF at certain points of
interest.

Currently, it is possible to:

- add new command line arguments
- override the decision for whether or not to perform OCR on a particular file
- modify the image is about to be sent for OCR
- modify the page image before it is converted to PDF
- replace the Tesseract OCR with another OCR engine that has similar behavior
- replace Ghostscript with another PDF to image converter (rasterizer) or
  PDF/A generator

OCRmyPDF plugins are based on the Python `pluggy` package and conform to its
conventions. Note that: plugins installed with as setuptools entrypoints are
not checked currently, because OCRmyPDF assumes you may not want to enable
plugins for all files.

See \[OCRmyPDF-EasyOCR\](<https://github.com/ocrmypdf/OCRmyPDF-EasyOCR>) for an
example of a straightforward, fully working plugin.

## Script plugins

Script plugins may be called from the command line, by specifying the name of a file.
Script plugins may be convenient for informal or "one-off" plugins, when a certain
batch of files needs a special processing step for example.

```bash
ocrmypdf --plugin ocrmypdf_example_plugin.py input.pdf output.pdf
```

Multiple plugins may be installed by issuing the `--plugin` argument multiple times.

## Packaged plugins

Installed plugins may be installed into the same virtual environment as OCRmyPDF
is installed into. They may be invoked using Python standard module naming.
If you are intending to distribute a plugin, please package it.

```bash
ocrmypdf --plugin ocrmypdf_fancypants.pockets.contents input.pdf output.pdf
```

OCRmyPDF does not automatically import plugins, because the assumption is that
plugins affect different files differently and you may not want them activated
all the time. The command line or `ocrmypdf.ocr(plugin='...')` must call
for them.

Third parties that wish to distribute packages for ocrmypdf should package them
as packaged plugins, and these modules should begin with the name `ocrmypdf_`
similar to `pytest` packages such as `pytest-cov` (the package) and
`pytest_cov` (the module).

:::{note}
We recommend plugin authors name their plugins with the prefix
`ocrmypdf-` (for the package name on PyPI) and `ocrmypdf_` (for the
module), just like pytest plugins. At the same time, please make it clear
that your package is not official.
:::

## Plugins

You can also create a plugin that OCRmyPDF will always automatically load if both are
installed in the same virtual environment, using a project entrypoint.
OCRmyPDF uses the entrypoint namespace "ocrmypdf".

For example, `pyproject.toml` would need to contain the following, for a plugin named
`ocrmypdf-exampleplugin`:

```toml
[project]
name = "ocrmypdf-exampleplugin"

[project.entry-points."ocrmypdf"]
exampleplugin = "exampleplugin.pluginmodule"
```

## Plugin requirements

OCRmyPDF generally uses multiple worker processes. When a new worker is started,
Python will import all plugins again, including all plugins that were imported earlier.
This means that the global state of a plugin in one worker will not be shared with
other workers. As such, plugin hook implementations should be stateless, relying
only on their inputs. Hook implementations may use their input parameters to
to obtain a reference to shared state prepared by another hook implementation.
Plugins must expect that other instances of the plugin will be running
simultaneously.

The `context` object that is passed to many hooks can be used to share information
about a file being worked on. Plugins must write private, plugin-specific data to
a subfolder named `{options.work_folder}/ocrmypdf-plugin-name`. Plugins MAY
read and write files in `options.work_folder`, but should be aware that their
semantics are subject to change.

OCRmyPDF will delete `options.work_folder` when it has finished OCRing
a file, unless invoked with `--keep-temporary-files`.

The documentation for some plugin hooks contain a detailed description of the
execution context in which they will be called.

Plugins should be prepared to work whether executed in worker threads or worker
processes. Generally, OCRmyPDF uses processes, but has a semi-hidden threaded
argument that simplifies debugging.

## Plugin hooks

A plugin may provide the following hooks. Hooks must be decorated with
`ocrmypdf.hookimpl`, for example:

```python
from ocrmypdf import hookimpl

@hookimpl
def add_options(parser):
    pass
```

The following is a complete list of hooks that are available, and when
they are called.

(firstresult)=

**Note on firstresult hooks**

If multiple plugins install implementations for this hook, they will be called in
the reverse of the order in which they are installed (i.e., last plugin wins).
When each hook implementation is called in order, the first implementation that
returns a value other than `None` will "win" and prevent execution of all other
hooks. As such, you cannot "chain" a series of plugin filters together in this
way. Instead, a single hook implementation should be responsible for any such
chaining operations.

## Examples

- OCRmyPDF's test suite contains several plugins that are used to simulate certain
  test conditions.
- [ocrmypdf-papermerge](https://github.com/papermerge/OCRmyPDF_papermerge) is
  a production plugin that integrates OCRmyPDF and the Papermerge document
  management system.

### Suppressing or overriding other plugins

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.initialize
```

### Custom command line arguments

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.add_options
```

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.check_options
```

### Plugin option models

Plugins can define their own option models using Pydantic. This allows plugins to:

- Define type-safe option structures with validation
- Add CLI arguments that map to their option model fields
- Access options via nested namespaces (e.g., `options.tesseract.timeout`)

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.register_options
```

Plugin options can be accessed in two ways:

1. **Flat access** (backward compatible): `options.tesseract_timeout`
2. **Nested access**: `options.tesseract.timeout`

Both access patterns are equivalent and return the same values.

:::{note}
**Plugin Interface Change**: Starting in OCRmyPDF v17.0.0, plugin hooks receive
`OcrOptions` objects instead of `argparse.Namespace` objects. Most plugins will
continue working due to duck-typing compatibility, but plugin developers should
update their type hints accordingly.
:::

### Migration guide for plugin developers

:::{versionadded} 17.0.0
:::

**Update imports:**

```python
from ocrmypdf._options import OcrOptions
```

**Update type hints:**

```python
# Before (v16 and earlier)
def check_options(options: argparse.Namespace) -> None:
    ...

# After (v17+)
def check_options(options: OcrOptions) -> None:
    ...
```

**Attribute access unchanged:**

```python
# These work exactly as before
options.languages
options.output_type
options.tesseract_timeout
```

**Remove in-place modifications:**

```python
# Before (v16 pattern - no longer recommended)
def check_options(options):
    options.some_computed_value = compute_value(options)

# After (v17 pattern - compute at point of use)
def some_function(options):
    computed = compute_value(options)
    use_computed(computed)
```

### Execution and progress reporting

```{eval-rst}
.. autoclass:: ocrmypdf.pluginspec.ProgressBar
    :members:
    :special-members: __init__, __enter__, __exit__
```

```{eval-rst}
.. autoclass:: ocrmypdf.pluginspec.Executor
    :members:
    :special-members: __call__
```

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.get_logging_console
```

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.get_executor
```

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.get_progressbar_class
```

### Applying special behavior before processing

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.validate
```

### PDF page to image

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.rasterize_pdf_page
```

### Modifying intermediate images

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.filter_ocr_image
```

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.filter_page_image
```

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.filter_pdf_page
```

### OCR engine

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.get_ocr_engine
```

```{eval-rst}
.. autoclass:: ocrmypdf.pluginspec.OcrEngine
    :members:

    .. automethod:: __str__
```

```{eval-rst}
.. autoclass:: ocrmypdf.pluginspec.OrientationConfidence
```

### PDF/A production

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.generate_pdfa
```

### PDF optimization

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.optimize_pdf
```

```{eval-rst}
.. autofunction:: ocrmypdf.pluginspec.is_optimization_enabled
```

### Working with OcrElement trees

:::{versionadded} 17.0.0
:::

OCRmyPDF v17 introduces the `OcrElement` dataclass for representing OCR
output in an engine-agnostic format. This enables plugins to work with
OCR results without parsing hOCR XML.

**Key classes:**

```python
from ocrmypdf import OcrElement, OcrClass, BoundingBox

# OcrElement - represents any OCR structural unit
page = OcrElement(
    ocr_class=OcrClass.PAGE,
    bbox=BoundingBox(0, 0, 612, 792),
    children=[...]
)

# BoundingBox - axis-aligned bounding box (left, top, right, bottom)
bbox = BoundingBox(left=100, top=50, right=300, bottom=80)

# OcrClass - constants for element types
OcrClass.PAGE      # "ocr_page"
OcrClass.LINE      # "ocr_line"
OcrClass.WORD      # "ocrx_word"
OcrClass.PARAGRAPH # "ocr_par"
```

**Navigating the tree:**

```python
# Get all words in a page
words = page.words  # Returns list[OcrElement]

# Get all lines
lines = page.lines

# Get combined text
text = page.get_text_recursive()

# Iterate by class
for para in page.paragraphs:
    print(para.get_text_recursive())
```

**OCR engine plugins:**

Plugins implementing custom OCR engines can now output `OcrElement` trees
directly via the `generate_ocr()` method, bypassing hOCR entirely:

```python
from pathlib import Path
from ocrmypdf.pluginspec import OcrEngine
from ocrmypdf import OcrElement, OcrClass, BoundingBox

class MyOcrEngine(OcrEngine):
    def generate_ocr(
        self,
        input_file: Path,
        options,
        context,
    ) -> OcrElement:
        # Perform OCR and return OcrElement tree directly
        # No need to generate hOCR XML
        return OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(0, 0, width, height),
            dpi=300,
            children=[
                OcrElement(
                    ocr_class=OcrClass.LINE,
                    bbox=BoundingBox(100, 50, 500, 80),
                    children=[
                        OcrElement(
                            ocr_class=OcrClass.WORD,
                            bbox=BoundingBox(100, 50, 200, 80),
                            text="Hello",
                        ),
                        # ... more words
                    ]
                ),
                # ... more lines
            ]
        )

    def supports_generate_ocr(self) -> bool:
        return True  # Indicate this engine uses generate_ocr()
```

This approach is simpler than generating hOCR and allows modern OCR
engines to integrate more naturally with OCRmyPDF.


================================================
FILE: docs/releasenotes/index.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# Release notes

OCRmyPDF uses [semantic versioning](http://semver.org/) for its
command line interface and its public API.

OCRmyPDF's output messages are not considered part of the stable interface -
that is, output messages may be improved at any release level, so parsing them
may be unreliable. Use the API to depend on precise behavior.

The public API may be useful in scripts that launch OCRmyPDF processes or that
wish to use some of its features for working with PDFs.

The most recent release of OCRmyPDF is ![version](https://img.shields.io/pypi/v/ocrmypdf.svg). Any newer versions
referred to in these notes may exist the main branch but have not been
tagged yet.

OCRmyPDF typically supports the three most recent Python versions.

:::{note}
Attention maintainers: these release notes may be updated with information
about a forthcoming release that has not been tagged yet. A release is only
official when it's tagged and posted to PyPI.
:::

```{toctree}
:glob: true
:maxdepth: 1
:reversed: true

version*
```


================================================
FILE: docs/releasenotes/version02.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v2

## v2.2-stable (2014-09-29)

OCRmyPDF versions 1 and 2 were implemented as shell scripts. OCRmyPDF
3.0+ is a fork that gradually replaced all shell scripts with Python
while maintaining the existing command line arguments. No one is
maintaining old versions.

For details on older versions, see the [final version of its release
notes](https://github.com/fritz-hh/OCRmyPDF/blob/7fd3dbdf42ca53a619412ce8add7532c5e81a9d1/RELEASE_NOTES.md).


================================================
FILE: docs/releasenotes/version03.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v3

## v3.2.1

Changes

- Fixed {issue}`47`
  "convert() got and unexpected keyword argument 'dpi'" by upgrading to
  img2pdf 0.2
- Tweaked the Dockerfiles

## v3.2

New features

- Lossless reconstruction: when possible, OCRmyPDF will inject text
  layers without otherwise manipulating the content and layout of a PDF
  page. For example, a PDF containing a mix of vector and raster
  content would see the vector content preserved. Images may still be
  transcoded during PDF/A conversion. (`--deskew` and
  `--clean-final` disable this mode, necessarily.)
- New argument `--tesseract-pagesegmode` allows you to pass page
  segmentation arguments to Tesseract OCR. This helps for two column
  text and other situations that confuse Tesseract.
- Added a new "polyglot" version of the Docker image, that generates
  Tesseract with all languages packs installed, for the polyglots among
  us. It is much larger.

Changes

- JPEG transcoding quality is now 95 instead of the default 75. Bigger
  file sizes for less degradation.

## v3.1.1

Changes

- Fixed bug that caused incorrect page size and DPI calculations on
  documents with mixed page sizes

## v3.1

Changes

- Default output format is now PDF/A-2b instead of PDF/A-1b
- Python 3.5 and macOS El Capitan are now supported platforms - no
  changes were needed to implement support
- Improved some error messages related to missing input files
- Fixed {issue}`20`: uppercase .PDF extension not accepted
- Fixed an issue where OCRmyPDF failed to text that certain pages
  contained previously OCR'ed text, such as OCR text produced by
  Tesseract 3.04
- Inserts /Creator tag into PDFs so that errors can be traced back to
  this project
- Added new option `--pdf-renderer=auto`, to let OCRmyPDF pick the
  best PDF renderer. Currently it always chooses the 'hocrtransform'
  renderer but that behavior may change.
- Set up Travis CI automatic integration testing

## v3.0

New features

- Easier installation with a Docker container or Python's `pip`
  package manager
- Eliminated many external dependencies, so it's easier to setup
- Now installs `ocrmypdf` to `/usr/local/bin` or equivalent for
  system-wide access and easier typing
- Improved command line syntax and usage help (`--help`)
- Tesseract 3.03+ PDF page rendering can be used instead for better
  positioning of recognized text (`--pdf-renderer tesseract`)
- PDF metadata (title, author, keywords) are now transferred to the
  output PDF
- PDF metadata can also be set from the command line (`--title`,
  etc.)
- Automatic repairs malformed input PDFs if possible
- Added test cases to confirm everything is working
- Added option to skip extremely large pages that take too long to OCR
  and are often not OCRable (e.g. large scanned maps or diagrams);
  other pages are still processed (`--skip-big`)
- Added option to kill Tesseract OCR process if it seems to be taking
  too long on a page, while still processing other pages
  (`--tesseract-timeout`)
- Less common colorspaces (CMYK, palette) are now supported by
  conversion to RGB
- Multiple images on the same PDF page are now supported

Changes

- New, robust rewrite in Python 3.4+ with
  [ruffus](http://www.ruffus.org.uk/index.html) pipelines

- Now uses Ghostscript 9.14's improved color conversion model to
  preserve PDF colors

- OCR text is now rendered in the PDF as invisible text. Previous
  versions of OCRmyPDF incorrectly rendered visible text with an image
  on top.

- All "tasks" in the pipeline can be executed in parallel on any
  available CPUs, increasing performance

- The `-o DPI` argument has been phased out, in favor of
  `--oversample DPI`, in case we need `-o OUTPUTFILE` in the future

- Removed several dependencies, so it's easier to install. We no longer
  use:

  - GNU [parallel](https://www.gnu.org/software/parallel/)
  - [ImageMagick](http://www.imagemagick.org/script/index.php)
  - Python 2.7
  - Poppler
  - [MuPDF](http://mupdf.com/docs/) tools
  - shell scripts
  - Java and [JHOVE](http://jhove.sourceforge.net/)
  - libxml2

- Some new external dependencies are required or optional, compared to
  v2.x:

  - Ghostscript 9.14+
  - [qpdf](http://qpdf.sourceforge.net/) 5.0.0+
  - [Unpaper](https://github.com/Flameeyes/unpaper) 6.1 (optional)
  - some automatically managed Python packages

Release candidates^

- rc9:

  - Fix
    {issue}`118`:
    report error if ghostscript iccprofiles are missing
  - fixed another issue related to
    {issue}`111`: PDF
    rasterized to palette file
  - add support image files with a palette
  - don't try to validate PDF file after an exception occurs

- rc8:

  - Fix
    {issue}`111`:
    exception thrown if PDF is missing DocumentInfo dictionary

- rc7:

  - fix error when installing direct from pip, "no such file
    'requirements.txt'"

- rc6:

  - dropped libxml2 (Python lxml) since Python 3's internal XML parser
    is sufficient
  - set up Docker container
  - fix Unicode errors if recognized text contains Unicode characters
    and system locale is not UTF-8

- rc5:

  - dropped Java and JHOVE in favour of qpdf
  - improved command line error output
  - additional tests and bug fixes
  - tested on Ubuntu 14.04 LTS

- rc4:

  - dropped MuPDF in favour of qpdf
  - fixed some installer issues and errors in installation
    instructions
  - improve performance: run Ghostscript with multithreaded rendering
  - improve performance: use multiple cores by default
  - bug fix: checking for wrong exception on process timeout

- rc3: skipping version number intentionally to avoid confusion with
  Tesseract

- rc2: first release for public testing to test-PyPI, Github

- rc1: testing release process

## Compatibility notes

- `./OCRmyPDF.sh` script is still available for now
- Stacking the verbosity option like `-vvv` is no longer supported
- The configuration file `config.sh` has been removed. Instead, you
  can feed a file to the arguments for common settings:

```
ocrmypdf input.pdf output.pdf @settings.txt
```

where `settings.txt` contains *one argument per line*, for example:

```
-l
deu
--author
A. Merkel
--pdf-renderer
tesseract
```

Fixes

- Handling of filenames containing spaces: fixed

Notes and known issues

- Some dependencies may work with lower versions than tested, so try
  overriding dependencies if they are "in the way" to see if they work.
- `--pdf-renderer tesseract` will output files with an incorrect page
  size in Tesseract 3.03, due to a bug in Tesseract.
- PDF files containing "inline images" are not supported and won't be
  for the 3.0 release. Scanned images almost never contain inline
  images.


================================================
FILE: docs/releasenotes/version04.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v4

## v4.5.6

- Fixed {issue}`156`,
  'NoneType' object has no attribute 'getObject' on pages with no
  optional /Contents record. This should resolve all issues related to
  pages with no /Contents record.
- Fixed {issue}`158`, ocrmypdf
  now stops and terminates if Ghostscript fails on an intermediate
  step, as it is not possible to proceed.
- Fixed {issue}`160`,
  exception thrown on certain invalid arguments instead of error
  message

## v4.5.5

- Automated update of macOS homebrew tap
- Fixed {issue}`154`, KeyError
  '/Contents' when searching for text on blank pages that have no
  /Contents record. Note: incomplete fix for this issue.

## v4.5.4

- Fixed `--skip-big` raising an exception if a page contains no images
  ({issue}`152`) (thanks
  to @TomRaz)
- Fixed an issue where pages with no images might trigger "cannot write
  mode P as JPEG"
  ({issue}`151`)

## v4.5.3

- Added a workaround for Ghostscript 9.21 and probably earlier versions
  would fail with the error message "VMerror -25", due to a Ghostscript
  bug in XMP metadata handling
- High Unicode characters (U+10000 and up) are no longer accepted for
  setting metadata on the command line, as Ghostscript may not handle
  them correctly.
- Fixed an issue where the `tess4` renderer would duplicate content
  onto output pages if tesseract failed or timed out
- Fixed `tess4` renderer not recognized when lossless reconstruction
  is possible

## v4.5.2

- Fixed {issue}`147`,
  `--pdf-renderer tess4 --clean` will produce an oversized page
  containing the original image in the bottom left corner, due to loss
  DPI information.
- Make "using Tesseract 4.0" warning less ominous
- Set up machinery for homebrew OCRmyPDF tap

## v4.5.1

- Fixed {issue}`137`,
  proportions of images with a non-square pixel aspect ratio would be
  distorted in output for `--force-ocr` and some other combinations
  of flags

## v4.5

- PDFs containing "Form XObjects" are now supported (issue
  {issue}`134`; PDF
  reference manual 8.10), and images they contain are taken into
  account when determining the resolution for rasterizing
- The Tesseract 4 Docker image no longer includes all languages,
  because it took so long to build something would tend to fail
- OCRmyPDF now warns about using `--pdf-renderer tesseract` with
  Tesseract 3.04 or lower due to issues with Ghostscript corrupting the
  OCR text in these cases

## v4.4.2

- The Docker images (ocrmypdf, ocrmypdf-polyglot, ocrmypdf-tess4) are
  now based on Ubuntu 16.10 instead of Debian stretch

  - This makes supporting the Tesseract 4 image easier
  - This could be a disruptive change for any Docker users who built
    customized these images with their own changes, and made those
    changes in a way that depends on Debian and not Ubuntu

- OCRmyPDF now prevents running the Tesseract 4 renderer with Tesseract
  3.04, which was permitted in v4.4 and v4.4.1 but will not work

## v4.4.1

- To prevent a [TIFF output
  error](https://github.com/python-pillow/Pillow/issues/2206) caused
  by img2pdf >= 0.2.1 and Pillow \<= 3.4.2, dependencies have been
  tightened
- The Tesseract 4.00 simultaneous process limit was increased from 1 to
  2, since it was observed that 1 lowers performance
- Documentation improvements to describe the `--tesseract-config`
  feature
- Added test cases and fixed error handling for `--tesseract-config`
- Tweaks to setup.py to deal with issues in the v4.4 release

## v4.4

- Tesseract 4.00 is now supported on an experimental basis.

  - A new rendering option `--pdf-renderer tess4` exploits Tesseract
    4's new text-only output PDF mode. See the documentation on PDF
    Renderers for details.
  - The `--tesseract-oem` argument allows control over the Tesseract
    4 OCR engine mode (tesseract's `--oem`). Use
    `--tesseract-oem 2` to enforce the new LSTM mode.
  - Fixed poor performance with Tesseract 4.00 on Linux

- Fixed an issue that caused corruption of output to stdout in some
  cases

- Removed test for Pillow JPEG and PNG support, as the minimum
  supported version of Pillow now enforces this

- OCRmyPDF now tests that the intended destination file is writable
  before proceeding

- The test suite now requires `pytest-helpers-namespace` to run (but
  not install)

- Significant code reorganization to make OCRmyPDF re-entrant and
  improve performance. All changes should be backward compatible for
  the v4.x series.

  - However, OCRmyPDF's dependency "ruffus" is not re-entrant, so no
    Python API is available. Scripts should continue to use the
    command line interface.

## v4.3.5

- Update documentation to confirm Python 3.6.0 compatibility. No code
  changes were needed, so many earlier versions are likely supported.

## v4.3.4

- Fixed "decimal.InvalidOperation: quantize result has too many digits"
  for high DPI images

## v4.3.3

- Fixed PDF/A creation with Ghostscript 9.20 properly
- Fixed an exception on inline stencil masks with a missing optional
  parameter

## v4.3.2

- Fixed a PDF/A creation issue with Ghostscript 9.20 (note: this fix
  did not actually work)

## v4.3.1

- Fixed an issue where pages produced by the "hocr" renderer after a
  Tesseract timeout would be rotated incorrectly if the input page was
  rotated with a /Rotate marker
- Fixed a file handle leak in LeptonicaErrorTrap that would cause a
  "too many open files" error for files around hundred pages of pages
  long when `--deskew` or `--remove-background` or other Leptonica
  based image processing features were in use, depending on the system
  value of `ulimit -n`
- Ability to specify multiple languages for multilingual documents is
  now advertised in documentation
- Reduced the file sizes of some test resources
- Cleaned up debug output
- Tesseract caching in test cases is now more cautious about false
  cache hits and reproducing exact output, not that any problems were
  observed

## v4.3

- New feature `--remove-background` to detect and erase the
  background of color and grayscale images

- Better documentation

- Fixed an issue with PDFs that draw images when the raster stack depth
  is zero

- ocrmypdf can now redirect its output to stdout for use in a shell
  pipeline

  - This does not improve performance since temporary files are still
    used for buffering
  - Some output validation is disabled in this mode

## v4.2.5

- Fixed an issue
  ({issue}`100`) with
  PDFs that omit the optional /BitsPerComponent parameter on images
- Removed non-free file milk.pdf

## v4.2.4

- Fixed an error
  ({issue}`90`) caused by
  PDFs that use stencil masks properly
- Fixed handling of PDFs that try to draw images or stencil masks
  without properly setting up the graphics state (such images are now
  ignored for the purposes of calculating DPI)

## v4.2.3

- Fixed an issue with PDFs that store page rotation (/Rotate) in an
  indirect object

- Integrated a few fixes to simplify downstream packaging (Debian)

  - The test suite no longer assumes it is installed
  - If running Linux, skip a test that passes Unicode on the command
    line

- Added a test case to check explicit masks and stencil masks

- Added a test case for indirect objects and linearized PDFs

- Deprecated the OCRmyPDF.sh shell script

## v4.2.2

- Improvements to documentation

## v4.2.1

- Fixed an issue where PDF pages that contained stencil masks would
  report an incorrect DPI and cause Ghostscript to abort
- Implemented stdin streaming

## v4.2

- ocrmypdf will now try to convert single image files to PDFs if they
  are provided as input
  ({issue}`15`)

  - This is a basic convenience feature. It only supports a single
    image and always makes the image fill the whole page.
  - For better control over image to PDF conversion, use `img2pdf`
    (one of ocrmypdf's dependencies)

- New argument `--output-type {pdf|pdfa}` allows disabling
  Ghostscript PDF/A generation

  - `pdfa` is the default, consistent with past behavior
  - `pdf` provides a workaround for users concerned about the
    increase in file size from Ghostscript forcing JBIG2 images to
    CCITT and transcoding JPEGs
  - `pdf` preserves as much as it can about the original file,
    including problems that PDF/A conversion fixes

- PDFs containing images with "non-square" pixel aspect ratios, such as
  200x100 DPI, are now handled and converted properly (fixing a bug
  that caused to be cropped)

- `--force-ocr` rasterizes pages even if they contain no images

  - supports users who want to use OCRmyPDF to reconstruct text
    information in PDFs with damaged Unicode maps (copy and paste text
    does not match displayed text)
  - supports reinterpreting PDFs where text was rendered as curves for
    printing, and text needs to be recovered
  - fixes issue
    {issue}`82`

- Fixes an issue where, with certain settings, monochrome images in
  PDFs would be converted to 8-bit grayscale, increasing file size
  ({issue}`79`)

- Support for Ubuntu 12.04 LTS "precise" has been dropped in favor of
  (roughly) Ubuntu 14.04 LTS "trusty"

  - Some Ubuntu "PPAs" (backports) are needed to make it work

- Support for some older dependencies dropped

  - Ghostscript 9.15 or later is now required (available in Ubuntu
    trusty with backports)
  - Tesseract 3.03 or later is now required (available in Ubuntu
    trusty)

- Ghostscript now runs in "safer" mode where possible

## v4.1.4

- Bug fix: monochrome images with an ICC profile attached were
  incorrectly converted to full color images if lossless reconstruction
  was not possible due to other settings; consequence was increased
  file size for these images

## v4.1.3

- More helpful error message for PDFs with version 4 security handler
- Update usage instructions for Windows/Docker users
- Fixed order of operations for matrix multiplication (no effect on most
  users)
- Add a few leptonica wrapper functions (no effect on most users)

## v4.1.2

- Replace IEC sRGB ICC profile with Debian's sRGB (from
  icc-profiles-free) which is more compatible with the MIT license
- More helpful error message for an error related to certain types of
  malformed PDFs

## v4.1

- `--rotate-pages` now only rotates pages when reasonably confidence
  in the orientation. This behavior can be adjusted with the new
  argument `--rotate-pages-threshold`
- Fixed problems in error checking if `unpaper` is uninstalled or
  missing at run-time
- Fixed problems with "RethrownJobError" errors during error handling
  that suppressed the useful error messages

## v4.0.7

- Minor correction to Ghostscript output settings

## v4.0.6

- Update install instructions
- Provide a sRGB profile instead of using Ghostscript's

## v4.0.5

- Remove some verbose debug messages from v4.0.4
- Fixed temporary that wasn't being deleted
- DPI is now calculated correctly for cropped images, along with other
  image transformations
- Inline images are now checked during DPI calculation instead of
  rejecting the image

## v4.0.4

Released with verbose debug message turned on. Do not use. Skip to
v4.0.5.

## v4.0.3

New features

- Page orientations detected are now reported in a summary comment

Fixes

- Show stack trace if unexpected errors occur
- Treat "too few characters" error message from Tesseract as a reason
  to skip that page rather than abort the file
- Docker: fix blank JPEG2000 issue by insisting on Ghostscript versions
  that have this fixed

## v4.0.2

Fixes

- Fixed compatibility with Tesseract 3.04.01 release, particularly its
  different way of outputting orientation information
- Improved handling of Tesseract errors and crashes
- Fixed use of chmod on Docker that broke most test cases

## v4.0.1

Fixes

- Fixed a KeyError if tesseract fails to find page orientation
  information

## v4.0

New features

- Automatic page rotation (`-r`) is now available. It uses ignores
  any prior rotation information on PDFs and sets rotation based on the
  dominant orientation of detectable text. This feature is fairly
  reliable but some false positives occur especially if there is not
  much text to work with.
  ({issue}`4`)
- Deskewing is now performed using Leptonica instead of unpaper.
  Leptonica is faster and more reliable at image deskewing than
  unpaper.

Fixes

- Fixed an issue where lossless reconstruction could cause some pages
  to be appear incorrectly if the page was rotated by the user in
  Acrobat after being scanned (specifically if it a /Rotate tag)
- Fixed an issue where lossless reconstruction could misalign the
  graphics layer with respect to text layer if the page had been
  cropped such that its origin is not (0, 0)
  ({issue}`49`)

Changes

- Logging output is now much easier to read
- `--deskew` is now performed by Leptonica instead of unpaper
  ({issue}`25`)
- libffi is now required
- Some changes were made to the Docker and Travis build environments to
  support libffi
- `--pdf-renderer=tesseract` now displays a warning if the Tesseract
  version is less than 3.04.01, the planned release that will include
  fixes to an important OCR text rendering bug in Tesseract 3.04.00.
  You can also manually install ./share/sharp2.ttf on top of pdf.ttf in
  your Tesseract tessdata folder to correct the problem.


================================================
FILE: docs/releasenotes/version05.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v5

## v5.7.0

- Fixed an issue that caused poor CPU utilization on machines with more
  than 4 cores when running Tesseract 4. (Related to {issue}`217`.)

- The 'hocr' renderer has been improved. The 'sandwich' and 'tesseract'
  renderers are still better for most use cases, but 'hocr' may be
  useful for people who work with the PDF.js renderer in English/ASCII
  languages. ({issue}`225`)

  - It now formats text in a matter that is easier for certain PDF
    viewers to select and extract copy and paste text. This should
    help macOS Preview and PDF.js in particular.
  - The appearance of selected text and behavior of selecting text is
    improved.
  - The PDF content stream now uses relative moves, making it more
    compact and easier for viewers to determine when two words on the
    same line.
  - It can now deal with text on a skewed baseline.
  - Thanks to @cforcey for the pull request, @jbreiden for many
    helpful suggestions, @ctbarbour for another round of improvements,
    and @acaloiaro for an independent review.

## v5.6.3

- Suppress two debug messages that were too verbose

## v5.6.2

- Development branch accidentally tagged as release. Do not use.

## v5.6.1

- Fixed {issue}`219`: change
  how the final output file is created to avoid triggering permission
  errors when the output is a special file such as `/dev/null`
- Fixed test suite failures due to a qpdf 8.0.0 regression and Python
  3.5's handling of symlink
- The "encrypted PDF" error message was different depending on the type
  of PDF encryption. Now a single clear message appears for all types
  of PDF encryption.
- ocrmypdf is now in Homebrew. Homebrew users are advised to the
  version of ocrmypdf in the official homebrew-core formulas rather
  than the private tap.
- Some linting

## v5.6.0

- Fixed {issue}`216`: preserve
  "text as curves" PDFs without rasterizing file
- Related to the above, messages about rasterizing are more consistent
- For consistency versions minor releases will now get the trailing .0
  they always should have had.

## v5.5

- Add new argument `--max-image-mpixels`. Pillow 5.0 now raises an
  exception when images may be decompression bombs. This argument can
  be used to override the limit Pillow sets.
- Fixed output page cropped when using the sandwich renderer and OCR is
  skipped on a rotated and image-processed page
- A warning is now issued when old versions of Ghostscript are used in
  cases known to cause issues with non-Latin characters
- Fixed a few parameter validation checks for `-output-type pdfa-1` and
  `pdfa-2`

## v5.4.4

- Fixed {issue}`181`: fix
  final merge failure for PDFs with more pages than the system file
  handle limit (`ulimit -n`)
- Fixed {issue}`200`: an
  uncommon syntax for formatting decimal numbers in a PDF would cause
  qpdf to issue a warning, which ocrmypdf treated as an error. Now this
  the warning is relayed.
- Fixed an issue where intermediate PDFs would be created at version 1.3
  instead of the version of the original file. It's possible but
  unlikely this had side effects.
- A warning is now issued when older versions of qpdf are used since
  issues like
  {issue}`200` cause
  qpdf to infinite-loop
- Address issue
  {issue}`140`: if
  Tesseract outputs invalid UTF-8, escape it and print its message
  instead of aborting with a Unicode error
- Adding previously unlisted setup requirement, pytest-runner
- Update documentation: fix an error in the example script for Synology
  with Docker images, improved security guidance, advised
  `pip install --user`

## v5.4.3

- If a subprocess fails to report its version when queried, exit
  cleanly with an error instead of throwing an exception
- Added test to confirm that the system locale is Unicode-aware and
  fail early if it's not
- Clarified some copyright information
- Updated pinned requirements.txt so the homebrew formula captures more
  recent versions

## v5.4.2

- Fixed a regression from v5.4.1 that caused sidecar files to be
  created as empty files

## v5.4.1

- Add workaround for Tesseract v4.00alpha crash when trying to obtain
  orientation and the latest language packs are installed

## v5.4

- Change wording of a deprecation warning to improve clarity
- Added option to generate PDF/A-1b output if desired
  (`--output-type pdfa-1`); default remains PDF/A-2b generation
- Update documentation

## v5.3.3

- Fixed missing error message that should occur when trying to force
  `--pdf-renderer sandwich` on old versions of Tesseract
- Update copyright information in test files
- Set system `LANG` to UTF-8 in Dockerfiles to avoid UTF-8 encoding
  errors

## v5.3.2

- Fixed a broken test case related to language packs

## v5.3.1

- Fixed wrong return code given for missing Tesseract language packs
- Fixed "brew audit" crashing on Travis when trying to auto-brew

## v5.3

- Added `--user-words` and `--user-patterns` arguments which are
  forwarded to Tesseract OCR as words and regular expressions
  respective to use to guide OCR. Supplying a list of subject-domain
  words should assist Tesseract with resolving words.
  ({issue}`165`)
- Using a non Latin-1 language with the "hocr" renderer now warns about
  possible OCR quality and recommends workarounds
  ({issue}`176`)
- Output file path added to error message when that location is not
  writable
  ({issue}`175`)
- Otherwise valid PDFs with leading whitespace at the beginning of the
  file are now accepted

## v5.2

- When using Tesseract 3.05.01 or newer, OCRmyPDF will select the
  "sandwich" PDF renderer by default, unless another PDF renderer is
  specified with the `--pdf-renderer` argument. The previous behavior
  was to select `--pdf-renderer=hocr`.
- The "tesseract" PDF renderer is now deprecated, since it can cause
  problems with Ghostscript on Tesseract 3.05.00
- The "tess4" PDF renderer has been renamed to "sandwich". "tess4" is
  now a deprecated alias for "sandwich".

## v5.1

- Files with pages larger than 200" (5080 mm) in either dimension are
  now supported with `--output-type=pdf` with the page size preserved
  (in the PDF specification this feature is called UserUnit scaling).
  Due to Ghostscript limitations this is not available in conjunction
  with PDF/A output.

## v5.0.1

- Fixed {issue}`169`,
  exception due to failure to create sidecar text files on some
  versions of Tesseract 3.04, including the jbarlow83/ocrmypdf Docker
  image

## v5.0

- Backward incompatible changes

  > - Support for Python 3.4 dropped. Python 3.5 is now required.
  > - Support for Tesseract 3.02 and 3.03 dropped. Tesseract 3.04 or
  >   newer is required. Tesseract 4.00 (alpha) is supported.
  > - The OCRmyPDF.sh script was removed.

- Add a new feature, `--sidecar`, which allows creating "sidecar"
  text files which contain the OCR results in plain text. These OCR
  text is more reliable than extracting text from PDFs. Closes
  {issue}`126`.

- New feature: `--pdfa-image-compression`, which allows overriding
  Ghostscript's lossy-or-lossless image encoding heuristic and making
  all images JPEG encoded or lossless encoded as desired. Fixes
  {issue}`163`.

- Fixed {issue}`143`, added
  `--quiet` to suppress "INFO" messages

- Fixed {issue}`164`, a typo

- Removed the command line parameters `-n` and `--just-print` since
  they have not worked for some time (reported as Ubuntu bug
  [#1687308](https://bugs.launchpad.net/ubuntu/+source/ocrmypdf/+bug/1687308))


================================================
FILE: docs/releasenotes/version06.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v6

## v6.2.5

- Disable a failing test due to Tesseract 4.0rc1 behavior change.
  Previously, Tesseract would exit with an error message if its
  configuration was invalid, and OCRmyPDF would intercept this message.
  Now Tesseract issues a warning, which OCRmyPDF v6.2.5 may relay or
  ignore. (In v7.x, OCRmyPDF will respond to the warning.)
- This release branch no longer supports using the optional PyMuPDF
  installation, since it was removed in v7.x.
- This release branch no longer supports macOS. macOS users should
  upgrade to v7.x.

## v6.2.4

- Backport Ghostscript 9.25 compatibility fixes, which removes support
  for setting Unicode metadata
- Backport blacklisting Ghostscript 9.24
- Older versions of Ghostscript are still supported

## v6.2.3

- Fixed compatibility with img2pdf >= 0.3.0 by rejecting input images
  that have an alpha channel
- This version will be included in Ubuntu 18.10

## v6.2.2

- Backport compatibility fixes for Python 3.7 and ruffus 2.7.0 from
  v7.0.0
- Backport fix to ignore masks when deciding what colors are on a page
- Backport some minor improvements from v7.0.0: better argument
  validation and warnings about the Tesseract 4.0.0 `--user-words`
  regression

## v6.2.1

- Fixed recent versions of Tesseract (after 4.0.0-beta1) not being
  detected as supporting the `sandwich` renderer ({issue}`271`).

## v6.2.0

- **Docker**: The Docker image `ocrmypdf-tess4` has been removed. The
  main Docker images, `ocrmypdf` and `ocrmypdf-polyglot` now use
  Ubuntu 18.04 as a base image, and as such Tesseract 4.0.0-beta1 is
  now the Tesseract version they use. There is no Docker image based on
  Tesseract 3.05 anymore.
- Creation of PDF/A-3 is now supported. However, there is no ability to
  attach files to PDF/A-3.
- Lists more reasons why the file size might grow.
- Fixed {issue}`262`,
  `--remove-background` error on PDFs contained colormapped
  (paletted) images.
- Fixed another XMP metadata validation issue, in cases where the input
  file's creation date has no timezone and the creation date is not
  overridden.

## v6.1.5

- Fixed {issue}`253`, a
  possible division by zero when using the `hocr` renderer.
- Fixed incorrectly formatted `<xmp:ModifyDate>` field inside XMP
  metadata for PDF/As. veraPDF flags this as a PDF/A validation
  failure. The error is caused the timezone and final digit of the
  seconds of modified time to be omitted, so at worst the modification
  time stamp is rounded to the nearest 10 seconds.

## v6.1.4

- Fixed {issue}`248`
  `--clean` argument may remove OCR from left column of text on
  certain documents. We now set `--layout none` to suppress this.
- The test cache was updated to reflect the change above.
- Change test suite to accommodate Ghostscript 9.23's new ability to
  insert JPEGs into PDFs without transcoding.
- XMP metadata in PDFs is now examined using `defusedxml` for safety.
- If an external process exits with a signal when asked to report its
  version, we now print the system error message instead of suppressing
  it. This occurred when the required executable was found but was
  missing a shared library.
- qpdf 7.0.0 or newer is now required as the test suite can no longer
  pass without it.

### Notes

- An apparent [regression in Ghostscript
  9.23](https://bugs.ghostscript.com/show_bug.cgi?id=699216) will
  cause some ocrmypdf output files to become invalid in rare cases; the
  workaround for the moment is to set `--force-ocr`.

## v6.1.3

- Fixed {issue}`247`,
  `/CreationDate` metadata not copied from input to output.
- A warning is now issued when Python 3.5 is used on files with a large
  page count, as this case is known to regress to single core
  performance. The cause of this problem is unknown.

## v6.1.2

- Upgrade to PyMuPDF v1.12.5 which includes a more complete fix to
  {issue}`239`.
- Add `defusedxml` dependency.

## v6.1.1

- Fixed text being reported as found on all pages if PyMuPDF is not
  installed.

## v6.1.0

- PyMuPDF is now an optional but recommended dependency, to alleviate
  installation difficulties on platforms that have less access to
  PyMuPDF than the author anticipated. (For version 6.x only) install
  OCRmyPDF with `pip install ocrmypdf[fitz]` to use it to its full
  potential.
- Fixed `FileExistsError` that could occur if OCR timed out while it
  was generating the output file.
  ({issue}`218`)
- Fixed table of contents/bookmarks all being redirected to page 1 when
  generating a PDF/A (with PyMuPDF). (Without PyMuPDF the table of
  contents is removed in PDF/A mode.)
- Fixed "RuntimeError: invalid key in dict" when table of
  contents/bookmarks titles contained the character `)`.
  ({issue}`239`)
- Added a new argument `--skip-repair` to skip the initial PDF repair
  step if the PDF is already well-formed (because another program
  repaired it).

## v6.0.0

- The software license has been changed to GPLv3 [it has since changed again].
  Test resource files and some individual sources may have other licenses.

- OCRmyPDF now depends on
  [PyMuPDF](https://pymupdf.readthedocs.io/en/latest/installation/).
  Including PyMuPDF is the primary reason for the change to GPLv3.

- Other backward incompatible changes

  - The `OCRMYPDF_TESSERACT`, `OCRMYPDF_QPDF`, `OCRMYPDF_GS` and
    `OCRMYPDF_UNPAPER` environment variables are no longer used.
    Change `PATH` if you need to override the external programs
    OCRmyPDF uses.
  - The `ocrmypdf` package has been moved to `src/ocrmypdf` to
    avoid issues with accidental import.
  - The function `ocrmypdf.exec.get_program` was removed.
  - The deprecated module `ocrmypdf.pageinfo` was removed.
  - The `--pdf-renderer tess4` alias for `sandwich` was removed.

- Fixed an issue where OCRmyPDF failed to detect existing text on
  pages, depending on how the text and fonts were encoded within the
  PDF. ({issue}`233,232`)

- Fixed an issue that caused dramatic inflation of file sizes when
  `--skip-text --output-type pdf` was used. OCRmyPDF now removes
  duplicate resources such as fonts, images and other objects that it
  generates. ({issue}`237`)

- Improved performance of the initial page splitting step. Originally
  this step was not believed to be expensive and ran in a process.
  Large file testing revealed it to be a bottleneck, so it is now
  parallelized. On a 700 page file with quad core machine, this change
  saves about 2 minutes. ({issue}`234`)

- The test suite now includes a cache that can be used to speed up test
  runs across platforms. This also does not require computing
  checksums, so it's faster. ({issue}`217`)


================================================
FILE: docs/releasenotes/version07.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v7

## v7.4.0

- `--force-ocr` may now be used with the new `--threshold` and
  `--mask-barcodes` features
- pikepdf >= 0.9.1 is now required.
- Changed metadata handling to pikepdf 0.9.1. As a result, metadata
  handling of non-ASCII characters in Ghostscript 9.25 or later is
  fixed.
- chardet >= 3.0.4 is temporarily listed as required. pdfminer.six
  depends on it, but the most recent release does not specify this
  requirement.
  ({issue}`326`)
- python-xmp-toolkit and libexempi are no longer required.
- A new Docker image is now being provided for users who wish to access
  OCRmyPDF over a simple HTTP interface, instead of the command line.
- Increase tolerance of PDFs that overflow or underflow the PDF
  graphics stack.
  ({issue}`325`)

## v7.3.1

- Fixed performance regression from v7.3.0; fast page analysis was not
  selected when it should be.
- Fixed a few exceptions related to the new `--mask-barcodes` feature
  and improved argument checking
- Added missing detection of TrueType fonts that lack a Unicode mapping

## v7.3.0

- Added a new feature `--redo-ocr` to detect existing OCR in a file,
  remove it, and redo the OCR. This may be particularly helpful for
  anyone who wants to take advantage of OCR quality improvements in
  Tesseract 4.0. Note that OCR added by OCRmyPDF before version 3.0
  cannot be detected since it was not properly marked as invisible text
  in the earliest versions. OCR that constructs a font from visible
  text, such as Adobe Acrobat's ClearScan.

- OCRmyPDF's content detection is generally more sophisticated. It
  learns more about the contents of each PDF and makes better
  recommendations:

  - OCRmyPDF can now detect when a PDF contains text that cannot be
    mapped to Unicode (meaning it is readable to human eyes but
    copy-pastes as gibberish). In these cases it recommends
    `--force-ocr` to make the text searchable.
  - PDFs containing vector objects are now rendered at more
    appropriate resolution for OCR.
  - We now exit with an error for PDFs that contain Adobe LiveCycle
    Designer's dynamic XFA forms. Currently the open source community
    does not have tools to work with these files.
  - OCRmyPDF now warns when a PDF that contains Adobe AcroForms, since
    such files probably do not need OCR. It can work with these files.

- Added three new **experimental** features to improve OCR quality in
  certain conditions. The name, syntax and behavior of these arguments
  is subject to change. They may also be incompatible with some other
  features.

  - `--remove-vectors` which strips out vector graphics. This can
    improve OCR quality since OCR will not search artwork for readable
    text; however, it currently removes "text as curves" as well.
  - `--mask-barcodes` to detect and suppress barcodes in files. We
    have observed that barcodes can interfere with OCR because they
    are "text-like" but not actually textual.
  - `--threshold` which uses a more sophisticated thresholding
    algorithm than is currently in use in Tesseract OCR. This works
    around a [known issue in Tesseract
    4.0](https://github.com/tesseract-ocr/tesseract/issues/1990)
    with dark text on bright backgrounds.

- Fixed an issue where an error message was not reported when the
  installed Ghostscript was very old.

- The PDF optimizer now saves files with object streams enabled when
  the optimization level is `--optimize 1` or higher (the default).
  This makes files a little bit smaller, but requires PDF 1.5. PDF 1.5
  was first released in 2003 and is broadly supported by PDF viewers,
  but some rudimentary PDF parsers such as PyPDF2 do not understand
  object streams. You can use the command line tool
  `qpdf --object-streams=disable` or
  [pikepdf](https://github.com/pikepdf/pikepdf) library to remove
  them.

- New dependency: pdfminer.six 20181108. Note this is a fork of the
  Python 2-only pdfminer.

- Deprecation notice: At the end of 2018, we will be ending support for
  Python 3.5 and Tesseract 3.x. OCRmyPDF v7 will continue to work with
  older versions.

## v7.2.1

- Fixed compatibility with an API change in pikepdf 0.3.5.
- A kludge to support Leptonica versions older than 1.72 in the test
  suite was dropped. Older versions of Leptonica are likely still
  compatible. The only impact is that a portion of the test suite will
  be skipped.

## v7.2.0

**Lossy JBIG2 behavior change**

A user reported that ocrmypdf was in fact using JBIG2 in **lossy**
compression mode. This was not the intended behavior. Users should
[review the technical concerns with JBIG2 in lossy
mode](https://abbyy.technology/en:kb:tip:jbig2_compression_and_ocr)
and decide if this is a concern for their use case.

JBIG2 lossy mode does achieve higher compression ratios than any other
monochrome compression technology; for large text documents the savings
are considerable. JBIG2 lossless still gives great compression ratios
and is a major improvement over the older CCITT G4 standard.

Only users who have reviewed the concerns with JBIG2 in lossy mode
should opt-in. As such, lossy mode JBIG2 is only turned on when the new
argument `--jbig2-lossy` is issued. This is independent of the setting
for `--optimize`.

Users who did not install an optional JBIG2 encoder are unaffected.

(Thanks to user 'bsdice' for reporting this issue.)

**Other issues**

- When the image optimizer quantizes an image to 1 bit per pixel, it
  will now attempt to further optimize that image as CCITT or JBIG2,
  instead of keeping it in the "flate" encoding which is not efficient
  for 1 bpp images.
  ({issue}`297`)
- Images in PDFs that are used as soft masks (i.e. transparency masks
  or alpha channels) are now excluded from optimization.
- Fixed handling of Tesseract 4.0-rc1 which now accepts invalid
  Tesseract configuration files, which broke the test suite.

## v7.1.0

- Improve the performance of initial text extraction, which is done to
  determine if a file contains existing text of some kind or not. On
  large files, this initial processing is now about 20x times faster.
  ({issue}`299`)
- pikepdf 0.3.3 is now required.
- Fixed {issue}`231`, a
  problem with JPEG2000 images where image metadata was only available
  inside the JPEG2000 file.
- Fixed some additional Ghostscript 9.25 compatibility issues.
- Improved handling of KeyboardInterrupt error messages.
  ({issue}`301`)
- README.md is now served in GitHub markdown instead of
  reStructuredText.

## v7.0.6

- Blacklist Ghostscript 9.24, now that 9.25 is available and fixes many
  regressions in 9.24.

## v7.0.5

- Improve capability with Ghostscript 9.24, and enable the JPEG
  passthrough feature when this version in installed.
- Ghostscript 9.24 lost the ability to set PDF title, author, subject
  and keyword metadata to Unicode strings. OCRmyPDF will set ASCII
  strings and warn when Unicode is suppressed. Other software may be
  used to update metadata. This is a short term work around.
- PDFs generated by Kodak Capture Desktop, or generally PDFs that
  contain indirect references to null objects in their table of
  contents, would have an invalid table of contents after processing by
  OCRmyPDF that might interfere with other viewers. This has been
  fixed.
- Detect PDFs generated by Adobe LiveCycle, which can only be displayed
  in Adobe Acrobat and Reader currently. When these are encountered,
  exit with an error instead of performing OCR on the "Please wait"
  error message page.

## v7.0.4

- Fixed exception thrown when trying to optimize a certain type of PNG
  embedded in a PDF with the `-O2`
- Update to pikepdf 0.3.2, to gain support for optimizing some
  additional image types that were previously excluded from
  optimization (CMYK and grayscale). Fixes
  {issue}`285`.

## v7.0.3

- Fixed {issue}`284`, an error
  when parsing inline images that have are also image masks, by
  upgrading pikepdf to 0.3.1

## v7.0.2

- Fixed a regression with `--rotate-pages` on pages that already had
  rotations applied.
  ({issue}`279`)
- Improve quality of page rotation in some cases by rasterizing a
  higher quality preview image.
  ({issue}`281`)

## v7.0.1

- Fixed compatibility with img2pdf >= 0.3.0 by rejecting input images
  that have an alpha channel
- Add forward compatibility for pikepdf 0.3.0 (unrelated to img2pdf)
- Various documentation updates for v7.0.0 changes

## v7.0.0

- The core algorithm for combining OCR layers with existing PDF pages
  has been rewritten and improved considerably. PDFs are no longer
  split into single page PDFs for processing; instead, images are
  rendered and the OCR results are grafted onto the input PDF. The new
  algorithm uses less temporary disk space and is much more performant
  especially for large files.

- New dependency: [pikepdf](https://github.com/pikepdf/pikepdf).
  pikepdf is a powerful new Python PDF library driving the latest
  OCRmyPDF features, built on the QPDF C++ library (libqpdf).

- New feature: PDF optimization with `-O` or `--optimize`. After
  OCR, OCRmyPDF will perform image optimizations relevant to OCR PDFs.

  - If a JBIG2 encoder is available, then monochrome images will be
    converted, with the potential for huge savings on large black and
    white images, since JBIG2 is far more efficient than any other
    monochrome (bi-level) compression. (All known US patents related
    to JBIG2 have probably expired, but it remains the responsibility
    of the user to supply a JBIG2 encoder such as
    [jbig2enc](https://github.com/agl/jbig2enc). OCRmyPDF does not
    implement JBIG2 encoding.)
  - If `pngquant` is installed, OCRmyPDF will optionally use it to
    perform lossy quantization and compression of PNG images.
  - The quality of JPEGs can also be lowered, on the assumption that a
    lower quality image may be suitable for storage after OCR.
  - This image optimization component will eventually be offered as an
    independent command line utility.
  - Optimization ranges from `-O0` through `-O3`, where `0`
    disables optimization and `3` implements all options. `1`, the
    default, performs only safe and lossless optimizations. (This is
    similar to GCC's optimization parameter.) The exact type of
    optimizations performed will vary over time.

- Small amounts of text in the margins of a page, such as watermarks,
  page numbers, or digital stamps, will no longer prevent the rest of a
  page from being OCRed when `--skip-text` is issued. This behavior
  is based on a heuristic.

- Removed features

  - The deprecated `--pdf-renderer tesseract` PDF renderer was
    removed.
  - `-g`, the option to generate debug text pages, was removed
    because it was a maintenance burden and only worked in isolated
    cases. HOCR pages can still be previewed by running the
    hocrtransform.py with appropriate settings.

- Removed dependencies

  - `PyPDF2`
  - `defusedxml`
  - `PyMuPDF`

- The `sandwich` PDF renderer can be used with all supported versions
  of Tesseract, including that those prior to v3.05 which don't support
  `-c textonly`. (Tesseract v4.0.0 is recommended and more
  efficient.)

- `--pdf-renderer auto` option and the diagnostics used to select a
  PDF renderer now work better with old versions, but may make
  different decisions than past versions.

- If everything succeeds but PDF/A conversion fails, a distinct return
  code is now returned (`ExitCode.pdfa_conversion_failed (10)`) where
  this situation previously returned
  `ExitCode.invalid_output_pdf (4)`. The latter is now returned only
  if there is some indication that the output file is invalid.

- Notes for downstream packagers

  - There is also a new dependency on `python-xmp-toolkit` which in
    turn depends on `libexempi3`.
  - It may be necessary to separately `pip install pycparser` to
    avoid [another Python 3.7
    issue](https://github.com/eliben/pycparser/pull/135).


================================================
FILE: docs/releasenotes/version08.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v8

## v8.3.2

- Dropped workaround for macOS that allowed it work without pdfminer.six,
  now a proper sdist release of pdfminer.six is available.
- pikepdf 1.5.0 is now required.

## v8.3.1

- Fixed an issue where PDFs with malformed metadata would be rendered as
  blank pages. {issue}`398`.

## v8.3.0

- Improved the strategy for updating pages when a new image of the page
  was produced. We now attempt to preserve more content from the
  original file, for annotations in particular.
- For PDFs with more than 100 pages and a sequence where one PDF page
  was replaced and one or more subsequent ones were skipped, an
  intermediate file would be corrupted while grafting OCR text, causing
  processing to fail. This is a regression, likely introduced in
  v8.2.4.
- Previously, we resized the images produced by Ghostscript by a small
  number of pixels to ensure the output image size was an exactly what
  we wanted. Having discovered a way to get Ghostscript to produce the
  exact image sizes we require, we eliminated the resizing step.
- Command line completions for `bash` are now available, in addition
  to `fish`, both in `misc/completion`. Package maintainers, please
  install these so users can take advantage.
- Updated requirements.
- pikepdf 1.3.0 is now required.

## v8.2.4

- Fixed a false positive while checking for a certain type of PDF that
  only Acrobat can read. We now more accurately detect Acrobat-only
  PDFs.
- OCRmyPDF holds fewer open file handles and is more prompt about
  releasing those it no longer needs.
- Minor optimization: we no longer traverse the table of contents to
  ensure all references in it are resolved, as changes to libqpdf have
  made this unnecessary.
- pikepdf 1.2.0 is now required.

## v8.2.3

- Fixed that `--mask-barcodes` would occasionally leave a unwanted
  temporary file named `junkpixt` in the current working folder.
- Fixed (hopefully) handling of Leptonica errors in an environment
  where a non-standard `sys.stderr` is present.
- Improved help text for `--verbose`.

## v8.2.2

- Fixed a regression from v8.2.0, an exception that occurred while
  attempting to report that `unpaper` or another optional dependency
  was unavailable.
- In some cases, `ocrmypdf [-c|--clean]` failed to exit with an error
  when `unpaper` is not installed.

## v8.2.1

- This release was canceled.

## v8.2.0

- A major improvement to our Docker image is now available thanks to
  hard work contributed by @mawi12345. The new Docker image,
  ocrmypdf-alpine, is based on Alpine Linux, and includes most of the
  functionality of three existed images in a smaller package. This
  image will replace the main Docker image eventually but for now all
  are being built. [See documentation for
  details](https://ocrmypdf.readthedocs.io/en/latest/docker.html).
- Documentation reorganized especially around the use of Docker images.
- Fixed a problem with PDF image optimization, where the optimizer
  would unnecessarily decompress and recompress PNG images, in some
  cases losing the benefits of the quantization it just had just
  performed. The optimizer is now capable of embedding PNG images into
  PDFs without transcoding them.
- Fixed a minor regression with lossy JBIG2 image optimization. All
  JBIG2 candidates images were incorrectly placed into a single
  optimization group for the whole file, instead of grouping pages
  together. This usually makes a larger JBIG2Globals dictionary and
  results in inferior compression, so it worked less well than
  designed. However, quality would not be impacted. Lossless JBIG2 was
  entirely unaffected.
- Updated dependencies, including pikepdf to 1.1.0. This fixes
  {issue}`358`.
- The install-time version checks for certain external programs have
  been removed from setup.py. These tests are now performed at
  run-time.
- The non-standard option to override install-time checks
  (`setup.py install --force`) is now deprecated and prints a
  warning. It will be removed in a future release.

## v8.1.0

- Added a feature, `--unpaper-args`, which allows passing arbitrary
  arguments to `unpaper` when using `--clean` or `--clean-final`.
  The default, very conservative unpaper settings are suppressed.
- The argument `--clean-final` now implies `--clean`. It was
  possible to issue `--clean-final` on its before this, but it would
  have no useful effect.
- Fixed an exception on traversing corrupt table of contents entries
  (specifically, those with invalid destination objects)
- Fixed an issue when using `--tesseract-timeout` and image
  processing features on a file with more than 100 pages.
  {issue}`347`
- OCRmyPDF now always calls `os.nice(5)` to signal to operating
  systems that it is a background process.

## v8.0.1

- Fixed an exception when parsing PDFs that are missing a required
  field. {issue}`325`
- pikepdf 1.0.5 is now required, to address some other PDF parsing
  issues.

## v8.0.0

No major features. The intent of this release is to sever support for
older versions of certain dependencies.

**Breaking changes**

- Dropped support for Tesseract 3.x. Tesseract 4.0 or newer is now
  required.
- Dropped support for Python 3.5.
- Some `ocrmypdf.pdfa` APIs that were deprecated in v7.x were
  removed. This functionality has been moved to pikepdf.

**Other changes**

- Fixed an unhandled exception when attempting to mask barcodes.
  {issue}`322`
- It is now possible to use ocrmypdf without pdfminer.six, to support
  distributions that do not have it or cannot currently use it (e.g.
  Homebrew). Downstream maintainers should include pdfminer.six if
  possible.
- A warning is now issue when PDF/A conversion removes some XMP
  metadata from the input PDF. (Only a "whitelist" of certain XMP
  metadata types are allowed in PDF/A.)
- Fixed several issues that caused PDF/As to be produced with
  nonconforming XMP metadata (would fail validation with veraPDF).
- Fixed some instances where invalid DocumentInfo from a PDF cause XMP
  metadata creation to fail.
- Fixed a few documentation problems.
- pikepdf 1.0.2 is now required.


================================================
FILE: docs/releasenotes/version09.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v9

## v9.8.2

- Fixed an issue where OCRmyPDF would ignore text inside Form XObject when
  making certain decisions about whether a document already had text.
- Fixed file size increase warning to take overhead of small files into account.
- Added instructions for installing on Cygwin.

## v9.8.1

- Fixed an issue where unexpected files in the `%PROGRAMFILES%\gs` directory
  (Windows) caused an exception.
- Mark pdfminer.six 20200517 as supported.
- If jbig2enc is missing and optimization is requested, a warning is issued
  instead of an error, which was the intended behavior.
- Documentation updates.

## v9.8.0

- Fixed issue where only the first PNG (FlateDecode) image in a file would be
  considered for optimization. File sizes should be improved from here on.
- Fixed a startup crash when the chosen language was Japanese ({issue}`543`).
- Added options to configure polling and log level to watcher.py.

## v9.7.2

- Fixed an issue with `ocrmypdf.ocr(...language=)` not accepting a list of
  languages as documented.
- Updated setup.py to confirm that pdfminer.six version 20200402 is supported.

## v9.7.1

- Fixed version check failing when used with qpdf 10.0.0.
- Added some missing type annotations.
- Updated documentation to warn about need for "ifmain" guard and Windows.

## v9.7.0

- Fixed an error in watcher.py if `OCR_JSON_SETTINGS` was not defined.
- Ghostscript 9.51 is now blacklisted, due to numerous problems with this version.
- Added a workaround for a problem with "txtwrite" in Ghostscript 9.52.
- Fixed an issue where the incorrect number of threads used was shown when
  `OMP_THREAD_LIMIT` was manipulated.
- Removed a possible performance bottlenecks for files that use hundreds to
  thousands of images on the same page.
- Documentation improvements.
- Optimization will now be applied to some monochrome images that have a color
  profile defined instead of only black and white.
- ICC profiles are consulted when determining the simplified colorspace of an
  image.

## v9.6.1

- Documentation improvements - thanks to many users for their contributions!

  > - Fixed installation instructions for ArchLinux (@pigmonkey)
  > - Updated installation instructions for FreeBSD and other OSes (@knobix)
  > - Added instructions for using Docker Compose with watchdog (@ianalexander,
  >   @deisi)
  > - Other miscellany (@mb720, @toy, @caiofacchinato)
  > - Some scripts provided in the documentation have been migrated out so that
  >   they can be copied out as whole files, and to ensure syntax checking
  >   is maintained.

- Fixed an error that caused bash completions to fail on macOS. ({issue}`502,504`;
  @AlexanderWillner)

- Fixed a rare case where OCRmyPDF threw an exception while processing a PDF
  with the wrong object type in its `/Trailer /Info`. The error is now logged
  and incorrect object is ignored. ({issue}`497`)

- Removed potentially non-free file `enron1.pdf` and simplified the test that
  used it.

- Removed potentially non-free file `misc/media/logo.afdesign`.

## v9.6.0

- Fixed a regression with transferring metadata from the input PDF to the output
  PDF in certain situations.
- pdfminer.six is now supported up to version 2020-01-24.
- Messages are explaining page rotation decisions are now shown at the standard
  verbosity level again when `--rotate-pages`. In some previous version they
  were set to debug level messages that only appeared with the parameter `-v1`.
- Improvements to `misc/watcher.py`. Thanks to @ianalexander and @svenihoney.
- Documentation improvements.

## v9.5.0

- Added API functions to measure OCR quality.
- Modest improvements to handling PDFs with difficult/non compliant metadata.

## v9.4.0

- Updated recommended dependency versions.
- Improvements to test coverage and changes to facilitate better measurement of
  test coverage, such as when tests run in subprocesses.
- Improvements to error messages when Leptonica is not installed correctly.
- Fixed use of pytest "session scope" that may have caused some intermittent
  CI failures.
- When the argument `--keep-temporary-files` or verbosity is set to `-v1`,
  a debug log file is generated in the working temporary folder.

## v9.3.0

- Improved native Windows support: we now check in the obvious places in
  the "Program Files" folders installations of Tesseract and Ghostscript,
  rather than relying on the user to edit `PATH` to specify their location.
  The `PATH` environment variable can still be used to differentiate when
  multiple installations are present or the programs are installed to non-
  standard locations.
- Fixed an exception on parsing Ghostscript error messages.
- Added an improved example demonstrating how to set up a watched folder
  for automated OCR processing (thanks to @ianalexander for the contribution).

## v9.2.0

- Native Windows is now supported.
- Continuous integration moved to Azure Pipelines.
- Improved test coverage and speed of tests.
- Fixed an issue where a page that was originally a JPEG would be saved as a
  PNG, increasing file size. This occurred only when a preprocessing option
  was selected along with `--output-type=pdf` and all images on the original
  page were JPEGs. Regression since v7.0.0.
- OCRmyPDF no longer depends on the QPDF executable `qpdf` or `libqpdf`.
  It uses pikepdf (which in turn depends on `libqpdf`). Package maintainers
  should adjust dependencies so that OCRmyPDF no longer calls for libqpdf on
  its own. For users of Python binary wheels, this change means a separate
  installation of QPDF is no longer necessary. This change is mainly to
  simplify installation on Windows.
- Fixed a rare case where log messages from Tesseract would be discarded.
- Fixed incorrect function signature for pixFindPageForeground, causing
  exceptions on certain platforms/Leptonica versions.

## v9.1.1

- Expand the range of pdfminer.six versions that are supported.
- Fixed Docker build when using pikepdf 1.7.0.
- Fixed documentation to recommend using pip from get-pip.py.

## v9.1.0

- Improved diagnostics when file size increases at output. Now warns if JBIG2
  or pngquant were not available.
- pikepdf 1.7.0 is now required, to pick up changes that remove the need for
  a source install on Linux systems running Python 3.8.

## v9.0.5

- The Alpine Docker image (jbarlow83/ocrmypdf-alpine) has been dropped due to
  the difficulties of supporting Alpine Linux.
- The primary Docker image (jbarlow83/ocrmypdf) has been improved to take on
  the extra features that used to be exclusive to the Alpine image.
- No changes to application code.
- pdfminer.six version 20191020 is now supported.

## v9.0.4

- Fixed compatibility with Python 3.8 (but requires source install for the moment).
- Fixed Tesseract settings for `--user-words` and `--user-patterns`.
- Changed to pikepdf 1.6.5 (for Python 3.8).
- Changed to Pillow 6.2.0 (to mitigate a security vulnerability in earlier Pillow).
- A debug message now mentions when English is automatically selected if the locale
  is not English.

## v9.0.3

- Embed an encoded version of the sRGB ICC profile in the intermediate
  Postscript file (used for PDF/A conversion). Previously we included the
  filename, which required Postscript to run with file access enabled. For
  security, Ghostscript 9.28 enables `-dSAFER` and as such, no longer
  permits access to any file by default. This fix is necessary for
  compatibility with Ghostscript 9.28.
- Exclude a test that sometimes times out and fails in continuous integration
  from the standard test suite.

## v9.0.2

- The image optimizer now skips optimizing flate (PNG) encoded images in some
  situations where the optimization effort was likely wasted.
- The image optimizer now ignores images that specify arbitrary decode arrays,
  since these are rare.
- Fixed an issue that caused inversion of black and white in monochrome images.
  We are not certain but the problem seems to be linked to Leptonica 1.76.0 and
  older.
- Fixed some cases where the test suite failed if
  English or German Tesseract language packs were not installed.
- Fixed a runtime error if the Tesseract English language is not installed.
- Improved explicit closing of Pillow images after use.
- Actually fixed of Alpine Docker image build.
- Changed to pikepdf 1.6.3.

## v9.0.1

- Fixed test suite failing when either of optional dependencies unpaper and
  pngquant were missing.
- Attempted fix of Alpine Docker image build.
- Documented that FreeBSD ports are now available.
- Changed to pikepdf 1.6.1.

## v9.0.0

**Breaking changes**

- The `--mask-barcodes` experimental feature has been dropped due to poor
  reliability and occasional crashes, both due to the underlying library that
  implements this feature (Leptonica).
- The `-v` (verbosity level) parameter now accepts only `0`, `1`, and
  `2`.
- Dropped support for Tesseract 4.00.00-alpha releases. Tesseract 4.0 beta and
  later remain supported.
- Dropped the `ocrmypdf-polyglot` and `ocrmypdf-webservice` images.

**New features**

- Added a high level API for applications that want to integrate OCRmyPDF.
  Special thanks to Martin Wind (@mawi1988) whose made significant contributions
  to this effort.
- Added progress bars for long-running steps. ■■■■■■■□□
- We now create linearized ("fast web view") PDFs by default. The new parameter
  `--fast-web-view` provides control over when this feature is applied.
- Added a new `--pages` feature to limit OCR to only a specific page range.
  The list may contain commas or single pages, such as `1, 3, 5-11`.
- When the number of pages is small compared to the number of allowed jobs, we
  run Tesseract in multithreaded (OpenMP) mode when available. This should
  improve performance on files with low page counts.
- Removed dependency on `ruffus`, and with that, the non-reentrancy
  restrictions that previous made an API impossible.
- Output and logging messages overhauled so that ocrmypdf may be integrated
  into applications that use the logging module.
- pikepdf 1.6.0 is required.
- Added a logo. 😊

**Bug fixes**

- Pages with vector artwork are treated as full color. Previously, vectors
  were ignored when considering the colorspace needed to cover a page, which
  could cause loss of color under certain settings.
- Test suite now spawns processes less frequently, allowing more accurate
  measurement of code coverage.
- Improved test coverage.
- Fixed a rare division by zero (if optimization produced an invalid file).
- Updated Docker images to use newer versions.
- Fixed images encoded as JBIG2 with a colorspace other than `/DeviceGray`
  were not interpreted correctly.
- Fixed a OCR text-image registration (i.e. alignment) problem when the page
  when MediaBox had a nonzero corner.


================================================
FILE: docs/releasenotes/version10.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v10

## v10.3.3

- Fixed a "KeyError: 'dpi'" error message when using `--threshold` on an image.
  ({issue}`607`)

## v10.3.2

- Fixed a case where we reported "no reason" for a file size increase, when we
  could determine the reason.
- Enabled support for pdfminer.six 20200726.

## v10.3.1

- Fixed a number of test suite failures with pdfminer.six older than version 20200402.
- Enabled support for pdfminer.six 20200720.

## v10.3.0

- Fixed an issue where we would consider images that were already JBIG2-encoded
  for optimization, potentially producing a less optimized image than the original.
  We do not believe this issue would ever cause an image to loss fidelity.
- Where available, pikepdf memory mapping is now used. This improves performance.
- When Leptonica 1.79+ is installed, use its new error handling API to avoid
  a "messy" redirection of stderr which was necessary to capture its error
  messages.
- For older versions of Leptonica, added a new thread level lock. This fixes a
  possible race condition in handling error conditions in Leptonica (although
  there is no evidence it ever caused issues in practice).
- Documentation improvements and more type hinting.

## v10.2.1

- Disabled calculation of text box order with pdfminer. We never needed this result
  and it is expensive to calculate on files with complex pre-existing text.
- Fixed plugin manager to accept `Path(plugin)` as a path to a plugin.
- Fixed some typing errors.
- Documentation improvements.

## v10.2.0

- Update Docker image to use Ubuntu 20.04.
- Fixed issue PDF/A acquires title "Untitled" after conversion. ({issue}`582`)
- Fixed a problem where, when using `--pdf-renderer hocr`, some text would
  be missing from the output when using a more recent version of Tesseract.
  Tesseract began adding more detailed markup about the semantics of text
  that our HOCR transform did not recognize, so it ignored them. This option is
  not the default. If necessary `--redo-ocr` also redoing OCR to fix such issues.
- Fixed an error in Python 3.9 beta, due to removal of deprecated
  `Element.getchildren()`. ({issue}`584`)
- Implemented support using the API with `BytesIO` and other file stream objects.
  ({issue}`545`)

## v10.1.1

- Fixed `OMP_THREAD_LIMIT` set to invalid value error messages on some input
  files. (The error was harmless, apart from less than optimal performance in
  some cases.)

## v10.1.0

- Previously, we `--clean-final` would cause an unpaper-cleaned page image to
  be produced twice, which was necessary in some cases but not in general. We
  now take this optimization opportunity and reuse the image if possible.
- We now provide PNG files as input to unpaper, since it accepts them, instead
  of generating PPM files which can be very large. This can improve performance
  and temporary disk usage.
- Documentation updated for plugins.

## v10.0.1

- Fixed regression when `-l lang1+lang2` is used from command line.

## v10.0.0

**Breaking changes**

- Support for pdfminer.six version 20181108 has been dropped, along with a
  monkeypatch that made this version work.
- Output messages are now displayed in color (when supported by the terminal)
  and prefixes describing the severity of the message are removed. As such
  programs that parse OCRmyPDF's log message will need to be revised. (Please
  consider using OCRmyPDF as a library instead.)
- The minimum version for certain dependencies has increased.
- Many API changes; see developer changes.
- The Python libraries pluggy and coloredlogs are now required.

**New features and improvements**

- PDF page scanning is now parallelized across CPUs, speeding up this phase
  dramatically for files with a high page counts.
- PDF page scanning is optimized, addressing some performance regressions.
- PDF page scanning is no longer run on pages that are not selected when the
  `--pages` argument is used.
- PDF page scanning is now independent of Ghostscript, ending our past reliance
  on this occasionally unstable feature in Ghostscript.
- A plugin architecture has been added, currently allowing one to more easily
  use a different OCR engine or PDF renderer from Tesseract and Ghostscript,
  respectively. A plugin can also override some decisions, such changing
  the OCR settings after initial scanning.
- Colored log messages.

**Developer changes**

- The test spoofing mechanism, used to test correct handling of failures in
  Tesseract and Ghostscript, has been removed in favor of using plugins for
  testing. The spoofing mechanism was fairly complex and required many special
  hacks for Windows.
- Code describing the resolution in DPI of images was refactored into a
  `ocrmypdf.helpers.Resolution` class.
- The module `ocrmypdf._exec` is now private to OCRmyPDF.
- The `ocrmypdf.hocrtransform` module has been updated to follow PEP8 naming
  conventions.
- Ghostscript is no longer used for finding the location of text in PDFs, and
  APIs related to this feature have been removed.
- Lots of internal reorganization to support plugins.


================================================
FILE: docs/releasenotes/version11.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v11

## v11.7.3

- Exclude CCITT Group 3 images from being optimized. Some libraries
  OCRmyPDF uses do not seem to handle this obscure compression format properly.
  You may get errors or possible corrupted output images without this fix.

## v11.7.2

- Updated pinned versions in main.txt, primarily to upgrade Pillow to 8.1.2, due
  to recently disclosed security vulnerabilities in that software.
- The `--sidecar` parameter now causes an exception if set to the same file as
  the input or output PDF.

## v11.7.1

- Some exceptions while attempting image optimization were only logged at the debug
  level, causing them to be suppressed. These errors are now logged appropriately.
- Improved the error message related to `--unpaper-args`.
- Updated documentation to mention the new conda distribution.

## v11.7.0

- We now support using `--sidecar` in conjunction with `--pages`; these arguments
  used to be mutually exclusive. ({issue}`735`)
- Fixed a possible issue with PDF/A-1b generation. Acrobat complained that our PDFs use
  object streams. More robust PDF/A validators like veraPDF don't consider this a
  problem, but we'll honor Acrobat's objection from here on. This may increase file
  size of PDF/A-1b files. PDF/A-2b files will not be affected.

## v11.6.2

- Fixed a regression where the wrong page orientation would be produced when using
  arguments such as `--deskew --rotate-pages` ({issue}`730`).

## v11.6.1

- Fixed an issue with attempting optimize unusually narrow-width images by excluding
  these images from optimization ({issue}`732`).
- Remove an obsolete compatibility shim for a version of pikepdf that is no longer
  supported.

## v11.6.0

- OCRmyPDF will now automatically register plugins from the same virtual environment
  with an appropriate setuptools entrypoint.
- Refactor the plugin manager to remove unnecessary complications and make plugin
  registration more automatic.
- `PageContext` and `PdfContext` are now formally part of the API, as they
  should have been, since they were part of `ocrmypdf.pluginspec`.

## v11.5.0

- Fixed an issue where the output page size might differ by a fractional amount
  due to rounding, when `--force-ocr` was used and the page contained objects
  with multiple resolutions.
- When determining the resolution at which to rasterize a page, we now consider
  printed text on the page as requiring a higher resolution. This fixes issues
  with certain pages being rendered with unacceptably low resolution text, but
  may increase output file sizes in some workflows where low resolution text
  is acceptable.
- Added a workaround to fix an exception that occurs when trying to
  `import ocrmypdf.leptonica` on Apple ARM silicon (or potentially, other
  platforms that do not permit write+executable memory).

## v11.4.5

- Fixed an issue where files may not be closed when the API is used.
- Improved `setup.cfg` with better settings for test coverage.

## v11.4.4

- Fixed `AttributeError: 'NoneType' object has no attribute 'userunit'` ({issue}`700`),
  related to OCRmyPDF not properly forwarded an error message from pdfminer.six.
- Adjusted typing of some arguments.
- `ocrmypdf.ocr` now takes a `threading.Lock` for reasons outlined in the
  documentation.

## v11.4.3

- Removed a redundant debug message.
- Test suite now asserts that most patched functions are called when they should be.
- Test suite now skips a test that fails on two particular versions of piekpdf.

## v11.4.2

- Fixed support for Cygwin, hopefully.
- watcher.py: Fixed an issue with the OCR_LOGLEVEL not being interpreted.

## v11.4.1

- Fixed an issue where invalid pages ranges passed using the `pages` argument,
  such as "1-0" would cause unhandled exceptions.
- Accepted a user-contributed to the Synology demo script in misc/synology.py.
- Clarified documentation about change of temporary file location `ocrmypdf.io`.
- Fixed Python wheel tag which was incorrectly set to py35 even though we long
  since dropped support for Python 3.5.

## v11.4.0

- When looking for Tesseract and Ghostscript, we now check the Windows Registry to
  see if their installers registered the location of their executables. This should
  help Windows users who have installed these programs to non-standard
  locations.
- We now report on the progress of PDF/A conversion, since this operation is
  sometimes slow.
- Improved command line completions.
- The prefix of the temporary folder OCRmyPDF creates has been changed from
  `com.github.ocrmypdf` to `ocrmypdf.io`. Scripts that chose to depend on this
  prefix may need to be adjusted. (This has always been an implementation detail so is
  not considered part of the semantic versioning "contract".)
- Fixed {issue}`692`, where a particular file with malformed fonts would flood an
  internal message cue by generating so many debug messages.
- Fixed an exception on processing hOCR files with no page record. Tesseract
  is not known to generate such files.

## v11.3.4

- Fixed an error message 'called readLinearizationData for file that is not
  linearized' that may occur when pikepdf 2.1.0 is used. (Upgrading to pikepdf
  2.1.1 also fixes the issue.)
- File watcher now automatically includes `.PDF` in addition to `.pdf` to
  better support case sensitive file systems.
- Some documentation and comment improvements.

## v11.3.3

- If unpaper outputs non-UTF-8 data, quietly fix this rather than choke on the
  conversion. (Possibly addresses {issue}`671`.)

## v11.3.2

- Explicitly require pikepdf 2.0.0 or newer when running on Python 3.9. (There are
  concerns about the stability of pybind11 2.5.x with Python 3.9, which is used in
  pikepdf 1.x.)
- Fixed another issue related to page rotation.
- Fixed an issue where image marked as image masks were not properly considered
  as optimization candidates.
- On some systems, unpaper seems to be unable to process the PNGs we offer it
  as input. We now convert the input to PNM format, which unpaper always accepts.
  Fixes {issue}`665` and {issue}`667`.
- DPI sent to unpaper is now rounded to a more reasonable number of decimal digits.
- Debug and error messages from unpaper were being suppressed.
- Some documentation tweaks.

## v11.3.1

- Declare support for new versions: pdfminer.six 20201018 and pikepdf 2.x
- Fixed warning related to `--pdfa-image-compression` that appears at the wrong
  time.

## v11.3.0

- The "OCR" step is describing as "Image processing" in the output messages when
  OCR is disabled, to better explain the application's behavior.
- Debug logs are now only created when run as a command line, and not when OCR
  is performed for an API call. It is the calling application's responsibility
  to set up logging.
- For PDFs with a low number of pages, we gathered information about the input PDF
  in a thread rather than process (when there are more pages). When run as a
  thread, we did not close the file handle to the working PDF, leaking one file
  handle per call of `ocrmypdf.ocr`.
- Fixed an issue where debug messages send by child worker processes did not match
  the log settings of parent process, causing messages to be dropped. This affected
  macOS and Windows only where the parent process is not forked.
- Fixed the hookspec of rasterize_pdf_page to remove default parameters that
  were not handled in an expected way by pluggy.
- Fixed another issue with automatic page rotation ({issue}`658`) due to the issue above.

## v11.2.1

- Fixed an issue where optimization of a 1-bit image with a color palette or
  associated ICC that was optimized to JBIG2 could have its colors inverted.

## v11.2.0

- Fixed an issue with optimizing PNG-type images that had soft masks or image masks.
  This is a regression introduced in (or about) v11.1.0.
- Improved type checking of the `plugins` parameter for the `ocrmypdf.ocr`
  API call.

## v11.1.2

- Fixed hOCR renderer writing the text in roughly reverse order. This should not
  affect reasonably smart PDF readers that properly locate the position of all
  text, but may confuse those that rely on the order of objects in the content
  stream. ({issue}`642`)

## v11.1.1

- We now avoid using named temporary files when using pngquant allowing containerized
  pngquant installs to be used.
- Clarified an error message.
- Highest number of 1's in a release ever!

## v11.1.0

- Fixed page rotation issues: {issue}`634,589`.
- Fixed some cases where optimization created an invalid image such as a
  1-bit "RGB" image: {issue}`629,620`.
- Page numbers are now displayed in debug logs when pages are being grafted.
- ocrmypdf.optimize.rewrite_png and ocrmypdf.optimize.rewrite_png_as_g4 were
  marked deprecated. Strictly speaking these should have been internal APIs,
  but they were never hidden.
- As a precaution, pikepdf mmap-based file access has been disabled due to a
  rare race condition that causes a crash when certain objects are deallocated.
  The problem is likely in pikepdf's dependency pybind11.
- Extended the example plugin to demonstrate conversion to mono.

## v11.0.2

- Fixed {issue}`612`, TypeError exception. Fixed by eliminating unnecessary repair of
  input PDF metadata in memory.

## v11.0.1

- Blacklist pdfminer.six 20200720, which has a regression fixed in 20200726.
- Approve img2pdf 0.4 as it passes tests.
- Clarify that the GPL-3 portion of pdfa.py was removed with the changes in v11.0.0;
  the debian/copyright file did not properly annotate this change.

## v11.0.0

- Project license changed to Mozilla Public License 2.0. Some miscellaneous
  code is now under MIT license and non-code content/media remains under
  CC-BY-SA 4.0. License changed with approval of all people who were found
  to have contributed to GPLv3 licensed sections of the project. ({issue}`600`)
- Because the license changed, this is being treated as a major version number
  change; however, there are no known breaking changes in functional behavior
  or API compared to v10.x.


================================================
FILE: docs/releasenotes/version12.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v12

## v12.7.2

- Fixed "invalid version number" error for Tesseract packaging with nonstandard
  version "5.0.0-rc1.20211030".
- Fixed use of deprecated `importlib.resources.read_binary`.
- Replace some uses of string paths with `pathlib.Path`.
- Fixed a leaked file handle when using `--output-type none`.
- Removed shims to support versions of pikepdf that are no longer supported.

## v12.7.1

- Declare support for pdfminer.six v20211012.

## v12.7.0

- Fixed test suite failure when using pikepdf 3.2.0 that was compiled with pybind11
  2.8.0. {issue}`843`
- Improve advice to user about using `--max-image-mpixels` if OCR fails for this
  reason.
- Minor documentation fixes. (Thanks to @mara004.)
- Don't require importlib-metadata and importlib-resources backports on versions of
  Python where the standard library implementation is sufficient.
  (Thanks to Marco Genasci.)

## v12.6.0

- Implemented `--output-type=none` to skip producing PDFs for applications that
  only want sidecar files ({issue}`787`).
- Fixed ambiguities in descriptions of behavior of `--jbig2-lossy`.
- Various improvements to documentation.

## v12.5.0

- Fixed build failure for the combination of PyPy 3.6 and pikepdf 3.0. This
  combination can work in a source build but does not work with wheels.
- Accepted bot that wanted to upgrade our deprecated requirements.txt.
- Documentation updates.
- Replace pkg_resources and install dependency on setuptools with
  importlib-metadata and importlib-resources.
- Fixed regression in hocrtransform causing text to be omitted when this
  renderer was used.
- Fixed some typing errors.

## v12.4.0

- When grafting text layers, use pikepdf's `unparse_content_stream` if available.
- Confirmed support for pluggy 1.0. (Thanks @QuLogic.)
- Fixed some typing issues, improved pre-commit settings, and fixed issues
  flagged by linters.
- PyPy 7.3.3 (=Python 3.6) is now supported. Note that PyPy does not necessarily
  run faster, because the vast majority of OCRmyPDF's execution time is spent
  running OCR or generally executing native code. However, PyPy may bring speed
  improvements in some areas.

## v12.3.3

- watcher.py: fixed interpretation of boolean env vars ({issue}`821`).
- Adjust CI scripts to test Tesseract 5 betas.
- Document our support for the Tesseract 5 betas.

## v12.3.2

- Indicate support for flask 2.x, watcher 2.x ({issue}`815, 816`).

## v12.3.1

- Fixed issue with selection of text when using the hOCR renderer ({issue}`813`).
- Fixed build errors with the Docker image by upgrading to a newer Ubuntu.
  Also set the timezone of this image to UTC.

## v12.3.0

- Fixed a regression introduced in Pillow 8.3.0. Pillow no longer rounds DPI
  for image resolutions. We now account for this ({issue}`802`).
- We no longer use some API calls that are deprecated in the latest versions of
  pikepdf.
- Improved error message when a language is requested that doesn't look like a
  typical ISO 639-2 code.
- Fixed some tests that attempted to symlink on Windows, breaking tests on a
  Windows desktop but not usually on CI.
- Documentation fixes (thanks to @mara004)

## v12.2.0

- Fixed invalid Tesseract version number on Windows ({issue}`795`).
- Documentation tweaks. Documentation build now depends on sphinx-issues package.

## v12.1.0

- For security reasons we now require Pillow >= 8.2.x. (Older versions will continue
  to work if upgrading is not an option.)
- The build system was reorganized to rely on `setup.cfg` instead of `setup.py`.
  All changes should work with previously supported versions of setuptools.
- The files in `requirements/*` are now considered deprecated but will be retained for v12.
  Instead use `pip install ocrmypdf[test]` instead of `requirements/test.txt`, etc.
  These files will be removed in v13.

## v12.0.3

- Expand the list of languages supported by the hocr PDF renderer.
  Several languages were previously considered not supported, particularly those
  non-European languages that use the Latin alphabet.
- Fixed a case where the exception stack trace was suppressed in verbose mode.
- Improved documentation around commercial OCR.

## v12.0.2

- Fixed exception thrown when using `--remove-background` on files containing small
  images ({issue}`769`).
- Improve documentation for description of adding language packs to the Docker image
  and corrected name of French language pack.

## v12.0.1

- Fixed "invalid version number" for untagged tesseract versions ({issue}`770`).

## v12.0.0

**Breaking changes**

- Due to recent security issues in pikepdf, Pillow and reportlab, we now require
  newer versions of these libraries and some of their dependencies. (If necessary,
  package maintainers may override these versions at their discretion; lower
  versions will often work.)
- We now use the "LeaveColorUnchanged" color conversion strategy when directing
  Ghostscript to create a PDF/A. Generally this is faster than performing a
  color conversion, which is not always necessary.
- OCR text is now packaged in a Form XObject. This makes it easier to isolate
  OCR from other document content. However, some poorly implemented PDF text
  extraction algorithms may fail to detect the text.
- Many API functions have stricter parameter checking or expect keyword arguments
  were they previously did not.
- Some deprecated functions in `ocrmypdf.optimize` were removed.
- The `ocrmypdf.leptonica` module is now deprecated, due to difficulties with
  the current strategy of ABI binding on newer platforms like Apple Silicon.
  It will be removed and replaced, either by repackaging Leptonica as an
  independent library using or using a different image processing library.
- Continuous integration moved to GitHub Actions.
- We no longer depend on `pytest_helpers_namespace` for testing.

**New features**

- New plugin hook: `get_progressbar_class`, for progress reporting,
  allowing developers to replace the standard console progress bar with some
  other mechanism, such as updating a GUI progress bar.
- New plugin hook: `get_executor`, for replacing the concurrency model.
  This is primarily to support execution on AWS Lambda, which does not support
  standard Python `multiprocessing` due to its lack of shared memory.
- New plugin hook: `get_logging_console`, for replacing the standard
  way OCRmyPDF outputs its messages.
- New plugin hook: `filter_pdf_page`, for modifying individual PDF
  pages produced by OCRmyPDF.
- OCRmyPDF now runs on nonstandard execution environments that do not have
  interprocess semaphores, such as AWS Lambda and Android Termux. If the environment
  does not have semaphores, OCRmyPDF will automatically select an alternate
  process executor that does not use semaphores.
- Continuous integration moved to GitHub Actions.
- We now generate an ARM64-compatible Docker image alongside the x64 image.
  Thanks to @andkrause for doing most of the work in a pull request several months
  ago, which we were finally able to integrate now. Also thanks to @0x326 for
  review comments.

**Fixes**

- Fixed a possible deadlock on attempting to flush `sys.stderr` when older
  versions of Leptonica are in use.
- Some worker processes inherited resources from their parents such as log
  handlers that may have also lead to deadlocks. These resources are now released.
- Improvements to test coverage.
- Removed vestiges of support for Tesseract versions older than 4.0.0-beta1 (
  which ships with Ubuntu 18.04).
- OCRmyPDF can now parse all of Tesseract version numbers, since several
  schemes have been in use.
- Fixed an issue with parsing PDFs that contain images drawn at a scale of 0. ({issue}`761`)
- Removed a frequently repeated message about disabling mmap.


================================================
FILE: docs/releasenotes/version13.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v13

## v13.7.0

- Fixed an exception when attempting to run and Tesseract is not installed.
- Changed to SPDX license tracking and information files.

## v13.6.2

- Added a shim to prevent an "error during error handling" for Python 3.7 and 3.8.
- Modernized some type annotations.
- Improved annotations on our \_windows module to help IDEs and mypy figure out what
  we're doing.

## v13.6.1

- Require setuptools-scm 7.0.5 to avoid possible issues with source distributions in
  earlier versions of setuptools-scm.
- Suppress a spurious warning, improve tests, improve typing and other miscellany.

## v13.6.0

- Added a new `initialize` plugin hook, making it possible to suppress built-in
  plugins more easily, among other possibilities.
- Fixed an issue where unpaper would exit with a "wrong stream" error, probably
  related to images with an odd integer width. {issue}`887, 665`

## v13.5.0

- Added a new `optimize_pdf` plugin hook, making it possible to create plugins that
  replace or enhance OCRmyPDF's PDF optimizer.
- Removed all max version restrictions. Our new policy is to blacklist known-bad releases
  and only block known-bad versions of dependencies.
- The naming schema for object that holds all OCR text that OCRmyPDF inserts has
  changed. This has always been an implementation detail (and remains so), but possibly,
  someone was relying on it and would appreciate the heads-up.
- Cleanup.

## v13.4.7

- Fixed PermissionError when cleaning up temporary files in rare cases. {issue}`974`
- Fixed PermissionError when calling `os.nice` on platforms that lack it. {issue}`973`
- Suppressed some warnings from libxmp during tests.

## v13.4.6

- Convert error on corrupt ICC profiles into a warning. Thanks to @oscherler.

## v13.4.5

- Remove upper bound on pdfminer.six version.
- Documentation.

## v13.4.4

- Updated pdfminer.six version.
- Docker image changed to Ubuntu 22.04 now that it is released and provides the
  dependencies we need. This seems more consistent than our recent change to
  Debian.

## v13.4.3

- Fix error on pytest.skip() with older versions of pytest.
- Documentation updates.

## v13.4.2

- Worked around a
  [major regression in Ghostscript 9.56.0](https://bugs.ghostscript.com/show_bug.cgi?id=705187)
  where **all OCR text is stripped out of the PDF**. It simply removes all text,
  even generated by software other than OCRmyPDF. Fortunately, we can ask
  Ghostscript 9.56.0 to use its old behavior that worked correctly for our purposes.
  Users must avoid the combination (Ghostscript 9.56.0, ocrmypdf \<13.4.2) since
  older versions of OCRmyPDF have no way of detecting that this particular
  version of Ghostscript removes all OCR text.
- Marked pdfminer 20220319 as supported.
- Fixed some deprecation warnings from recent versions of Pillow and pytest.
- Test suite now covers Python 3.10 (Python 3.10 worked fine before, but was not
  being tested).
- Docker image now uses debian:bookworm-slim as the base image to fix the Docker
  image build.

## v13.4.1

- Temporarily make threads rather than processes the default executor worker, due
  to a persistent deadlock issue when processes are used. Add a new command line
  argument `--no-use-threads` to disable this.

## v13.4.0

- Fixed test failures when using pikepdf 5.0.0.
- Various improvements to the optimizer. In particular, we now recognize PDF images
  that are encoded with both deflate (PNG) and DCT (JPEG), and also produce PDF
  with images compressed with deflate and DCT, since this often yields file size
  improvements compared to plain DCT.

## v13.3.0

- Made a harmless but "scary" exception after failing to optimize an image less scary.
- Added a warning if a page image is too large for unpaper to clean. The image is
  passed through without cleaning. This is due to a hard-coded limitation in a
  C library used by unpaper so it cannot be rectified easily.
- We now use better default settings when calling img2pdf.
- We no longer try to optimize images that we failed to save in certain situations.
- We now account for some differences in text output from Tesseract 5 compared to
  Tesseract 4.
- Better handling of Ghostscript producing empty images when attempting to rasterize
  page images.

## v13.2.0

- Removed all runtime uses of distutils since it is deprecated in standard library. We
  previous used `distutils.version` to examine version numbers of dependencies
  at run time, and now use `packaging.version` for this. This is a new
  dependency.
- Fixed an error message advising the user that Ghostscript was not installed being
  suppressed when this condition actually happens.
- Fixed an issue with incorrect page number and totals being displayed in the progress
  bar. This was purely a display/presentation issue. {issue}`876`.

## v13.1.1

- Fixed issue with attempting to deskew a blank page on Tesseract 5. {issue}`868`.

## v13.1.0

- Changed to using Python concurrent.futures-based parallel execution instead of
  pools, since futures have now exceed pools in features.
- If a child worker is terminated (perhaps by the operating system or the user
  killing it in a task manager), the parallel task will fail an error message.
  Previously, the main ocrmypdf process would "hang" indefinitely, waiting for the
  child to report.
- Added new argument `--tesseract-thresholding` to provide control over Tesseract 5's
  threshold parameter.
- Documentation updates and changes. Better documentation for `--output-type none`,
  added a few releases ago. Removed some obsolete documentation.
- Improved bash completions - thanks to @FPille.

## v13.0.0

**Breaking changes**

- The deprecated module `ocrmypdf.leptonica` has been removed.
- We no longer depend on Leptonica (`liblept`) or CFFI (`libffi`,
  `python3-cffi`). (Note that Tesseract still requires Leptonica; OCRmyPDF no longer
  directly uses this library.)
- The argument `--remove-background` is temporarily disabled while we search for an
  alternative to the Leptonica implementation of this feature.
- The `--threshold` argument has been removed, since this also depended on Leptonica.
  Tesseract 5.x has implemented improvements to thresholding, so this feature will be
  redundant anyway.
- `--deskew` was previous calculated by a Leptonica algorithm. We now use a feature
  of Tesseract to find the appropriate the angle to deskew a page. The deskew angle
  according to Tesseract may differ from Leptonica's algorithm. At least in theory,
  Tesseract's deskew angle is informed by a more complex analysis than Leptonica,
  so this should improve results in general. We also use Pillow to perform the
  deskewing, which may affect the appearance of the image compared to Leptonica.
- Support for Python 3.6 was dropped, since this release is approaching end of life.
- We now require pikepdf 4.0 or newer. This, in turn, means that OCRmyPDF requires
  a system compatible with the manylinux2014 specification. This change was "forced"
  by Pillow not releasing manylinux2010 wheels anymore.
- We no longer provide requirements.txt-style files. Use `pip install ocrmypdf[...]`
  instead.
- Bumped required versions of several libraries.

**Fixes**

- Fixed an issue where OCRmyPDF failed to find Ghostscript on Windows even when
  installed, and would exit with an error.
- By removing Leptonica, we fixed all issues related to Leptonica on Apple
  Silicon or Leptonica failing to import on Windows.


================================================
FILE: docs/releasenotes/version14.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v14

## v14.4.0

- Digitally signed PDFs are now detected. If the PDF is signed, OCRmyPDF will
  refuse to modify it. Previously, only encrypted PDFs were detected, not
  those that were signed but not encrypted. {issue}`1040`
- In addition, `--invalidate-digital-signatures` can be used to override the
  above behavior and modify the PDF anyway. {issue}`1040`
- tqdm progress bars replaced with "rich" progress bars. The rich library is
  a new dependency. Certain APIs that used tqdm are now deprecated and will
  be removed in the next major release.
- Improved integration with GitHub Releases. Thanks to @stumpylog.

## v14.3.0

- Renamed master branch to main.
- Improve PDF rasterization accuracy by using the `-dPDFSTOPONERROR` option
  to Ghostscript. Use `--continue-on-soft-render-error` if you want to render
  the PDF anyway. The plugin specification was adjusted to support this feature;
  plugin authors may want to adapt PDF rasterizing and rendering
  plugins. {issue}`1083`
- The calculated deskew angle is now recorded in the logged output. {issue}`1101`
- Metadata can now be unset by setting a metadata type such as `--title` to an
  empty string. {issue}`1117,1059`
- Fixed random order of languages due to use of a set. This may have caused output
  to vary when multiple languages were set for OCR. {issue}`1113`
- Clarified the optimization ratio reported in the log output.
- Documentation improvements.

## v14.2.1

- Fixed {issue}`977`, where images inside Form XObjects were always excluded
  from image optimization.

## v14.2.0

- Added `--tesseract-downsample-above` to downsample larger images even when
  they do not exceed Tesseract's internal limits. This can be used to speed
  up OCR, possibly sacrificing accuracy.
- Fixed resampling AttributeError on older Pillow. {issue}`1096`
- Removed an error about using Ghostscript on PDFs with that have the /UserUnit
  feature in use. Previously, Ghostscript would fail to process these PDFs,
  but in all supported versions it is now supported, so the error is no longer
  needed.
- Improved documentation around installing other language packs for Tesseract.

## v14.1.0

- Added `--tesseract-non-ocr-timeout`. This allows using Tesseract's deskew
  and other non-OCR features while disabling OCR using `--tesseract-timeout 0`.
- Added `--tesseract-downsample-large-images`. This downsamples larges images
  that exceed the maximum image size Tesseract can handle. Large images may still
  take a long time to process, but this allows them to be processed if that
  is desired.
- Fixed {issue}`1082`, an issue with snap packaged building.
- Change linter to ruff, fix lint errors, update documentation.

## v14.0.4

- Fixed {issue}`1066, 1075`, an exception when processing certain malformed PDFs.

## v14.0.3

- Fixed {issue}`1068`, avoid deleting /dev/null when running as root.
- Other documentation fixes.

## v14.0.2

- Fixed {issue}`1052`, an exception on attempting to process certain nonconforming PDFs.
- Explicitly documented that Windows 32-bit is no longer supported.
- Fixed source installation instructions.
- Other documentation fixes.

## v14.0.1

- Fixed some version checks done with smart version comparison.
- Added missing jbig2dec to Docker image.

## v14.0.0

- Dropped support for Python 3.7.
- Dropped support generally speaking, all dependencies older than what Ubuntu 20.04
  provides.
- Ghostscript 9.50 or newer is now required. Shims to support old versions were
  removed.
- Tesseract 4.1.1 or newer is now required. Shims to support old versions were
  removed.
- Docker image now uses Tesseract 5.
- Dropped setup.cfg configuration for pyproject.toml.
- Removed deprecation exception PdfMergeFailedError.
- A few more public domain test files were removed or replaced. We are aiming for
  100% compliance with SPDX and generally towards simplifying copyright.


================================================
FILE: docs/releasenotes/version15.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v15

## v15.4.4

- Fixed documentation for installing Ghostscript on Windows. {issue}`1198`
- Added warning message about security issue in older versions of Ghostscript.

## v15.4.3

- Fixed deprecation warning in pikepdf older than 8.7.1; pikepdf >= 8.7.1 is
  now required.

## v15.4.2

- We now raise an exception on a certain class of PDFs that likely need an
  explicit color conversion strategy selected to display correctly
  for PDF/A conversion.
- Fixed an error that occurred while trying to write a log message after the
  debug log handler was removed.

## v15.4.1

- Fixed misc/watcher.py regressions: accept `--ocr-json-settings` as either
  filename or JSON string, as previously; and argument count mismatch.
  {issue}`1183,1185`
- We no longer attempt to set /ProcSet in the PDF output, since this is an
  obsolete PDF feature.
- Documentation improvements.

## v15.4.0

- Added new experimental APIs to support offline editing of the final text.
  Specifically, one can now generate hOCR files with OCRmyPDF, edit them with
  some other tool, and then finalize the PDF. They are experimental and
  subject to change, including details of how the working folder is used.
  There is no command line interface.
- Code reorganization: executors, progress bars, initialization and setup.
- Fixed test coverage in cases where the coverage tool did not properly trace
  into threads or subprocesses. This code was still being tested but appeared
  as not covered.
- In the test suite, reduced use of subprocesses and other techniques that
  interfere with coverage measurement.
- Improved error check for when we appear to be running inside a snap container
  and files are not available.
- Plugin specification now properly defines progress bars as a protocol rather
  than defining them as "tqdm-like".
- We now default to using "forkserver" process creation on POSIX platforms
  rather than fork, since this is method is more robust and avoids some
  issues when threads are present.
- Fixed an instance where the user's request to `--no-use-threads` was ignored.
- If a PDF does not have language metadata on its top level object, we add
  the OCR language.
- Replace some cryptic test error messages with more helpful ones.
- Debug messages for how OCRmyPDF picks the colorspace for a page are now
  more descriptive.

## v15.3.1

- Fixed an issue with logging settings for misc/watcher.py introduced in the
  previous release. {issue}`1180`
- We now attempt to preserve the input's extended attributes when creating
  the output file.
- For some reason, the macOS build now needs OpenSSL explicitly installed.
- Updated documentation on Docker performance concerns.

## v15.3.0

- Update misc/watcher.py to improve command line interface using Typer, and
  support `.env` specification of environment variables. Improved error
  messages. Thanks to @mflagg2814 for the PR that prompted this improvement.
- Improved error message when a file cannot be read because we are running in
  a snap container.

## v15.2.0

- Added a Docker image based on Alpine Linux. This image is smaller than the
  Ubuntu-based image and may be useful in some situations. Currently hosted at
  jbarlow83/ocrmypdf-alpine. Currently not available in ARM flavor.
- The Ubuntu Docker is now aliased to jbarlow83/ocrmypdf-ubuntu.
- Updated Docker documentation.

## v15.1.0

- We now require Pillow 10.0.1, due a serious security vulnerability in all earlier
  versions of that dependency. The vulnerability concerns WebP images and could
  be triggered in OCRmyPDF when creating a PDF from a malicious WebP image.
- Added some keyword arguments to `ocrmypdf.ocr` that were previously accepted
  but undocumented.
- Documentation updates and typing improvements.

## v15.0.2

- Added Python 3.12 to test matrix.
- Updated documentation for notes on Python 3.12, 32-bit support and some new
  features in v15.

## v15.0.1

- Wheels Python tag changed to py39.
- Marked as a expected fail a test that fails on recent Ghostscript versions.
- Clarified documentation and release notes around the extent of 32-bit support.
- Updated installation documentation to changes in v15.

## v15.0.0

- Dropped support for Python 3.8.
- Dropped support some older dependencies, specifically `coloredlogs` and
  `tqdm` in favor of rich - see `pyproject.toml` for details.
  Generally speaking, Ubuntu 22.04 is our new baseline system.
- Tightened version requirements for some dependencies.
- Dropped support for 32-bit Linux wheels. We strongly recommend a 64-bit operating
  system, and 64-bit versions of Python, Tesseract and Ghostscript to use OCRmyPDF.
  Many of our dependencies are dropping 32-bit builds (e.g. Pillow), and we are
  following suit. (Maintainers may still build 32-bit versions from source.)
- Changed to trusted release for PyPI publishing.
- pikepdf memory mapping is enabled again for improved performance, now that an
  issue with feature in pikepdf is fixed.
- `ocrmypdf.helpers.calculate_downsample` previously had two variants, one
  that took a `PIL.Image` and one that took a `tuple[int, int]`. The latter
  was removed.
- The snap version of ocrmypdf is now based on Ubuntu core22.
- We now account for situations where a small portion of an image on a page is drawn
  at high DPI (resolution). Previously, the entire page would be rasterized at the
  highest resolution of any feature, which caused performance problems. Now,
  the page is rasterized
  at a resolution based on the average DPI of the page, weighted by the area that
  each feature occupies. Typically, small areas of high resolution in PDFs are
  errors or quirks from the repeated use of assets and high resolution is not
  beneficial. {issue}`1010,1104,1004,1079,1010`
- Ghostscript color conversion strategy is now configurable using
  `--color-conversion-strategy`. {issue}`1143`
- JBIG2 threshold for optimization is now configurable using
  `--jbig2-threshold`. {issue}`1133`


================================================
FILE: docs/releasenotes/version16.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v16

## v16.13.0

- Added detection and repair for Ghostscript 10.6 JPEG corruption. When GS 10.6
  truncates JPEG data by 1-15 bytes, OCRmyPDF now restores the original image
  bytes from the input PDF. A warning is issued when GS 10.6+ is detected.
  {issue}`1603`
- We continue to force re-optimization of JPEGs, since this catches some issues with corruption for situations where Ghostscript modifies an image. It is likely there are still cases where we cannot mitigate all corruption issues. {issue}`1585`
- Fixed handling of PDF page boxes (ArtBox, BleedBox) which were not being
  processed correctly in some cases. {issue}`1181,1360`
- Documentation: clarified podman usage instructions.

## v16.12.0

- Disable Ghostscript's subset fonts feature, which was found to corrupt text in certain
  PDFs. Thanks @mnaegler for identifying this issue. {issue}`1592`
- Users of Ghostscript 10.6.0+ reported that Ghostscript seems to generate corrupted
  JPEGs. We force re-optimization of these JPEGs to mitigate the corruption until
  Ghostscript fixes the issue. {issue}`1585`
- OCRmyPDF now avoids applying flate compression to large JPEG images, unless maximum
  optimization is requested, since flate+DCT compression reduces performances in PDF
  viewers with large images.
- Updated Dockerfiles to use more recent base operating systems.
- Updated build and test matrix to include Python 3.14.
- Minor documentation improvements.
- pikepdf >= 10.0.0 is now required.

## v16.11.1

- Fixed issue with Tesseract changing an error message related to skew. {issue}`1576`
- Dropped macOS 13 from build-test matrix since it is no longer supported by Apple.

## v16.11.0

- Deprecated "semfree" plugin in favor of falling back to threads if the platform
  does not support semaphores. Fixes an issue with Python 3.14.
- Fixed references to PDF/A compliances levels to be consistent with ISO nomenclature.
  Thanks @5HT2. {issue}`1557`
- Fixed an issue around using plugin_manager as an argument. {issue}`1555`
- Added OpenBSD install steps to README. {issue}`1554`
- Removed PyPy from test matrix due to declining support in third party libraries.
- Documentation improvements.

## v16.10.4

- Corrected build errors in Python 3.13.3 and 3.13.4.

## v16.10.3 (not released)

- Blocked optimization of images with pre-blended soft masks. {issue}`1536`
- Fixed warning from hypothesis on running tests.
- Release incomplete due to new test failures in Python 3.13.3 and 3.13.4.

## v16.10.2

- Blacklist pikepdf 9.8.0 due to an incompatible change.

## v16.10.1

- No changes affecting OCRmyPDF functionality for command line end users.
- webservice: made page specification easier to find in UI.
- webservice: fix download button downloads wrong file.
- Converted project documentation from rST to Markdown.
- Added README translation to Simplified Chinese. Thanks @HuaPai.
- Modernized license specification in pyproject.toml.
- Modernized SPDX license to REUSE.toml.

## v16.10.0

- Added hocr textangle processing, improving handling of text at angles.
  Thanks @0dinD {issue}`1467`
- Docker documentation updates related to podman. Thanks @rugk. {issue}`1489,1488`
- Dropped webservice.py's fragile use of ttyd. Instead, messages from ocrmypdf are
  printed to the console.
- Fixed broken test test_hocrtransform_matches_sandwich, which had become
  an invalid test. Thanks @QuLogic for reporting.
- Improved install instructions for Windows. Thanks @alex.

## v16.9.0

- Added hocr caption processing. Thanks @0dinD {issue}`1466`
- ocrmypdf-alpine Docker image is now built with Alpine 3.21.
- Fixed error handling of PDFs that contain invalid images with both ImageMask
  and ColorSpace defined. {issue}`1453`
- Fixed test suite regression when only older Ghostscripts are installed.
- Improved documetnation of \_progressbar.py. Thanks @QuentinFuxa. {issue}`1456`
- Disabling building of documentation as PDF on ReadTheDocs, as this caused
  complex build issues deemed not worth solving.

## v16.8.0

- Upgraded webservice.py demonstration using streamlit. It's now possible to
  exercise most of OCRmyPDF's functionality in a simple web UI.
- Added cache to Dockerfiles to improve build speed.
- Fixed numerous formatting errors in the documentation that prevented some
  parts of documentation from generating correctly.
- Improved OCR text rendering by suppressing negative-width spaces. Thanks
  @pajowu. {issue}`1446`
- Improved detecting of invisible text when using `--redo-ocr`. Thanks
  @pajowu. {issue}`1448``

## v16.7.0

- Fixed further issues with Docker build and updated some versions.
- Main Docker image returned to Ubuntu 24.04 since the fix in v16.6.2 resolved
  that concern.
- Code that previously sent Ghostscript output to stdout has been changed to
  output to temporary files, since Ghostscript was doing that anyway internally.
  This is a modest efficiency improvement.
- Fixed an issue with debug log output being parsed as rich markup. {issue}`1444`

## v16.6.2

- Remove invalid hyperlink annotations to satisfy Ghostscript 10.x during PDF/A
  conversion. {issue}`1425`

## v16.6.1

- Fixed some issues with Docker build, such as removing unnecessary content and using
  a stable Tesseract version.
- Reverted Docker image to Ubuntu 22.04 to access older/more stable Ghostscript
  for now.
- Clarified batch commands in documentation.
- Fixed an issue with JSON serialization and pickling of HOCRResult. {issue}`1427`

## v16.6.0

- Fixed an issue where damaged PDFs would fail with `--redo-ocr`. {issue}`1403`
- Fixed an error that prevented JBIG2 optimization on Windows if the image
  was optimized in an earlier step. {issue}`1396`
- Fixed an error detecting the version of unpaper 7.0.0. {issue}`1409`
- Fixed a performance regression when scanning pages. {issue}`1378`. Thanks @aliemjay.
- Fixed Alpine Docker image by enforcing Alpine 3.19. Alpine 3.20 includes a
  defective version of Tesseract OCR and so is not usable.
- Upgraded Ubuntu Docker image to use Ubuntu 24.04.
- Build and test scripts/actions switched to uv.
- When running in a container, we now remind the user that temporary folders
  are inside the container and may not be accessible.
- Fixed Linux test coverage matrix, which was missing some key versions.

## v16.5.0

- Fixed issue with interpreting PDFs that have images with array masks.
  {issue}`1377`
- Enabled testing on Python 3.13.
- Fixed a test that did not work correctly but still passed. {issue}`1382`
- Improved "PDF/A conversion failed" warning message to better describe implications.
- Updated documentation to better explain OCR_JSON_SETTINGS in batch processing.
- Build backend changed from setuptools to hatchling.

## v16.4.3

- Work around pdfminer.six issue where a token on the buffer boundary is incorrectly
  parsed as two tokens. {issue}`1361`
- New rules are applied to stencil masks and explicit masks when calculating the
  optimal page DPI for rendering. {issue}`1362`
- Fixed attempts to use an incompatible jbig2.EXE provided by TeX Live. {issue}`1363`

## v16.4.2

- Fixed order of filenames passed to Ghostscript for PDF/A generation. {issue}`1359`
- Suppressed missing jbig2dec warning message. {issue}`1358`
- Fixed calculation of image size when soft mask dimensions don't match image
  dimension. {issue}`1351`
- Several fixes to documentation. Thanks to users Iris and JoKalliauer
  who contributed these changes.
- Fixed error on processing PDFs that are missing certain image metadata. {issue}`1315`

## v16.4.1

- Fixed calculation of image printed area (used in finding weighted DPI for OCR).
  {issue}`1334`
- Fixed "NotImplementedError: not sure how to get colorspace" error
  messages in logs which simply records a failure to optimize images with
  print production colorspaces. {issue}`1315`

## v16.4.0

- Selecting the `osd` and `equ` pseudo-languages with `-l/--language` now
  exits with an error when using Tesseract OCR, because these are not
  regular Tesseract languages but implementation details implemented.
  Using them can cause Tesseract to crash.
- The hOCR renderer is more tolerant of extra whitespace in input files.
- watcher.py now changes the output file extension to .pdf when the input is not
  .pdf.
- Improved handling of PDFs that contain circularly referenced Form XObjects.
  {issue}`1321`
- Fixed Alpine Docker image for ARM64, which was not building correctly.
- Docker images now use pikepdf 9.0.0.
- Prevent use of Tesseract OCR 5.4.0, a version with known regressions.
- Disabled progressbar for "Linearizing" when `--no-progress-bar` set.
- Fixed some tests that warn about missing JBIG2 decoding via pikepdf, by
  installing the necessary libraries during tests.

## v16.3.1

- Fixed a test suite failure with Ghostscript 10.03.0+. {issue}`1316`
- Fixed an issue with the presentation of the "OCR" progress bar. {issue}`1313`

## v16.3.0

- Fixed progress bar not displaying for Ghostscript PDF/A conversion. {issue}`1313`
- Added progress bar for linearization. {issue}`1313`
- If `--rotate-pages-threshold` issued without `--rotate-pages` we now exit with
  an error since the user likely intended to use `--rotate-pages`. {issue}`1309`
- If Tesseract hOCR gives an invalid line box, print an error message instead of
  exiting with an error. {issue}`1312`

## v16.2.0

- Fixed issue 'NoneType' object has no attribute 'get' when optimizing certain PDFs.
  {issue}`1293,1271`
- Switched formatting from black to ruff.
- Added support for sending sidecar output to io.BytesIO.
- Added support for converting HEIF/HEIC images (the native image of iPhones and
  some other devices) to PDFs, when the appropriate pi-hief library is installed.
  This library is marked as a dependency, but maintainers may opt out if needed.
- We now default to downsampling large images that would exceed Tesseract's internal
  limits, but only if it cause processing to fail. Previously, this behavior only
  occurred if specifically requested on command line. It can still be configured
  and disabled. See the --tesseract command line options.
- Added Macports install instructions. Thanks @akierig.
- Improved logging output when an unexpected error occurs while trying to obtain
  the version of a third party program.

## v16.1.2

- Fixed test suite failure when using Ghostscript 10.3.
- Other minor corrections.

## v16.1.1

- Fixed PyPy 3.10 support.

## v16.1.0

- Improved hOCR renderer is now default for left to right languages.
- Improved handling of rotated pages. Previously, OCR text might be missing for
  pages that were rotated with a /Rotate tag on the page entry.
- Improved handling of cropped pages. Previously, in some cases a page with a
  crop box would not have its OCR applied correctly and misalignment between
  OCR text and visible text coudl occur.
- Documentation improvements, especially installation instructions for less
  common platforms.

## v16.0.4

- Fixed some issues for left-to-right text with the new hOCR renderer. It is still
  not default yet but will be made so soon. Right-to-left text is still in progress.
- Added an error to prevent use of several versions of Ghostscript that seem
  corrupt existing text in input PDFs. Newly generated OCR is not affected.
  For best results, use Ghostscript 10.02.1 or newer, which contains the fix
  for the issue.

## v16.0.3

- Changed minimum required Ghostscript to 9.54, to support users of RHEL 9 and its
  derivatives, since that is the latest version available there.
- Removed warning message about CVE-2023-43115, on the assumption that most
  distributions have backported the patch by now.

## v16.0.2

- Temporarily changed PDF text renderer back to sandwich by default to address
  regressions in macOS Preview.

## v16.0.1

- Fixed text rendering issue with new hOCR text renderer - extraneous byte order
  marks.
- Tightened dependencies.

## v16.0.0

- Added OCR text renderer, combined the best ideas of Tesseract's PDF
  generator and the older hOCR transformer renderer. The result is a hopefully
  permanent fix for wordssmushedtogetherwithoutspaces issues in extracted text,
  better registration/position of text on skewed baselines {issue}`1009`,
  fixes to character output when the German Fraktur script is used {issue}`1191`,
  proper rendering of right to left languages (Arabic, Hebrew, Persian) {issue}`1157`.
  Asian languages may still have excessive word breaks compared to expectations.
  The new renderer is the default; the old sandwich renderer is still available
  using `--pdf-renderer sandwich`; the old hOCR renderer is no more.
- The `ocrmypdf.hocrtransform` API has changed substantially.
- Support for Python 3.9 has been dropped. Python 3.10+ is now required.
- pikepdf >= 8.8.0 is now required.


================================================
FILE: docs/releasenotes/version17.md
================================================
% SPDX-FileCopyrightText: 2022 James R. Barlow
% SPDX-License-Identifier: CC-BY-SA-4.0

# v17

## v17.3.0

- Fixed Python API ignoring the ``language`` parameter, always defaulting to
  ``eng``. The API now correctly maps ``language`` to OcrOptions ``languages``
  and splits ``+``-separated codes (e.g. ``eng+deu``) to match CLI behavior.
  {issue}`1640`
- Fixed Python API producing empty OCR output because ``tesseract_timeout``
  defaulted to 0, causing Tesseract to time out immediately. The default is
  now ``None``, falling back to the plugin's 180-second timeout. {issue}`1636`
- Fixed OCR text layer displacement on PDFs with non-zero MediaBox origins
  (e.g. JSTOR or cropped PDFs). The coordinate transformation matrix is now
  always computed, not skipped when rotation is zero. {issue}`1630`
- Restored image overlay support (``--image``) for the hocrtransform tool,
  enabling sandwich PDF output with the fpdf2 renderer. {issue}`1634`
- Docker: updated Alpine base image to 3.23.
- Documentation restructured into per-major-version release notes files.
- Release process improvements.

## v17.2.0

- Fixed incorrect word spacing in poppler-based PDF viewers and tools (Evince,
  pdftotext, and others) where words on the same line appeared separated by
  double newlines. This works around a poppler bug where Tz (horizontal scaling)
  is not carried across BT/ET boundaries. {issue}`1632`
- Fixed OCR text layer being visible instead of invisible due to incorrect fpdf2
  text rendering mode attribute. This caused OCR text to appear when images were
  removed from the PDF. {issue}`1631`
- Fixed OCR text layer misalignment with non-zero mediabox origins, which
  affected cropped PDFs and JSTOR PDFs generated by iText. The ``--redo-ocr``
  mode would shift text vertically on these files. {issue}`1630`
- Fixed Ghostscript rasterization failure with very low DPI values (below 10).
  OCRmyPDF now renders at a minimum of 10 DPI and resizes the output to match
  the originally requested dimensions. {issue}`1612`

## v17.1.0

- Added `--tagged-pdf-mode` to allow skipping the TaggedPDF error message, if desired.
- Fixed an issue where deflated JPEGs (FlateDecode + DCTDecode) were counted as
  lossless images for the purpose of determining whether to compress to JPEG,
  causing file size inflation with some workflows (`--mode force` in particular).

## v17.0.1

- Fixed output file size inflation when using pypdfium as rasterizer and force-ocr
  mode.

## v17.0.0

**Breaking changes**

- **Plugin interface migration**: Plugin hooks now receive `OcrOptions` objects instead of
  `argparse.Namespace` objects. Most plugins will continue working due to duck-typing
  compatibility, but plugin developers should update their type hints from `Namespace`
  to `OcrOptions`.
- Built-in plugins no longer modify options in-place, improving immutability and
  code clarity.
- **Lossy JBIG2 removed**: The `--jbig2-lossy` and `--jbig2-page-group-size` options have been
  removed due to well-documented risks of character substitution errors. These options are now
  deprecated and will emit warnings if used. Only lossless JBIG2 compression is supported.
- **PDF/A output behavior change**: If neither Ghostscript nor verapdf is installed,
  `--output-type auto` (the new default) will produce a standard PDF instead of PDF/A. This is
  a change from previous versions where Ghostscript was required and PDF/A was always produced.
  This configuration is rare but users should be aware of the change.

**New features**

- **pypdfium2 rasterizer**: Added optional pypdfium2-based PDF rasterization plugin as an
  alternative to Ghostscript for page rendering. Use `--rasterizer pypdfium` to enable
  (requires `pip install pypdfium2`). The default `--rasterizer auto` prefers pypdfium when
  available and falls back to Ghostscript.
- **Pluggable OCR engines**: New `--ocr-engine` option allows selecting OCR engines:
  - `auto` (default): Uses Tesseract
  - `tesseract`: Explicit Tesseract selection
  - `none`: Skip OCR entirely for PDF processing-only workflows

  This prepares the foundation for future third-party OCR engine plugins.
- **Smart PDF/A conversion**: New `--output-type auto` (now the default) produces best-effort
  PDF/A output without requiring Ghostscript when the verapdf validator is available. Falls back
  to traditional Ghostscript conversion when needed.
- **verapdf integration**: Added optional verapdf validation for fast PDF/A conversion. When
  available, OCRmyPDF attempts speculative PDF/A conversion using pikepdf, validates with verapdf,
  and skips Ghostscript if validation passes.
- **Optional Ghostscript**: As a consequence of the changes above, Ghostscript is no longer a required dependency. It is optional.
- **fpdf2 text renderer**: Replaced legacy hOCR text renderer with new fpdf2-based implementation,
  providing better multilingual support and more accurate text positioning.
- **Improved Occulta glyphless font**: The new Occulta font provides better handling of
  zero-width markers and double-width CJK characters for accurate text layer positioning.
- **Expanded multilingual font support**: Added FontProvider infrastructure with language-aware
  font selection for Devanagari (Hindi, Sanskrit, Marathi, Nepali), CJK (Chinese, Japanese,
  Korean), Arabic script, and many other scripts. System font discovery reduces package size.
- **Simplified mode selection**: New `--mode` (`-m`) argument consolidates processing options:
  - `default`: Error if text is found (standard behavior)
  - `force`: Rasterize all content and run OCR (replaces `--force-ocr`)
  - `skip`: Skip pages with existing text (replaces `--skip-text`)
  - `redo`: Re-OCR pages, stripping old text layer (replaces `--redo-ocr`)

  Legacy flags remain as silent aliases for backward compatibility.

**API improvements**

- Centralized validation logic in the `OcrOptions` Pydantic model
- Removed scattered option mutation throughout the codebase
- Better type safety for plugin development
- Simplified plugin option handling
- New `OcrElement`, `OcrClass`, and `BoundingBox` exports for OCR engine plugin developers
- Extended `OcrEngine` ABC with `generate_ocr()` method for direct OCR tree output, eliding the need to translate a modern engine's output to hOCR or directly write to PDF.

**Bug fixes**

- Fixed double-compression of already-deflated JPEGs.
- Fixed tesseract_cache plugin to properly handle cache misses.
- Fixed handling of PDF page boxes (ArtBox, BleedBox) which were not being processed correctly.
- Added thread safety lock to pypdfium plugin for concurrent operations.
- Improved pdfminer.six compatibility with explicit word spacing.

**Documentation**

- Updated cookbook to replace deprecated `--tesseract-timeout 0` with `--ocr-engine none`.
- Added comprehensive plugin documentation for new OCR engine framework.

**Dependency changes**

- Requires: one of `pypdfium2` or `ghostscript` for PDF rasterization (PDF to image)
  - Preferred: both
- Requires: one of `verapdf` or `ghostscript` for PDF/A generation
  - Preferred: both
- Recommended: `pypdfium2` for PDF rasterization (new dependency)
- Recommended: `ghostscript` (used to be Required)
- Recommended: Noto fonts for improved OCR text positioning
- Optional: `verapdf` for fast PDF/A validation (new dependency)
- Requires: `fpdf2` for text layer rendering (new dependency)
- Recommended: replace `typer` with `cyclopts` in misc scripts (new dependency)
- See docs/maintainers.md for details.

**Migration guide for plugin developers**

- Update imports: `from ocrmypdf._options import OcrOptions`
- Update type hints: `def check_options(options: OcrOptions)` instead of `options: Namespace`
- Attribute access remains unchanged: `options.languages`, `options.output_type`, etc.
- Remove any in-place option modifications - compute values at point of use instead
- Most existing plugins will continue working without changes due to duck-typing


================================================
FILE: misc/_webservice.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: AGPL-3.0-or-later

"""This is a simple web service/HTTP wrapper for OCRmyPDF.

This may be more convenient than the command line tool for some Docker users.
Note that OCRmyPDF uses Ghostscript, which is licensed under AGPLv3+. While
OCRmyPDF is under GPLv3, this file is distributed under the Affero GPLv3+ license,
to emphasize that SaaS deployments should make sure they comply with
Ghostscript's license as well as OCRmyPDF's.
"""

from __future__ import annotations

import os
import subprocess
import sys
from functools import partial
from operator import getitem
from pathlib import Path
from tempfile import NamedTemporaryFile

import pikepdf
import streamlit as st

from ocrmypdf._defaults import DEFAULT_ROTATE_PAGES_THRESHOLD


def get_host_url_with_port(port: int) -> str:
    """Get the host URL for the web service. Hacky."""
    host_url = st.context.headers["host"]
    try:
        host, _streamlit_port = host_url.split(":", maxsplit=1)
    except ValueError:
        host = host_url
    return f"//{host}:{port}"  # Use the same protocol


st.title("OCRmyPDF Web Service")

uploaded = st.file_uploader("Upload input PDF or image", type=["pdf"], key="file")

mode = st.selectbox("Mode", options=["normal", "skip-text", "force-ocr", "redo-ocr"])

pages = st.text_input(
    "Pages", value="", help="Comma-separated list of pages to process"
)

with st.expander("Input options"):
    invalidate_digital_signatures = st.checkbox(
        "Invalidate digital signatures", value=False
    )
    language = st.selectbox("Language", options=["eng", "deu", "fra", "spa"])

    image_dpi = st.slider(
        "Image DPI", value=300, key="image_dpi", min_value=1, max_value=5000, step=50
    )
with st.expander("Preprocessing"):
    skip_big = st.checkbox("Skip OCR on big pages", value=False, key="skip_big")
    oversample = st.slider("Oversample", min_value=0, max_value=5000, value=0, step=50)
    rotate_pages = st.checkbox("Rotate pages", value=False, key="rotate")
    deskew = st.checkbox("Deskew pages", value=False, key="deskew")
    clean = st.checkbox("Clean pages before OCR", value=False, key="clean")
    clean_final = st.checkbox("Clean final", value=False, key="clean_final")
    remove_vectors = st.checkbox("Remove vectors", value=False, key="remove_vectors")


with st.expander("Output options"):
    output_type = st.selectbox(
        "Output type", options=["pdfa", "pdf", "pdfa-1", "pdfa-2", "pdfa-3", "none"]
    )

    pdf_renderer = st.selectbox(
        "PDF renderer", options=["auto", "hocr", "hocrdebug", "sandwich"]
    )

    optimize = st.selectbox("Optimize", options=["0", "1", "2", "3"])

    st.selectbox("PDF/A compression", options=["auto", "jpeg", "lossless"])

with st.expander("Metadata"):
    title = author = keywords = subject = None
    if uploaded:
        with pikepdf.open(uploaded) as pdf, pdf.open_metadata() as meta:
            st.code(str(meta), language="xml")
            title = st.text_input("Title", value=meta.get('dc:title', ''))
            author = st.text_input("Author", value=meta.get('dc:creator', ''))
            keywords = st.text_input("Keywords", value=meta.get('dc:subject', ''))
            subject = st.text_input("Subject", value=meta.get('dc:description', ''))


with st.expander("Optimization after OCR"):
    jpeg_quality = st.slider(
        "JPEG quality", min_value=0, max_value=100, value=75, key="jpeg_quality"
    )
    png_quality = st.slider(
        "PNG quality", min_value=0, max_value=100, value=75, key="png_quality"
    )
    jbig2_threshold = st.number_input(
        "JBIG2 threshold", value=0.85, key="jbig2_threshold"
    )

with st.expander("Advanced options"):
    jobs = st.slider(
        "Threads",
        min_value=1,
        max_value=os.cpu_count(),
        value=os.cpu_count(),
        key="threads",
    )
    max_image_mpixels = st.number_input(
        "Max image size",
        value=250.0,
        min_value=0.0,
        help="Maximum image size in megapixels",
    )
    rotate_pages_threshold = st.number_input(
        "Rotate pages threshold",
        value=DEFAULT_ROTATE_PAGES_THRESHOLD,
        min_value=0.0,
        max_value=1000.0,
        help="Threshold for automatic page rotation",
    )
    fast_web_view = st.number_input(
        "Fast web view",
        value=1.0,
        min_value=0.0,
        help="Linearize files above this size in MB",
    )
    continue_on_soft_render_error = st.checkbox(
        "Continue on soft render error", value=True
    )
    verbose_labels = ["quiet", "default", "debug", "debug_all"]
    verbose = st.selectbox(
        "Verbosity level",
        options=[-1, 0, 1, 2],
        index=1,
        format_func=partial(getitem, verbose_labels),
    )

if uploaded:
    args = []
    if mode and mode != 'normal':
        args.append(f"--{mode}")
    if language:
        args.append(f"--language={language}")
    if not uploaded.name.lower().endswith(".pdf") and image_dpi:
        args.append(f"--image-dpi={image_dpi}")
    if skip_big:
        args.append("--skip-big")
    if oversample:
        args.append(f"--oversample={oversample}")
    if rotate_pages:
        args.append("--rotate-pages")
    if deskew:
        args.append("--deskew")
    if clean:
        args.append("--clean")
    if clean_final:
        args.append("--clean-final")
    if remove_vectors:
        args.append("--remove-vectors")
    if output_type:
        args.append(f"--output-type={output_type}")
    if pdf_renderer:
        args.append(f"--pdf-renderer={pdf_renderer}")
    if optimize:
        args.append(f"--optimize={optimize}")
    if title:
        args.append(f"--title={title}")
    if author:
        args.append(f"--author={author}")
    if keywords:
        args.append(f"--keywords={keywords}")
    if subject:
        args.append(f"--subject={subject}")
    if pages:
        args.append(f"--pages={pages}")
    if max_image_mpixels:
        args.append(f"--max-image-mpixels={max_image_mpixels}")
    if rotate_pages_threshold:
        args.append(f"--rotate-pages-threshold={rotate_pages_threshold}")
    if fast_web_view:
        args.append(f"--fast-web-view={fast_web_view}")
    if continue_on_soft_render_error:
        args.append("--continue-on-soft-render-error")
    if verbose:
        args.append(f"--verbose={verbose}")
    if optimize > '0' and jpeg_quality:
        args.append(f"--jpeg-quality={jpeg_quality}")
    if optimize > '0' and png_quality:
        args.append(f"--png-quality={png_quality}")
    if jbig2_threshold:
        args.append(f"--jbig2-threshold={jbig2_threshold}")
    if jobs:
        args.append(f"--jobs={jobs}")
    with NamedTemporaryFile(delete=True, suffix=f"_{uploaded.name}") as input_file:
        input_file.write(uploaded.getvalue())
        input_file.flush()
        input_file.seek(0)
        args.append(str(input_file.name))
        with NamedTemporaryFile(delete=True, suffix=".pdf") as output_file:
            args.append(str(output_file.name))

            st.session_state['running'] = (
                'run_button' in st.session_state and st.session_state.run_button
            )
            if st.button(
                "Run OCRmyPDF",
                disabled=st.session_state.get("running", False),
                key='run_button',
            ):
                st.session_state['running'] = True
                args = [sys.executable, '-u', '-m', "ocrmypdf"] + args

                proc = subprocess.Popen(
                    args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
                )
                with st.container(border=True):
                    while proc.poll() is None:
                        line = proc.stderr.readline()
                        if line:
                            st.html("<code>" + line.decode().strip() + "</code>")

                if proc.returncode != 0:
                    st.error(f"ocrmypdf failed with exit code {proc.returncode}")
                    st.session_state['running'] = False
                    st.stop()

                if Path(output_file.name).stat().st_size == 0:
                    st.error("No output PDF file was generated")
                    st.stop()

                st.download_button(
                    label="Download output PDF",
                    data=output_file.read(),
                    file_name=uploaded.name,
                    mime="application/pdf",
                )
                st.session_state['running'] = False


================================================
FILE: misc/batch.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2016 findingorder <https://github.com/findingorder>
# SPDX-FileCopyrightText: 2024 nilsro <https://github.com/nilsro>
# SPDX-License-Identifier: MIT

"""Example of using ocrmypdf as a library in a script.

This script will recursively search a directory for PDF files and run OCR on
them. It will log the results. It runs OCR on every file, even if it already
has text. OCRmyPDF will detect files that already have text.

You should edit this script to meet your needs.
"""

from __future__ import annotations

import filecmp
import logging
import os
import posixpath
import shutil
import sys
from pathlib import Path

import ocrmypdf

# pylint: disable=logging-format-interpolation
# pylint: disable=logging-not-lazy


def filecompare(a, b):
    try:
        return filecmp.cmp(a, b, shallow=True)
    except FileNotFoundError:
        return False


script_dir = Path(__file__).parent
# set archive_dir to a path for backup original documents. Leave empty if not required.
archive_dir = "/pdfbak"

start_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")

if len(sys.argv) > 2:
    log_file = Path(sys.argv[2])
else:
    log_file = script_dir.with_name("ocr-tree.log")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    filename=log_file,
    filemode="a",
)

logging.info(f"Start directory {start_dir}")

ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)

for filename in start_dir.glob("**/*.pdf"):
    logging.info(f"Processing {filename}")
    if ocrmypdf.pdfa.file_claims_pdfa(filename)["pass"]:
        logging.info("Skipped document because it already contained text")
    else:
        archive_filename = archive_dir + str(filename)
        if len(archive_dir) > 0 and not filecompare(filename, archive_filename):
            logging.info(f"Archiving document to {archive_filename}")
            try:
                shutil.copy2(filename, posixpath.dirname(archive_filename))
            except OSError:
                os.makedirs(posixpath.dirname(archive_filename))
                shutil.copy2(filename, posixpath.dirname(archive_filename))
        try:
            result = ocrmypdf.ocr(filename, filename, deskew=True)
            logging.info(result)
        except ocrmypdf.exceptions.EncryptedPdfError:
            logging.info("Skipped document because it is encrypted")
        except ocrmypdf.exceptions.PriorOcrFoundError:
            logging.info("Skipped document because it already contained text")
        except ocrmypdf.exceptions.DigitalSignatureError:
            logging.info("Skipped document because it has a digital signature")
        except ocrmypdf.exceptions.TaggedPDFError:
            logging.info(
                "Skipped document because it does not need ocr as it is tagged"
            )
        except Exception:
            logging.error("Unhandled error occured")
        logging.info("OCR complete")


================================================
FILE: misc/bisect_pdf.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MIT

"""Helper script for bisecting PDFs to find a page with an issue."""
from __future__ import annotations

import sys

import pikepdf

if len(sys.argv) != 2:
    print(f"Usage: {sys.argv[0]} <input.pdf>")
    sys.exit(1)

with pikepdf.open(sys.argv[1]) as pdf:
    num_pages = len(pdf.pages)
    low = 0
    high = num_pages - 1
    while low <= high:
        mid = (low + high) // 2
        with pikepdf.new() as new_pdf:
            new_pdf.pages.extend(pdf.pages[low : mid + 1])
            new_pdf.save(f"bisect-issue-{low + 1}-{mid + 1}.pdf")
        print(f"Is bisect-issue-{low + 1}-{mid + 1}.pdf good or bad?", end=" ")
        while True:
            response = input().lower()
            if response == "good":
                low = mid + 1
                break
            elif response == "bad":
                high = mid - 1
                break
            else:
                print("Please respond with 'good' or 'bad'.")
    print(f"The issue is on page {low + 1} of the original PDF.")
    with pikepdf.new() as new_pdf:
        new_pdf.pages.extend(pdf.pages[low])
        new_pdf.save(f"bisect-issue-bad-{low + 1}.pdf")
    with pikepdf.new() as new_pdf:
        new_pdf.pages.extend(pdf.pages[:low])
        new_pdf.pages.extend(pdf.pages[low + 1 :])
        new_pdf.save(f"bisect-issue-good-{low + 1}.pdf")


================================================
FILE: misc/completion/ocrmypdf.bash
================================================
# SPDX-FileCopyrightText: 2021 Frank Pille
# SPDX-FileCopyrightText: 2020 Alex Willner
# SPDX-License-Identifier: MIT

set -o errexit

__ocrmypdf_arguments()
{
    local arguments="\
--help                          (show help message)
--language                      (language(s) of the file to be OCRed)
--image-dpi                     (assume this DPI if input image DPI is unknown)
--output-type                   (select PDF output options)
--sidecar                       (write OCR to text file)
--version                       (print program version and exit)
--jobs                          (how many worker processes to use)
--quiet                         (suppress INFO messages)
--verbose                       (set verbosity level)
--title                         (set metadata)
--author                        (set metadata)
--subject                       (set metadata)
--keywords                      (set metadata)
--rotate-pages                  (rotate pages to correct orientation)
--deskew                        (fix small horizontal alignment skew)
--clean                         (clean document images before OCR)
--clean-final                   (clean document images and keep result)
--unpaper-args                  (a quoted string of arguments to pass to unpaper)
--oversample                    (oversample images to this DPI)
--remove-vectors                (don\'t send vector objects to OCR)
--mode                          (processing mode for pages with existing text)
--force-ocr                     (OCR documents that already have printable text)
--skip-text                     (skip OCR on any pages that already contain text)
--redo-ocr                      (redo OCR on any pages that seem to have OCR already)
--invalidate-digital-signatures (remove digital signatures from PDF)
--tagged-pdf-mode               (control behavior for Tagged PDFs)
--skip-big                      (skip OCR on pages larger than this many MPixels)
--optimize                      (select optimization level)
--jpeg-quality                  (JPEG quality [0..100])
--png-quality                   (PNG quality [0..100])
--jbig2-lossy                   (enable lossy JBIG2 (see docs))
--jbig2-threshold               (set JBIG2 threshold (see docs))
--pages                         (apply OCR to only the specified pages)
--max-image-mpixels             (image decompression bomb threshold)
--pdf-renderer                  (select PDF renderer options)
--ocr-engine                    (OCR engine to use)
--rasterizer                    (PDF page rasterizer)
--rotate-pages-threshold        (page rotation confidence)
--pdfa-image-compression        (set PDF/A image compression options)
--fast-web-view                 (if file size if above this amount in MB linearize PDF)
--continue-on-soft-render-error (continue after recoverable render errors)
--plugin                        (name of plugin to import)
--keep-temporary-files          (keep temporary files (debug)
--tesseract-config              (set custom tesseract config file)
--tesseract-pagesegmode         (set tesseract --psm)
--tesseract-oem                 (set tesseract --oem)
--tesseract-thresholding        (set tesseract image thresholding)
--tesseract-timeout             (maximum number of seconds to wait for OCR)
--tesseract-non-ocr-timeout     (maximum seconds for non-OCR operations)
--tesseract-downsample-large-images    (downsample large images before OCR)
--no-tesseract-downsample-large-images (do not downsample large images)
--tesseract-downsample-above    (downsample images larger than this pixel size)
--user-words                    (specify location of user words file)
--user-patterns                 (specify location of user patterns file)
--no-progress-bar               (disable the progress bar)
--color-conversion-strategy     (select color conversion strategy)
"

    COMPREPLY=( $( compgen -W "$arguments" -- "$cur") )

    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_output-type()
{
    local choices="auto   (best-effort PDF/A without Ghostscript (default))
pdfa   (output a PDF/A-2b)
pdf    (output a standard PDF)
pdfa-1 (output a PDF/A-1b)
pdfa-2 (output a PDF/A-2b)
pdfa-3 (output a PDF/A-3b)
none   (do not produce an output PDF (for example, if you only care about --sidecar))"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )

    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_verbose()
{
    local choices="0  (standard output messages)
1  (troubleshooting output messages)
2  (debugging output messages)"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )

    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_optimize()
{
    local choices="0  (do not optimize)
1  (do safe, lossless optimizations (default))
2  (do some lossy optimizations)
3  (do aggressive lossy optimizations (including lossy JBIG2))"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )

    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_pdf-renderer()
{
    local choices="auto      (auto select PDF renderer, uses fpdf2)
fpdf2     (use fpdf2 renderer with full language support)
sandwich  (use sandwich renderer)
hocr      (use hOCR renderer - deprecated)
hocrdebug (uses hOCR renderer in debug mode - deprecated)"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )

    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_pdfa-image-compression()
{
    local choices="auto     (let Ghostscript decide how to compress images)
jpeg     (convert color and grayscale images to JPEG)
lossless (convert color and grayscale images to lossless (PNG))"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )

    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_tesseract-pagesegmode()
{
    local choices="0  (orientation and script detection (OSD) only)
1  (automatic page segmentation with OSD)
2  (automatic page segmentation, but no OSD, or OCR)
3  (fully automatic page segmentation, but no OSD (default))
4  (assume a single column of text of variable sizes)
5  (assume a single uniform block of vertically aligned text)
6  (assume a single uniform block of text)
7  (treat the image as a single text line)
8  (treat the image as a single word)
9  (treat the image as a single word in a circle)
10 (treat the image as a single character)
11 (sparse text - find as much text as possible in no particular order)
12 (sparse text with OSD)
13 (raw line - treat the image as a single text line)"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )

    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_tesseract-oem()
{
    local choices="0 (legacy engine only)
1 (neural nets LSTM engine only)
2 (legacy + LSTM engines)
3 (default, based on what is available)"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )

    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_tesseract-thresholding()
{
    local choices="auto          (let OCRmyPDF pick thresholding - current always uses otsu)
otsu          (use hOCR renderer)
adaptive-otsu (use adaptive Otsu thresholding)
sauvola       (use Sauvola thresholding)"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )
    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_color-conversion-strategy()
{
    local choices="LeaveColorUnchanged (default)
CMYK (convert to CMYK)
Gray (convert to grayscale)
RGB (convert to RGB)
UseDeviceIndependentColor (convert with device independent color)"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )
    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_mode()
{
    local choices="default (error if text is found)
force   (rasterize all content and run OCR)
skip    (skip pages with existing text)
redo    (re-OCR pages, replacing old invisible text)"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )
    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_tagged-pdf-mode()
{
    local choices="default (error if --mode is default, otherwise warn)
ignore  (always warn but continue processing)"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )
    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_ocr-engine()
{
    local choices="auto      (select best available engine)
tesseract (use Tesseract OCR)
none      (skip OCR entirely)"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )
    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_rasterizer()
{
    local choices="auto        (prefer pypdfium, fall back to Ghostscript)
ghostscript (use Ghostscript rasterizer)
pypdfium    (use pypdfium rasterizer - faster)"

    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )
    # Remove description if only one completion exists
    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
        COMPREPLY=( ${COMPREPLY[0]%% *} )
    fi
}

__ocrmypdf_check_previous()
{
    case $prev in
        -h|--help|--version)
            return 0
            ;;
        -l|--language)
            COMPREPLY=$( command tesseract --list-langs 2>/dev/null )
            COMPREPLY=( $( compgen -W '${COMPREPLY[@]##*:}' -- "$cur" ) )
            return 0
            ;;
        --output-type)
            __ocrmypdf_output-type
            return 0
            ;;
        -j|--jobs)
            COMPREPLY=( $( compgen -W '{1..'$( _ncpus )'}' -- "$cur" ) )
            return 0
            ;;
        -v|--verbose)
            __ocrmypdf_verbose
            return 0
            ;;
        -O|--optimize)
            __ocrmypdf_optimize
            return 0
            ;;
        --pdf-renderer)
            __ocrmypdf_pdf-renderer
            return 0
            ;;
        -m|--mode)
            __ocrmypdf_mode
            return 0
            ;;
        --tagged-pdf-mode)
            __ocrmypdf_tagged-pdf-mode
            return 0
            ;;
        --ocr-engine)
            __ocrmypdf_ocr-engine
            return 0
            ;;
        --rasterizer)
            __ocrmypdf_rasterizer
            return 0
            ;;
        --pdfa-image-compression)
            __ocrmypdf_pdfa-image-compression
            return 0
            ;;
        --tesseract-pagesegmode)
            __ocrmypdf_tesseract-pagesegmode
            return 0
            ;;
        --tesseract-oem)
            __ocrmypdf_tesseract-oem
            return 0
            ;;
        --tesseract-thresholding)
            __ocrmypdf_tesseract-thresholding
            return 0
            ;;

        --title|--author|--subject|--keywords|--unpaper-args|--pages|--plugin|\
        --jpeg-quality|--png-quality|--image-dpi|--oversample|--skip-big|--max-image-mpixels|\
        --tesseract-timeout|--tesseract-non-ocr-timeout|--tesseract-downsample-above|\
        --rotate-pages-threshold|--fast-web-view)
            # argument required but no completions available
            return 0
            ;;
        --tesseract-config|--user-words|--user-patterns|--sidecar)
            _filedir
            return 0
            ;;
        --color-conversion-strategy)
            __ocrmypdf_color-conversion-strategy
            return 0
            ;;
    esac

    return 1
}

_ocrmypdf()
{
    local OLDIFS="$IFS"
    local IFS=$'\n'

    local cur prev

    # Homebrew on Macs have version 1.3 of bash-completion which doesn't include - see #502
    if declare -F _init_completion >/dev/null 2>&1; then
      _init_completion  || return
    else
        COMPREPLY=()
        _get_comp_words_by_ref cur prev
    fi

    if __ocrmypdf_check_previous -ne 0; then
        return
    fi

    if [[ "$cur" == -* ]]; then
        __ocrmypdf_arguments
    else
        _filedir
    fi

    IFS="$OLDIFS"

    return
} &&
complete -F _ocrmypdf ocrmypdf

set +o errexit

# ex: filetype=sh


================================================
FILE: misc/completion/ocrmypdf.fish
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT

complete -c ocrmypdf -x -n __fish_is_first_arg -l version
complete -c ocrmypdf -x -n __fish_is_first_arg -s h -s "?" -l help

complete -c ocrmypdf -r -l sidecar -d "write OCR to text file"
complete -c ocrmypdf -x -s q -l quiet

complete -c ocrmypdf -s r -l rotate-pages -d "rotate pages to correct orientation"
complete -c ocrmypdf -s d -l deskew -d "fix small horizontal alignment skew"
complete -c ocrmypdf -s c -l clean -d "clean document images before OCR"
complete -c ocrmypdf -s i -l clean-final -d "clean document images and keep result"
complete -c ocrmypdf -x -l unpaper-args -d "quoted string of arguments to pass to unpaper"
complete -c ocrmypdf -l remove-vectors -d "don't send vector objects to OCR"

function __fish_ocrmypdf_mode
    echo -e "default\t"(_ "error if text is found")
    echo -e "force\t"(_ "rasterize all content and run OCR")
    echo -e "skip\t"(_ "skip pages with existing text")
    echo -e "redo\t"(_ "re-OCR pages, replacing old invisible text")
end
complete -c ocrmypdf -x -s m -l mode -a '(__fish_ocrmypdf_mode)' -d "processing mode for pages with existing text"
complete -c ocrmypdf -s f -l force-ocr -d "OCR documents that already have printable text"
complete -c ocrmypdf -s s -l skip-text -d "skip OCR on any pages that already contain text"
complete -c ocrmypdf -l redo-ocr -d "redo OCR on any pages that seem to have OCR already"
complete -c ocrmypdf -l invalidate-digital-signatures -d "invalidate digital signatures and allow OCR to proceed"

function __fish_ocrmypdf_tagged_pdf_mode
    echo -e "default\t"(_ "error if --mode is default, otherwise warn")
    echo -e "ignore\t"(_ "always warn but continue processing")
end
complete -c ocrmypdf -x -l tagged-pdf-mode -a '(__fish_ocrmypdf_tagged_pdf_mode)' -d "control behavior for Tagged PDFs"

complete -c ocrmypdf -s k -l keep-temporary-files -d "keep temporary files (debug)"

function __fish_ocrmypdf_languages
    set langs (tesseract --list-langs ^/dev/null)
    set arr (string split '\n' $langs)
    for lang in $arr[2..-1]
        echo $lang
    end
end
complete -c ocrmypdf -x -s l -l language -a '(__fish_ocrmypdf_languages)' -d language

complete -c ocrmypdf -x -l image-dpi -d "assume this DPI if input image DPI is unknown"

function __fish_ocrmypdf_output_type
    echo -e "auto\t"(_ "best-effort PDF/A without requiring Ghostscript (default)")
    echo -e "pdfa\t"(_ "output a PDF/A-2b")
    echo -e "pdf\t"(_ "output a standard PDF")
    echo -e "pdfa-1\t"(_ "output a PDF/A-1b")
    echo -e "pdfa-2\t"(_ "output a PDF/A-2b")
    echo -e "pdfa-3\t"(_ "output a PDF/A-3b")
    echo -e "none\t"(_ "do not produce an output PDF (for example, if you only care about --sidecar)")
end
complete -c ocrmypdf -x -l output-type -a '(__fish_ocrmypdf_output_type)' -d "select PDF output options"

function __fish_ocrmypdf_pdf_renderer
    echo -e "auto\t"(_ "auto select PDF renderer (default, uses fpdf2)")
    echo -e "fpdf2\t"(_ "use fpdf2 renderer with full language support")
    echo -e "sandwich\t"(_ "use sandwich renderer")
    echo -e "hocr\t"(_ "use hOCR renderer (deprecated)")
    echo -e "hocrdebug\t"(_ "uses hOCR renderer in debug mode (deprecated)")
end
complete -c ocrmypdf -x -l pdf-renderer -a '(__fish_ocrmypdf_pdf_renderer)' -d "select PDF renderer options"

function __fish_ocrmypdf_ocr_engine
    echo -e "auto\t"(_ "select best available engine (default)")
    echo -e "tesseract\t"(_ "use Tesseract OCR")
    echo -e "none\t"(_ "skip OCR entirely")
end
complete -c ocrmypdf -x -l ocr-engine -a '(__fish_ocrmypdf_ocr_engine)' -d "OCR engine to use"

function __fish_ocrmypdf_rasterizer
    echo -e "auto\t"(_ "prefer pypdfium, fall back to Ghostscript (default)")
    echo -e "ghostscript\t"(_ "use Ghostscript rasterizer")
    echo -e "pypdfium\t"(_ "use pypdfium rasterizer (faster)")
end
complete -c ocrmypdf -x -l rasterizer -a '(__fish_ocrmypdf_rasterizer)' -d "PDF page rasterizer"

function __fish_ocrmypdf_optimize
    echo -e "0\t"(_ "do not optimize")
    echo -e "1\t"(_ "do safe, lossless optimizations (default)")
    echo -e "2\t"(_ "do some lossy optimizations")
    echo -e "3\t"(_ "do aggressive lossy optimizations (including lossy JBIG2)")
end
complete -c ocrmypdf -x -s O -l optimize -a '(__fish_ocrmypdf_optimize)' -d "select optimization level"

function __fish_ocrmypdf_verbose
    echo -e "0\t"(_ "standard output messages")
    echo -e "1\t"(_ "troubleshooting output messages")
    echo -e "2\t"(_ "debugging output messages")
end
complete -c ocrmypdf -x -s v -l verbose -a '(__fish_ocrmypdf_verbose)' -d "set verbosity level"

complete -c ocrmypdf -x -l no-progress-bar -d "disable the progress bar"

function __fish_ocrmypdf_pdfa_compression
    echo -e "auto\t"(_ "let Ghostscript decide how to compress images")
    echo -e "jpeg\t"(_ "convert color and grayscale images to JPEG")
    echo -e "lossless\t"(_ "convert color and grayscale images to lossless (PNG)")
end
complete -c ocrmypdf -x -l pdfa-image-compression -a '(__fish_ocrmypdf_pdfa_compression)' -d "set PDF/A image compression options"

complete -c ocrmypdf -x -s j -l jobs -d "how many worker processes to use"
complete -c ocrmypdf -x -l title -d "set metadata"
complete -c ocrmypdf -x -l author -d "set metadata"
complete -c ocrmypdf -x -l subject -d "set metadata"
complete -c ocrmypdf -x -l keywords -d "set metadata"
complete -c ocrmypdf -x -l oversample -d "oversample images to this DPI"
complete -c ocrmypdf -x -l skip-big -d "skip OCR on pages larger than this many MPixels"

complete -c ocrmypdf -x -l jpeg-quality -d "JPEG quality [0..100]"
complete -c ocrmypdf -x -l png-quality -d "PNG quality [0..100]"
complete -c ocrmypdf -x -l jbig2-lossy -d "enable lossy JBIG2 (see docs)"
complete -c ocrmypdf -x -l jbig2-threshold -d "JBIG2 compression threshold (see docs)"
complete -c ocrmypdf -x -l max-image-mpixels -d "image decompression bomb threshold"
complete -c ocrmypdf -x -l pages -d "apply OCR to only the specified pages"
complete -c ocrmypdf -x -l tesseract-config -d "set custom tesseract config file"

function __fish_ocrmypdf_tesseract_pagesegmode
    echo -e "0\t"(_ "orientation and script detection (OSD) only")
    echo -e "1\t"(_ "automatic page segmentation with OSD")
    echo -e "2\t"(_ "automatic page segmentation, but no OSD, or OCR")
    echo -e "3\t"(_ "fully automatic page segmentation, but no OSD (default)")
    echo -e "4\t"(_ "assume a single column of text of variable sizes")
    echo -e "5\t"(_ "assume a single uniform block of vertically aligned text")
    echo -e "6\t"(_ "assume a single uniform block of text")
    echo -e "7\t"(_ "treat the image as a single text line")
    echo -e "8\t"(_ "treat the image as a single word")
    echo -e "9\t"(_ "treat the image as a single word in a circle")
    echo -e "10\t"(_ "treat the image as a single character")
    echo -e "11\t"(_ "sparse text - find as much text as possible in no particular order")
    echo -e "12\t"(_ "sparse text with OSD")
    echo -e "13\t"(_ "raw line - treat the image as a single text line")
end
complete -c ocrmypdf -x -l tesseract-pagesegmode -a '(__fish_ocrmypdf_tesseract_pagesegmode)' -d "set tesseract --psm"

function __fish_ocrmypdf_tesseract_oem
    echo -e "0\t"(_ "legacy engine only")
    echo -e "1\t"(_ "neural nets LSTM engine only")
    echo -e "2\t"(_ "legacy + LSTM engines")
    echo -e "3\t"(_ "default, based on what is available")
end
complete -c ocrmypdf -x -l tesseract-oem -a '(__fish_ocrmypdf_tesseract_oem)' -d "set tesseract --oem"

function __fish_ocrmypdf_tesseract_thresholding
    echo -e "auto\t"(_ "let OCRmyPDF pick thresholding (current always uses otsu)")
    echo -e "otsu\t"(_ "legacy Otsu thresholding")
    echo -e "adaptive-otsu\t"(_ "use adaptive Otsu thresholding")
    echo -e "sauvola\t"(_ "use Sauvola thresholding")
end
complete -c ocrmypdf -x -l tesseract-thresholding -a '(__fish_ocrmypdf_tesseract_thresholding)' -d "set tesseract thresholding method (needs Tesseract 5.x)"

complete -c ocrmypdf -x -l tesseract-timeout -d "maximum number of seconds to wait for OCR"
complete -c ocrmypdf -x -l tesseract-non-ocr-timeout -d "maximum seconds to wait for non-OCR operations"
complete -c ocrmypdf -l tesseract-downsample-large-images -d "downsample large images before OCR"
complete -c ocrmypdf -l no-tesseract-downsample-large-images -d "do not downsample large images"
complete -c ocrmypdf -x -l tesseract-downsample-above -d "downsample images larger than this pixel size"
complete -c ocrmypdf -x -l rotate-pages-threshold -d "page rotation confidence"

complete -c ocrmypdf -r -l user-words -d "specify location of user words file"
complete -c ocrmypdf -r -l user-patterns -d "specify location of user patterns file"
complete -c ocrmypdf -x -l fast-web-view -d "if file size if above this amount in MB, linearize PDF"
complete -c ocrmypdf -l continue-on-soft-render-error -d "continue processing after recoverable render errors"
complete -c ocrmypdf -r -l plugin -d "name of plugin to import"

function __fish_ocrmypdf_color_conversion_strategy
    echo -e "LeaveColorUnchanged\t"(_ "do not convert color spaces (default)")
    echo -e "CMYK\t"(_ "convert all color spaces to CMYK")
    echo -e "Gray\t"(_ "convert all color spaces to grayscale")
    echo -e "RGB\t"(_ "convert all color spaces to RGB")
    echo -e "UseDeviceIndependentColor\t"(_ "convert all color spaces to ICC-based color spaces")
end

complete -c ocrmypdf -x -l color-conversion-strategy -a '(__fish_ocrmypdf_color_conversion_strategy)' -d "set color conversion strategy"

function __fish_ocrmypdf_input_file_given
    set -l tokens (commandline -opc)
    for token in $tokens
        if string match -q -r '^-' -- $token
            continue
        end
        if test -f "$token"
            return 0
        end
    end
    return 1
end

complete -c ocrmypdf -x -n 'not __fish_ocrmypdf_input_file_given' -a "(__fish_complete_suffix .pdf)" -d "input file"


================================================
FILE: misc/docker-compose.example.yml
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
---
version: "3.3"
services:
  ocrmypdf:
    restart: always
    container_name: ocrmypdf
    image: jbarlow83/ocrmypdf
    volumes:
      - "/media/scan:/input"
      - "/mnt/scan:/output"
    environment:
      - OCR_OUTPUT_DIRECTORY_YEAR_MONTH=0
    user: "<SET TO YOUR USER ID>:<SET TO YOUR GROUP ID>"
    entrypoint: python3
    command: watcher.py


================================================
FILE: misc/example_plugin.py
================================================
# SPDX-FileCopyrightText: 2022 James R Barlow: https://github.com/jbarlow83
# SPDX-License-Identifier: MIT

"""An example of an OCRmyPDF plugin.

This plugin adds two new command line arguments
    --grayscale-ocr: converts the image to grayscale before performing OCR on it
        (This is occasionally useful for images whose color confounds OCR. It only
        affects the image shown to OCR. The image is not saved.)
    --mono-page: converts pages all pages in the output file to black and white

To use this from the command line:
    ocrmypdf --plugin path/to/example_plugin.py --mono-page input.pdf output.pdf

To use this as an API:
    import ocrmypdf
    ocrmypdf.ocr('input.pdf', 'output.pdf',
        plugins=['path/to/example_plugin.py'], mono_page=True
    )
"""

from __future__ import annotations

import logging

from PIL import Image

from ocrmypdf import hookimpl

log = logging.getLogger(__name__)


@hookimpl
def add_options(parser):
    parser.add_argument('--grayscale-ocr', action='store_true')
    parser.add_argument('--mono-page', action='store_true')


@hookimpl
def prepare(options):
    pass


@hookimpl
def validate(pdfinfo, options):
    pass


@hookimpl
def filter_ocr_image(page, image):
    if page.options.grayscale_ocr:
        log.info("graying")
        return image.convert('L')
    return image


@hookimpl
def filter_page_image(page, image_filename):
    if page.options.mono_page:
        with Image.open(image_filename) as im:
            im = im.convert('1')
            im.save(image_filename)
        return image_filename
    else:
        output = image_filename.with_suffix('.jpg')
        with Image.open(image_filename) as im:
            im.save(output)
        return output


================================================
FILE: misc/flatpak/io.ocrmypdf.ocrmypdf.metainfo.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<component type="console-application">
  <id>io.ocrmypdf.ocrmypdf</id>

  <name>OCRmyPDF</name>
  <summary>Adds an OCR text layer to scanned PDF files, allowing them to be searched</summary>

  <developer id="io.ocrmypdf">
      <name>OCRmyPDF Developers</name>
  </developer>

  <url type="homepage">https://github.com/ocrmypdf/ocrmypdf</url>
  <url type="bugtracker">https://github.com/ocrmypdf/OCRmyPDF/issues</url>

  <content_rating type="oars-1.1" />

  <metadata_license>CC0-1.0</metadata_license>
  <project_license>MPL-2.0</project_license>

  <description>
    <ul>
        <li>Generates a searchable PDF/A file from a regular PDF</li>
        <li>Places OCR text accurately below the image to ease copy / paste</li>
        <li>Keeps the exact resolution of the original embedded images</li>
        <li>When possible, inserts OCR information as a lossless operation without disrupting any other content</li>
        <li>Optimizes PDF images, often producing files smaller than the input file If requested, deskews and/or cleans the image before performing OCR</li>
        <li>Validates input and output files</li>
        <li>Distributes work across all available CPU cores</li>
        <li>Uses Tesseract OCR engine to recognize more than 100 languages</li>
        <li>Keeps your private data private</li>
        <li>Scales properly to handle files with thousands of pages</li>
        <li>Battle-tested on millions of PDFs</li>
    </ul>
  </description>

  <provides>
    <binary>ocrmypdf</binary>
  </provides>

  <icon type="stock">io.ocrmypdf.ocrmypdf</icon>

  <screenshots>
    <screenshot type="default">
      <image>https://raw.githubusercontent.com/ocrmypdf/OCRmyPDF/f7ad5f16bd0340b0b1803dada0c02f9f40542bd8/misc/flatpak/sample_screenshot.png</image>
      <caption>Sample usage of OCRmyPDF</caption>
    </screenshot>
  </screenshots>

  <categories>
    <category>Office</category>
    <category>Utility</category>
  </categories>

  <keywords>
    <keyword>ocr</keyword>
    <keyword>pdf</keyword>
    <keyword>tool</keyword>
  </keywords>

  <releases>
    <release version="16.8.0" date="2025-01-05"/>
  </releases>
</component>


================================================
FILE: misc/ocrmypdf_compare.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MIT

"""Run OCRmyPDF on the same PDF with different options."""

from __future__ import annotations

import os
import shlex
from io import BytesIO
from pathlib import Path
from subprocess import check_output, run
from tempfile import TemporaryDirectory

import pikepdf
import pymupdf
import streamlit as st
from lxml import etree
from streamlit_pdf_viewer import pdf_viewer


def do_column(label, suffix, d):
    cli = st.text_area(
        f"Command line arguments for {label}",
        key=f"args{suffix}",
        value="ocrmypdf {in_} {out}",
    )
    env_text = st.text_area(f"Environment variables for {label}", key=f"env{suffix}")
    env = os.environ.copy()
    for line in env_text.splitlines():
        if line:
            try:
                k, v = line.split("=", 1)
            except ValueError:
                st.error(f"Invalid environment variable: {line}")
                break
            env[k] = v
    args = shlex.split(
        cli.format(
            in_=os.path.join(d, "input.pdf"),
            out=os.path.join(d, f"output{suffix}.pdf"),
        )
    )
    with st.expander("Environment variables", expanded=bool(env_text.strip())):
        st.code('\n'.join(f"{k}={v}" for k, v in env.items()))
    st.code(shlex.join(args))
    return env, args


def main():
    st.set_page_config(layout="wide")

    st.title("OCRmyPDF Compare")
    st.write("Run OCRmyPDF on the same PDF with different options.")
    st.warning("This is a testing tool and is not intended for production use.")

    uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
    if uploaded_pdf is None:
        return

    pdf_bytes = uploaded_pdf.read()

    with pikepdf.open(BytesIO(pdf_bytes)) as p, TemporaryDirectory() as d:
        with st.expander("PDF Metadata"):
            with p.open_metadata() as meta:
                xml_txt = str(meta)
                parser = etree.XMLParser(remove_blank_text=True)
                tree = etree.fromstring(xml_txt, parser=parser)
                st.code(
                    etree.tostring(tree, pretty_print=True).decode("utf-8"),
                    language="xml",
                )
            st.write(p.docinfo)
            st.write("Number of pages:", len(p.pages))

        col1, col2 = st.columns(2)
        with col1:
            env1, args1 = do_column("A", "1", d)
        with col2:
            env2, args2 = do_column("B", "2", d)

        if not st.button("Execute and Compare"):
            return
        with st.spinner("Executing..."):
            Path(d, "input.pdf").write_bytes(pdf_bytes)
            run(args1, env=env1)
            run(args2, env=env2)

            col1, col2 = st.columns(2)
            with col1:
                st.text(
                    "Ghostscript version A: "
                    + check_output(
                        ["gs", "--version"],
                        env=env1,
                        text=True,
                    )
                )
            with col2:
                st.text(
                    "Ghostscript version B: "
                    + check_output(
                        ["gs", "--version"],
                        env=env2,
                        text=True,
                    )
                )

            doc1 = pymupdf.open(os.path.join(d, "output1.pdf"))
            doc2 = pymupdf.open(os.path.join(d, "output2.pdf"))
            for i, page1_2 in enumerate(zip(doc1, doc2, strict=False)):
                st.write(f"Page {i+1}")
                page1, page2 = page1_2
                col1, col2 = st.columns(2)
                with col1, st.container(border=True):
                    st.write(page1.get_text())
                with col2, st.container(border=True):
                    st.write(page2.get_text())

            col1, col2 = st.columns(2)
            with col1, st.expander("PDF Viewer"):
                pdf_viewer(Path(d, "output1.pdf"))
            with col2, st.expander("PDF Viewer"):
                pdf_viewer(Path(d, "output2.pdf"))


if __name__ == "__main__":
    main()


================================================
FILE: misc/pdf_compare.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MIT

"""Compare two PDFs."""

from __future__ import annotations

import os
from io import BytesIO
from pathlib import Path
from tempfile import TemporaryDirectory

import pikepdf
import pymupdf
import streamlit as st
from lxml import etree
from streamlit_pdf_viewer import pdf_viewer


def do_metadata(pdf):
    with pikepdf.open(pdf) as pdf:
        with pdf.open_metadata() as meta:
            xml_txt = str(meta)
            parser = etree.XMLParser(remove_blank_text=True)
            tree = etree.fromstring(xml_txt, parser=parser)
            st.code(
                etree.tostring(tree, pretty_print=True).decode("utf-8"),
                language="xml",
            )
        st.write(pdf.docinfo)
        st.write("Number of pages:", len(pdf.pages))


def main():
    st.set_page_config(layout="wide")

    st.title("PDF Compare")
    st.write("Compare two PDFs.")

    col1, col2 = st.columns(2)
    with col1:
        uploaded_pdf1 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf1')
    with col2:
        uploaded_pdf2 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf2')
    if uploaded_pdf1 is None or uploaded_pdf2 is None:
        return

    pdf_bytes1 = uploaded_pdf1.getvalue()
    pdf_bytes2 = uploaded_pdf2.getvalue()

    with st.expander("PDF Metadata"):
        col1, col2 = st.columns(2)
        with col1:
            do_metadata(BytesIO(pdf_bytes1))
        with col2:
            do_metadata(BytesIO(pdf_bytes2))

    with TemporaryDirectory() as d:
        Path(d, "1.pdf").write_bytes(pdf_bytes1)
        Path(d, "2.pdf").write_bytes(pdf_bytes2)

        with st.expander("Text"):
            doc1 = pymupdf.open(os.path.join(d, "1.pdf"))
            doc2 = pymupdf.open(os.path.join(d, "2.pdf"))
            for i, page1_2 in enumerate(zip(doc1, doc2, strict=False)):
                st.write(f"Page {i+1}")
                page1, page2 = page1_2
                col1, col2 = st.columns(2)
                with col1, st.container(border=True):
                    st.write(page1.get_text())
                with col2, st.container(border=True):
                    st.write(page2.get_text())

        with st.expander("PDF Viewer"):
            col1, col2 = st.columns(2)
            with col1:
                pdf_viewer(Path(d, "1.pdf"), key='pdf_viewer1', render_text=True)
            with col2:
                pdf_viewer(Path(d, "2.pdf"), key='pdf_viewer2', render_text=True)


if __name__ == "__main__":
    main()


================================================
FILE: misc/pdf_text_diff.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Compare text in PDFs."""

from __future__ import annotations

from pathlib import Path
from subprocess import run
from tempfile import NamedTemporaryFile
from typing import Annotated

import cyclopts

app = cyclopts.App()


@app.default
def main(
    pdf1: Annotated[Path, cyclopts.Parameter()],
    pdf2: Annotated[Path, cyclopts.Parameter()],
    *,
    engine: Annotated[str, cyclopts.Parameter()] = 'pdftotext',
):
    """Compare text in PDFs."""
    with open(pdf1, 'rb') as f1, open(pdf2, 'rb') as f2:
        text1 = run(
            ['pdftotext', '-layout', '-', '-'],
            stdin=f1,
            capture_output=True,
            check=True,
        )
        text2 = run(
            ['pdftotext', '-layout', '-', '-'],
            stdin=f2,
            capture_output=True,
            check=True,
        )

    with NamedTemporaryFile() as t1, NamedTemporaryFile() as t2:
        t1.write(text1.stdout)
        t1.flush()
        t2.write(text2.stdout)
        t2.flush()
        diff = run(
            ['diff', '--color=always', '--side-by-side', t1.name, t2.name],
            capture_output=True,
        )
        run(['less', '-R'], input=diff.stdout, check=True)
        if text1.stdout.strip() != text2.stdout.strip():
            return 1

    return 0


if __name__ == '__main__':
    app()


================================================
FILE: misc/screencast/README.md
================================================
<!-- SPDX-License-Identifier: CC-BY-SA-4.0 -->

To regenerate
=============

Using asciinema and svg-term (`npm install -g svg-term-cli`).

Create `~/.config/asciinema/config` to disable prompt.

```
[record]

command = fish --init-command 'alias fish_prompt="echo \>\ "'
```

Run asciinema

```
asciinema rec new_input.cast
```

Re-record faster version with fewer pauses

```
asciinema rec demo.cast -c "asciinema play new_input.cast --speed 2 --idle-time-limit 0.5"
```

Convert to SVG
```
svg-term --in=misc/screencast/demo.cast --out=misc/screencast/demo.svg --window
```


================================================
FILE: misc/screencast/demo.cast
================================================
{"version": 2, "width": 131, "height": 24, "timestamp": 1687247006, "env": {"SHELL": "/usr/bin/fish", "TERM": "xterm-256color"}}
[0.103649, "o", "\u001b[?2004h\u001b]7; \u0007"]
[0.104223, "o", "\u001b]0;fish  \u0007\u001b[30m\u001b(B\u001b[m\r> \u001b[K\r\u001b[C\u001b[C"]
[0.604542, "o", "o\r\u001b[3C\b\u001b[38;2;255;0;0mo\r\u001b[3C\u001b[30m\u001b(B\u001b[m\u001b[38;2;85;85;85mcrmypdf multipage.pdf multipage_with_ocr.pdf\r\u001b[3C\u001b[30m\u001b(B\u001b[m"]
[0.679571, "o", "\u001b[38;2;255;0;0mc\u001b[38;2;85;85;85mrmypdf multipage.pdf multipage_with_ocr.pdf\r\u001b[4C\u001b[30m\u001b(B\u001b[m"]
[0.767271, "o", "\u001b[38;2;255;0;0mr\u001b[38;2;85;85;85mmypdf multipage.pdf multipage_with_ocr.pdf\r\u001b[5C\u001b[30m\u001b(B\u001b[m"]
[0.814505, "o", "\u001b[38;2;255;0;0mm\u001b[38;2;85;85;85mypdf multipage.pdf multipage_with_ocr.pdf\r\u001b[6C\u001b[30m\u001b(B\u001b[m"]
[0.938919, "o", "\u001b[38;2;255;0;0my\u001b[38;2;85;85;85mpdf multipage.pdf multipage_with_ocr.pdf\r\u001b[7C\u001b[30m\u001b(B\u001b[m"]
[0.967347, "o", "\u001b[38;2;255;0;0mp\u001b[38;2;85;85;85mdf multipage.pdf multipage_with_ocr.pdf\r\u001b[8C\u001b[30m\u001b(B\u001b[m"]
[1.009954, "o", "\u001b[38;2;255;0;0md\u001b[38;2;85;85;85mf multipage.pdf multipage_with_ocr.pdf\r\u001b[9C\u001b[30m\u001b(B\u001b[m"]
[1.034488, "o", "\u001b[38;2;255;0;0mf\u001b[38;2;85;85;85m multipage.pdf multipage_with_ocr.pdf\r\u001b[10C\u001b[30m\u001b(B\u001b[m\b\b\b\b\b\b\b\b\u001b[38;2;0;95;215mocrmypdf\u001b[38;2;85;85;85m multipage.pdf multipage_with_ocr.pdf\r\u001b[10C\u001b[30m\u001b(B\u001b[m"]
[1.069226, "o", "\u001b[38;2;0;95;215m \u001b[38;2;85;85;85mmultipage.pdf multipage_with_ocr.pdf\r\u001b[11C\u001b[30m\u001b(B\u001b[m\b \u001b[38;2;85;85;85mmultipage.pdf multipage_with_ocr.pdf\r\u001b[11C\u001b[30m\u001b(B\u001b[m"]
[1.569682, "o", "-\u001b[K\r\u001b[12C\u001b[38;2;85;85;85m-version\r\u001b[12C\u001b[30m\u001b(B\u001b[m\b\u001b[38;2;0;175;255m-\u001b[38;2;85;85;85m-version\r\u001b[12C\u001b[30m\u001b(B\u001b[m"]
[1.642096, "o", "\u001b[38;2;0;175;255m-\u001b[38;2;85;85;85mversion\r\u001b[13C\u001b[30m\u001b(B\u001b[m"]
[1.71793, "o", "\u001b[38;2;0;175;255ms\u001b[30m\u001b(B\u001b[m\u001b[K\r\u001b[14C"]
[1.771483, "o", "\u001b[38;2;0;175;255mk\r\u001b[15C\u001b[30m\u001b(B\u001b[m"]
[1.864664, "o", "\u001b[38;2;0;175;255mi\r\u001b[16C\u001b[30m\u001b(B\u001b[m"]
[1.876085, "o", "\u001b[38;2;0;175;255mp\r\u001b[17C\u001b[30m\u001b(B\u001b[m"]
[2.092979, "o", "\u001b[38;2;0;175;255m-\r\u001b[18C\u001b[30m\u001b(B\u001b[m"]
[2.138821, "o", "\u001b[38;2;0;175;255mt\r\u001b[19C\u001b[30m\u001b(B\u001b[m"]
[2.18017, "o", "\u001b[38;2;0;175;255me\r\u001b[20C\u001b[30m\u001b(B\u001b[m"]
[2.268222, "o", "\u001b[38;2;0;175;255mx\r\u001b[21C\u001b[30m\u001b(B\u001b[m"]
[2.277031, "o", "\u001b[38;2;0;175;255mt\r\u001b[22C\u001b[30m\u001b(B\u001b[m"]
[2.322469, "o", "\u001b[38;2;0;175;255m \r\u001b[23C\u001b[30m\u001b(B\u001b[m\b \r\u001b[23C"]
[2.824696, "o", "m\r\u001b[24C\b\u001b[38;2;0;175;255m\u001b[4mm\r\u001b[24C\u001b[30m\u001b(B\u001b[m\u001b[38;2;85;85;85masks.pdf \r\u001b[24C\u001b[30m\u001b(B\u001b[m"]
[2.923234, "o", "\u001b[38;2;0;175;255m\u001b[4mu\u001b[30m\u001b(B\u001b[m\u001b[K\r\u001b[25C\u001b[38;2;85;85;85mltipage.pdf \r\u001b[25C\u001b[30m\u001b(B\u001b[m"]
[2.960685, "o", "\u001b[38;2;0;175;255m\u001b[4ml\u001b[38;2;85;85;85m\u001b[24mtipage.pdf \r\u001b[26C\u001b[30m\u001b(B\u001b[m"]
[3.03365, "o", "\u001b[38;2;0;175;255m\u001b[4mt\u001b[38;2;85;85;85m\u001b[24mipage.pdf \r\u001b[27C\u001b[30m\u001b(B\u001b[m"]
[3.479338, "o", "\u001b[38;2;0;175;255m\u001b[4mipage.pdf \r\u001b[37C\u001b[30m\u001b(B\u001b[m\b \r\u001b[37C"]
[3.754818, "o", "m\r\u001b[38C\b\u001b[38;2;0;175;255m\u001b[4mm\r\u001b[38C\u001b[30m\u001b(B\u001b[m\u001b[38;2;85;85;85masks.pdf \r\u001b[38C\u001b[30m\u001b(B\u001b[m"]
[3.873318, "o", "\u001b[38;2;0;175;255m\u001b[4mu\u001b[30m\u001b(B\u001b[m\u001b[K\r\u001b[39C\u001b[38;2;85;85;85mltipage.pdf \r\u001b[39C\u001b[30m\u001b(B\u001b[m"]
[3.926829, "o", "\u001b[38;2;0;175;255m\u001b[4ml\u001b[38;2;85;85;85m\u001b[24mtipage.pdf \r\u001b[40C\u001b[30m\u001b(B\u001b[m"]
[4.272251, "o", "\u001b[38;2;0;175;255m\u001b[4mtipage.pdf \r\u001b[51C\u001b[30m\u001b(B\u001b[m\b \r\u001b[51C"]
[4.343464, "o", "\r\u001b[50C"]
[4.416286, "o", "\r\u001b[49C"]
[4.490574, "o", "\r\u001b[48C"]
[4.564115, "o", "\r\u001b[47C"]
[4.630398, "o", "\r\u001b[46C"]
[4.76825, "o", "\u001b[38;2;0;175;255m\u001b[4m_.pd\u001b[30m\u001b(B\u001b[mf \r\u001b[47C\u001b[10D\u001b[38;2;0;175;255mmultipage_.pdf\u001b[30m\u001b(B\u001b[m \r\u001b[47C"]
[5.012506, "o", "\u001b[38;2;0;175;255mo.pd\u001b[30m\u001b(B\u001b[mf \r\u001b[48C\u001b[3C\u001b[38;2;0;175;255mf\u001b[30m\u001b(B\u001b[m \r\u001b[48C"]
[5.053615, "o", "\u001b[38;2;0;175;255mc.pd\u001b[30m\u001b(B\u001b[mf \r\u001b[49C\u001b[3C\u001b[38;2;0;175;255mf\u001b[30m\u001b(B\u001b[m \r\u001b[49C"]
[5.103957, "o", "\u001b[38;2;0;175;255mr.pd\u001b[30m\u001b(B\u001b[mf \r\u001b[50C\u001b[3C\u001b[38;2;0;175;255mf\u001b[30m\u001b(B\u001b[m \r\u001b[50C"]
[5.226183, "o", "\r\u001b[55C"]
[5.728321, "o", "\r\n\u001b[30m\u001b(B\u001b[m\u001b[?2004l\u001b]0;ocrmypdf --skip-text multipage.pdf multipage_ocr.pdf  /home/jb/src/ocrmypdf/tests/resources\u0007\u001b[30m\u001b(B\u001b[m\r"]
[5.801032, "o", "\rScanning contents:   0%|                                                                                   | 0/6 [00:00<?, ?page/s]"]
[5.802664, "o", "\rScanning contents: 100%|█████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1270.68page/s]\r\n"]
[5.802747, "o", "Start processing 6 pages concurrently\r\n"]
[5.803488, "o", "\rOCR:   0%|                                                                                             | 0.0/6.0 [00:00<?, ?page/s]"]
[5.804896, "o", "\r                                                                                                                                   \r    4 skipping all processing on this page\r\n\rOCR:   0%|                                                                                             | 0.0/6.0 [00:00<?, ?page/s]"]
[5.896969, "o", "\rOCR:  25%|█████████████████████▎                                                               | 1.5/6.0 [00:00<00:00,  8.12page/s]"]
[6.170021, "o", "\rOCR:  42%|███████████████████████████████████▍                                                 | 2.5/6.0 [00:00<00:01,  3.05page/s]"]
[6.292338, "o", "\rOCR:  58%|█████████████████████████████████████████████████▌                                   | 3.5/6.0 [00:00<00:00,  3.39page/s]"]
[6.586017, "o", "\rOCR:  75%|███████████████████████████████████████████████████████████████▊                     | 4.5/6.0 [00:01<00:00,  2.49page/s]"]
[7.087058, "o", "\rOCR:  92%|█████████████████████████████████████████████████████████████████████████████▉       | 5.5/6.0 [00:06<00:00,  1.98s/page]\rOCR: 100%|█████████████████████████████████████████████████████████████████████████████████████| 6.0/6.0 [00:06<00:00,  1.09s/page]\r\nPostprocessing...\r\n"]
[7.104927, "o", "\rPDF/A conversion:   0%|                                                                                    | 0/6 [00:00<?, ?page/s]"]
[7.607392, "o", "\rPDF/A conversion:  50%|██████████████████████████████████████                                      | 3/6 [00:01<00:01,  1.61page/s]"]
[7.653781, "o", "\rPDF/A conversion:  83%|███████████████████████████████████████████████████████████████▎            | 5/6 [00:01<00:00,  2.90page/s]"]
[7.774532, "o", "\rPDF/A conversion: 100%|████████████████████████████████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.71page/s]\r\n"]
[7.778252, "o", "\u001b[33mSome input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.\u001b[0m\r\n"]
[8.280789, "o", "\rRecompressing JPEGs: 0image [00:00, ?image/s]\rRecompressing JPEGs: 0image [00:00, ?image/s]\r\n\rDeflating JPEGs:   0%|                                                                                    | 0/4 [00:00<?, ?image/s]\rDeflating JPEGs: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 238.28image/s]\r\n"]
[8.28149, "o", "\rJBIG2: 0item [00:00, ?item/s]\rJBIG2: 0item [00:00, ?item/s]\r\n"]
[8.289998, "o", "Image optimization ratio: 1.01 savings: 1.3%\r\nTotal file size ratio: 1.02 savings: 1.6%\r\n"]
[8.291209, "o", "Output file is a PDF/A-2b (as expected)\r\n"]
[8.361316, "o", "\u001b[2m⏎\u001b(B\u001b[m                                                                                                                                  \r⏎ \r\u001b[K\u001b[?2004h\u001b]0;fish /home/jb/src/ocrmypdf/tests/resources\u0007\u001b[30m\u001b(B\u001b[m> \u001b[K\r\u001b[C\u001b[C"]
[8.862206, "o", "\r\n\u001b[30m\u001b(B\u001b[m\u001b[30m\u001b(B\u001b[m\u001b[?2004l"]


================================================
FILE: misc/synology.py
================================================
#!/bin/env python3
# SPDX-FileCopyrightText: 2017 Enantiomerie
# SPDX-License-Identifier: MIT

"""Example OCRmyPDF for Synology NAS."""

from __future__ import annotations

# This script must be edited to meet your needs.
import logging
import os
import shutil
import subprocess
import sys
import time

# pylint: disable=logging-format-interpolation
# pylint: disable=logging-not-lazy

script_dir = os.path.dirname(os.path.realpath(__file__))
timestamp = time.strftime("%Y-%m-%d-%H%M_")
log_file = script_dir + '/' + timestamp + 'ocrmypdf.log'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(message)s',
    filename=log_file,
    filemode='w',
)

start_dir = sys.argv[1] if len(sys.argv) > 1 else '.'

for dir_name, _subdirs, file_list in os.walk(start_dir):
    logging.info(dir_name)
    os.chdir(dir_name)
    for filename in file_list:
        file_stem, file_ext = os.path.splitext(filename)
        if file_ext != '.pdf':
            continue
        full_path = os.path.join(dir_name, filename)
        timestamp_ocr = time.strftime("%Y-%m-%d-%H%M_OCR_")
        filename_ocr = timestamp_ocr + file_stem + '.pdf'
        # create string for pdf processing
        # the script is processed as root user via chron
        cmd = [
            'docker',
            'run',
            '--rm',
            '-i',
            'jbarlow83/ocrmypdf',
            '--deskew',
            '-',
            '-',
        ]
        logging.info(cmd)
        full_path_ocr = os.path.join(dir_name, filename_ocr)
        with (
            open(filename, 'rb') as input_file,
            open(full_path_ocr, 'wb') as output_file,
        ):
            proc = subprocess.run(
                cmd,
                stdin=input_file,
                stdout=output_file,
                stderr=subprocess.PIPE,
                check=False,
                text=True,
                errors='ignore',
            )
        logging.info(proc.stderr)
        os.chmod(full_path_ocr, 0o664)
        os.chmod(full_path, 0o664)
        full_path_ocr_archive = sys.argv[2]
        full_path_archive = sys.argv[2] + '/no_ocr'
        shutil.move(full_path_ocr, full_path_ocr_archive)
        shutil.move(full_path, full_path_archive)
logging.info('Finished.\n')


================================================
FILE: misc/watcher.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2019 Ian Alexander <https://github.com/ianalexander>
# SPDX-FileCopyrightText: 2020 James R Barlow <https://github.com/jbarlow83>
# SPDX-License-Identifier: MIT

"""Watch a directory for new PDFs and OCR them."""

from __future__ import annotations

import datetime as dt
import json
import logging
import shutil
import sys
import time
from enum import Enum
from pathlib import Path
from typing import Annotated, Any

import cyclopts
import pikepdf
from dotenv import load_dotenv
from watchdog.events import PatternMatchingEventHandler
from watchdog.observers import Observer
from watchdog.observers.polling import PollingObserver

import ocrmypdf

load_dotenv()


# pylint: disable=logging-format-interpolation
app = cyclopts.App(name="ocrmypdf-watcher")

log = logging.getLogger('ocrmypdf-watcher')


class LoggingLevelEnum(str, Enum):
    """Enum for logging levels."""

    DEBUG = "DEBUG"
    INFO = "INFO"
    WARNING = "WARNING"
    ERROR = "ERROR"
    CRITICAL = "CRITICAL"


def get_output_path(root: Path, basename: str, output_dir_year_month: bool) -> Path:
    assert '/' not in basename, "basename must not contain '/'"
    if output_dir_year_month:
        today = dt.datetime.today()
        output_directory_year_month = root / str(today.year) / f'{today.month:02d}'
        if not output_directory_year_month.exists():
            output_directory_year_month.mkdir(parents=True, exist_ok=True)
        output_path = Path(output_directory_year_month) / Path(basename).with_suffix(
            '.pdf'
        )
    else:
        output_path = root / Path(basename).with_suffix('.pdf')
    return output_path


def wait_for_file_ready(
    file_path: Path, poll_new_file_seconds: int, retries_loading_file: int
):
    # This loop waits to make sure that the file is completely loaded on
    # disk before attempting to read. Docker sometimes will publish the
    # watchdog event before the file is actually fully on disk, causing
    # pikepdf to fail.

    tries = retries_loading_file + 1
    while tries:
        try:
            with pikepdf.Pdf.open(file_path) as pdf:
                log.debug(f"{file_path} ready with {pdf.pages} pages")
                return True
        except (FileNotFoundError, OSError) as e:
            log.info(f"File {file_path} is not ready yet")
            log.debug("Exception was", exc_info=e)
            time.sleep(poll_new_file_seconds)
            tries -= 1
        except pikepdf.PdfError as e:
            log.info(f"File {file_path} is not full written yet")
            log.debug("Exception was", exc_info=e)
            time.sleep(poll_new_file_seconds)
            tries -= 1

    return False


def execute_ocrmypdf(
    *,
    file_path: Path,
    archive_dir: Path,
    output_dir: Path,
    ocrmypdf_kwargs: dict[str, Any],
    on_success_delete: bool,
    on_success_archive: bool,
    poll_new_file_seconds: int,
    retries_loading_file: int,
    output_dir_year_month: bool,
):
    output_path = get_output_path(output_dir, file_path.name, output_dir_year_month)

    log.info("-" * 20)
    log.info(f'New file: {file_path}. Waiting until fully written...')
    if not wait_for_file_ready(file_path, poll_new_file_seconds, retries_loading_file):
        log.info(f"Gave up waiting for {file_path} to become ready")
        return
    log.info(f'Attempting to OCRmyPDF to: {output_path}')

    log.debug(
        f'OCRmyPDF input_file={file_path} output_file={output_path} '
        f'kwargs: {ocrmypdf_kwargs}'
    )
    exit_code = ocrmypdf.ocr(
        ocrmypdf.OcrOptions(
            input_file=file_path,
            output_file=output_path,
            **ocrmypdf_kwargs,
        )
    )
    if exit_code == 0:
        if on_success_delete:
            log.info(f'OCR is done. Deleting: {file_path}')
            file_path.unlink()
        elif on_success_archive:
            log.info(f'OCR is done. Archiving {file_path.name} to {archive_dir}')
            shutil.move(file_path, f'{archive_dir}/{file_path.name}')
        else:
            log.info('OCR is done')
    else:
        log.info('OCR is done')


class HandleObserverEvent(PatternMatchingEventHandler):
    def __init__(  # noqa: D107
        self,
        patterns=None,
        ignore_patterns=None,
        ignore_directories=False,
        case_sensitive=False,
        settings=None,
    ):
        super().__init__(
            patterns=patterns,
            ignore_patterns=ignore_patterns,
            ignore_directories=ignore_directories,
            case_sensitive=case_sensitive,
        )
        self._settings = settings if settings else {}

    def on_any_event(self, event):
        if event.event_type in ['created']:
            execute_ocrmypdf(file_path=Path(event.src_path), **self._settings)


@app.default
def main(
    input_dir: Annotated[
        Path,
        cyclopts.Parameter(
            env_var='OCR_INPUT_DIRECTORY',
        ),
    ] = Path('/input'),
    output_dir: Annotated[
        Path,
        cyclopts.Parameter(
            env_var='OCR_OUTPUT_DIRECTORY',
        ),
    ] = Path('/output'),
    archive_dir: Annotated[
        Path,
        cyclopts.Parameter(
            env_var='OCR_ARCHIVE_DIRECTORY',
        ),
    ] = Path('/processed'),
    *,
    output_dir_year_month: Annotated[
        bool,
        cyclopts.Parameter(
            env_var='OCR_OUTPUT_DIRECTORY_YEAR_MONTH',
            help='Create a subdirectory in the output directory for each year/month',
        ),
    ] = False,
    on_success_delete: Annotated[
        bool,
        cyclopts.Parameter(
            env_var='OCR_ON_SUCCESS_DELETE',
            help='Delete the input file after successful OCR',
        ),
    ] = False,
    on_success_archive: Annotated[
        bool,
        cyclopts.Parameter(
            env_var='OCR_ON_SUCCESS_ARCHIVE',
            help='Archive the input file after successful OCR',
        ),
    ] = False,
    deskew: Annotated[
        bool,
        cyclopts.Parameter(
            env_var='OCR_DESKEW',
            help='Deskew the input file before OCR',
        ),
    ] = False,
    ocr_json_settings: Annotated[
        str | None,
        cyclopts.Parameter(
            env_var='OCR_JSON_SETTINGS',
            help='JSON settings to pass to OCRmyPDF (JSON string or file path)',
        ),
    ] = None,
    poll_new_file_seconds: Annotated[
        int,
        cyclopts.Parameter(
            env_var='OCR_POLL_NEW_FILE_SECONDS',
            help='Seconds to wait before polling a new file',
        ),
    ] = 1,
    use_polling: Annotated[
        bool,
        cyclopts.Parameter(
            env_var='OCR_USE_POLLING',
            help='Use polling instead of filesystem events',
        ),
    ] = False,
    retries_loading_file: Annotated[
        int,
        cyclopts.Parameter(
            env_var='OCR_RETRIES_LOADING_FILE',
            help='Number of times to retry loading a file before giving up',
        ),
    ] = 5,
    loglevel: Annotated[
        LoggingLevelEnum,
        cyclopts.Parameter(
            env_var='OCR_LOGLEVEL',
            help='Logging level',
        ),
    ] = LoggingLevelEnum.INFO,
    patterns: Annotated[
        str,
        cyclopts.Parameter(
            env_var='OCR_PATTERNS',
            help='File patterns to watch',
        ),
    ] = '*.pdf,*.PDF',
):
    ocrmypdf.configure_logging(
        verbosity=(
            ocrmypdf.Verbosity.default
            if loglevel != LoggingLevelEnum.DEBUG
            else ocrmypdf.Verbosity.debug
        ),
        manage_root_logger=True,
    )
    log.setLevel(loglevel.value)
    log.info(
        f"Starting OCRmyPDF watcher with config:\n"
        f"Input Directory: {input_dir}\n"
        f"Output Directory: {output_dir}\n"
        f"Output Directory Year & Month: {output_dir_year_month}\n"
        f"Archive Directory: {archive_dir}"
    )
    log.info(
        f"INPUT_DIRECTORY: {input_dir}\n"
        f"OUTPUT_DIRECTORY: {output_dir}\n"
        f"ARCHIVE_DIRECTORY: {archive_dir}\n"
        f"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\n"
        f"ON_SUCCESS_DELETE: {on_success_delete}\n"
        f"ON_SUCCESS_ARCHIVE: {on_success_archive}\n"
        f"DESKEW: {deskew}\n"
        f"ARGS: {ocr_json_settings}\n"
        f"POLL_NEW_FILE_SECONDS: {poll_new_file_seconds}\n"
        f"RETRIES_LOADING_FILE: {retries_loading_file}\n"
        f"USE_POLLING: {use_polling}\n"
        f"LOGLEVEL: {loglevel.value}"
    )

    if ocr_json_settings and Path(ocr_json_settings).exists():
        json_settings = json.loads(Path(ocr_json_settings).read_text())
    else:
        json_settings = json.loads(ocr_json_settings or '{}')

    if 'input_file' in json_settings or 'output_file' in json_settings:
        log.error(
            'OCR_JSON_SETTINGS (--ocr-json-settings) may not specify input/output file'
        )
        sys.exit(1)

    handler = HandleObserverEvent(
        patterns=patterns.split(','),
        settings={
            'archive_dir': archive_dir,
            'output_dir': output_dir,
            'ocrmypdf_kwargs': json_settings | {'deskew': deskew},
            'on_success_delete': on_success_delete,
            'on_success_archive': on_success_archive,
            'poll_new_file_seconds': poll_new_file_seconds,
            'retries_loading_file': retries_loading_file,
            'output_dir_year_month': output_dir_year_month,
        },
    )
    observer = PollingObserver() if use_polling else Observer()
    observer.schedule(handler, input_dir, recursive=True)
    observer.start()
    print(f"Watching {input_dir} for new PDFs. Press Ctrl+C to exit.")
    try:
        while True:
            time.sleep(30)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()


if __name__ == "__main__":
    app()


================================================
FILE: misc/webservice.py
================================================
#!/usr/bin/env python
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: AGPL-3.0-or-later

"""Run the OCRmyPDF web service."""

from __future__ import annotations

import os
import sys

try:
    import streamlit  # noqa: F401
except ImportError:
    raise ImportError(
        'You need to install streamlit in the Python environment '
        'to run the web service.\n'
    ) from None

if __name__ == '__main__':
    os.execvp(
        sys.executable,
        [
            sys.executable,
            '-m',
            'streamlit',
            'run',
            'misc/_webservice.py',
            *sys.argv[1:],
        ],
    )


================================================
FILE: pyproject.toml
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "ocrmypdf"
version = "17.3.0"
description = "OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched"
readme = "README.md"
license = "MPL-2.0"
requires-python = ">=3.11"
dependencies = [
  "deprecation>=2.1.0",
  "fpdf2>=2.8.0",
  "img2pdf>=0.5",
  "packaging>=20",
  "pdfminer.six>=20220319",
  "pi-heif",                # Heif image format - maintainers: if this is removed, it will NOT break
  "pikepdf>=10",
  "Pillow>=10.0.1",
  "pluggy>=1",
  "pydantic>=2.12.5",
  "pypdfium2>=5.0.0",
  "rich>=13",
  "uharfbuzz>=0.53.2",
]
authors = [{ name = "James R. Barlow", email = "james@purplerock.ca" }]
classifiers = [
  "Development Status :: 5 - Production/Stable",
  "Environment :: Console",
  "Intended Audience :: End Users/Desktop",
  "Intended Audience :: Science/Research",
  "Intended Audience :: System Administrators",
  "Operating System :: MacOS",
  "Operating System :: Microsoft :: Windows",
  "Operating System :: POSIX",
  "Operating System :: POSIX :: BSD",
  "Operating System :: POSIX :: Linux",
  "Programming Language :: Python :: 3",
  "Topic :: Scientific/Engineering :: Image Recognition",
  "Topic :: Text Processing :: Indexing",
  "Topic :: Text Processing :: Linguistic",
]
keywords = ["PDF", "OCR", "optical character recognition", "PDF/A", "scanning"]

[project.urls]
Documentation = "https://ocrmypdf.readthedocs.io/"
Source = "https://github.com/ocrmypdf/OCRmyPDF"
Tracker = "https://github.com/ocrmypdf/OCRmyPDF/issues"
Changelog = "https://github.com/ocrmypdf/OCRmyPDF/tree/main/docs/releasenotes"

[project.optional-dependencies]
# User-installable features - use `uv sync --extra <name>` or `pip install ocrmypdf[name]`
watcher = ["watchdog>=1.0.2", "cyclopts>=3", "python-dotenv"]
webservice = ["streamlit>=1.41.0"]

[project.scripts]
ocrmypdf = "ocrmypdf.__main__:run"

[tool.distutils.bdist_wheel]
python-tag = "py311"

[tool.coverage.run]
branch = true
parallel = true
concurrency = ["multiprocessing", "thread"]
sigterm = true

[tool.coverage.paths]
source = ["src/ocrmypdf"]

[tool.coverage.report]
# Regexes for lines to exclude from consideration
exclude_lines = [
  # Have to re-enable the standard pragma
  "pragma: no cover",
  # Don't complain if tests don't hit defensive assertion code:
  "raise AssertionError",
  "raise NotImplementedError",
  # Don't complain if non-runnable code isn't run:
  "if 0:",
  "if False:",
  "if __name__ == .__main__.:",
  "if TYPE_CHECKING:",
]

[tool.pytest.ini_options]
minversion = "6.0"
testpaths = ["tests"]
addopts = "-n auto"
markers = ["slow"]
filterwarnings = [
  "ignore:.*XMLParser.*:DeprecationWarning",
  "ignore:.*ast.NameConstant.*:DeprecationWarning:reportlab",
  "ignore:.*distutils.*:DeprecationWarning:libxmp",
]

[tool.mypy]

[[tool.mypy.overrides]]
module = [
  'pluggy',
  'img2pdf',
  'pdfminer.*',
  'reportlab.*',
  'fitz',
  'libxmp.utils',
]
ignore_missing_imports = true

[tool.ruff]
target-version = "py311"
exclude = ["src/ocrmypdf/_version.py"] # Autogenerated

[tool.ruff.lint]
"select" = [
  "D",   # pydocstyle
  "E",   # pycodestyle
  "W",   # pycodestyle
  "F",   # pyflakes
  "I",   # isort
  "UP",  # pyupgrade
  "SIM", # simplify
  "B",   # flake8-bugbear
  "ICN", # flake8-import-conventions
]
ignore = [
  "B028", # warning with no explicit stacklevel
  # rule is key in dict instead of key in dict.keys(); but pikepdf semantics differ
  "SIM118",
]

[tool.ruff.lint.isort]
known-first-party = ["ocrmypdf"]
required-imports = ["from __future__ import annotations"]

[tool.ruff.lint.flake8-import-conventions]
# Prohibit explicit imports from the 'datetime' module
banned-from = ["datetime"]
# Optionally, suggest an alias for 'import datetime' (e.g., as dt)
extend-aliases = { "datetime" = "dt" }

[tool.ruff.lint.pydocstyle]
convention = "google"

[tool.ruff.lint.per-file-ignores]
"docs/conf.py" = ["D100", "D101", "D105"]
"tests/*.py" = ["D100", "D101", "D102", "D103", "D105", "E501"]
"misc/*.py" = ["D103", "D101", "D102"]
"src/ocrmypdf/builtin_plugins/*.py" = ["D103", "D102", "D105"]

[tool.ruff.format]
quote-style = "preserve"

[dependency-groups]
# Developer-only tools - use `uv sync --group <name>`
dev = ["mypy>=1.13.0", "ipykernel>=6.29.5", "reportlab>=4.4.4"]
test = [
  # Core testing framework
  "coverage[toml]>=6.2",
  "hypothesis>=6.36.0",
  "pytest>=6.2.5",
  "pytest-cov>=3.0.0",
  "pytest-xdist>=2.5.0",
  # Test dependencies
  "python-xmp-toolkit==2.0.1", # also requires apt-get install libexempi3
  "reportlab>=3.6.8",
  # Type stubs for testing
  "types-Pillow",
  "types-humanfriendly",
  # Extended test capabilities (merged from extended_test)
  "pymupdf>=1.24.14",
]
docs = [
  "myst-parser>=4.0.1",
  "sphinx",
  "sphinx-issues",
  "sphinx-reredirects",
  "sphinx-rtd-theme",
  "sphinxcontrib-mermaid",
]
streamlit-dev = ["streamlit>=1.40.2", "streamlit-pdf-viewer>=0.0.19"]


================================================
FILE: scripts/generate_glyphless_font.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Generate the Occulta glyphless font for OCRmyPDF.

Occulta (Latin for "hidden") is a glyphless font designed for invisible text layers
in searchable PDFs. It has proper Unicode cmap coverage using format 13 (many-to-one)
for efficient mapping of all BMP codepoints to a small set of width-specific glyphs.

Features:
- Full BMP coverage (U+0000 to U+FFFF)
- Width-aware glyphs for proper text selection:
  - Zero-width for combining marks and invisible characters
  - Regular width (500 units) for Latin, Greek, Cyrillic, Arabic, Hebrew, etc.
  - Double width (1000 units) for CJK and fullwidth characters
- Uses cmap format 13 (many-to-one) for ~12KB size vs ~780KB with format 12
- Compatible with fpdf2 and other modern PDF libraries

Usage:
    python scripts/generate_glyphless_font.py

Output:
    src/ocrmypdf/data/Occulta.ttf
"""

from __future__ import annotations

import unicodedata
from pathlib import Path

from fontTools.fontBuilder import FontBuilder
from fontTools.ttLib import TTFont
from fontTools.ttLib.tables._c_m_a_p import CmapSubtable
from fontTools.ttLib.tables._g_l_y_f import Glyph

# Output path relative to this script
OUTPUT_PATH = Path(__file__).parent.parent / "src" / "ocrmypdf" / "data" / "Occulta.ttf"

# Font metrics (units per em = 1000)
UNITS_PER_EM = 1000
ASCENT = 800
DESCENT = -200

# Glyph definitions: (name, advance_width, left_side_bearing)
GLYPHS = [
    (".notdef", 500, 0),  # Required, used for unmapped characters
    ("space", 500, 0),  # U+0020 SPACE
    ("nbspace", 500, 0),  # U+00A0 NO-BREAK SPACE
    ("blank0", 0, 0),  # Zero-width (combining marks, ZWNJ, ZWJ, BOM)
    ("blank1", 500, 0),  # Regular width (most scripts)
    ("blank2", 1000, 0),  # Double width (CJK, fullwidth)
]

# Explicit zero-width character codepoints
ZERO_WIDTH_CHARS = frozenset(
    [
        0x200B,  # ZERO WIDTH SPACE
        0x200C,  # ZERO WIDTH NON-JOINER
        0x200D,  # ZERO WIDTH JOINER
        0xFEFF,  # ZERO WIDTH NO-BREAK SPACE (BOM)
        0x200E,  # LEFT-TO-RIGHT MARK
        0x200F,  # RIGHT-TO-LEFT MARK
        0x202A,  # LEFT-TO-RIGHT EMBEDDING
        0x202B,  # RIGHT-TO-LEFT EMBEDDING
        0x202C,  # POP DIRECTIONAL FORMATTING
        0x202D,  # LEFT-TO-RIGHT OVERRIDE
        0x202E,  # RIGHT-TO-LEFT OVERRIDE
        0x2060,  # WORD JOINER
        0x2061,  # FUNCTION APPLICATION
        0x2062,  # INVISIBLE TIMES
        0x2063,  # INVISIBLE SEPARATOR
        0x2064,  # INVISIBLE PLUS
    ]
)


def classify_codepoint(codepoint: int) -> str:
    """Classify a Unicode codepoint into one of our glyph categories.

    Args:
        codepoint: Unicode codepoint (0x0000 to 0xFFFF)

    Returns:
        Glyph name to map this codepoint to
    """
    # Special cases first
    if codepoint == 0x0020:
        return "space"
    if codepoint == 0x00A0:
        return "nbspace"
    if codepoint in ZERO_WIDTH_CHARS:
        return "blank0"

    # Use Unicode properties for the rest
    char = chr(codepoint)
    try:
        category = unicodedata.category(char)
        east_asian_width = unicodedata.east_asian_width(char)

        # Combining marks are zero-width
        if category.startswith("M"):
            return "blank0"

        # Wide and Fullwidth characters are double-width
        if east_asian_width in ("W", "F"):
            return "blank2"

        # Everything else is regular width
        return "blank1"

    except (ValueError, TypeError):
        # Fallback for any edge cases
        return "blank1"


def build_cmap() -> dict[int, str]:
    """Build the Unicode to glyph name mapping for the entire BMP.

    Returns:
        Dictionary mapping codepoints to glyph names
    """
    return {cp: classify_codepoint(cp) for cp in range(0x10000)}


def create_font() -> TTFont:
    """Create the Occulta glyphless font.

    Returns:
        TTFont object ready to be saved
    """
    glyph_names = [g[0] for g in GLYPHS]

    # Start building the font
    fb = FontBuilder(UNITS_PER_EM, isTTF=True)
    fb.setupGlyphOrder(glyph_names)

    # Create empty (invisible) glyphs
    glyphs = {}
    for name, _, _ in GLYPHS:
        glyph = Glyph()
        glyph.numberOfContours = 0
        glyphs[name] = glyph
    fb.setupGlyf(glyphs)

    # Set up horizontal metrics
    metrics = {name: (width, lsb) for name, width, lsb in GLYPHS}
    fb.setupHorizontalMetrics(metrics)

    # Minimal cmap to satisfy FontBuilder (we'll replace it later)
    fb.setupCharacterMap({0x0020: "space", 0x00A0: "nbspace"})

    # Set up other required tables
    fb.setupHorizontalHeader(ascent=ASCENT, descent=DESCENT)
    fb.setupOS2(
        sTypoAscender=ASCENT,
        sTypoDescender=DESCENT,
        sTypoLineGap=0,
        usWinAscent=UNITS_PER_EM,
        usWinDescent=abs(DESCENT),
        sxHeight=500,
        sCapHeight=700,
    )
    import time

    # Use current time for font timestamps
    now = int(time.time())
    fb.setupHead(unitsPerEm=UNITS_PER_EM, created=now, modified=now)
    fb.setupPost()
    fb.setupNameTable(
        {
            "familyName": "Occulta",
            "styleName": "Regular",
            "uniqueFontIdentifier": "OCRmyPDF;Occulta-Regular;2026",
            "fullName": "Occulta Regular",
            "version": "Version 2.0",
            "psName": "Occulta-Regular",
        }
    )

    # Build the font
    font = fb.font

    # Now replace the cmap with format 13 for efficient many-to-one mapping
    char_to_glyph = build_cmap()

    cmap13 = CmapSubtable.newSubtable(13)
    cmap13.platformID = 3  # Windows
    cmap13.platEncID = 10  # Unicode full repertoire
    cmap13.language = 0
    cmap13.cmap = char_to_glyph

    font["cmap"].tables = [cmap13]

    return font


def main() -> None:
    """Generate the Occulta font and save it."""
    print("Generating Occulta glyphless font...")

    font = create_font()

    # Create output directory if needed
    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

    # Save the font
    font.save(str(OUTPUT_PATH))
    font.close()

    # Report statistics
    size = OUTPUT_PATH.stat().st_size
    print(f"Saved to: {OUTPUT_PATH}")
    print(f"Size: {size:,} bytes")

    # Verify cmap
    font = TTFont(str(OUTPUT_PATH))
    for table in font["cmap"].tables:
        print(
            f"cmap: Platform {table.platformID}, "
            f"Encoding {table.platEncID}, "
            f"Format {table.format}, "
            f"{len(table.cmap)} mappings"
        )
    font.close()

    print("Done!")


if __name__ == "__main__":
    main()


================================================
FILE: snapcraft.yaml
================================================
# SPDX-FileCopyrightText: 2022 Alexander Langanke
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-FileCopyrightText: 2023 林博仁(Buo-ren, Lin) <Buo.Ren.Lin@gmail.com>
# SPDX-License-Identifier: MPL-2.0

name: ocrmypdf
title: OCRmyPDF
base: core24
version: git
summary: OCRmyPDF adds a searchable text layer to scanned PDF files
description: OCRmyPDF packaged for snap
grade: stable
confinement: strict
icon: docs/images/logo-square-256.svg
license: MPL-2.0

platforms:
  amd64:

environment:
  TESSDATA_PREFIX: $SNAP/usr/share/tesseract-ocr/5/tessdata
  GS_LIB: $SNAP/usr/share/ghostscript/10.02.1/Resource/Init
  GS_FONTPATH: $SNAP/usr/share/ghostscript/10.02.1/Resource/Font
  LD_LIBRARY_PATH: $SNAP/usr/lib/x86_64-linux-gnu

apps:
  ocrmypdf:
    command: usr/bin/snapcraft-preload python3 -m ocrmypdf
    plugs:
      - desktop
      - desktop-legacy
      - wayland
      - x11
      - home
      - removable-media

parts:
  snapcraft-preload:
    source: https://github.com/sergiusens/snapcraft-preload.git
    plugin: cmake
    cmake-parameters:
      - -DCMAKE_INSTALL_PREFIX=/usr -DLIBPATH=/usr/lib
    build-packages:
      - on amd64:
          - gcc-multilib
          - g++-multilib
    stage-packages:
      - lib32stdc++6

  jbig2enc:
    plugin: autotools
    source: https://github.com/agl/jbig2enc.git
    source-tag: "0.29"
    build-packages:
      - libleptonica-dev

  ocrmypdf:
    plugin: python
    source: .

    build-packages:
      - python3-pip

    stage-packages:
      - ghostscript
      - icc-profiles-free
      - liblept5
      - libxml2
      - pngquant
      - tesseract-ocr-all
      - unpaper
      - qpdf
      - zlib1g

    python-packages:
      - cffi
      - pdfminer.six
      - pikepdf
      - Pillow
      - pluggy
      - reportlab
      - setuptools
      - tqdm
      - pipe
      - wheel

    override-build: |
      craftctl default
      ln -sf ../usr/lib/libsnapcraft-preload.so $CRAFT_PART_INSTALL/lib/libsnapcraft-preload.so


================================================
FILE: src/ocrmypdf/RELEASE.md
================================================
<!-- SPDX-FileCopyrightText: 2022 James R. Barlow -->
<!-- SPDX-License-Identifier: CC-BY-SA-4.0 -->

# Release checklist

## Patch release

- Check `pytest`

- Update release notes

## Minor release

## Major release

- Run `pre-commit autoupdate`

- Check README.md

- Check pyproject.toml

    - Are classifiers up to date?
    - Is `python_requires` correct?
    - Is it to drop support for older Pythons?
    - Can we tighten any `install_requires` dependencies?

- Search for old version shims we can remove

    - "shim"
    - ` pikepdf.__version__`

- Search for deprecation: search all files for deprec*, etc.

- Check requirements in setup.cfg

- Delete `tests/cache`, do `pytest --runslow`, and update cache.

- Do `pytest --cov-report html`


================================================
FILE: src/ocrmypdf/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Adds OCR layer to PDFs."""

from __future__ import annotations

from pluggy import HookimplMarker as _HookimplMarker

from ocrmypdf import helpers, hocrtransform, pdfa, pdfinfo
from ocrmypdf._concurrent import Executor
from ocrmypdf._defaults import PROGRAM_NAME
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._options import OcrOptions, TaggedPdfMode
from ocrmypdf._pipelines._common import (
    configure_debug_logging,
)
from ocrmypdf._version import __version__
from ocrmypdf.api import (
    Verbosity,
    configure_logging,
    ocr,
)
from ocrmypdf.exceptions import (
    BadArgsError,
    DpiError,
    EncryptedPdfError,
    ExitCode,
    ExitCodeException,
    InputFileError,
    MissingDependencyError,
    OutputFileAccessError,
    PriorOcrFoundError,
    SubprocessOutputError,
    TesseractConfigError,
    UnsupportedImageFormatError,
)
from ocrmypdf.models.ocr_element import (
    Baseline,
    BoundingBox,
    FontInfo,
    OcrClass,
    OcrElement,
)
from ocrmypdf.pluginspec import OcrEngine, OrientationConfidence

hookimpl = _HookimplMarker('ocrmypdf')

__all__ = [
    '__version__',
    'BadArgsError',
    'Baseline',
    'BoundingBox',
    'configure_debug_logging',
    'configure_logging',
    'DpiError',
    'EncryptedPdfError',
    'Executor',
    'ExitCode',
    'ExitCodeException',
    'FontInfo',
    'helpers',
    'hocrtransform',
    'hookimpl',
    'InputFileError',
    'MissingDependencyError',
    'ocr',
    'OcrClass',
    'OcrElement',
    'OcrEngine',
    'OcrOptions',
    'OrientationConfidence',
    'OutputFileAccessError',
    'PageContext',
    'pdfa',
    'PdfContext',
    'pdfinfo',
    'PriorOcrFoundError',
    'PROGRAM_NAME',
    'SubprocessOutputError',
    'TaggedPdfMode',
    'TesseractConfigError',
    'UnsupportedImageFormatError',
    'Verbosity',
]


================================================
FILE: src/ocrmypdf/__main__.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""ocrmypdf command line entrypoint."""

from __future__ import annotations

import logging
import multiprocessing
import os
import signal
import sys
from contextlib import suppress

from ocrmypdf import __version__
from ocrmypdf._pipelines.ocr import run_pipeline_cli
from ocrmypdf._validation import check_options
from ocrmypdf.api import Verbosity, configure_logging
from ocrmypdf.cli import get_options_and_plugins
from ocrmypdf.exceptions import (
    BadArgsError,
    ExitCode,
    InputFileError,
    MissingDependencyError,
)

log = logging.getLogger('ocrmypdf')


def sigbus(*args):
    """Handle SIGBUS signals.

    pikepdf, depending on configuration, may use mmap so SIGBUS is a
    possibility.
    """
    raise InputFileError("Lost access to the input file")


def run(args=None):
    """Run the ocrmypdf command line interface."""
    options, plugin_manager = get_options_and_plugins(args=args)

    with suppress(AttributeError, PermissionError):
        os.nice(5)

    verbosity = options.verbose
    if not os.isatty(sys.stderr.fileno()):
        options.progress_bar = False
    if options.quiet:
        verbosity = Verbosity.quiet
        options.progress_bar = False
    configure_logging(
        verbosity,
        progress_bar_friendly=options.progress_bar,
        manage_root_logger=True,
        plugin_manager=plugin_manager,
    )
    log.debug('ocrmypdf %s', __version__)
    try:
        check_options(options, plugin_manager)
    except ValueError as e:
        log.error(e)
        return ExitCode.bad_args
    except BadArgsError as e:
        log.error(e)
        return e.exit_code
    except MissingDependencyError as e:
        log.error(e)
        return ExitCode.missing_dependency

    with suppress(AttributeError, OSError):
        signal.signal(signal.SIGBUS, sigbus)

    result = run_pipeline_cli(options=options, plugin_manager=plugin_manager)
    return result


if __name__ == '__main__':
    multiprocessing.freeze_support()
    if sys.platform not in ('win32', 'darwin'):
        with suppress(RuntimeError):
            multiprocessing.set_start_method('forkserver')
    sys.exit(run())


================================================
FILE: src/ocrmypdf/_annots.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""OCRmyPDF PDF annotation cleanup."""

from __future__ import annotations

import logging

from pikepdf import Dictionary, Name, NameTree, Pdf

log = logging.getLogger(__name__)


def remove_broken_goto_annotations(pdf: Pdf) -> bool:
    """Remove broken goto annotations from a PDF.

    If a PDF contains a GoTo Action that points to a named destination that does not
    exist, Ghostscript PDF/A conversion will fail. In any event, a named destination
    that is not defined is not useful.

    Args:
        pdf: Opened PDF file.

    Returns:
        bool: True if the file was modified, False if not.
    """
    modified = False

    # Check if there are any named destinations
    if Name.Names not in pdf.Root:
        return modified
    if Name.Dests not in pdf.Root[Name.Names]:
        return modified

    dests = pdf.Root[Name.Names][Name.Dests]
    if not isinstance(dests, Dictionary):
        return modified
    nametree = NameTree(dests)

    # Create a set of all named destinations
    names = set(k for k in nametree.keys())

    for n, page in enumerate(pdf.pages):
        if Name.Annots not in page:
            continue
        for annot in page[Name.Annots]:
            if not isinstance(annot, Dictionary):
                continue
            if Name.A not in annot or Name.D not in annot[Name.A]:
                continue
            # We found an annotation that points to a named destination
            named_destination = str(annot[Name.A][Name.D])
            if named_destination not in names:
                # If there is no corresponding named destination, remove the
                # annotation. Having no destination set is still valid and just
                # makes the link non-functional.
                log.warning(
                    f"Disabling a hyperlink annotation on page {n + 1} to a "
                    "non-existent named destination "
                    f"{named_destination}."
                )
                del annot[Name.A][Name.D]
                modified = True

    return modified


================================================
FILE: src/ocrmypdf/_concurrent.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""OCRmyPDF concurrency abstractions."""

from __future__ import annotations

import threading
from abc import ABC, abstractmethod
from collections.abc import Callable, Iterable
from typing import Any, TypeVar

from ocrmypdf._progressbar import NullProgressBar, ProgressBar

T = TypeVar('T')


def _task_noop(*_args, **_kwargs) -> None:
    return


def _task_finished_noop(_result: Any, pbar: ProgressBar):
    pbar.update()


class Executor(ABC):
    """Abstract concurrent executor."""

    pool_lock = threading.Lock()
    pbar_class = NullProgressBar

    def __init__(self, *, pbar_class=None):
        if pbar_class:
            self.pbar_class = pbar_class

    def __call__(
        self,
        *,
        use_threads: bool,
        max_workers: int,
        progress_kwargs: dict,
        worker_initializer: Callable | None = None,
        task: Callable[..., T] | None = None,
        task_arguments: Iterable | None = None,
        task_finished: Callable[[T, ProgressBar], None] | None = None,
    ) -> None:
        """Set up parallel execution and progress reporting.

        Args:
            use_threads: If ``False``, the workload is the sort that will benefit from
                running in a multiprocessing context (for example, it uses Python
                heavily, and parallelizing it with threads is not expected to be
                performant).
            max_workers: The maximum number of workers that should be run.
            progress_kwargs: Arguments to set up the progress bar.
            worker_initializer: Called when a worker is initialized, in the worker's
                execution context. If the child workers are processes, it must be
                possible to marshall/pickle the worker initializer.
                ``functools.partial`` can be used to bind parameters.
            task: Called when the worker starts a new task, in the worker's execution
                context. Must be possible to marshall to the worker.
            task_finished: Called when a worker finishes a task, in the parent's
                context.
            task_arguments: An iterable that generates a group of parameters for each
                task. This runs in the parent's context, but the parameters must be
                marshallable to the worker.
        """
        if not task_arguments:
            return  # Nothing to do!
        if not worker_initializer:
            worker_initializer = _task_noop
        if not task_finished:
            task_finished = _task_finished_noop
        if not task:
            task = _task_noop

        with self.pool_lock:
            self._execute(
                use_threads=use_threads,
                max_workers=max_workers,
                progress_kwargs=progress_kwargs,
                worker_initializer=worker_initializer,
                task=task,
                task_arguments=task_arguments,
                task_finished=task_finished,
            )

    @abstractmethod
    def _execute(
        self,
        *,
        use_threads: bool,
        max_workers: int,
        progress_kwargs: dict,
        worker_initializer: Callable,
        task: Callable,
        task_arguments: Iterable,
        task_finished: Callable,
    ):
        """Custom executors should override this method."""


def setup_executor(plugin_manager) -> Executor:
    pbar_class = plugin_manager.get_progressbar_class()
    return plugin_manager.get_executor(progressbar_class=pbar_class)


class SerialExecutor(Executor):
    """Implements a purely sequential executor using the parallel protocol.

    The current process/thread will be the worker that executes all tasks
    in order. As such, ``worker_initializer`` will never be called.
    """

    def _execute(
        self,
        *,
        use_threads: bool,
        max_workers: int,
        progress_kwargs: dict,
        worker_initializer: Callable,
        task: Callable,
        task_arguments: Iterable,
        task_finished: Callable,
    ):  # pylint: disable=unused-argument
        with self.pbar_class(**progress_kwargs) as pbar:
            for args in task_arguments:
                result = task(*args)
                task_finished(result, pbar)


================================================
FILE: src/ocrmypdf/_defaults.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

# Enforce English hegemony
from __future__ import annotations

DEFAULT_LANGUAGE = 'eng'

# Default rotation threshold
DEFAULT_ROTATE_PAGES_THRESHOLD = 14.0

PROGRAM_NAME = 'OCRmyPDF'


================================================
FILE: src/ocrmypdf/_exec/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Manage third party executables."""

from __future__ import annotations


================================================
FILE: src/ocrmypdf/_exec/ghostscript.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Interface to Ghostscript executable."""

from __future__ import annotations

import logging
import os
import re
from collections import deque
from os import fspath
from pathlib import Path
from subprocess import PIPE, CalledProcessError

from packaging.version import Version
from PIL import Image, UnidentifiedImageError

from ocrmypdf.exceptions import (
    ColorConversionNeededError,
    InputFileError,
    SubprocessOutputError,
)
from ocrmypdf.helpers import Resolution
from ocrmypdf.pluginspec import GhostscriptRasterDevice
from ocrmypdf.subprocess import get_version, run, run_polling_stderr

COLOR_CONVERSION_STRATEGIES = frozenset(
    [
        'CMYK',
        'Gray',
        'LeaveColorUnchanged',
        'RGB',
        'UseDeviceIndependentColor',
    ]
)
# Ghostscript executable - gswin32c is not supported
GS = 'gswin64c' if os.name == 'nt' else 'gs'


log = logging.getLogger(__name__)


class DuplicateFilter(logging.Filter):
    """Filter out duplicate log messages.

    A context window of default 5 messages is used to determine if a message is a
    duplicate. This is because some Ghostscript messages are word wrapped.
    """

    def __init__(self, logger: logging.Logger, context_window=5):
        self.window: deque[str] = deque([], maxlen=context_window)
        self.logger = logger
        self.levelno = logging.DEBUG
        self.count = 0

    def filter(self, record):
        if record.msg in self.window:
            self.count += 1
            self.levelno = record.levelno
            return False
        else:
            if self.count >= 1:
                rep_msg = f"(suppressed {self.count} repeated lines)"
                self.count = 0  # Avoid infinite recursion
                self.logger.log(self.levelno, rep_msg)
                self.window.clear()
            self.window.append(record.msg)
            return True


log.addFilter(DuplicateFilter(log))


def version() -> Version:
    return Version(get_version(GS))


def _gs_error_reported(stream) -> bool:
    match = re.search(r'error', stream, flags=re.IGNORECASE)
    return bool(match)


def _gs_devicen_reported(stream) -> bool:
    """Did Ghostscript warn about a DeviceN with inappropriate alternate?

    If so, we need the user to select a color conversion, or the resulting PDF will
    not present correctly in some PDF viewers.
    """
    match = re.search(
        r'DeviceN.*inappropriate alternate',
        stream,
        flags=re.IGNORECASE | re.MULTILINE,
    )
    return bool(match)


def rasterize_pdf(
    input_file: os.PathLike,
    output_file: os.PathLike,
    *,
    raster_device: GhostscriptRasterDevice,
    raster_dpi: Resolution,
    pageno: int = 1,
    page_dpi: Resolution | None = None,
    rotation: int | None = None,
    filter_vector: bool = False,
    stop_on_error: bool = False,
    use_cropbox: bool = False,
):
    """Rasterize one page of a PDF at resolution raster_dpi in canvas units.

    Args:
        input_file: The PDF file to rasterize.
        output_file: The file to write the rasterized PDF to.
        raster_device: The Ghostscript raster device to use to rasterize the PDF.
        raster_dpi: Resolution in dots per inch at which to rasterize page.
        pageno: Page number to rasterize (beginning at page 1).
        page_dpi: Resolution, overriding output image DPI.
        rotation: Cardinal angle, clockwise, to rotate page.
        filter_vector: If True, remove vector graphics objects.
        stop_on_error: If True, stop rasterizing on the first error.
        use_cropbox: If True, rasterize the CropBox instead of MediaBox.
            Default is False (use MediaBox).
    """
    raster_dpi = raster_dpi.round(6)
    if not page_dpi:
        page_dpi = raster_dpi

    # Ghostscript may fail with very low DPI values (below 10). If the requested
    # DPI is too low, use a minimum of 10 DPI and resize the output afterward.
    MIN_RASTER_DPI = 10
    needs_low_dpi_resize = (
        raster_dpi.x < MIN_RASTER_DPI or raster_dpi.y < MIN_RASTER_DPI
    )
    if needs_low_dpi_resize:
        effective_dpi = Resolution(
            max(raster_dpi.x, MIN_RASTER_DPI), max(raster_dpi.y, MIN_RASTER_DPI)
        )
    else:
        effective_dpi = raster_dpi

    args_gs = (
        [
            GS,
            '-dSAFER',
            '-dBATCH',
            '-dNOPAUSE',
            '-dInterpolateControl=-1',
            f'-sDEVICE={raster_device}',
            f'-dFirstPage={pageno}',
            f'-dLastPage={pageno}',
            f'-r{effective_dpi.x:f}x{effective_dpi.y:f}',
        ]
        + (['-dUseCropBox'] if use_cropbox else [])
        + (['-dFILTERVECTOR'] if filter_vector else [])
        + (['-dPDFSTOPONERROR'] if stop_on_error else [])
        + [
            '-o',
            fspath(output_file),
            '-sstdout=%stderr',  # Literal %s, not string interpolation
            '-dAutoRotatePages=/None',  # Probably has no effect on raster
            '-f',
            fspath(input_file),
        ]
    )

    try:
        p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
    except CalledProcessError as e:
        log.error(e.stderr.decode(errors='replace'))
        Path(output_file).unlink(missing_ok=True)
        raise SubprocessOutputError("Ghostscript rasterizing failed") from e

    stderr = p.stderr.decode(errors='replace')
    if _gs_error_reported(stderr):
        log.error(stderr)
        if stop_on_error and "recoverable image error" in stderr:
            Path(output_file).unlink(missing_ok=True)
            raise InputFileError(
                "Ghostscript rasterizing failed. The input file contains errors that "
                "cause PDF viewers to interpret it differently and incorrectly. "
                "Try using --continue-on-soft-render-error and manually inspect the "
                "input and output files to check for visual differences or errors."
            )

    try:
        with Image.open(output_file) as im:
            if needs_low_dpi_resize:
                # Resize to the dimensions that would have resulted from the
                # original low DPI request
                scale_x = raster_dpi.x / effective_dpi.x
                scale_y = raster_dpi.y / effective_dpi.y
                new_size = (
                    max(1, int(round(im.width * scale_x))),
                    max(1, int(round(im.height * scale_y))),
                )
                im = im.resize(new_size, Image.Resampling.LANCZOS)
            if rotation is not None:
                log.debug("Rotating output by %i", rotation)
                # rotation is a clockwise angle and Image.ROTATE_* is
                # counterclockwise so this cancels out the rotation
                if rotation == 90:
                    im = im.transpose(Image.Transpose.ROTATE_90)
                elif rotation == 180:
                    im = im.transpose(Image.Transpose.ROTATE_180)
                elif rotation == 270:
                    im = im.transpose(Image.Transpose.ROTATE_270)
                if rotation % 180 == 90:
                    page_dpi = page_dpi.flip_axis()
            im.save(output_file, dpi=page_dpi)
    except UnidentifiedImageError:
        log.error(
            f"Ghostscript (using {raster_device} at {raster_dpi} dpi) produced "
            "an invalid page image file."
        )
        raise
    except OSError as e:
        log.error(
            f"Ghostscript (using {raster_device} at {raster_dpi} dpi) produced "
            "an invalid page image file."
        )
        raise UnidentifiedImageError() from e


class GhostscriptFollower:
    """Parses the output of Ghostscript and uses it to update the progress bar."""

    re_process = re.compile(r"Processing pages \d+ through (\d+).")
    re_page = re.compile(r"Page (\d+)")

    def __init__(self, progressbar_class):
        self.count = 0
        self.progressbar_class = progressbar_class
        self.progressbar = None

    def __enter__(self):
        # We can't actually set up the progressbar here, because we don't know
        # how many pages there are until the first __call__() happens. So we
        # do it in __call__().
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        if self.progressbar:
            return self.progressbar.__exit__(exc_type, exc_value, traceback)
        return False

    def __call__(self, line):
        if not self.progressbar_class:
            return
        if not self.progressbar:
            m = self.re_process.match(line.strip())
            if m:
                self.count = int(m.group(1))
                self.progressbar = self.progressbar_class(
                    total=self.count, desc="PDF/A conversion", unit='page'
                )
                # Now that we know the count, we can set up the progressbar.
                self.progressbar.__enter__()
        else:
            if self.re_page.match(line.strip()):
                self.progressbar.update()


def generate_pdfa(
    pdf_pages,
    output_file: os.PathLike,
    *,
    compression: str,
    color_conversion_strategy: str,
    pdf_version: str = '1.5',
    pdfa_part: str = '2',
    progressbar_class=None,
    stop_on_error: bool = False,
):
    # Ghostscript's compression is all or nothing. We can either force all images
    # to JPEG, force all to Flate/PNG, or let it decide how to encode the images.
    # In most case it's best to let it decide.
    compression_args = []
    if compression == 'jpeg':
        compression_args = [
            "-dAutoFilterColorImages=false",
            "-dColorImageFilter=/DCTEncode",
            "-dAutoFilterGrayImages=false",
            "-dGrayImageFilter=/DCTEncode",
        ]
    elif compression == 'lossless':
        compression_args = [
            "-dAutoFilterColorImages=false",
            "-dColorImageFilter=/FlateEncode",
            "-dAutoFilterGrayImages=false",
            "-dGrayImageFilter=/FlateEncode",
        ]
    else:
        compression_args = [
            "-dAutoFilterColorImages=true",
            "-dAutoFilterGrayImages=true",
        ]

    gs_version = version()
    if gs_version == Version('9.56.0'):
        # 9.56.0 breaks our OCR, should be fixed in 9.56.1
        # https://bugs.ghostscript.com/show_bug.cgi?id=705187
        compression_args.append('-dNEWPDF=false')

    if os.name == 'nt':
        # Windows has lots of fatal "permission denied" errors
        stop_on_error = False

    # nb no need to specify ProcessColorModel when ColorConversionStrategy
    # is set; see:
    # https://bugs.ghostscript.com/show_bug.cgi?id=699392
    args_gs = (
        [
            GS,
            "-dBATCH",
            "-dNOPAUSE",
            "-dSAFER",
            f"-dCompatibilityLevel={str(pdf_version)}",
            "-sDEVICE=pdfwrite",
            "-dAutoRotatePages=/None",
            f"-sColorConversionStrategy={color_conversion_strategy}",
        ]
        + (['-dPDFSTOPONERROR'] if stop_on_error else [])
        + compression_args
        + [
            "-dJPEGQ=95",
            "-dSubsetFonts=false",  # Prevents GS from messing up some encodings
            f"-dPDFA={pdfa_part}",
            "-dPDFACompatibilityPolicy=1",
            "-o",
            fspath(output_file),
            "-sstdout=%stderr",  # Literal %s, not string interpolation
        ]
    )
    args_gs.extend(fspath(s) for s in pdf_pages)  # Stringify Path objs
    try:
        with GhostscriptFollower(progressbar_class) as pbar:
            p = run_polling_stderr(
                args_gs,
                stderr=PIPE,
                check=True,
                text=True,
                encoding='utf-8',
                errors='replace',
                callback=pbar,
            )
    except CalledProcessError as e:
        # Ghostscript does not change return code when it fails to create
        # PDF/A - check PDF/A status elsewhere
        log.error(e.stderr)
        raise SubprocessOutputError('Ghostscript PDF/A rendering failed') from e
    else:
        stderr = p.stderr
        # If there is an error we log the whole stderr, except for filtering
        # duplicates.
        if _gs_error_reported(stderr):
            # Ghostscript outputs the pattern **** Error: ....  frequently.
            # Occasionally the error message is spammed many times. We filter
            # out duplicates of this message using the filter above. We use
            # the **** pattern to split the stderr into parts.
            for part in stderr.split('****'):
                log.error(part)
        if _gs_devicen_reported(stderr):
            raise ColorConversionNeededError()


================================================
FILE: src/ocrmypdf/_exec/jbig2enc.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Interface to jbig2 executable."""

from __future__ import annotations

from subprocess import PIPE, CalledProcessError

from packaging.version import Version

from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.subprocess import get_version, run


def version() -> Version:
    try:
        version = get_version('jbig2', regex=r'jbig2enc (\d+(\.\d+)*).*')
    except CalledProcessError as e:
        # TeX Live for Windows provides an incompatible jbig2.EXE which may
        # be on the PATH.
        raise MissingDependencyError('jbig2enc') from e
    return Version(version)


def available():
    try:
        version()
    except MissingDependencyError:
        return False
    return True


def convert_single(cwd, infile, outfile, threshold):
    args = ['jbig2', '--pdf', '-t', str(threshold), infile]
    with open(outfile, 'wb') as fstdout:
        proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE)
    proc.check_returncode()
    return proc


================================================
FILE: src/ocrmypdf/_exec/pngquant.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Interface to pngquant executable."""

from __future__ import annotations

from pathlib import Path
from subprocess import PIPE

from packaging.version import Version

from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.subprocess import get_version, run


def version() -> Version:
    return Version(get_version('pngquant', regex=r'(\d+(\.\d+)*).*'))


def available():
    try:
        version()
    except MissingDependencyError:
        return False
    return True


def quantize(input_file: Path, output_file: Path, quality_min: int, quality_max: int):
    """Quantize a PNG image using pngquant.

    Args:
        input_file: Input PNG image
        output_file: Output PNG image
        quality_min: Minimum quality to use
        quality_max: Maximum quality to use
    """
    with open(input_file, 'rb') as input_stream:
        args = [
            'pngquant',
            '--force',
            '--skip-if-larger',
            '--quality',
            f'{quality_min}-{quality_max}',
            '--',  # pngquant: stop processing arguments
            '-',  # pngquant: stream input and output
        ]
        result = run(args, stdin=input_stream, stdout=PIPE, stderr=PIPE, check=False)

    if result.returncode == 0:
        # input_file could be the same as output_file, so we defer the write
        output_file.write_bytes(result.stdout)


================================================
FILE: src/ocrmypdf/_exec/tesseract.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Interface to Tesseract executable."""

from __future__ import annotations

import logging
import os
import re
from contextlib import suppress
from enum import IntEnum
from math import pi
from os import fspath
from pathlib import Path
from subprocess import PIPE, STDOUT, CalledProcessError, TimeoutExpired

from packaging.version import Version

from ocrmypdf.exceptions import (
    MissingDependencyError,
    SubprocessOutputError,
    TesseractConfigError,
)
from ocrmypdf.pluginspec import OrientationConfidence
from ocrmypdf.subprocess import get_version, run

log = logging.getLogger(__name__)


def _tesseract_env(omp_thread_limit: int | None) -> dict[str, str] | None:
    """Create environment dict with OMP_THREAD_LIMIT set for Tesseract subprocesses."""
    if omp_thread_limit is None:
        return None
    env = os.environ.copy()
    env['OMP_THREAD_LIMIT'] = str(omp_thread_limit)
    return env


class ThresholdingMethod(IntEnum):
    """Tesseract thresholding methods for image binarization."""

    AUTO = 0
    OTSU = 0  # Alias for AUTO - uses Tesseract's default (legacy Otsu)
    ADAPTIVE_OTSU = 1
    SAUVOLA = 2


# Legacy dictionary for backward compatibility
TESSERACT_THRESHOLDING_METHODS: dict[str, int] = {
    'auto': ThresholdingMethod.AUTO,
    'otsu': ThresholdingMethod.OTSU,
    'adaptive-otsu': ThresholdingMethod.ADAPTIVE_OTSU,
    'sauvola': ThresholdingMethod.SAUVOLA,
}


class TesseractLoggerAdapter(logging.LoggerAdapter):
    """Prepend [tesseract] to messages emitted from tesseract."""

    def process(self, msg, kwargs):
        kwargs['extra'] = self.extra
        return f'[tesseract] {msg}', kwargs


TESSERACT_VERSION_PATTERN = r"""
    v?
    (?:
        (?:(?P<epoch>[0-9]+)!)?                           # epoch
        (?P<release>[0-9]+(?:\.[0-9]+)*)                  # release segment
        (?P<pre>                                          # pre-release
            [-_\.]?
            (?P<pre_l>(a|b|c|rc|alpha|beta|pre|preview))
            [-_\.]?
            (?P<pre_n>[0-9]+)?
        )?
        (?P<post>                                         # post release
            (?:-(?P<post_n1>[0-9]+))
            |
            (?:
                [-_\.]?
                (?P<post_l>post|rev|r)
                [-_\.]?
                (?P<post_n2>[0-9]+)?
            )
        )?
        (?P<dev>                                          # dev release
            [-_\.]?
            (?P<dev_l>dev)
            [-_\.]?
            (?P<dev_n>[0-9]+)?
        )?
        (?P<date>
            [-_\.]
            (?:20[0-9][0-9] [0-1][0-9] [0-3][0-9])       # yyyy mm dd
        )?
        (?P<gitcount>
            [-_\.]?
            [0-9]+
        )?
        (?P<gitcommit>
            [-_\.]?
            g[0-9a-f]{2,10}
        )?
    )
    (?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
"""


class TesseractVersion(Version):
    """Modify standard packaging.Version regex to support Tesseract idiosyncrasies."""

    _regex = re.compile(
        r"^\s*" + TESSERACT_VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE
    )


def version() -> Version:
    return TesseractVersion(get_version('tesseract', regex=r'tesseract\s(.+)'))


def has_thresholding() -> bool:
    """Does Tesseract have -c thresholding method capability?"""
    return version() >= Version('5.0')


def get_languages() -> set[str]:
    def lang_error(output):
        msg = (
            "Tesseract failed to report available languages.\n"
            "Output from Tesseract:\n"
            "-----------\n"
        )
        msg += output
        return msg

    args_tess = ['tesseract', '--list-langs']
    try:
        proc = run(
            args_tess,
            text=True,
            stdout=PIPE,
            stderr=STDOUT,
            logs_errors_to_stdout=True,
            check=True,
        )
        output = proc.stdout
    except CalledProcessError as e:
        raise MissingDependencyError(lang_error(e.output)) from e

    for line in output.splitlines():
        if line.startswith('Error'):
            raise MissingDependencyError(lang_error(output))
    _header, *rest = output.splitlines()
    return {lang.strip() for lang in rest}


def tess_base_args(langs: list[str], engine_mode: int | None) -> list[str]:
    args = ['tesseract']
    if langs:
        args.extend(['-l', '+'.join(langs)])
    if engine_mode is not None:
        args.extend(['--oem', str(engine_mode)])
    return args


def _parse_tesseract_output(binary_output: bytes) -> dict[str, str]:
    def gen():
        for line in binary_output.decode().splitlines():
            line = line.strip()
            parts = line.split(':', maxsplit=2)
            if len(parts) == 2:
                yield parts[0].strip(), parts[1].strip()

    return dict(gen())


def get_orientation(
    input_file: Path,
    engine_mode: int | None,
    timeout: float,
    omp_thread_limit: int | None = None,
) -> OrientationConfidence:
    args_tesseract = tess_base_args(['osd'], engine_mode) + [
        '--psm',
        '0',
        fspath(input_file),
        'stdout',
    ]

    try:
        p = run(
            args_tesseract,
            stdout=PIPE,
            stderr=STDOUT,
            timeout=timeout,
            check=True,
            env=_tesseract_env(omp_thread_limit),
        )
    except TimeoutExpired:
        return OrientationConfidence(angle=0, confidence=0.0)
    except CalledProcessError as e:
        tesseract_log_output(e.stdout)
        tesseract_log_output(e.stderr)
        # Check both stdout (e.output) and stderr for known non-fatal messages
        all_output = (e.output or b'') + (e.stderr or b'')
        if (
            b'Too few characters. Skipping this page' in all_output
            or b'Image too large' in all_output
        ):
            return OrientationConfidence(0, 0)
        raise SubprocessOutputError() from e

    osd = _parse_tesseract_output(p.stdout)
    angle = int(osd.get('Orientation in degrees', 0))
    orient_conf = OrientationConfidence(
        angle=angle, confidence=float(osd.get('Orientation confidence', 0))
    )
    return orient_conf


def _is_empty_page_error(exc):
    if b'Empty page!!' in exc.output:  # Tesseract 4.x
        return True

    return exc.returncode == 1 and (
        # Tesseract 5.0-5.4 or so
        exc.output == b''
        # Tesseract 5.5+
        or exc.output.startswith(b"Error in boxClipToRectangle: box outside rectangle")
    )


def get_deskew(
    input_file: Path,
    languages: list[str],
    engine_mode: int | None,
    timeout: float,
    omp_thread_limit: int | None = None,
) -> float:
    """Gets angle to deskew this page, in degrees."""
    args_tesseract = tess_base_args(languages, engine_mode) + [
        '--psm',
        '2',
        fspath(input_file),
        'stdout',
    ]

    try:
        p = run(
            args_tesseract,
            stdout=PIPE,
            stderr=STDOUT,
            timeout=timeout,
            check=True,
            env=_tesseract_env(omp_thread_limit),
        )
    except TimeoutExpired:
        return 0.0
    except CalledProcessError as e:
        tesseract_log_output(e.stdout)
        tesseract_log_output(e.stderr)
        if _is_empty_page_error(e):
            # Not enough info for a skew angle
            return 0.0
        raise SubprocessOutputError() from e

    parsed = _parse_tesseract_output(p.stdout)
    deskew_radians = float(parsed.get('Deskew angle', 0))
    deskew_degrees = 180 / pi * deskew_radians
    log.debug(f"Deskew angle: {deskew_degrees:.3f}")
    return deskew_degrees


def tesseract_log_output(stream: bytes) -> None:
    tlog = TesseractLoggerAdapter(
        log,
        extra=log.extra if hasattr(log, 'extra') else None,  # type: ignore
    )

    if not stream:
        return
    try:
        text = stream.decode()
    except UnicodeDecodeError:
        text = stream.decode('utf-8', 'ignore')

    lines = text.splitlines()
    for line in lines:
        if line.startswith(
            ("Tesseract Open Source", "Warning in pixReadMem")
        ):
            continue
        elif 'diacritics' in line:
            tlog.warning("lots of diacritics - possibly poor OCR")
        elif line.startswith('OSD: Weak margin'):
            tlog.warning("unsure about page orientation")
        elif 'Error in pixScanForForeground' in line:
            pass  # Appears to be spurious/problem with nonwhite borders
        elif 'Error in boxClipToRectangle' in line:
            pass  # Always appears with pixScanForForeground message
        elif 'parameter not found: ' in line.lower():
            tlog.error(line.strip())
            problem = line.split('found: ')[1]
            raise TesseractConfigError(problem)
        elif 'error' in line.lower() or 'exception' in line.lower():
            tlog.error(line.strip())
        elif 'warning' in line.lower():
            tlog.warning(line.strip())
        elif 'read_params_file' in line.lower():
            tlog.error(line.strip())
        else:
            tlog.info(line.strip())


def page_timedout(timeout: float) -> None:
    if timeout == 0:
        return
    log.warning("[tesseract] took too long to OCR - skipping")


def _generate_null_hocr(output_hocr: Path, output_text: Path, image: Path) -> None:
    """Produce an empty .hocr file.

    Ensures page is the same size as the input image.
    """
    output_hocr.write_text('', encoding='utf-8')
    output_text.write_text('[skipped page]', encoding='utf-8')


def generate_hocr(
    *,
    input_file: Path,
    output_hocr: Path,
    output_text: Path,
    languages: list[str],
    engine_mode: int,
    tessconfig: list[str],
    timeout: float,
    pagesegmode: int,
    thresholding: ThresholdingMethod,
    user_words,
    user_patterns,
    omp_thread_limit: int | None = None,
) -> None:
    """Generate a hOCR file, which must be converted to PDF."""
    prefix = output_hocr.with_suffix('')

    args_tesseract = tess_base_args(languages, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

    if thresholding != ThresholdingMethod.AUTO and has_thresholding():
        args_tesseract.extend(['-c', f'thresholding_method={thresholding}'])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])

    if user_patterns:
        args_tesseract.extend(['--user-patterns', user_patterns])

    # Reminder: test suite tesseract test plugins will break after any changes
    # to the number of order parameters here
    args_tesseract.extend([fspath(input_file), fspath(prefix), 'hocr', 'txt'])
    args_tesseract.extend(tessconfig)
    try:
        p = run(
            args_tesseract,
            stdout=PIPE,
            stderr=STDOUT,
            timeout=timeout,
            check=True,
            env=_tesseract_env(omp_thread_limit),
        )
        stdout = p.stdout
    except TimeoutExpired:
        # Generate a HOCR file with no recognized text if tesseract times out
        # Temporary workaround to hocrTransform not being able to function if
        # it does not have a valid hOCR file.
        page_timedout(timeout)
        _generate_null_hocr(output_hocr, output_text, input_file)
    except CalledProcessError as e:
        tesseract_log_output(e.output)
        if b'Image too large' in e.output or b'Empty page!!' in e.output:
            _generate_null_hocr(output_hocr, output_text, input_file)
            return

        raise SubprocessOutputError() from e
    else:
        tesseract_log_output(stdout)
        # The sidecar text file will get the suffix .txt; rename it to
        # whatever caller wants it named
        with suppress(FileNotFoundError):
            prefix.with_suffix('.txt').replace(output_text)


def use_skip_page(output_pdf: Path, output_text: Path) -> None:
    output_text.write_text('[skipped page]', encoding='utf-8')

    # A 0 byte file to the output to indicate a skip
    output_pdf.write_bytes(b'')


def generate_pdf(
    *,
    input_file: Path,
    output_pdf: Path,
    output_text: Path,
    languages: list[str],
    engine_mode: int,
    tessconfig: list[str],
    timeout: float,
    pagesegmode: int,
    thresholding: ThresholdingMethod,
    user_words,
    user_patterns,
    omp_thread_limit: int | None = None,
) -> None:
    """Generate a PDF using Tesseract's internal PDF generator.

    We specifically a text-only PDF which is more suitable for combining with
    the input page.
    """
    args_tesseract = tess_base_args(languages, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

    args_tesseract.extend(['-c', 'textonly_pdf=1'])

    if thresholding != ThresholdingMethod.AUTO and has_thresholding():
        args_tesseract.extend(['-c', f'thresholding_method={thresholding}'])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])

    if user_patterns:
        args_tesseract.extend(['--user-patterns', user_patterns])

    prefix = output_pdf.parent / Path(output_pdf.stem)

    # Reminder: test suite tesseract test plugins might break after any changes
    # to the number of order parameters here

    args_tesseract.extend([fspath(input_file), fspath(prefix), 'pdf', 'txt'])
    args_tesseract.extend(tessconfig)
    try:
        p = run(
            args_tesseract,
            stdout=PIPE,
            stderr=STDOUT,
            timeout=timeout,
            check=True,
            env=_tesseract_env(omp_thread_limit),
        )
        stdout = p.stdout
        with suppress(FileNotFoundError):
            prefix.with_suffix('.txt').replace(output_text)
    except TimeoutExpired:
        page_timedout(timeout)
        use_skip_page(output_pdf, output_text)
    except CalledProcessError as e:
        tesseract_log_output(e.output)
        if b'Image too large' in e.output or b'Empty page!!' in e.output:
            use_skip_page(output_pdf, output_text)
            return
        raise SubprocessOutputError() from e
    else:
        tesseract_log_output(stdout)


================================================
FILE: src/ocrmypdf/_exec/unpaper.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Interface to unpaper executable."""

from __future__ import annotations

import logging
import os
from collections.abc import Iterator
from contextlib import contextmanager
from decimal import Decimal
from pathlib import Path
from subprocess import PIPE, STDOUT
from tempfile import TemporaryDirectory

from packaging.version import Version
from PIL import Image

from ocrmypdf.exceptions import SubprocessOutputError
from ocrmypdf.subprocess import get_version, run

# unpaper documentation:
# https://github.com/Flameeyes/unpaper/blob/main/doc/basic-concepts.md


UNPAPER_IMAGE_PIXEL_LIMIT = 256 * 1024 * 1024

DecFloat = Decimal | float

log = logging.getLogger(__name__)


class UnpaperImageTooLargeError(Exception):
    """To capture details when an image is too large for unpaper."""

    def __init__(
        self,
        w,
        h,
        message="Image with size {}x{} is too large for cleaning with 'unpaper'.",
    ):
        self.w = w
        self.h = h
        self.message = message.format(w, h)
        super().__init__(self.message)


def version() -> Version:
    return Version(get_version('unpaper', regex=r'(?m).*?(\d+(\.\d+)(\.\d+)?)'))


@contextmanager
def _setup_unpaper_io(input_file: Path) -> Iterator[tuple[Path, Path, Path]]:
    with Image.open(input_file) as im:
        if im.width * im.height >= UNPAPER_IMAGE_PIXEL_LIMIT:
            raise UnpaperImageTooLargeError(w=im.width, h=im.height)

    with TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir:
        tmppath = Path(tmpdir)
        # No changes, PNG input, just use the file we already have
        input_png = input_file
        # unpaper can write .png too, but it seems to write them slowly
        # adds a few seconds to test suite - so just use pnm
        output_pnm = tmppath / 'output.pnm'
        yield input_png, output_pnm, tmppath


def run_unpaper(
    input_file: Path, output_file: Path, *, dpi: DecFloat, mode_args: list[str]
) -> None:
    args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args

    with _setup_unpaper_io(input_file) as (input_png, output_pnm, tmpdir):
        # To prevent any shenanigans from accepting arbitrary parameters in
        # --unpaper-args, we:
        # 1) run with cwd set to a tmpdir with only unpaper's files
        # 2) forbid the use of '/' in arguments, to prevent changing paths
        # 3) append absolute paths for the input and output file
        # This should ensure that a user cannot clobber some other file with
        # their unpaper arguments (whether intentionally or otherwise)
        args_unpaper.extend([os.fspath(input_png), os.fspath(output_pnm)])
        run(
            args_unpaper,
            close_fds=True,
            check=True,
            stderr=STDOUT,  # unpaper writes logging output to stdout and stderr
            stdout=PIPE,  # and cannot send file output to stdout
            cwd=tmpdir,
            logs_errors_to_stdout=True,
        )
        try:
            with Image.open(output_pnm) as imout:
                imout.save(output_file, dpi=(dpi, dpi))
        except OSError as e:
            raise SubprocessOutputError(
                "unpaper: failed to produce the expected output file. "
                + " Called with: "
                + str(args_unpaper)
            ) from e


def clean(
    input_file: Path,
    output_file: Path,
    *,
    dpi: DecFloat,
    unpaper_args: list[str] | None = None,
) -> Path:
    default_args = [
        '--layout',
        'none',
        '--mask-scan-size',
        '100',  # don't blank out narrow columns
        '--no-border-align',  # don't align visible content to borders
        '--no-mask-center',  # don't center visible content within page
        '--no-grayfilter',  # don't remove light gray areas
        '--no-blackfilter',  # don't remove solid black areas
        '--no-deskew',  # don't deskew
    ]
    if not unpaper_args:
        unpaper_args = default_args
    try:
        run_unpaper(input_file, output_file, dpi=dpi, mode_args=unpaper_args)
        return output_file
    except UnpaperImageTooLargeError as e:
        log.warning(str(e))
        return input_file


================================================
FILE: src/ocrmypdf/_exec/verapdf.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Interface to verapdf executable."""

from __future__ import annotations

import json
import logging
from pathlib import Path
from subprocess import PIPE
from typing import NamedTuple

from packaging.version import Version

from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.subprocess import get_version, run

log = logging.getLogger(__name__)


class ValidationResult(NamedTuple):
    """Result of PDF/A validation."""

    valid: bool
    failed_rules: int
    message: str


def version() -> Version:
    """Get verapdf version."""
    return Version(get_version('verapdf', regex=r'veraPDF (\d+(\.\d+)*)'))


def available() -> bool:
    """Check if verapdf is available."""
    try:
        version()
    except (MissingDependencyError, OSError):
        return False
    return True


def output_type_to_flavour(output_type: str) -> str:
    """Map OCRmyPDF output_type to verapdf flavour.

    Args:
        output_type: One of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'

    Returns:
        verapdf flavour string like '1b', '2b', '3b'
    """
    mapping = {
        'pdfa': '2b',
        'pdfa-1': '1b',
        'pdfa-2': '2b',
        'pdfa-3': '3b',
    }
    return mapping.get(output_type, '2b')


def validate(input_file: Path, flavour: str) -> ValidationResult:
    """Validate a PDF against a PDF/A profile.

    Args:
        input_file: Path to PDF file to validate
        flavour: verapdf flavour (1a, 1b, 2a, 2b, 2u, 3a, 3b, 3u)

    Returns:
        ValidationResult with validation status
    """
    args = [
        'verapdf',
        '--format',
        'json',
        '--flavour',
        flavour,
        str(input_file),
    ]

    try:
        proc = run(args, stdout=PIPE, stderr=PIPE, check=False)
    except FileNotFoundError as e:
        raise MissingDependencyError('verapdf') from e

    try:
        result = json.loads(proc.stdout)
        jobs = result.get('report', {}).get('jobs', [])
        if not jobs:
            return ValidationResult(False, -1, 'No validation jobs in result')
        validation_results = jobs[0].get('validationResult', [])
        if not validation_results:
            return ValidationResult(False, -1, 'No validation result in output')
        validation_result = validation_results[0]
        details = validation_result.get('details', {})
        failed_rules = details.get('failedRules', 0)

        if failed_rules == 0:
            return ValidationResult(True, 0, 'PDF/A validation passed')
        else:
            return ValidationResult(
                False,
                failed_rules,
                f'PDF/A validation failed with {failed_rules} rule violations',
            )
    except (json.JSONDecodeError, KeyError, TypeError) as e:
        log.debug('Failed to parse verapdf output: %s', e)
        return ValidationResult(False, -1, f'Failed to parse verapdf output: {e}')


================================================
FILE: src/ocrmypdf/_graft.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""For grafting text-only PDF pages onto freeform PDF pages."""

from __future__ import annotations

import logging
from contextlib import suppress
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from ocrmypdf.hocrtransform import OcrElement

from pikepdf import (
    Dictionary,
    Name,
    Operator,
    Page,
    Pdf,
    Stream,
    parse_content_stream,
    unparse_content_stream,
)

from ocrmypdf._jobcontext import PdfContext
from ocrmypdf._options import ProcessingMode
from ocrmypdf._pipeline import VECTOR_PAGE_DPI


class RenderMode(Enum):
    """Controls where the OCR text layer is placed relative to page content.

    ON_TOP: Text layer renders above page content (reserved for future use).
    UNDERNEATH: Text layer renders below page content (current default behavior).
    """

    ON_TOP = 0
    UNDERNEATH = 1


@dataclass
class Fpdf2PageInfo:
    """Information needed to render and graft an fpdf2 page."""

    pageno: int
    hocr_path: Path
    dpi: float
    autorotate_correction: int
    emplaced_page: bool


@dataclass
class Fpdf2ParsedPage:
    """Parsed page data ready for fpdf2 rendering."""

    pageno: int
    ocr_tree: OcrElement
    dpi: float
    autorotate_correction: int
    emplaced_page: bool


# Alias for backward compatibility with plan documentation
Fpdf2DirectPage = Fpdf2ParsedPage


def _compute_text_misalignment(
    content_rotation: int, autorotate_correction: int, emplaced_page: bool
) -> int:
    """Compute rotation needed to align text layer with page content.

    Args:
        content_rotation: Original page /Rotate value (degrees).
        autorotate_correction: Rotation applied during rasterization (degrees).
        emplaced_page: Whether the page content was replaced with rasterized image.

    Returns:
        Rotation in degrees to apply to text layer to align with content.
    """
    if emplaced_page:
        # New image is upright after autorotation was applied
        content_rotation = autorotate_correction
    text_rotation = autorotate_correction
    return (text_rotation - content_rotation) % 360


def _compute_page_rotation(
    content_rotation: int, autorotate_correction: int, emplaced_page: bool
) -> int:
    """Compute final page /Rotate value after grafting.

    Args:
        content_rotation: Original page /Rotate value (degrees).
        autorotate_correction: Rotation applied during rasterization (degrees).
        emplaced_page: Whether the page content was replaced with rasterized image.

    Returns:
        Final /Rotate value for the page.
    """
    if emplaced_page:
        content_rotation = autorotate_correction
    return (content_rotation - autorotate_correction) % 360


def _build_text_layer_ctm(
    text_width: float,
    text_height: float,
    page_width: float,
    page_height: float,
    page_origin_x: float,
    page_origin_y: float,
    text_rotation: int,
):
    """Build transformation matrix to align text layer with page content.

    Always computes the full CTM to handle non-zero page origins (e.g.,
    JSTOR PDFs with MediaBox like [0, 100, 595, 982]) and minor scale
    differences due to DPI rounding.

    Args:
        text_width: Width of text layer mediabox.
        text_height: Height of text layer mediabox.
        page_width: Width of target page mediabox.
        page_height: Height of target page mediabox.
        page_origin_x: X origin of target page mediabox.
        page_origin_y: Y origin of target page mediabox.
        text_rotation: Rotation in degrees (clockwise) to apply to text layer.

    Returns:
        pikepdf.Matrix transformation matrix, or None if identity.
    """
    from pikepdf import Matrix

    wt, ht = text_width, text_height

    # Center text, rotate, scale to fit page, then position at page origin
    translate = Matrix().translated(-wt / 2, -ht / 2)
    untranslate = Matrix().translated(page_width / 2, page_height / 2)
    corner = Matrix().translated(page_origin_x, page_origin_y)

    # Negate rotation because input is clockwise angle
    rotate = Matrix().rotated(-text_rotation % 360)

    # Swap dimensions if 90 or 270 degree rotation
    if text_rotation in (90, 270):
        wt, ht = ht, wt

    # Scale to fit page dimensions
    scale_x = page_width / wt if wt else 1.0
    scale_y = page_height / ht if ht else 1.0
    scale = Matrix().scaled(scale_x, scale_y)

    ctm = translate @ rotate @ scale @ untranslate @ corner

    # Return None if the result is effectively identity
    identity = Matrix()
    if ctm == identity:
        return None

    return ctm


log = logging.getLogger(__name__)
MAX_REPLACE_PAGES = 100


def _ensure_dictionary(obj: Dictionary | Stream, name: Name):
    if name not in obj:
        obj[name] = Dictionary({})
    return obj[name]


def strip_invisible_text(pdf: Pdf, page: Page):
    stream = []
    in_text_obj = False
    render_mode = 0
    render_mode_stack = []
    text_objects = []

    for operands, operator in parse_content_stream(page, ''):
        if operator == Operator('Tr'):
            render_mode = operands[0]

        if operator == Operator('q'):
            render_mode_stack.append(render_mode)

        if operator == Operator('Q'):
            # IndexError is raised if stack is empty; try to carry on
            with suppress(IndexError):
                render_mode = render_mode_stack.pop()

        if not in_text_obj:
            if operator == Operator('BT'):
                in_text_obj = True
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            text_objects.append((operands, operator))
            if operator == Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    content_stream = unparse_content_stream(stream)
    page.Contents = Stream(pdf, content_stream)


class OcrGrafter:
    """Manages grafting text-only PDFs onto regular PDFs."""

    def __init__(self, context: PdfContext):
        self.context = context
        self.path_base = context.origin

        self.pdf_base = Pdf.open(self.path_base)

        self.pdfinfo = context.pdfinfo
        self.output_file = context.get_path('graft_layers.pdf')

        self.emplacements = 1
        self.render_mode = RenderMode.UNDERNEATH

        # Check renderer type
        pdf_renderer = context.options.pdf_renderer
        self.use_sandwich_renderer = pdf_renderer == 'sandwich'

        # For fpdf2: accumulate pages before rendering
        self.fpdf2_hocr_pages: list[Fpdf2PageInfo] = []
        self.fpdf2_parsed_pages: list[Fpdf2ParsedPage] = []

    def graft_page(
        self,
        *,
        pageno: int,
        image: Path | None,
        ocr_output: Path | None,
        ocr_tree: OcrElement | None,
        autorotate_correction: int,
    ):
        """Graft OCR output onto a page of the base PDF.

        Args:
            pageno: Zero-based page number.
            image: Path to the visible page image PDF, or None if not replacing.
            ocr_output: Path to OCR output file. For fpdf2 renderer this is an
                hOCR file; for sandwich renderer this is a text-only PDF.
            ocr_tree: OCR tree for fpdf2 renderer.
            autorotate_correction: Orientation correction in degrees (0, 90, 180, 270).
        """
        if ocr_output and ocr_tree:
            raise ValueError(
                'Cannot specify both ocr_output and ocr_tree for fpdf2 renderer'
            )
        # Handle image emplacement first
        emplaced_page = False
        content_rotation = self.pdfinfo[pageno].rotation
        path_image = Path(image).resolve() if image else None
        if path_image is not None and path_image != self.path_base:
            # We are updating the old page with a rasterized PDF of the new
            # page (without changing objgen, to preserve references)
            log.debug("Emplacement update")
            with Pdf.open(path_image) as pdf_image:
                self.emplacements += 1
                foreign_image_page = pdf_image.pages[0]
                self.pdf_base.pages.append(foreign_image_page)
                local_image_page = self.pdf_base.pages[-1]
                self.pdf_base.pages[pageno].emplace(
                    local_image_page, retain=(Name.Parent,)
                )
                del self.pdf_base.pages[-1]
            emplaced_page = True

        if self.use_sandwich_renderer:
            # Sandwich renderer: graft pre-rendered PDF immediately
            if ocr_output:
                text_misaligned = _compute_text_misalignment(
                    content_rotation, autorotate_correction, emplaced_page
                )
                self._graft_sandwich_text_layer(
                    pageno=pageno,
                    textpdf=ocr_output,
                    text_rotation=text_misaligned,
                )
                page_rotation = _compute_page_rotation(
                    content_rotation, autorotate_correction, emplaced_page
                )
                self.pdf_base.pages[pageno].Rotate = page_rotation
        else:
            # fpdf2 renderer: accumulate page info for batch rendering.
            # The hOCR coordinates are in the corrected (upright) coordinate system.
            # We store autorotate_correction and emplaced_page to set the final
            # page /Rotate tag after grafting.
            if ocr_tree:
                self.fpdf2_parsed_pages.append(
                    Fpdf2ParsedPage(
                        ocr_tree=ocr_tree,
                        pageno=pageno,
                        autorotate_correction=autorotate_correction,
                        emplaced_page=emplaced_page,
                        dpi=self.pdfinfo[pageno].dpi.to_scalar(),
                    )
                )
            if ocr_output:
                self.fpdf2_hocr_pages.append(
                    Fpdf2PageInfo(
                        hocr_path=ocr_output,
                        pageno=pageno,
                        autorotate_correction=autorotate_correction,
                        emplaced_page=emplaced_page,
                        dpi=self.pdfinfo[pageno].dpi.to_scalar(),
                    )
                )

    def finalize(self):
        # Can have hocr OR parsed pages OR neither (no OCR), but not both
        assert not (
            self.fpdf2_hocr_pages and self.fpdf2_parsed_pages
        ), "Can't have both hocr and ocrtree pages"

        if self.fpdf2_hocr_pages:
            # Render all pages with fpdf2, then graft
            parsed_pages = self._parse_hocr_pages()
            self.fpdf2_parsed_pages = parsed_pages

        if self.fpdf2_parsed_pages:
            self._render_and_graft_fpdf2_pages()

        self.pdf_base.save(self.output_file)
        self.pdf_base.close()
        return self.output_file

    def _parse_hocr_pages(self):
        """Render all pages to multi-page PDF with shared fonts, then graft."""
        from ocrmypdf.hocrtransform.hocr_parser import HocrParser

        log.info(
            "Parsing %d pages with HocrParser",
            len(self.fpdf2_hocr_pages),
        )

        # Parse all hOCR files and collect OcrElements
        pages_data: list[Fpdf2ParsedPage] = []
        for page_info in self.fpdf2_hocr_pages:
            if page_info.hocr_path.stat().st_size == 0:
                continue  # Skip empty pages

            # Parse hOCR to OcrElement
            parser = HocrParser(page_info.hocr_path)
            ocr_tree = parser.parse()

            # Use DPI from hOCR (scan_res) which reflects actual rasterization DPI.
            # Fall back to pdfinfo DPI or VECTOR_PAGE_DPI for vector-only pages.
            effective_dpi = ocr_tree.dpi or page_info.dpi or float(VECTOR_PAGE_DPI)
            pages_data.append(
                Fpdf2ParsedPage(
                    pageno=page_info.pageno,
                    ocr_tree=ocr_tree,
                    dpi=effective_dpi,
                    autorotate_correction=page_info.autorotate_correction,
                    emplaced_page=page_info.emplaced_page,
                )
            )

        return pages_data

    def _render_and_graft_fpdf2_pages(self):
        font_dir = Path(__file__).parent / "data"

        # Render all pages to single PDF
        multi_page_pdf_path = self.context.get_path('fpdf2_multipage.pdf')

        from ocrmypdf.font import MultiFontManager
        from ocrmypdf.fpdf_renderer import Fpdf2MultiPageRenderer

        multi_font_manager = MultiFontManager(font_dir)
        # Build renderer input as (pageno, ocr_tree, dpi) tuples
        renderer_pages_data = [
            (parsed.pageno, parsed.ocr_tree, parsed.dpi)
            for parsed in self.fpdf2_parsed_pages
        ]
        renderer = Fpdf2MultiPageRenderer(
            pages_data=renderer_pages_data,
            multi_font_manager=multi_font_manager,
            invisible_text=True,
        )

        renderer.render(multi_page_pdf_path)

        # Now graft each page from the multi-page PDF
        with Pdf.open(multi_page_pdf_path) as pdf_text:
            for idx, parsed in enumerate(self.fpdf2_parsed_pages):
                # Copy page from multi-page PDF
                text_page = pdf_text.pages[idx]

                content_rotation = self.pdfinfo[parsed.pageno].rotation
                text_misaligned = _compute_text_misalignment(
                    content_rotation,
                    parsed.autorotate_correction,
                    parsed.emplaced_page,
                )
                self._graft_fpdf2_text_layer(parsed.pageno, text_page, text_misaligned)

                page_rotation = _compute_page_rotation(
                    content_rotation,
                    parsed.autorotate_correction,
                    parsed.emplaced_page,
                )
                self.pdf_base.pages[parsed.pageno].Rotate = page_rotation

        # Clean up multi-page PDF if not keeping temp files
        if not self.context.options.keep_temporary_files:
            with suppress(FileNotFoundError):
                multi_page_pdf_path.unlink()

    def _graft_fpdf2_text_layer(self, pageno: int, text_page: Page, text_rotation: int):
        """Graft a single text page onto the base PDF.

        Similar to existing _graft_text_layer but works with
        already-rendered pikepdf Page instead of file path.

        Args:
            pageno: Zero-based page number.
            text_page: The text-only PDF page to graft.
            text_rotation: Rotation to apply to align text with content (degrees).
        """
        from pikepdf import Array

        base_page = self.pdf_base.pages[pageno]

        # Extract content stream from text_page
        text_contents = text_page.Contents.read_bytes()

        # Get the mediabox from the text page
        mediabox = Array([float(x) for x in text_page.mediabox])  # type: ignore[misc]
        wt = float(mediabox[2]) - float(mediabox[0])
        ht = float(mediabox[3]) - float(mediabox[1])

        # Get base page mediabox
        base_mediabox = base_page.mediabox
        wp = float(base_mediabox[2]) - float(base_mediabox[0])
        hp = float(base_mediabox[3]) - float(base_mediabox[1])

        # Create Form XObject from text page content
        base_resources = _ensure_dictionary(base_page.obj, Name.Resources)
        base_xobjs = _ensure_dictionary(base_resources, Name.XObject)
        text_xobj_name = Name.random(prefix="OCR-")
        xobj = self.pdf_base.make_stream(text_contents)
        base_xobjs[text_xobj_name] = xobj
        xobj.Type = Name.XObject
        xobj.Subtype = Name.Form
        xobj.FormType = 1
        xobj.BBox = base_mediabox

        # Copy resources from text page's Resources to xobj
        # We need to handle this carefully since text_page is from a foreign PDF
        if hasattr(text_page, 'Resources') and text_page.Resources:
            # Create empty Resources dictionary for xobj
            xobj_resources = _ensure_dictionary(xobj, Name.Resources)

            # Copy fonts if they exist
            if Name.Font in text_page.Resources:
                xobj_fonts = _ensure_dictionary(xobj_resources, Name.Font)
                text_fonts = text_page.Resources[Name.Font]
                # Copy each font from the foreign PDF
                for font_name, font_obj in text_fonts.items():
                    xobj_fonts[font_name] = self.pdf_base.copy_foreign(font_obj)

            # Copy ExtGState (graphics state) if it exists - needed for transparency
            if Name.ExtGState in text_page.Resources:
                xobj_extstates = _ensure_dictionary(xobj_resources, Name.ExtGState)
                text_extstates = text_page.Resources[Name.ExtGState]
                # Copy each graphics state from the foreign PDF
                for gs_name, gs_obj in text_extstates.items():
                    xobj_extstates[gs_name] = self.pdf_base.copy_foreign(gs_obj)

        # Build transformation matrix for rotation and scaling
        ctm = _build_text_layer_ctm(
            wt,
            ht,
            wp,
            hp,
            float(base_mediabox[0]),
            float(base_mediabox[1]),
            text_rotation,
        )
        if ctm is not None:
            pdf_draw_xobj = (
                (b'q %s cm\n' % ctm.encode()) + (b'%s Do\n' % text_xobj_name) + b'Q\n'
            )
        else:
            pdf_draw_xobj = b'q\n' + (b'%s Do\n' % text_xobj_name) + b'\nQ\n'

        new_text_layer = Stream(self.pdf_base, pdf_draw_xobj)

        # Strip old invisible text if redo mode is enabled
        if self.context.options.mode == ProcessingMode.redo:
            strip_invisible_text(self.pdf_base, base_page)

        # Add text layer to base page
        base_page.contents_coalesce()
        base_page.contents_add(
            new_text_layer, prepend=self.render_mode == RenderMode.UNDERNEATH
        )
        base_page.contents_coalesce()

    def _graft_sandwich_text_layer(
        self,
        *,
        pageno: int,
        textpdf: Path,
        text_rotation: int,
    ):
        """Graft a pre-rendered text-only PDF onto the base PDF.

        This is used by the sandwich renderer which generates PDFs directly
        from Tesseract rather than going through hOCR.
        """
        from pikepdf import PdfError

        log.debug("Grafting sandwich text layer")
        if Path(textpdf).stat().st_size == 0:
            return

        try:
            with Pdf.open(textpdf) as pdf_text:
                pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()

                base_page = self.pdf_base.pages[pageno]

                # Get font from the text PDF
                pdf_text_fonts = pdf_text.pages[0].Resources.get(
                    Name.Font, Dictionary()
                )
                font = None
                font_key = None
                for f in ('/f-0-0', '/F1'):
                    pdf_text_font = pdf_text_fonts.get(f, None)
                    if pdf_text_font is not None:
                        font_key = Name(f)
                        font = self.pdf_base.copy_foreign(pdf_text_font)
                        break

                # Get mediabox dimensions for rotation calculations
                mediabox = pdf_text.pages[0].mediabox
                wt = float(mediabox[2]) - float(mediabox[0])
                ht = float(mediabox[3]) - float(mediabox[1])

                base_mediabox = base_page.mediabox
                wp = float(base_mediabox[2]) - float(base_mediabox[0])
                hp = float(base_mediabox[3]) - float(base_mediabox[1])

                # Build transformation matrix for rotation and scaling
                ctm = _build_text_layer_ctm(
                    wt,
                    ht,
                    wp,
                    hp,
                    float(base_mediabox[0]),
                    float(base_mediabox[1]),
                    text_rotation,
                )
                log.debug("Grafting with ctm %r", ctm)

                # Create Form XObject
                base_resources = _ensure_dictionary(base_page.obj, Name.Resources)
                base_xobjs = _ensure_dictionary(base_resources, Name.XObject)
                text_xobj_name = Name.random(prefix="OCR-")
                xobj = self.pdf_base.make_stream(pdf_text_contents)
                base_xobjs[text_xobj_name] = xobj
                xobj.Type = Name.XObject
                xobj.Subtype = Name.Form
                xobj.FormType = 1
                xobj.BBox = base_mediabox

                # Add font to xobj resources
                if font_key is not None and font is not None:
                    xobj_resources = _ensure_dictionary(xobj, Name.Resources)
                    xobj_fonts = _ensure_dictionary(xobj_resources, Name.Font)
                    if font_key not in xobj_fonts:
                        xobj_fonts[font_key] = font

                if ctm is not None:
                    pdf_draw_xobj = (
                        (b'q %s cm\n' % ctm.encode())
                        + (b'%s Do\n' % text_xobj_name)
                        + b'\nQ\n'
                    )
                else:
                    pdf_draw_xobj = b'q\n' + (b'%s Do\n' % text_xobj_name) + b'\nQ\n'
                new_text_layer = Stream(self.pdf_base, pdf_draw_xobj)

                if self.context.options.mode == ProcessingMode.redo:
                    strip_invisible_text(self.pdf_base, base_page)
                base_page.contents_coalesce()
                base_page.contents_add(
                    new_text_layer, prepend=self.render_mode == RenderMode.UNDERNEATH
                )
                base_page.contents_coalesce()

                # Add font to page resources
                if font_key is not None and font is not None:
                    page_resources = _ensure_dictionary(base_page.obj, Name.Resources)
                    page_fonts = _ensure_dictionary(page_resources, Name.Font)
                    if font_key not in page_fonts:
                        page_fonts[font_key] = font
        except (FileNotFoundError, PdfError):
            # PdfError occurs if a 0-length file is written e.g. due to OCR timeout
            pass


================================================
FILE: src/ocrmypdf/_jobcontext.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Defines context objects that are passed to child processes/threads."""

from __future__ import annotations

from collections.abc import Iterator
from pathlib import Path
from typing import TYPE_CHECKING

from ocrmypdf._options import OcrOptions
from ocrmypdf.pdfinfo import PdfInfo
from ocrmypdf.pdfinfo.info import PageInfo

if TYPE_CHECKING:
    from ocrmypdf._plugin_manager import OcrmypdfPluginManager


class PdfContext:
    """Holds the context for a particular run of the pipeline."""

    options: OcrOptions  #: The specified options for processing this PDF.
    origin: Path  #: The filename of the original input file.
    pdfinfo: PdfInfo  #: Detailed data for this PDF.
    plugin_manager: (
        OcrmypdfPluginManager  #: PluginManager for processing the current PDF.
    )

    def __init__(
        self,
        options: OcrOptions,
        work_folder: Path,
        origin: Path,
        pdfinfo: PdfInfo,
        plugin_manager,
    ):
        self.options = options
        self.work_folder = work_folder
        self.origin = origin
        self.pdfinfo = pdfinfo
        self.plugin_manager = plugin_manager

    def get_path(self, name: str) -> Path:
        """Generate a ``Path`` for an intermediate file involved in processing.

        The path will be in a temporary folder that is common for all processing
        of this particular PDF.
        """
        return self.work_folder / name

    def get_page_contexts(self) -> Iterator[PageContext]:
        """Get all ``PageContext`` for this PDF."""
        npages = len(self.pdfinfo)
        for n in range(npages):
            yield PageContext(self, n)

    def get_page_context_args(self) -> Iterator[tuple[PageContext]]:
        """Get all ``PageContext`` for this PDF packaged in tuple for args-splatting."""
        npages = len(self.pdfinfo)
        for n in range(npages):
            yield (PageContext(self, n),)


class PageContext:
    """Holds our context for a page.

    Must be pickle-able, so stores only intrinsic/simple data elements or those
    capable of their serializing themselves via ``__getstate__``.

    Note: Uses OcrOptions with JSON serialization for multiprocessing compatibility.
    """

    origin: Path  #: The filename of the original input file.
    pageno: int  #: This page number (zero-based).
    pageinfo: PageInfo  #: Information on this page.
    plugin_manager: (
        OcrmypdfPluginManager  #: PluginManager for processing the current PDF.
    )

    def __init__(self, pdf_context: PdfContext, pageno):
        self.work_folder = pdf_context.work_folder
        self.origin = pdf_context.origin
        # Store OcrOptions directly instead of Namespace
        self.options = pdf_context.options
        self.pageno = pageno
        self.pageinfo = pdf_context.pdfinfo[pageno]
        self.plugin_manager = pdf_context.plugin_manager
        # Ensure no reference to PdfContext which contains OcrOptions
        self._pdf_context = None

    def get_path(self, name: str) -> Path:
        """Generate a ``Path`` for a file that is part of processing this page.

        The path will be based in a common temporary folder and have a prefix based
        on the page number.
        """
        return self.work_folder / f"{(self.pageno + 1):06d}_{name}"

    def __getstate__(self):
        state = self.__dict__.copy()

        options_json = self.options.model_dump_json_safe()
        state['options_json'] = options_json
        # Remove the OcrOptions object to avoid pickle issues
        del state['options']

        # Remove any potential references to Pydantic objects
        state.pop('_pdf_context', None)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)

        # Reconstruct OcrOptions from JSON if available
        if 'options_json' in state:
            from ocrmypdf._options import OcrOptions

            self.options = OcrOptions.model_validate_json_safe(state['options_json'])
        # Otherwise, we have a fallback Namespace (shouldn't happen in normal operation)
        # Leave it as-is for compatibility


================================================
FILE: src/ocrmypdf/_logging.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Logging support classes."""

from __future__ import annotations

import logging

from rich.console import Console
from rich.logging import RichHandler


class PageNumberFilter(logging.Filter):
    """Insert PDF page number that emitted log message to log record."""

    def filter(self, record):
        pageno = getattr(record, 'pageno', None)
        if isinstance(pageno, int):
            record.pageno = f'{pageno:5d} '
        elif pageno is None:
            record.pageno = ''
        return True


class RichLoggingHandler(RichHandler):
    def __init__(self, console: Console, **kwargs):
        super().__init__(
            console=console, show_level=False, show_time=False, markup=False, **kwargs
        )


================================================
FILE: src/ocrmypdf/_metadata.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""OCRmyPDF page processing pipeline functions."""

from __future__ import annotations

import datetime as dt
import logging
import os
from pathlib import Path
from typing import Any

from pikepdf import Dictionary, Name, Pdf
from pikepdf import __version__ as PIKEPDF_VERSION
from pikepdf.models.metadata import PdfMetadata, encode_pdf_date

from ocrmypdf._defaults import PROGRAM_NAME
from ocrmypdf._jobcontext import PdfContext
from ocrmypdf._version import __version__ as OCRMYPF_VERSION
from ocrmypdf.languages import iso_639_2_from_3

log = logging.getLogger(__name__)


def get_docinfo(base_pdf: Pdf, context: PdfContext) -> dict[str, str]:
    """Read the document info and store it in a dictionary."""
    options = context.options

    def from_document_info(key):
        try:
            s = base_pdf.docinfo[key]
            return str(s)
        except (KeyError, TypeError):
            return ''

    pdfmark = {
        k: from_document_info(k)
        for k in ('/Title', '/Author', '/Keywords', '/Subject', '/CreationDate')
    }
    if options.title:
        pdfmark['/Title'] = options.title
    if options.author:
        pdfmark['/Author'] = options.author
    if options.keywords:
        pdfmark['/Keywords'] = options.keywords
    if options.subject:
        pdfmark['/Subject'] = options.subject

    creator_tag = context.plugin_manager.get_ocr_engine(options=options).creator_tag(
        options
    )

    pdfmark['/Creator'] = f'{PROGRAM_NAME} {OCRMYPF_VERSION} / {creator_tag}'
    pdfmark['/Producer'] = f'pikepdf {PIKEPDF_VERSION}'
    pdfmark['/ModDate'] = encode_pdf_date(dt.datetime.now(dt.UTC))
    return pdfmark


def report_on_metadata(options, missing):
    if not missing:
        return
    if options.output_type.startswith('pdfa'):
        log.warning(
            "Some input metadata could not be copied because it is not "
            "permitted in PDF/A. You may wish to examine the output "
            "PDF's XMP metadata."
        )
        log.debug("The following metadata fields were not copied: %r", missing)
    else:
        log.error(
            "Some input metadata could not be copied."
            "You may wish to examine the output PDF's XMP metadata."
        )
        log.info("The following metadata fields were not copied: %r", missing)


def repair_docinfo_nuls(pdf):
    """If the DocumentInfo block contains NUL characters, remove them.

    If the DocumentInfo block is malformed, log an error and continue.
    """
    modified = False
    try:
        if not isinstance(pdf.docinfo, Dictionary):
            raise TypeError("DocumentInfo is not a dictionary")
        for k, v in pdf.docinfo.items():
            if isinstance(v, str) and b'\x00' in bytes(v):
                pdf.docinfo[k] = bytes(v).replace(b'\x00', b'')
                modified = True
    except TypeError:
        # TypeError can also be raised if dictionary items are unexpected types
        log.error("File contains a malformed DocumentInfo block - continuing anyway.")
    return modified


def should_linearize(working_file: Path, context: PdfContext) -> bool:
    """Determine whether the PDF should be linearized.

    For smaller files, linearization is not worth the effort.
    """
    filesize = os.stat(working_file).st_size
    return filesize > (context.options.fast_web_view * 1_000_000)


def _fix_metadata(meta_original: PdfMetadata, meta_pdf: PdfMetadata):
    # If xmp:CreateDate is missing, set it to the modify date to
    # ensure consistency with Ghostscript.
    if 'xmp:CreateDate' not in meta_pdf:
        meta_pdf['xmp:CreateDate'] = meta_pdf.get('xmp:ModifyDate', '')
    if meta_pdf.get('dc:title') == 'Untitled' and ('dc:title' not in meta_original):
        # Ghostscript likes to set title to Untitled if omitted from input.
        # Reverse this, because PDF/A TechNote 0003:Metadata in PDF/A-1
        # and the XMP Spec do not make this recommendation.
        del meta_pdf['dc:title']


def _unset_empty_metadata(meta: PdfMetadata, options):
    """Unset metadata fields that were explicitly set to empty strings.

    If the user explicitly specified an empty string for any of the
    following, they should be unset and not reported as missing in
    the output pdf. Note that some metadata fields use differing names
    between PDF/A and PDF.
    """
    if options.title == '' and 'dc:title' in meta:
        del meta['dc:title']  # PDF/A and PDF
    if options.author == '':
        if 'dc:creator' in meta:
            del meta['dc:creator']  # PDF/A (Not xmp:CreatorTool)
        if 'pdf:Author' in meta:
            del meta['pdf:Author']  # PDF
    if options.subject == '':
        if 'dc:description' in meta:
            del meta['dc:description']  # PDF/A
        if 'dc:subject' in meta:
            del meta['dc:subject']  # PDF
    if options.keywords == '' and 'pdf:Keywords' in meta:
        del meta['pdf:Keywords']  # PDF/A and PDF


def _set_language(pdf: Pdf, languages: list[str]):
    """Set the language of the PDF."""
    if Name.Lang in pdf.Root or not languages:
        return  # Already set or can't change
    primary_language_iso639_3 = languages[0]
    if not primary_language_iso639_3:
        return
    iso639_2 = iso_639_2_from_3(primary_language_iso639_3)
    if not iso639_2:
        return
    pdf.Root.Lang = iso639_2


class MetadataProgress:
    def __init__(self, progressbar_class, enable: bool = True):
        self.progressbar_class = progressbar_class
        self.progressbar = self.progressbar_class(
            total=100, desc="Linearizing", unit='%', disable=not enable
        )

    def __enter__(self):
        self.progressbar.__enter__()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        return self.progressbar.__exit__(exc_type, exc_value, traceback)

    def __call__(self, percent: int):
        if not self.progressbar_class:
            return
        self.progressbar.update(completed=percent)


def metadata_fixup(
    working_file: Path, context: PdfContext, pdf_save_settings: dict[str, Any]
) -> Path:
    """Fix certain metadata fields whether PDF or PDF/A.

    Override some of Ghostscript's metadata choices.

    Also report on metadata in the input file that was not retained during
    conversion.
    """
    output_file = context.get_path('metafix.pdf')
    options = context.options

    pbar_class = context.plugin_manager.get_progressbar_class()
    with (
        Pdf.open(context.origin) as original,
        Pdf.open(working_file) as pdf,
        MetadataProgress(pbar_class, options.progress_bar) as pbar,
    ):
        docinfo = get_docinfo(original, context)
        with (
            original.open_metadata(
                set_pikepdf_as_editor=False, update_docinfo=False, strict=False
            ) as meta_original,
            pdf.open_metadata() as meta_pdf,
        ):
            meta_pdf.load_from_docinfo(
                docinfo, delete_missing=False, raise_failure=False
            )
            _fix_metadata(meta_original, meta_pdf)
            _unset_empty_metadata(meta_original, options)
            _unset_empty_metadata(meta_pdf, options)
            meta_missing = set(meta_original.keys()) - set(meta_pdf.keys())
            report_on_metadata(options, meta_missing)

        _set_language(pdf, options.languages)
        pdf.save(output_file, progress=pbar, **pdf_save_settings)

    return output_file


================================================
FILE: src/ocrmypdf/_options.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Internal options model for OCRmyPDF."""

from __future__ import annotations

import json
import logging
import os
import shlex
import unicodedata
from collections.abc import Sequence
from enum import StrEnum
from io import IOBase
from pathlib import Path
from typing import Any, BinaryIO

from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator

from ocrmypdf._defaults import DEFAULT_LANGUAGE, DEFAULT_ROTATE_PAGES_THRESHOLD
from ocrmypdf.exceptions import BadArgsError
from ocrmypdf.helpers import monotonic

# Import plugin option models - these will be available after plugins are loaded
# We'll use forward references and handle imports dynamically

log = logging.getLogger(__name__)

# Module-level registry for plugin option models
# This is populated by setup_plugin_infrastructure() after plugins are loaded
_plugin_option_models: dict[str, type] = {}

PathOrIO = BinaryIO | IOBase | Path | str | bytes


class ProcessingMode(StrEnum):
    """OCR processing mode for handling pages with existing text.

    This enum controls how OCRmyPDF handles pages that already contain text:

    - ``default``: Error if text is found (standard OCR behavior)
    - ``force``: Rasterize all content and run OCR regardless of existing text
    - ``skip``: Skip OCR on pages that already have text
    - ``redo``: Re-OCR pages, stripping old invisible text layer
    """

    default = 'default'
    force = 'force'
    skip = 'skip'
    redo = 'redo'


class TaggedPdfMode(StrEnum):
    """Control behavior when encountering a Tagged PDF.

    Tagged PDFs often indicate documents generated from office applications
    that may not need OCR. This enum controls how OCRmyPDF handles them:

    - ``default``: Error if ProcessingMode is default, otherwise warn
    - ``ignore``: Always warn but continue processing (never error)
    """

    default = 'default'
    ignore = 'ignore'


def _pages_from_ranges(ranges: str) -> set[int]:
    """Convert page range string to set of page numbers."""
    pages: list[int] = []
    page_groups = ranges.replace(' ', '').split(',')
    for group in page_groups:
        if not group:
            continue
        try:
            start, end = group.split('-')
        except ValueError:
            pages.append(int(group) - 1)
        else:
            try:
                new_pages = list(range(int(start) - 1, int(end)))
                if not new_pages:
                    raise BadArgsError(
                        f"invalid page subrange '{start}-{end}'"
                    ) from None
                pages.extend(new_pages)
            except ValueError:
                raise BadArgsError(f"invalid page subrange '{group}'") from None

    if not pages:
        raise BadArgsError(
            f"The string of page ranges '{ranges}' did not contain any recognizable "
            f"page ranges."
        )

    if not monotonic(pages):
        log.warning(
            "List of pages to process contains duplicate pages, or pages that are "
            "out of order"
        )
    if any(page < 0 for page in pages):
        raise BadArgsError("pages refers to a page number less than 1")

    log.debug("OCRing only these pages: %s", pages)
    return set(pages)


class OcrOptions(BaseModel):
    """Internal options model that can masquerade as argparse.Namespace.

    This model provides proper typing and validation while maintaining
    compatibility with existing code that expects argparse.Namespace behavior.
    """

    # I/O options
    input_file: PathOrIO
    output_file: PathOrIO
    sidecar: PathOrIO | None = None
    output_folder: Path | None = None
    work_folder: Path | None = None

    # Core OCR options
    languages: list[str] = Field(default_factory=lambda: [DEFAULT_LANGUAGE])
    output_type: str = 'auto'
    mode: ProcessingMode = ProcessingMode.default

    # Backward compatibility properties for force_ocr, skip_text, redo_ocr
    @property
    def force_ocr(self) -> bool:
        """Backward compatibility alias for mode == ProcessingMode.force."""
        return self.mode == ProcessingMode.force

    @property
    def skip_text(self) -> bool:
        """Backward compatibility alias for mode == ProcessingMode.skip."""
        return self.mode == ProcessingMode.skip

    @property
    def redo_ocr(self) -> bool:
        """Backward compatibility alias for mode == ProcessingMode.redo."""
        return self.mode == ProcessingMode.redo

    # Job control
    jobs: int | None = None
    use_threads: bool = True
    progress_bar: bool = True
    quiet: bool = False
    verbose: int = 0
    keep_temporary_files: bool = False

    # Image processing
    image_dpi: int | None = None
    deskew: bool = False
    clean: bool = False
    clean_final: bool = False
    rotate_pages: bool = False
    remove_background: bool = False
    remove_vectors: bool = False
    oversample: int = 0
    unpaper_args: list[str] | None = None

    # OCR behavior
    skip_big: float | None = None
    pages: str | set[int] | None = None  # Can be string or set after validation
    invalidate_digital_signatures: bool = False
    tagged_pdf_mode: TaggedPdfMode = TaggedPdfMode.default

    # Metadata
    title: str | None = None
    author: str | None = None
    subject: str | None = None
    keywords: str | None = None

    # Optimization
    optimize: int = 1
    jpg_quality: int | None = None
    png_quality: int | None = None
    jbig2_threshold: float = 0.85

    # Compatibility alias for plugins that expect jpeg_quality
    @property
    def jpeg_quality(self):
        """Compatibility alias for jpg_quality."""
        return self.jpg_quality

    @jpeg_quality.setter
    def jpeg_quality(self, value):
        """Compatibility alias for jpg_quality."""
        self.jpg_quality = value

    # Output behavior
    no_overwrite: bool = False

    # Advanced options
    max_image_mpixels: float = 250.0
    pdf_renderer: str = 'auto'
    ocr_engine: str = 'auto'
    rasterizer: str = 'auto'
    rotate_pages_threshold: float = DEFAULT_ROTATE_PAGES_THRESHOLD
    user_words: os.PathLike | None = None
    user_patterns: os.PathLike | None = None
    fast_web_view: float = 1.0
    continue_on_soft_render_error: bool | None = None

    # Tesseract options - also accessible via options.tesseract.<field>
    tesseract_config: list[str] = []
    tesseract_pagesegmode: int | None = None
    tesseract_oem: int | None = None
    tesseract_thresholding: int | None = None
    tesseract_timeout: float | None = None
    tesseract_non_ocr_timeout: float | None = None
    tesseract_downsample_above: int = 32767
    tesseract_downsample_large_images: bool | None = None

    # Ghostscript options - also accessible via options.ghostscript.<field>
    pdfa_image_compression: str | None = None
    color_conversion_strategy: str = "LeaveColorUnchanged"

    # Optimize/JBIG2 options - also accessible via options.optimize.<field>
    jbig2_threshold: float = 0.85

    # Plugin system
    plugins: Sequence[Path | str] | None = None

    # Store any extra attributes (for plugins and dynamic options)
    extra_attrs: dict[str, Any] = Field(
        default_factory=dict, exclude=True, alias='_extra_attrs'
    )

    @field_validator('languages')
    @classmethod
    def validate_languages(cls, v):
        """Ensure languages list is not empty."""
        if not v:
            return [DEFAULT_LANGUAGE]
        return v

    @field_validator('output_type')
    @classmethod
    def validate_output_type(cls, v):
        """Validate output type is one of the allowed values."""
        valid_types = {'auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'}
        if v not in valid_types:
            raise ValueError(f"output_type must be one of {valid_types}")
        return v

    @field_validator('pdf_renderer')
    @classmethod
    def validate_pdf_renderer(cls, v):
        """Validate PDF renderer is one of the allowed values."""
        valid_renderers = {'auto', 'sandwich', 'fpdf2'}
        # Legacy hocr/hocrdebug are accepted but redirected to fpdf2
        legacy_renderers = {'hocr', 'hocrdebug'}
        all_accepted = valid_renderers | legacy_renderers
        if v not in all_accepted:
            raise ValueError(f"pdf_renderer must be one of {all_accepted}")
        return v

    @field_validator('rasterizer')
    @classmethod
    def validate_rasterizer(cls, v):
        """Validate rasterizer is one of the allowed values."""
        valid_rasterizers = {'auto', 'ghostscript', 'pypdfium'}
        if v not in valid_rasterizers:
            raise ValueError(f"rasterizer must be one of {valid_rasterizers}")
        return v

    @field_validator('clean_final')
    @classmethod
    def validate_clean_final(cls, v, info):
        """If clean_final is True, also set clean to True."""
        if v and hasattr(info, 'data') and 'clean' in info.data:
            info.data['clean'] = True
        return v

    @field_validator('jobs')
    @classmethod
    def validate_jobs(cls, v):
        """Validate jobs is a reasonable number."""
        if v is not None and (v < 0 or v > 256):
            raise ValueError("jobs must be between 0 and 256")
        return v

    @field_validator('verbose')
    @classmethod
    def validate_verbose(cls, v):
        """Validate verbose level."""
        if v < 0 or v > 2:
            raise ValueError("verbose must be between 0 and 2")
        return v

    @field_validator('oversample')
    @classmethod
    def validate_oversample(cls, v):
        """Validate oversample DPI."""
        if v < 0 or v > 5000:
            raise ValueError("oversample must be between 0 and 5000")
        return v

    @field_validator('max_image_mpixels')
    @classmethod
    def validate_max_image_mpixels(cls, v):
        """Validate max image megapixels."""
        if v < 0:
            raise ValueError("max_image_mpixels must be non-negative")
        return v

    @field_validator('rotate_pages_threshold')
    @classmethod
    def validate_rotate_pages_threshold(cls, v):
        """Validate rotate pages threshold."""
        if v < 0 or v > 1000:
            raise ValueError("rotate_pages_threshold must be between 0 and 1000")
        return v

    @field_validator('title', 'author', 'keywords', 'subject')
    @classmethod
    def validate_metadata_unicode(cls, v):
        """Validate metadata strings don't contain unsupported Unicode characters."""
        if v is None:
            return v

        for char in v:
            if unicodedata.category(char) == 'Co' or ord(char) >= 0x10000:
                hexchar = hex(ord(char))[2:].upper()
                raise ValueError(
                    f"Metadata string contains unsupported Unicode character: "
                    f"{char} (U+{hexchar})"
                )
        return v

    @field_validator('pages')
    @classmethod
    def validate_pages_format(cls, v):
        """Convert page ranges string to set of page numbers."""
        if v is None:
            return v
        if isinstance(v, set):
            return v  # Already processed

        # Convert string ranges to set of page numbers
        return _pages_from_ranges(v)

    @field_validator('unpaper_args', mode='before')
    @classmethod
    def validate_unpaper_args(cls, v):
        """Normalize unpaper_args from string to list and validate security."""
        if v is None:
            return v
        if isinstance(v, str):
            v = shlex.split(v)
        if isinstance(v, list):
            if any(('/' in arg or arg == '.' or arg == '..') for arg in v):
                raise ValueError('No filenames allowed in --unpaper-args')
            return v
        raise ValueError(f'unpaper_args must be a string or list, got {type(v)}')

    @model_validator(mode='before')
    @classmethod
    def handle_special_cases(cls, data):
        """Handle special cases for API compatibility and legacy options."""
        if isinstance(data, dict):
            # For hOCR API, output_file might not be present
            if 'output_folder' in data and 'output_file' not in data:
                data['output_file'] = '/dev/null'  # Placeholder

            # Convert legacy boolean options (force_ocr, skip_text, redo_ocr) to mode
            force = data.pop('force_ocr', None)
            skip = data.pop('skip_text', None)
            redo = data.pop('redo_ocr', None)

            # Count how many legacy options are set to True
            legacy_set = [
                (force, ProcessingMode.force),
                (skip, ProcessingMode.skip),
                (redo, ProcessingMode.redo),
            ]
            legacy_true = [(val, mode) for val, mode in legacy_set if val]
            legacy_count = len(legacy_true)

            # Get current mode value (may be string or enum)
            current_mode = data.get('mode', ProcessingMode.default)
            if isinstance(current_mode, str):
                current_mode = ProcessingMode(current_mode)
            mode_is_set = current_mode != ProcessingMode.default

            if legacy_count > 1:
                raise ValueError(
                    "Choose only one of --force-ocr, --skip-text, --redo-ocr."
                )

            if legacy_count == 1:
                expected_mode = legacy_true[0][1]
                if mode_is_set and current_mode != expected_mode:
                    legacy_flag = f"--{expected_mode.value.replace('_', '-')}-ocr"
                    raise ValueError(
                        f"Conflicting options: --mode {current_mode.value} "
                        f"cannot be used with {legacy_flag} or similar legacy flag."
                    )
                # Set mode from legacy option
                data['mode'] = expected_mode

        return data

    @model_validator(mode='after')
    def validate_redo_ocr_options(self):
        """Validate options compatible with redo mode."""
        if self.mode == ProcessingMode.redo and (
            self.deskew or self.clean_final or self.remove_background
        ):
            raise ValueError(
                "--redo-ocr (or --mode redo) is not currently compatible with "
                "--deskew, --clean-final, and --remove-background"
            )
        return self

    @model_validator(mode='after')
    def validate_output_type_compatibility(self):
        """Validate output type is compatible with output file."""
        if self.output_type == 'none' and str(self.output_file) not in (
            os.devnull,
            '-',
        ):
            raise ValueError(
                "Since you specified `--output-type none`, the output file "
                f"{self.output_file} cannot be produced. Set the output file to "
                f"`-` to suppress this message."
            )
        return self

    @property
    def lossless_reconstruction(self):
        """Determine lossless_reconstruction based on other options."""
        lossless = not any(
            [
                self.deskew,
                self.clean_final,
                self.mode == ProcessingMode.force,
                self.remove_background,
            ]
        )
        return lossless

    def model_dump_json_safe(self) -> str:
        """Serialize to JSON with special handling for non-serializable types."""
        # Create a copy of the model data for serialization
        data = self.model_dump()

        # Handle special types that don't serialize to JSON directly
        def _serialize_value(value):
            if isinstance(value, Path):
                return {'__type__': 'Path', 'value': str(value)}
            elif (
                isinstance(value, BinaryIO | IOBase)
                or hasattr(value, 'read')
                or hasattr(value, 'write')
            ):
                # Stream object - replace with placeholder
                return {'__type__': 'Stream', 'value': 'stream'}
            elif hasattr(value, '__class__') and 'Iterator' in value.__class__.__name__:
                # Handle Pydantic serialization iterators
                return {'__type__': 'Stream', 'value': 'stream'}
            elif isinstance(value, property):
                # Handle property objects that shouldn't be serialized
                return None
            elif isinstance(value, list | tuple):
                return [_serialize_value(item) for item in value]
            elif isinstance(value, dict):
                return {k: _serialize_value(v) for k, v in value.items()}
            else:
                return value

        # Process all fields
        serializable_data = {}
        for key, value in data.items():
            serialized_value = _serialize_value(value)
            if serialized_value is not None:  # Skip None values from properties
                serializable_data[key] = serialized_value

        # Add extra_attrs, excluding plugin cache entries (they'll be recreated lazily)
        if self.extra_attrs:
            filtered_extra = {
                k: v
                for k, v in self.extra_attrs.items()
                if not k.startswith('_plugin_cache_')
            }
            if filtered_extra:
                serializable_data['_extra_attrs'] = _serialize_value(filtered_extra)

        return json.dumps(serializable_data)

    @classmethod
    def model_validate_json_safe(cls, json_str: str) -> OcrOptions:
        """Reconstruct from JSON with special handling for non-serializable types."""
        data = json.loads(json_str)

        # Handle special types during deserialization
        def _deserialize_value(value):
            if isinstance(value, dict) and '__type__' in value:
                if value['__type__'] == 'Path':
                    return Path(value['value'])
                elif value['__type__'] == 'Stream':
                    # For streams, we'll use a placeholder string
                    return value['value']
                else:
                    return value['value']
            elif isinstance(value, list):
                return [_deserialize_value(item) for item in value]
            elif isinstance(value, dict):
                return {k: _deserialize_value(v) for k, v in value.items()}
            else:
                return value

        # Process all fields
        deserialized_data = {}
        extra_attrs = {}

        for key, value in data.items():
            if key == '_extra_attrs':
                extra_attrs = _deserialize_value(value)
            else:
                deserialized_data[key] = _deserialize_value(value)

        # Create instance
        instance = cls(**deserialized_data)
        instance.extra_attrs = extra_attrs

        return instance

    model_config = ConfigDict(
        extra="forbid",  # Force use of extra_attrs for unknown fields
        arbitrary_types_allowed=True,  # Allow BinaryIO, Path, etc.
        validate_assignment=True,  # Validate on attribute assignment
    )

    @classmethod
    def register_plugin_models(cls, models: dict[str, type]) -> None:
        """Register plugin option model classes for nested access.

        Args:
            models: Dictionary mapping namespace to model class
        """
        global _plugin_option_models
        _plugin_option_models.update(models)

    def _get_plugin_options(self, namespace: str) -> Any:
        """Get or create a plugin options instance for the given namespace.

        This method creates plugin option instances lazily from flat field values.

        Args:
            namespace: The plugin namespace (e.g., 'tesseract', 'optimize')

        Returns:
            An instance of the plugin's option model, or None if not registered
        """
        # Use extra_attrs to cache plugin option instances
        cache_key = f'_plugin_cache_{namespace}'
        if cache_key in self.extra_attrs:
            return self.extra_attrs[cache_key]

        if namespace not in _plugin_option_models:
            raise AttributeError(
                f"Plugin namespace '{namespace}' is not registered. "
                f"Ensure setup_plugin_infrastructure() was called."
            )

        model_class = _plugin_option_models[namespace]

        def _convert_value(value):
            """Convert value to be compatible with plugin model fields."""
            if isinstance(value, os.PathLike):
                return os.fspath(value)
            return value

        # Build kwargs from flat fields
        kwargs = {}
        for field_name in model_class.model_fields:
            # Try namespace_field pattern first (e.g., tesseract_timeout)
            flat_name = f"{namespace}_{field_name}"
            if flat_name in OcrOptions.model_fields:
                value = getattr(self, flat_name)
                if value is not None:
                    kwargs[field_name] = _convert_value(value)
            # Also check direct field name (for fields like jbig2_lossy)
            elif field_name in OcrOptions.model_fields:
                value = getattr(self, field_name)
                if value is not None:
                    kwargs[field_name] = _convert_value(value)
            # Check for special mappings
            elif namespace == 'optimize' and field_name == 'level':
                # 'optimize' field maps to 'level' in OptimizeOptions
                if 'optimize' in OcrOptions.model_fields:
                    value = self.optimize
                    if value is not None:
                        kwargs[field_name] = _convert_value(value)
            elif namespace == 'optimize' and field_name == 'jpeg_quality':
                # jpg_quality maps to jpeg_quality
                if 'jpg_quality' in OcrOptions.model_fields:
                    value = self.jpg_quality
                    if value is not None:
                        kwargs[field_name] = _convert_value(value)

        # Create and cache the plugin options instance
        instance = model_class(**kwargs)
        self.extra_attrs[cache_key] = instance
        return instance

    def __getattr__(self, name: str) -> Any:
        """Support dynamic access to plugin option namespaces.

        This allows accessing plugin options like:
            options.tesseract.timeout
            options.optimize.level

        Plugin models must be registered via register_plugin_models() for
        namespace access to work. Built-in plugins register their models
        during initialization.

        Args:
            name: Attribute name

        Returns:
            Plugin options instance if name is a registered namespace,
            otherwise raises AttributeError
        """
        # Check if this is a plugin namespace
        if name.startswith('_'):
            # Private attributes should not trigger plugin lookup
            raise AttributeError(
                f"'{type(self).__name__}' object has no attribute '{name}'"
            )

        # Try to get plugin options for this namespace
        if name in _plugin_option_models:
            return self._get_plugin_options(name)

        # Check extra_attrs
        if 'extra_attrs' in self.__dict__ and name in self.extra_attrs:
            return self.extra_attrs[name]

        raise AttributeError(
            f"'{type(self).__name__}' object has no attribute '{name}'"
        )


================================================
FILE: src/ocrmypdf/_pipeline.py
================================================
# SPDX-FileCopyrightText: 2018-2022 James R. Barlow
# SPDX-FileCopyrightText: 2019 Martin Wind
# SPDX-License-Identifier: MPL-2.0

"""OCRmyPDF page processing pipeline functions."""

from __future__ import annotations

import logging
import os
import re
import sys
from collections.abc import Iterable, Iterator, Sequence
from contextlib import suppress
from io import BytesIO
from pathlib import Path
from shutil import copyfileobj
from typing import TYPE_CHECKING, Any, BinaryIO, TypeVar, cast

if TYPE_CHECKING:
    from ocrmypdf.hocrtransform import OcrElement

import img2pdf
import pikepdf
from PIL import Image, ImageColor, ImageDraw

from ocrmypdf._concurrent import Executor
from ocrmypdf._exec import unpaper
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._metadata import repair_docinfo_nuls
from ocrmypdf._options import OcrOptions, ProcessingMode, TaggedPdfMode
from ocrmypdf.exceptions import (
    DigitalSignatureError,
    DpiError,
    EncryptedPdfError,
    InputFileError,
    PriorOcrFoundError,
    TaggedPDFError,
    UnsupportedImageFormatError,
)
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution, safe_symlink
from ocrmypdf.pdfa import (
    file_claims_pdfa,
    generate_pdfa_ps,
    speculative_pdfa_conversion,
)
from ocrmypdf.pdfinfo import Colorspace, Encoding, FloatRect, PageInfo, PdfInfo
from ocrmypdf.pluginspec import GhostscriptRasterDevice, OrientationConfidence

try:
    from pi_heif import register_heif_opener
except ImportError:

    def register_heif_opener():
        pass


T = TypeVar("T")
log = logging.getLogger(__name__)

VECTOR_PAGE_DPI = 400


register_heif_opener()


def triage_image_file(input_file: Path, output_file: Path, options: OcrOptions) -> None:
    """Triage the input image file.

    If the input file is an image, check its resolution and convert it to PDF.

    Args:
        input_file: The path to the input file.
        output_file: The path to the output file.
        options: An object containing the options passed to the OCRmyPDF command.

    Raises:
        UnsupportedImageFormatError: If the input file is not a supported image format.
        DpiError: If the input image has no resolution (DPI) in its metadata or if the
            resolution is not credible.
    """
    log.info("Input file is not a PDF, checking if it is an image...")
    try:
        im = Image.open(input_file)
    except OSError as e:
        # Recover the original filename
        log.error(str(e).replace(str(input_file), str(options.input_file)))
        if not input_file.exists():
            log.error("Input file does not exist: %s", input_file)
        if input_file.is_dir():
            log.error("Input file is a directory: %s", input_file)
        if input_file.is_file():
            log.error("Input file is a file: %s", input_file)
        if input_file.stat().st_size == 0:
            log.error("Input file is empty: %s", input_file)
        raise UnsupportedImageFormatError() from e

    with im:
        log.info("Input file is an image")
        if 'dpi' in im.info:
            if im.info['dpi'] <= (96, 96) and not options.image_dpi:
                log.info("Image size: (%d, %d)", *im.size)
                log.info("Image resolution: (%d, %d)", *im.info['dpi'])
                raise DpiError(
                    "Input file is an image, but the resolution (DPI) is "
                    "not credible.  Estimate the resolution at which the "
                    "image was scanned and specify it using --image-dpi."
                )
        elif not options.image_dpi:
            log.info("Image size: (%d, %d)", *im.size)
            raise DpiError(
                "Input file is an image, but has no resolution (DPI) "
                "in its metadata.  Estimate the resolution at which "
                "image was scanned and specify it using --image-dpi."
            )

        if im.mode in ('RGBA', 'LA'):
            raise UnsupportedImageFormatError(
                "The input image has an alpha channel. Remove the alpha "
                "channel first."
            )

        if 'iccprofile' not in im.info:
            if im.mode == 'RGB':
                log.info("Input image has no ICC profile, assuming sRGB")
            elif im.mode == 'CMYK':
                raise UnsupportedImageFormatError(
                    "Input CMYK image has no ICC profile, not usable"
                )

    try:
        log.info("Image seems valid. Try converting to PDF...")
        layout_fun = img2pdf.default_layout_fun
        if options.image_dpi:
            layout_fun = img2pdf.get_fixed_dpi_layout_fun(
                Resolution(options.image_dpi, options.image_dpi)
            )
        with open(output_file, 'wb') as outf:
            img2pdf.convert(
                os.fspath(input_file),
                layout_fun=layout_fun,
                outputstream=outf,
                **IMG2PDF_KWARGS,
            )
        log.info("Successfully converted to PDF, processing...")
    except img2pdf.ImageOpenError as e:
        raise UnsupportedImageFormatError() from e


def _pdf_guess_version(input_file: Path, search_window=1024) -> str:
    """Try to find version signature at start of file.

    Not robust enough to deal with appended files.

    Returns empty string if not found, indicating file is probably not PDF.
    """
    with open(input_file, 'rb') as f:
        signature = f.read(search_window)
    m = re.search(rb'%PDF-(\d\.\d)', signature)
    if m:
        return m.group(1).decode('ascii')
    return ''


def triage(
    original_filename: str, input_file: Path, output_file: Path, options: OcrOptions
) -> Path:
    """Triage the input file. We can handle PDFs and images."""
    try:
        if _pdf_guess_version(input_file):
            if options.image_dpi:
                log.warning(
                    "Argument --image-dpi is being ignored because the "
                    "input file is a PDF, not an image."
                )
            try:
                with pikepdf.open(input_file) as pdf:
                    pdf.save(output_file)
            except pikepdf.PdfError as e:
                raise InputFileError() from e
            except pikepdf.PasswordError as e:
                raise EncryptedPdfError() from e
            return output_file
    except OSError as e:
        log.debug(f"Temporary file was at: {input_file}")
        msg = str(e).replace(str(input_file), original_filename)
        raise InputFileError(msg) from e

    triage_image_file(input_file, output_file, options)
    return output_file


def get_pdfinfo(
    input_file,
    *,
    executor: Executor,
    detailed_analysis: bool = False,
    progbar: bool = False,
    max_workers: int | None = None,
    use_threads: bool = True,
    check_pages=None,
) -> PdfInfo:
    """Get the PDF info."""
    try:
        return PdfInfo(
            input_file,
            detailed_analysis=detailed_analysis,
            progbar=progbar,
            max_workers=max_workers,
            use_threads=use_threads,
            check_pages=check_pages,
            executor=executor,
        )
    except pikepdf.PasswordError as e:
        raise EncryptedPdfError() from e
    except pikepdf.PdfError as e:
        raise InputFileError() from e


def validate_pdfinfo_options(context: PdfContext) -> None:
    """Validate the PDF info options."""
    pdfinfo = context.pdfinfo
    options = context.options

    if pdfinfo.needs_rendering:
        raise InputFileError(
            "This PDF contains dynamic XFA forms created by Adobe LiveCycle "
            "Designer and can only be read by Adobe Acrobat or Adobe Reader."
        )
    if pdfinfo.has_signature:
        if options.invalidate_digital_signatures:
            log.warning("All digital signatures will be invalidated")
        else:
            raise DigitalSignatureError()
    if pdfinfo.has_acroform:
        if options.mode == ProcessingMode.redo:
            raise InputFileError(
                "This PDF has a user fillable form. --redo-ocr (or --mode redo) "
                "is not currently possible on such files."
            )
        else:
            log.warning(
                "This PDF has a fillable form. "
                "Chances are it is a pure digital "
                "document that does not need OCR."
            )
            if options.mode != ProcessingMode.force:
                log.info(
                    "Use the option --force-ocr (or --mode force) to produce an "
                    "image of the form and all filled form fields. The output PDF "
                    "will be 'flattened' and will no longer be fillable."
                )
    if pdfinfo.is_tagged:
        log.warning(
            "This PDF is marked as a Tagged PDF. This often indicates "
            "that the PDF was generated from an office document and does "
            "not need OCR. PDF pages processed by OCRmyPDF may not be "
            "tagged correctly."
        )
        if (
            options.tagged_pdf_mode == TaggedPdfMode.default
            and options.mode == ProcessingMode.default
        ):
            log.info("Use --tagged-pdf-mode ignore to ignore Tagged PDFs.")
            raise TaggedPDFError()
    context.plugin_manager.validate(pdfinfo=pdfinfo, options=options)


def _vector_page_dpi(pageinfo: PageInfo) -> int:
    """Get a DPI to use for vector pages, if the page has vector content."""
    return VECTOR_PAGE_DPI if pageinfo.has_vector or pageinfo.has_text else 0


def get_page_square_dpi(
    page_context: PageContext, image_dpi: Resolution | None = None
) -> Resolution:
    """Get the DPI when we require xres == yres, scaled to physical units.

    Page DPI includes UserUnit scaling.
    """
    pageinfo = page_context.pageinfo
    options = page_context.options
    if not image_dpi:
        image_dpi = pageinfo.dpi
    xres = image_dpi.x or 0.0
    yres = image_dpi.y or 0.0
    userunit = float(pageinfo.userunit) or 1.0
    units = float(
        max(
            (xres * userunit) or VECTOR_PAGE_DPI,
            (yres * userunit) or VECTOR_PAGE_DPI,
            _vector_page_dpi(pageinfo),
            options.oversample or 0.0,
        )
    )
    return Resolution(units, units)


def get_canvas_square_dpi(
    page_context: PageContext, image_dpi: Resolution | None = None
) -> Resolution:
    """Get the DPI when we require xres == yres, in Postscript units.

    Canvas DPI is independent of PDF UserUnit scaling, which is
    used to describe situations where the PDF user space is not 1:1 with
    the physical units of the page.
    """
    pageinfo = page_context.pageinfo
    options = page_context.options
    if not image_dpi:
        image_dpi = pageinfo.dpi
    units = float(
        max(
            image_dpi.x or VECTOR_PAGE_DPI,
            image_dpi.y or VECTOR_PAGE_DPI,
            _vector_page_dpi(pageinfo),
            options.oversample or 0.0,
        )
    )
    return Resolution(units, units)


def is_ocr_required(page_context: PageContext) -> bool:
    """Check if the page needs to be OCR'd."""
    pageinfo = page_context.pageinfo
    options = page_context.options

    ocr_required = True

    if options.pages and pageinfo.pageno not in options.pages:
        log.debug(f"skipped {pageinfo.pageno} as requested by --pages {options.pages}")
        ocr_required = False
    elif pageinfo.has_text:
        if options.mode == ProcessingMode.default:
            raise PriorOcrFoundError(
                "page already has text! - aborting (use --force-ocr or --mode force "
                "to force OCR; see also help for --skip-text, --redo-ocr, and --mode)"
            )
        elif options.mode == ProcessingMode.force:
            log.info("page already has text! - rasterizing text and running OCR anyway")
            ocr_required = True
        elif options.mode == ProcessingMode.redo:
            if pageinfo.has_corrupt_text:
                log.warning(
                    "some text on this page cannot be mapped to characters: "
                    "consider using --force-ocr (or --mode force) instead"
                )
            else:
                log.info("redoing OCR")
            ocr_required = True
        elif options.mode == ProcessingMode.skip:
            log.info("skipping all processing on this page")
            ocr_required = False
    elif not pageinfo.images and not options.lossless_reconstruction:
        # We found a page with no images and no text. That means it may
        # have vector art that the user wants to OCR. If we determined
        # lossless reconstruction is not possible then we have to rasterize
        # the image. So if OCR is being forced, take that to mean YES, go
        # ahead and rasterize. If not forced, then pretend there's no text
        # on the page at all so we don't lose anything.
        # This could be made smarter by explicitly searching for vector art.
        if options.mode == ProcessingMode.force and options.oversample:
            # The user really wants to reprocess this file
            log.info(
                "page has no images - "
                f"rasterizing at {options.oversample} DPI because "
                "--force-ocr --oversample (or --mode force --oversample) was specified"
            )
        elif options.mode == ProcessingMode.force:
            # Warn the user they might not want to do this
            log.warning(
                "page has no images - "
                "all vector content will be "
                f"rasterized at {VECTOR_PAGE_DPI} DPI, losing some resolution and "
                "likely increasing file size. Use --oversample to adjust the "
                "DPI."
            )
        else:
            log.info(
                "page has no images - "
                "skipping all processing on this page to avoid losing detail. "
                "Use --force-ocr (or --mode force) if you wish to perform OCR on "
                "pages that have vector content."
            )
            ocr_required = False

    if ocr_required and options.skip_big and pageinfo.images:
        pixel_count = pageinfo.width_pixels * pageinfo.height_pixels
        if pixel_count > (options.skip_big * 1_000_000):
            ocr_required = False
            log.warning(
                "page too big, skipping OCR "
                f"({(pixel_count / 1_000_000):.1f} MPixels > "
                f"{options.skip_big:.1f} MPixels --skip-big)"
            )
    return ocr_required


def rasterize_preview(input_file: Path, page_context: PageContext) -> Path:
    """Generate a lower quality preview image."""
    output_file = page_context.get_path('rasterize_preview.jpg')
    canvas_dpi = Resolution(300.0, 300.0).take_min(
        [get_canvas_square_dpi(page_context)]
    )
    page_dpi = Resolution(300.0, 300.0).take_min([get_page_square_dpi(page_context)])
    page_context.plugin_manager.rasterize_pdf_page(
        input_file=input_file,
        output_file=output_file,
        raster_device=GhostscriptRasterDevice.JPEGGRAY,
        raster_dpi=canvas_dpi,
        pageno=page_context.pageinfo.pageno + 1,
        page_dpi=page_dpi,
        rotation=0,
        filter_vector=False,
        stop_on_soft_error=not page_context.options.continue_on_soft_render_error,
        options=page_context.options,
        use_cropbox=False,
    )
    return output_file


def describe_rotation(
    page_context: PageContext, orient_conf: OrientationConfidence, correction: int
) -> str:
    """Describe the page rotation we are going to perform (or not perform)."""
    direction = {0: '⇧', 90: '⇨', 180: '⇩', 270: '⇦'}
    turns = {0: ' ', 90: '⬏', 180: '↻', 270: '⬑'}

    existing_rotation = page_context.pageinfo.rotation
    action = ''
    if orient_conf.confidence >= page_context.options.rotate_pages_threshold:
        if correction != 0:
            action = 'will rotate ' + turns[correction]
        else:
            action = 'rotation appears correct'
    else:
        action = "confidence too low to rotate" if correction != 0 else "no change"

    facing = ''

    if existing_rotation != 0:
        facing = f"with existing rotation {direction.get(existing_rotation, '?')}, "
    facing += f"page is facing {direction.get(orient_conf.angle, '?')}"

    return f"{facing}, confidence {orient_conf.confidence:.2f} - {action}"


def get_orientation_correction(preview: Path, page_context: PageContext) -> int:
    """Work out orientation correction for each page.

    We ask Ghostscript to draw a preview page, which will rasterize with the
    current /Rotate applied, and then ask OCR which way the page is
    oriented. If the value of /Rotate is correct (e.g., a user already
    manually fixed rotation), then OCR will say the page is pointing
    up and the correction is zero. Otherwise, the orientation found by
    OCR represents the clockwise rotation, or the counterclockwise
    correction to rotation.

    When we draw the real page for OCR, we rotate it by the CCW correction,
    which points it (hopefully) upright. _graft.py takes care of the orienting
    the image and text layers.
    """
    ocr_engine = page_context.plugin_manager.get_ocr_engine(
        options=page_context.options
    )
    orient_conf = ocr_engine.get_orientation(preview, page_context.options)

    correction = orient_conf.angle % 360
    log.info(describe_rotation(page_context, orient_conf, correction))
    if (
        orient_conf.confidence >= page_context.options.rotate_pages_threshold
        and correction != 0
    ):
        return correction

    return 0


def calculate_image_dpi(page_context: PageContext) -> Resolution:
    """Calculate the DPI for the page image."""
    pageinfo = page_context.pageinfo
    dpi_profile = pageinfo.page_dpi_profile()
    if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8:
        image_dpi = Resolution(dpi_profile.weighted_dpi, dpi_profile.weighted_dpi)
    else:
        image_dpi = pageinfo.dpi
    return image_dpi


def calculate_raster_dpi(page_context: PageContext):
    """Calculate the DPI for rasterization."""
    # Produce the page image with square resolution or else deskew and OCR
    # will not work properly.
    image_dpi = calculate_image_dpi(page_context)
    dpi_profile = page_context.pageinfo.page_dpi_profile()
    canvas_dpi = get_canvas_square_dpi(page_context, image_dpi)
    page_dpi = get_page_square_dpi(page_context, image_dpi)
    if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8:
        log.warning(
            "Weighted average image DPI is %0.1f, max DPI is %0.1f. "
            "The discrepancy may indicate a high detail region on this page, "
            "but could also indicate a problem with the input PDF file. "
            "Page image will be rendered at %0.1f DPI.",
            dpi_profile.weighted_dpi,
            dpi_profile.max_dpi,
            canvas_dpi.to_scalar(),
        )
    return canvas_dpi, page_dpi


def rasterize(
    input_file: Path,
    page_context: PageContext,
    correction: int = 0,
    output_tag: str = '',
    remove_vectors: bool | None = None,
) -> Path:
    """Rasterize a PDF page to a PNG image.

    Args:
        input_file: The input PDF file path.
        page_context: The page context object.
        correction: The orientation correction angle. Defaults to 0.
        output_tag: The output tag. Defaults to ''.
        remove_vectors: Whether to remove vectors. Defaults to None, which means
            the value from the page context options will be used. If the value
            is True or False, it will override the page context options.

    Returns:
        Path: The output PNG file path.
    """
    colorspaces = [
        GhostscriptRasterDevice.PNGMONO,
        GhostscriptRasterDevice.PNGGRAY,
        GhostscriptRasterDevice.PNG256,
        GhostscriptRasterDevice.PNG16M,
    ]
    device_idx = 0

    if remove_vectors is None:
        remove_vectors = page_context.options.remove_vectors

    output_file = page_context.get_path(f'rasterize{output_tag}.png')
    pageinfo = page_context.pageinfo

    def at_least(colorspace):
        return max(device_idx, colorspaces.index(colorspace))

    for image in pageinfo.images:
        if image.type_ != 'image':
            continue  # ignore masks
        if image.bpc > 1:
            if image.color == Colorspace.index:
                device_idx = at_least(GhostscriptRasterDevice.PNG256)
            elif image.color == Colorspace.gray:
                device_idx = at_least(GhostscriptRasterDevice.PNGGRAY)
            else:
                device_idx = at_least(GhostscriptRasterDevice.PNG16M)

    if pageinfo.has_vector:
        log.debug(f"Page has vector content, using {GhostscriptRasterDevice.PNG16M}")
        device_idx = at_least(GhostscriptRasterDevice.PNG16M)

    device = colorspaces[device_idx]

    log.debug(
        f"Rasterize with {device}, rotation {correction}, mediabox {pageinfo.mediabox}"
    )

    canvas_dpi, page_dpi = calculate_raster_dpi(page_context)

    page_context.plugin_manager.rasterize_pdf_page(
        input_file=input_file,
        output_file=output_file,
        raster_device=device,
        raster_dpi=canvas_dpi,
        page_dpi=page_dpi,
        pageno=pageinfo.pageno + 1,
        rotation=correction,
        filter_vector=remove_vectors,
        stop_on_soft_error=not page_context.options.continue_on_soft_render_error,
        options=page_context.options,
        use_cropbox=False,
    )
    return output_file


def preprocess_remove_background(input_file: Path, page_context: PageContext) -> Path:
    """Remove the background from the input image (temporarily disabled)."""
    if any(image.bpc > 1 for image in page_context.pageinfo.images):
        raise NotImplementedError("--remove-background is temporarily not implemented")
        # output_file = page_context.get_path('pp_rm_bg.png')
        # leptonica.remove_background(input_file, output_file)
        # return output_file
    log.info("background removal skipped on mono page")
    return input_file


def preprocess_deskew(input_file: Path, page_context: PageContext) -> Path:
    """Deskews the input image using the OCR engine and saves the output to a file.

    Args:
        input_file: The input image file to deskew.
        page_context: The context of the page being processed.

    Returns:
        Path: The path to the deskewed image file.
    """
    output_file = page_context.get_path('pp_deskew.png')
    dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context))

    ocr_engine = page_context.plugin_manager.get_ocr_engine(
        options=page_context.options
    )
    deskew_angle_degrees = ocr_engine.get_deskew(input_file, page_context.options)

    with Image.open(input_file) as im:
        # According to Pillow docs, .rotate() will automatically use Image.NEAREST
        # resampling if image is mode '1' or 'P'
        deskewed = im.rotate(
            deskew_angle_degrees,
            resample=Image.Resampling.BICUBIC,
            fillcolor=ImageColor.getcolor('white', mode=im.mode),  # type: ignore
        )
        deskewed.save(output_file, dpi=dpi)

    return output_file


def preprocess_clean(input_file: Path, page_context: PageContext) -> Path:
    """Clean the input image using unpaper."""
    output_file = page_context.get_path('pp_clean.png')
    dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context))
    return unpaper.clean(
        input_file,
        output_file,
        dpi=dpi.to_scalar(),
        unpaper_args=page_context.options.unpaper_args,
    )


def create_ocr_image(image: Path, page_context: PageContext) -> Path:
    """Create the image we send for OCR.

    Might not be the same as the display image depending on preprocessing.
    This image will never be shown to the user.
    """
    output_file = page_context.get_path('ocr.png')
    options = page_context.options
    with Image.open(image) as im:
        log.debug('resolution %r', im.info['dpi'])

        if options.mode != ProcessingMode.force:
            # Do not mask text areas when forcing OCR, because we need to OCR
            # all text areas
            mask = None  # Exclude both visible and invisible text from OCR
            if options.mode == ProcessingMode.redo:
                mask = True  # Mask visible text, but not invisible text

            draw = ImageDraw.ImageDraw(im)
            for textarea in page_context.pageinfo.get_textareas(
                visible=mask, corrupt=None
            ):
                # Calculate resolution based on the image size and page dimensions
                # without regard whatever resolution is in pageinfo (may differ or
                # be None)
                bbox = [float(v) for v in textarea]
                xyscale = tuple(float(coord) / 72.0 for coord in im.info['dpi'])
                pixcoords = (
                    bbox[0] * xyscale[0],
                    im.height - bbox[3] * xyscale[1],
                    bbox[2] * xyscale[0],
                    im.height - bbox[1] * xyscale[1],
                )
                log.debug('blanking %r', pixcoords)
                draw.rectangle(pixcoords, fill='white')
                # draw.rectangle(pixcoords, outline='pink')

        filter_im = page_context.plugin_manager.filter_ocr_image(
            page=page_context, image=im
        )
        if filter_im is not None:
            im = filter_im

        # Pillow requires integer DPI
        dpi = tuple(round(coord) for coord in im.info['dpi'])
        im.save(output_file, dpi=dpi)
    return output_file


def ocr_engine_hocr(input_file: Path, page_context: PageContext) -> tuple[Path, Path]:
    """Run the OCR engine and generate hOCR output."""
    hocr_out = page_context.get_path('ocr_hocr.hocr')
    hocr_text_out = page_context.get_path('ocr_hocr.txt')
    options = page_context.options

    ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)
    ocr_engine.generate_hocr(
        input_file=input_file,
        output_hocr=hocr_out,
        output_text=hocr_text_out,
        options=options,
    )
    return hocr_out, hocr_text_out


def ocr_engine_direct(
    input_file: Path, page_context: PageContext
) -> tuple[OcrElement, Path]:
    """Run the OCR engine and return OcrElement tree directly.

    This is the modern path for OCR engines that support the generate_ocr() API.
    It bypasses hOCR file generation for better performance and richer data.

    Args:
        input_file: The image file to OCR.
        page_context: The page context with options and path utilities.

    Returns:
        A tuple of (OcrElement tree, path to text sidecar file).
    """
    text_out = page_context.get_path('ocr_direct.txt')
    options = page_context.options

    ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)
    ocr_tree, text_content = ocr_engine.generate_ocr(
        input_file=input_file,
        options=options,
        page_number=page_context.pageno,
    )

    # Write text sidecar file
    text_out.write_text(text_content, encoding='utf-8')

    return ocr_tree, text_out


def should_visible_page_image_use_jpg(pageinfo: PageInfo) -> bool:
    """Determines whether the visible page image should be saved as a JPEG.

    If all images were JPEGs originally (including FlateDecode+DCTDecode),
    permit a JPEG as output.

    Args:
        pageinfo: The PageInfo object containing information about the page.

    Returns:
        A boolean indicating whether the visible page image should be saved as a JPEG.
    """
    return bool(pageinfo.images) and all(
        im.enc in (Encoding.jpeg, Encoding.flate_jpeg) for im in pageinfo.images
    )


def create_visible_page_jpg(image: Path, page_context: PageContext) -> Path:
    """Create a visible page image in JPEG format.

    This is intended to be used when all images on the page were originally JPEGs.
    """
    output_file = page_context.get_path('visible.jpg')
    with Image.open(image) as im:
        # At this point the image should be a .png, but deskew, unpaper
        # might have removed the DPI information. In this case, fall back to
        # square DPI used to rasterize. When the preview image was
        # rasterized, it was also converted to square resolution, which is
        # what we want to give to the OCR engine, so keep it square.
        if 'dpi' in im.info:
            dpi = Resolution(*im.info['dpi'])
        else:
            # Fallback to page-implied DPI
            dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context))

        # Pillow requires integer DPI
        im.save(output_file, format='JPEG', dpi=dpi.to_int())
    return output_file


def create_pdf_page_from_image(
    image: Path, page_context: PageContext, orientation_correction: int
) -> Path:
    """Create a PDF page from a page image."""
    # We rasterize a square DPI version of each page because most image
    # processing tools don't support rectangular DPI. Use the square DPI as it
    # accurately describes the image. It would be possible to resample the image
    # at this stage back to non-square DPI to more closely resemble the input,
    # except that the hocr renderer does not understand non-square DPI. The
    # sandwich renderer would be fine.
    output_file = page_context.get_path('visible.pdf')

    pageinfo = page_context.pageinfo
    pagesize = 72.0 * float(pageinfo.width_inches), 72.0 * float(pageinfo.height_inches)
    effective_rotation = (pageinfo.rotation - orientation_correction) % 360
    swap_axis = effective_rotation % 180 == 90
    if swap_axis:
        pagesize = pagesize[1], pagesize[0]

    # Create a new single page PDF to hold
    bio = BytesIO()
    with open(image, 'rb') as imfile:
        log.debug('convert')

        layout_fun = img2pdf.get_layout_fun(pagesize)
        img2pdf.convert(
            imfile,
            layout_fun=layout_fun,
            outputstream=bio,
            engine=img2pdf.Engine.pikepdf,
            rotation=img2pdf.Rotation.ifvalid,
        )
        log.debug('convert done')

    # img2pdf does not generate boxes correctly, so we fix them
    bio.seek(0)
    fix_pagepdf_boxes(bio, output_file, page_context, swap_axis=swap_axis)

    output_file = page_context.plugin_manager.filter_pdf_page(
        page=page_context, image_filename=image, output_pdf=output_file
    )
    return output_file


def ocr_engine_textonly_pdf(
    input_image: Path, page_context: PageContext
) -> tuple[Path, Path]:
    """Run the OCR engine and generate a text-only PDF (will look blank)."""
    output_pdf = page_context.get_path('ocr_tess.pdf')
    output_text = page_context.get_path('ocr_tess.txt')
    options = page_context.options

    ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)
    ocr_engine.generate_pdf(
        input_file=input_image,
        output_pdf=output_pdf,
        output_text=output_text,
        options=options,
    )
    return output_pdf, output_text


def _offset_rect(rect: tuple[float, float, float, float], offset: tuple[float, float]):
    """Offset a rectangle by a given amount."""
    return (
        rect[0] + offset[0],
        rect[1] + offset[1],
        rect[2] + offset[0],
        rect[3] + offset[1],
    )


def _adjust_pagebox(
    page: pikepdf.Page,
    media_box: FloatRect,
    name: pikepdf.Name,
    target_box: FloatRect,
    offset: tuple[float, float],
    swap_axis: bool,
):
    if media_box == target_box:
        return
    box = _offset_rect(target_box, offset)
    if swap_axis:
        box = box[1], box[0], box[3], box[2]
    page[name] = box
    log.debug(f"{str(name)} = {target_box}")


def fix_pagepdf_boxes(
    infile: Path | BinaryIO,
    out_file: Path,
    page_context: PageContext,
    swap_axis: bool = False,
) -> Path:
    """Fix the bounding boxes in a single page PDF.

    The single page PDF is created with a normal MediaBox with its lower left corner
    at (0, 0). infile is the single page PDF. page_context.mediabox has the original
    file's mediabox, which may have a different origin. We need to adjust the other
    boxes in the single page PDF to match the effect they had on the original page.

    When correcting page rotation, we create a single page PDF that is correctly
    rotated instead of an incorrectly rotated and then setting page.Rotate on it.
    If rotation is either 90 or 270 degrees, then this function can be called
    with swap_axis to swap the X and Y coordinates of all the boxes.

    We are not concerned with solving degenerate cases where the boxes overlap or
    or express invalid rectangles. We merely pass the boxes, producing a
    transformation equivalent to the change made by constructing a new page image.
    """
    with pikepdf.open(infile) as pdf:
        for page in pdf.pages:
            log.debug(
                f"initial mediabox={page.MediaBox} and pageinfo "
                f"mediabox={page_context.pageinfo.mediabox}"
            )
            mediabox = page_context.pageinfo.mediabox
            offset = -mediabox[0], -mediabox[1]
            if swap_axis:
                mediabox = mediabox[1], mediabox[0], mediabox[3], mediabox[2]
            boxes = ['CropBox', 'TrimBox', 'ArtBox', 'BleedBox']
            for box_name in boxes:
                _adjust_pagebox(
                    page,
                    mediabox,
                    pikepdf.Name(f"/{box_name}"),
                    getattr(page_context.pageinfo, box_name.lower()),
                    offset,
                    swap_axis,
                )

        pdf.save(out_file)
    return out_file


def generate_postscript_stub(context: PdfContext) -> Path:
    """Generates a PostScript file stub for the given PDF context.

    Args:
        context: The PDF context to generate the PostScript file stub for.

    Returns:
        Path: The path to the generated PostScript file stub.
    """
    output_file = context.get_path('pdfa.ps')
    generate_pdfa_ps(output_file)
    return output_file


def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext) -> Path:
    """Converts the given PDF to PDF/A.

    Args:
        input_pdf: The input PDF file path (presumably not PDF/A).
        input_ps_stub: The input PostScript file path, containing instructions
            for the PDF/A generator to use.
        context: The PDF context.
    """
    options = context.options
    input_pdfinfo = context.pdfinfo
    fix_docinfo_file = context.get_path('fix_docinfo.pdf')
    output_file = context.get_path('pdfa.pdf')

    # If the DocumentInfo record contains NUL characters, Ghostscript will
    # produce XMP metadata which contains invalid XML entities (&#0;).
    # NULs in DocumentInfo seem to be common since older Acrobats included them.
    # pikepdf can deal with this, but we make the world a better place by
    # stamping them out as soon as possible.
    with pikepdf.open(input_pdf) as pdf_file:
        if repair_docinfo_nuls(pdf_file):
            pdf_file.save(fix_docinfo_file)
        else:
            safe_symlink(input_pdf, fix_docinfo_file)

    # Extract PDF/A part correctly
    if options.output_type.startswith('pdfa'):
        if options.output_type == 'pdfa':
            pdfa_part = '2'  # Default to PDF/A-2
        else:
            pdfa_part = options.output_type.split('-')[
                -1
            ]  # Extract number from pdfa-1, pdfa-2, etc.
    else:
        pdfa_part = '2'  # Fallback

    context.plugin_manager.generate_pdfa(
        pdf_version=input_pdfinfo.min_version,
        pdf_pages=[fix_docinfo_file],
        pdfmark=input_ps_stub,
        output_file=output_file,
        context=context,
        pdfa_part=pdfa_part,
        progressbar_class=(
            context.plugin_manager.get_progressbar_class()
            if options.progress_bar
            else None
        ),
        stop_on_soft_error=not options.continue_on_soft_render_error,
    )

    return output_file


def try_speculative_pdfa(input_pdf: Path, context: PdfContext) -> Path | None:
    """Try speculative PDF/A conversion with verapdf validation.

    This attempts a fast PDF/A conversion by adding PDF/A structures
    directly with pikepdf, then validating with verapdf. If validation
    passes, returns the converted file. If it fails or verapdf is not
    available, returns None to signal that Ghostscript should be used.

    Args:
        input_pdf: Path to the PDF to convert
        context: The PDF context

    Returns:
        Path to valid PDF/A file, or None if speculative conversion failed
    """
    from ocrmypdf._exec import verapdf

    options = context.options

    # Skip speculative conversion if user requested specific image compression,
    # since that requires Ghostscript to apply
    gs_opts = getattr(options, 'ghostscript', None)
    if gs_opts is not None:
        compression = getattr(gs_opts, 'pdfa_image_compression', 'auto')
        if compression != 'auto':
            log.debug(
                'Skipping speculative PDF/A: --pdfa-image-compression=%s requires '
                'Ghostscript',
                compression,
            )
            return None

    if not verapdf.available():
        log.debug('verapdf not available, skipping speculative PDF/A conversion')
        return None
    output_file = context.get_path('speculative_pdfa.pdf')

    try:
        speculative_pdfa_conversion(input_pdf, output_file, options.output_type)

        flavour = verapdf.output_type_to_flavour(options.output_type)
        result = verapdf.validate(output_file, flavour)

        if result.valid:
            log.info('Speculative PDF/A conversion succeeded - skipping Ghostscript')
            return output_file
        else:
            log.debug(
                'Speculative PDF/A validation failed (%d rule violations), '
                'falling back to Ghostscript',
                result.failed_rules,
            )
            return None

    except Exception as e:
        log.debug('Speculative PDF/A conversion failed: %s', e)
        return None


def try_auto_pdfa(input_pdf: Path, context: PdfContext) -> tuple[Path, str]:
    """Best-effort PDF/A for 'auto' output type.

    This function attempts to produce PDF/A without requiring Ghostscript:
    1. If verapdf is available, tries speculative conversion with validation
    2. Without verapdf, passes through as PDF/A if safe (input already PDF/A
       or force-ocr was used)
    3. Falls back to regular PDF if neither condition is met

    Args:
        input_pdf: Path to the PDF to convert
        context: The PDF context

    Returns:
        Tuple of (output_path, actual_output_type) where actual_output_type
        is 'pdfa' if PDF/A was achieved, 'pdf' otherwise
    """
    from ocrmypdf._exec import verapdf

    # If verapdf available, try speculative conversion with validation
    if verapdf.available():
        result = try_speculative_pdfa(input_pdf, context)
        if result is not None:
            return (result, 'pdfa')
        # verapdf validation failed - fall through to regular PDF
        log.info(
            'Auto mode: speculative PDF/A validation failed, outputting regular PDF'
        )
        return (input_pdf, 'pdf')

    # Without verapdf, check if we can pass through as PDF/A
    if _is_safe_pdfa(input_pdf, context.options):
        # Pass through as-is (no modifications needed)
        log.info('Auto mode: passing through as PDF/A (input already compliant)')
        return (input_pdf, 'pdfa')

    # Fall through to regular PDF
    log.info('Auto mode: no verapdf available and input is not PDF/A, outputting PDF')
    return (input_pdf, 'pdf')


def _is_safe_pdfa(input_pdf: Path, options) -> bool:
    """Check if file can be considered PDF/A without validation.

    These are cases where our modifications don't break PDF/A compliance:
    1. Input already claims PDF/A (we just grafted OCR text onto it)
    2. We used force-ocr (we rewrote the entire PDF from scratch)

    Args:
        input_pdf: Path to the PDF to check
        options: OCR options

    Returns:
        True if file can safely be considered PDF/A
    """
    # Safe if input already claims PDF/A
    pdfa_status = file_claims_pdfa(input_pdf)
    if pdfa_status['pass']:
        return True

    # Safe if we rewrote the PDF with force mode
    return options.mode == ProcessingMode.force


def should_linearize(working_file: Path, context: PdfContext) -> bool:
    """Determine whether the PDF should be linearized.

    For smaller files, linearization is not worth the effort.
    """
    filesize = os.stat(working_file).st_size
    return filesize > (context.options.fast_web_view * 1_000_000)


def get_pdf_save_settings(output_type: str) -> dict[str, Any]:
    """Get pikepdf.Pdf.save settings for the given output type.

    Essentially, don't use features that are incompatible with a given
    PDF/A specification.
    """
    if output_type == 'pdfa-1':
        # Trigger recompression to ensure object streams are removed, because
        # Acrobat complains about them in PDF/A-1b validation.
        return dict(
            preserve_pdfa=True,
            compress_streams=True,
            stream_decode_level=pikepdf.StreamDecodeLevel.generalized,
            object_stream_mode=pikepdf.ObjectStreamMode.disable,
        )
    else:
        return dict(
            preserve_pdfa=True,
            compress_streams=True,
            object_stream_mode=(pikepdf.ObjectStreamMode.generate),
        )


def _file_size_ratio(
    input_file: Path, output_file: Path
) -> tuple[float | None, float | None]:
    """Calculate ratio of input to output file sizes and percentage savings.

    Args:
        input_file (Path): The path to the input file.
        output_file (Path): The path to the output file.

    Returns:
        tuple[float | None, float | None]: A tuple containing the file size
        ratio and the percentage savings achieved by the output file size
        compared to the input file size.
    """
    input_size = input_file.stat().st_size
    output_size = output_file.stat().st_size
    if output_size == 0:
        return None, None
    ratio = input_size / output_size
    savings = 1 - output_size / input_size
    return ratio, savings


def optimize_pdf(
    input_file: Path, context: PdfContext, executor: Executor
) -> tuple[Path, Sequence[str]]:
    """Optimize the given PDF file."""
    output_file = context.get_path('optimize.pdf')
    output_pdf, messages = context.plugin_manager.optimize_pdf(
        input_pdf=input_file,
        output_pdf=output_file,
        context=context,
        executor=executor,
        linearize=should_linearize(input_file, context),
    )

    ratio, savings = _file_size_ratio(input_file, output_file)
    if ratio:
        log.info(f"Image optimization ratio: {ratio:.2f} savings: {(savings):.1%}")
    ratio, savings = _file_size_ratio(context.origin, output_file)
    if ratio:
        log.info(f"Total file size ratio: {ratio:.2f} savings: {(savings):.1%}")
    return output_pdf, messages


def enumerate_compress_ranges(
    iterable: Iterable[T],
) -> Iterator[tuple[tuple[int, int], T | None]]:
    """Enumerate the ranges of non-empty elements in an iterable.

    Compresses consecutive ranges of length 1 into single elements.

    Args:
        iterable: An iterable of elements to enumerate.

    Yields:
        A tuple containing a range of indices and the corresponding element.
        If the element is None, the range represents a skipped range of indices.
    """
    skipped_from, index = None, None
    for index, txt_file in enumerate(iterable):
        index += 1
        if txt_file:
            if skipped_from is not None:
                yield (skipped_from, index - 1), None
                skipped_from = None
            yield (index, index), txt_file
        else:
            if skipped_from is None:
                skipped_from = index
    if skipped_from is not None:
        yield (skipped_from, index), None


def merge_sidecars(txt_files: Iterable[Path | None], context: PdfContext) -> Path:
    """Merge the page sidecar files into a single file.

    Sidecar files are created by the OCR engine and contain the text for each
    page in the PDF. This function merges the sidecar files into a single file
    and returns the path to the merged file.
    """
    output_file = context.get_path('sidecar.txt')
    with open(output_file, 'w', encoding="utf-8") as stream:
        for (from_, to_), txt_file in enumerate_compress_ranges(txt_files):
            if from_ != 1:
                stream.write('\f')  # Form feed between pages for all pages after first
            if txt_file:
                txt = txt_file.read_text(encoding="utf-8")
                # Some versions of Tesseract add a form feed at the end and
                # others don't. Remove it if it exists, since we add one manually.
                stream.write(txt.removesuffix('\f'))
            else:
                pages = f"{from_}-{to_}" if from_ != to_ else f"{from_}"
                stream.write(f'[OCR skipped on page(s) {pages}]')
    return output_file


def copy_final(
    input_file: Path, output_file: str | Path | BinaryIO, original_file: Path | None
) -> None:
    """Copy the final temporary file to the output destination.

    Args:
        input_file (Path): The intermediate input file to copy.
        output_file (str | Path | BinaryIO): The output file to copy to.
        original_file: The original file to copy attributes from.

    Returns:
        None
    """
    log.debug('%s -> %s', input_file, output_file)
    with input_file.open('rb') as input_stream:
        if output_file == '-':
            copyfileobj(input_stream, sys.stdout.buffer)  # type: ignore[misc]
            sys.stdout.flush()
        elif hasattr(output_file, 'writable'):
            output_stream = cast(BinaryIO, output_file)
            copyfileobj(input_stream, output_stream)  # type: ignore[misc]
            with suppress(AttributeError):
                output_stream.flush()
        else:
            # At this point we overwrite the output_file specified by the user
            # use copyfileobj because then we use open() to create the file and
            # get the appropriate umask, ownership, etc.
            with open(output_file, 'w+b') as output_stream:
                copyfileobj(input_stream, output_stream)


================================================
FILE: src/ocrmypdf/_pipelines/__init__.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0


from __future__ import annotations


================================================
FILE: src/ocrmypdf/_pipelines/_common.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import json
import logging
import logging.handlers
import os
import shutil
import sys
import threading
from collections.abc import Callable, Sequence
from concurrent.futures.process import BrokenProcessPool
from concurrent.futures.thread import BrokenThreadPool
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, NamedTuple, cast

if TYPE_CHECKING:
    from ocrmypdf.hocrtransform import OcrElement

import PIL
import PIL.Image
from pikepdf import Pdf

from ocrmypdf._annots import remove_broken_goto_annotations
from ocrmypdf._concurrent import Executor, setup_executor
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._logging import PageNumberFilter
from ocrmypdf._metadata import metadata_fixup
from ocrmypdf._options import OcrOptions
from ocrmypdf._pipeline import (
    convert_to_pdfa,
    create_ocr_image,
    create_pdf_page_from_image,
    create_visible_page_jpg,
    generate_postscript_stub,
    get_orientation_correction,
    get_pdf_save_settings,
    get_pdfinfo,
    optimize_pdf,
    preprocess_clean,
    preprocess_deskew,
    preprocess_remove_background,
    rasterize,
    rasterize_preview,
    should_linearize,
    should_visible_page_image_use_jpg,
    try_auto_pdfa,
    try_speculative_pdfa,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._validation import (
    report_output_file_size,
)
from ocrmypdf.exceptions import ExitCode, ExitCodeException
from ocrmypdf.helpers import (
    check_pdf,
    pikepdf_enable_mmap,
    running_in_docker,
    running_in_snap,
    samefile,
)
from ocrmypdf.pdfa import file_claims_pdfa
from ocrmypdf.pdfinfo import PdfInfo

log = logging.getLogger(__name__)
tls = threading.local()
tls.pageno = None


def _set_logging_tls(tls):
    """Inject current page number (when available) into log records."""
    old_factory = logging.getLogRecordFactory()

    def wrapper(*args, **kwargs):
        record = old_factory(*args, **kwargs)
        if hasattr(tls, 'pageno'):
            record.pageno = tls.pageno
        return record

    logging.setLogRecordFactory(wrapper)


_set_logging_tls(tls)


def set_thread_pageno(pageno: int | None):
    """Set page number (1-based) that the current thread is processing."""
    tls.pageno = pageno


class PageResult(NamedTuple):
    """Result when a page is finished processing."""

    pageno: int
    """Page number, 0-based."""

    pdf_page_from_image: Path | None = None
    """Single page PDF from image."""

    ocr: Path | None = None
    """Single page OCR PDF."""

    text: Path | None = None
    """Single page text file."""

    orientation_correction: int = 0
    """Orientation correction in degrees."""

    ocr_tree: OcrElement | None = None
    """Direct OcrElement tree (when using generate_ocr() API)."""


class HOCRResultEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Path):
            return {'Path': str(obj)}
        return super().default(obj)


class HOCRResultDecoder(json.JSONDecoder):
    def __init__(self, *args, **kwargs):
        kwargs['object_hook'] = self.dict_to_object
        super().__init__(*args, **kwargs)

    def dict_to_object(self, d):
        if 'Path' in d:
            return Path(d['Path'])
        return d


@dataclass
class HOCRResult:
    """Result when hOCR is finished processing."""

    pageno: int
    """Page number, 0-based."""

    pdf_page_from_image: Path | None = None
    """Single page PDF from image."""

    hocr: Path | None = None
    """Single page hOCR file."""

    textpdf: Path | None = None
    """hOCR file after conversion to PDF."""

    orientation_correction: int = 0
    """Orientation correction in degrees."""

    ocr_tree: OcrElement | None = None
    """Direct OcrElement tree (when using generate_ocr() API)."""

    @classmethod
    def from_json(cls, json_str: str) -> HOCRResult:
        """Create an instance from a dict."""
        return cls(**json.loads(json_str, cls=HOCRResultDecoder))

    def to_json(self) -> str:
        """Serialize to a JSON string."""
        return json.dumps(self.__dict__, cls=HOCRResultEncoder)


def configure_debug_logging(
    log_filename: Path, prefix: str = ''
) -> tuple[logging.FileHandler, Callable[[], None]]:
    """Create a debug log file at a specified location.

    Returns the log handler, and a function to remove the handler.

    Args:
        log_filename: Where to the put the log file.
        prefix: The logging domain prefix that should be sent to the log.
    """
    log_file_handler = logging.FileHandler(log_filename, delay=True)
    log_file_handler.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '[%(asctime)s] - %(name)s - %(levelname)7s -%(pageno)s %(message)s'
    )
    log_file_handler.setFormatter(formatter)
    log_file_handler.addFilter(PageNumberFilter())
    logging.getLogger(prefix).addHandler(log_file_handler)

    def remover():
        try:
            logging.getLogger(prefix).removeHandler(log_file_handler)
            log_file_handler.close()
        except OSError as e:
            print(e, file=sys.stderr)

    return log_file_handler, remover


def worker_init(max_pixels: int | None) -> None:
    """Initialize a worker thread or process."""
    # In Windows, child process will not inherit our change to this value in
    # the parent process, so ensure workers get it set. Not needed when running
    # threaded, but harmless to set again.
    PIL.Image.MAX_IMAGE_PIXELS = max_pixels
    pikepdf_enable_mmap()


@contextmanager
def manage_debug_log_handler(
    *,
    options: OcrOptions,
    work_folder: Path,
):
    remover = None
    if (options.keep_temporary_files or options.verbose >= 1) and not os.environ.get(
        'PYTEST_CURRENT_TEST', ''
    ):
        # Debug log for command line interface only with verbose output
        # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this
        # when pytest is running
        _debug_log_handler, remover = configure_debug_logging(
            work_folder / "debug.log", prefix=""
        )  # pragma: no cover
    try:
        yield
    finally:
        if remover:
            remover()


def _print_temp_folder_location(work_folder: Path):
    """Print the location of the temporary work folder."""
    msgs = [f"Temporary working files retained at:\n{work_folder}"]
    if running_in_docker():  # pragma: no cover
        msgs.append(
            "OCRmyPDF is running in a Docker container, "
            "so the files will be inside the container."
        )
    elif running_in_snap():  # pragma: no cover
        msgs.append(
            "OCRmyPDF is running in a Snap container, "
            "so the files will be inside the container."
        )
    print('\n'.join(msgs), file=sys.stderr)


@contextmanager
def manage_work_folder(*, work_folder: Path, retain: bool, print_location: bool):
    try:
        yield work_folder
    finally:
        if retain:
            if print_location:
                _print_temp_folder_location(work_folder)
        else:
            shutil.rmtree(work_folder, ignore_errors=True)


def cli_exception_handler(
    fn: Callable[[OcrOptions, OcrmypdfPluginManager], ExitCode],
    options: OcrOptions,
    plugin_manager: OcrmypdfPluginManager,
) -> ExitCode:
    """Convert exceptions into command line error messages and exit codes.

    When known exceptions are raised, the exception message is printed to stderr
    and the program exits with a non-zero exit code. When unknown exceptions are
    raised, the exception traceback is printed to stderr and the program exits
    with a non-zero exit code.
    """
    try:
        # We cannot use a generator and yield here, as would be the usual pattern
        # for exception handling context managers, because we need to return an exit
        # code.
        return fn(options, plugin_manager)
    except KeyboardInterrupt:
        if options.verbose >= 1:
            log.exception("KeyboardInterrupt")
        else:
            log.error("KeyboardInterrupt")
        return ExitCode.ctrl_c
    except ExitCodeException as e:
        e = cast(ExitCodeException, e)
        if options.verbose >= 1:
            log.exception("ExitCodeException")
        elif str(e):
            log.error("%s: %s", type(e).__name__, str(e))
        else:
            log.error(type(e).__name__)
        return e.exit_code
    except ValueError as e:
        # Convert Pydantic validation errors to BadArgsError for proper exit code
        if "validation error" in str(e).lower() or "value error" in str(e).lower():
            if options.verbose >= 1:
                log.exception("Validation error")
            else:
                log.error("Invalid argument: %s", str(e))
            return ExitCode.bad_args
        # Re-raise other ValueErrors to be caught by the general exception handler
        raise
    except PIL.Image.DecompressionBombError:
        log.exception(
            "A decompression bomb error was encountered while executing the "
            "pipeline. Use the argument --max-image-mpixels to raise the maximum "
            "image pixel limit."
        )
        return ExitCode.other_error
    except (
        BrokenProcessPool,
        BrokenThreadPool,
    ):
        log.exception(
            "A worker process was terminated unexpectedly. This is known to occur if "
            "processing your file takes all available swap space and RAM. It may "
            "help to try again with a smaller number of jobs, using the --jobs "
            "argument."
        )
        return ExitCode.child_process_error
    except Exception:  # pylint: disable=broad-except
        log.exception("An exception occurred while executing the pipeline")
        return ExitCode.other_error


def setup_pipeline(
    options: OcrOptions,
    plugin_manager: OcrmypdfPluginManager,
) -> Executor:
    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    # Note: OcrOptions is immutable, so we can't modify options.jobs directly
    # The jobs field should already be set correctly during OcrOptions creation

    # Apply PIL max image pixels side effect
    PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
    if PIL.Image.MAX_IMAGE_PIXELS == 0:
        PIL.Image.MAX_IMAGE_PIXELS = None  # type: ignore

    pikepdf_enable_mmap()
    executor = setup_executor(plugin_manager)
    return executor


def do_get_pdfinfo(pdf_path: Path, executor: Executor, options) -> PdfInfo:
    # Handle pages field - it might be a string that needs conversion
    check_pages = options.pages
    if isinstance(check_pages, str):
        from ocrmypdf._options import _pages_from_ranges

        check_pages = _pages_from_ranges(check_pages)

    return get_pdfinfo(
        pdf_path,
        executor=executor,
        detailed_analysis=options.redo_ocr,
        progbar=options.progress_bar,
        max_workers=options.jobs,
        use_threads=options.use_threads,
        check_pages=check_pages,
    )


def preprocess(
    page_context: PageContext,
    image: Path,
    remove_background: bool,
    deskew: bool,
    clean: bool,
) -> Path:
    """Preprocess an image."""
    if remove_background:
        image = preprocess_remove_background(image, page_context)
    if deskew:
        image = preprocess_deskew(image, page_context)
    if clean:
        image = preprocess_clean(image, page_context)
    return image


def make_intermediate_images(
    page_context: PageContext, orientation_correction: int
) -> tuple[Path, Path | None]:
    """Create intermediate and preprocessed images for OCR."""
    options = page_context.options

    ocr_image = preprocess_out = None
    rasterize_out = rasterize(
        page_context.origin,
        page_context,
        correction=orientation_correction,
        remove_vectors=False,
    )

    if not any([options.clean, options.clean_final, options.remove_vectors]):
        ocr_image = preprocess_out = preprocess(
            page_context,
            rasterize_out,
            options.remove_background,
            options.deskew,
            clean=False,
        )
    else:
        if not options.lossless_reconstruction:
            preprocess_out = preprocess(
                page_context,
                rasterize_out,
                options.remove_background,
                options.deskew,
                clean=options.clean_final,
            )
        if options.remove_vectors:
            rasterize_ocr_out = rasterize(
                page_context.origin,
                page_context,
                correction=orientation_correction,
                remove_vectors=True,
                output_tag='_ocr',
            )
        else:
            rasterize_ocr_out = rasterize_out

        if (
            preprocess_out
            and rasterize_ocr_out == rasterize_out
            and options.clean == options.clean_final
        ):
            # Optimization: image for OCR is identical to presentation image
            ocr_image = preprocess_out
        else:
            ocr_image = preprocess(
                page_context,
                rasterize_ocr_out,
                options.remove_background,
                options.deskew,
                clean=options.clean,
            )
    return ocr_image, preprocess_out


def process_page(page_context: PageContext) -> tuple[Path, Path | None, int]:
    """Process page to create OCR image, visible page image and orientation."""
    options = page_context.options
    orientation_correction = 0
    if options.rotate_pages:
        # Rasterize
        rasterize_preview_out = rasterize_preview(page_context.origin, page_context)
        orientation_correction = get_orientation_correction(
            rasterize_preview_out, page_context
        )

    ocr_image, preprocess_out = make_intermediate_images(
        page_context, orientation_correction
    )
    ocr_image_out = create_ocr_image(ocr_image, page_context)

    pdf_page_from_image_out = None
    if not options.lossless_reconstruction:
        assert preprocess_out
        visible_image_out = preprocess_out
        if should_visible_page_image_use_jpg(page_context.pageinfo):
            visible_image_out = create_visible_page_jpg(visible_image_out, page_context)
        filtered_image = page_context.plugin_manager.filter_page_image(
            page=page_context, image_filename=visible_image_out
        )
        if filtered_image is not None:  # None if no hook is present
            visible_image_out = filtered_image
        pdf_page_from_image_out = create_pdf_page_from_image(
            visible_image_out, page_context, orientation_correction
        )
    return ocr_image_out, pdf_page_from_image_out, orientation_correction


def postprocess(
    pdf_file: Path, context: PdfContext, executor: Executor
) -> tuple[Path, Sequence[str]]:
    """Postprocess the PDF file."""
    # pdf_out = pdf_file
    with Pdf.open(pdf_file) as pdf:
        fix_annots = context.get_path('fix_annots.pdf')
        if remove_broken_goto_annotations(pdf):
            pdf.save(fix_annots)
            pdf_out = fix_annots
        else:
            pdf_out = pdf_file
    if context.options.output_type == 'auto':
        # Best effort PDF/A - never uses Ghostscript
        pdf_out, actual_type = try_auto_pdfa(pdf_out, context)
        # Store actual output type for reporting
        context.options.extra_attrs['_actual_output_type'] = actual_type
    elif context.options.output_type.startswith('pdfa'):
        # Required PDF/A - uses Ghostscript as fallback
        speculative_result = try_speculative_pdfa(pdf_out, context)
        if speculative_result is not None:
            pdf_out = speculative_result
        else:
            # Fall back to Ghostscript conversion
            ps_stub_out = generate_postscript_stub(context)
            pdf_out = convert_to_pdfa(pdf_out, ps_stub_out, context)

    optimizing = context.plugin_manager.is_optimization_enabled(context=context)
    save_settings = get_pdf_save_settings(context.options.output_type)
    save_settings['linearize'] = not optimizing and should_linearize(pdf_out, context)

    pdf_out = metadata_fixup(pdf_out, context, pdf_save_settings=save_settings)
    return optimize_pdf(pdf_out, context, executor)


def report_output_pdf(options, start_input_file, optimize_messages) -> ExitCode:
    if options.output_file == '-':
        log.info("Output sent to stdout")
    elif hasattr(options.output_file, 'writable') and options.output_file.writable():
        log.info("Output written to stream")
    elif samefile(options.output_file, Path(os.devnull)):
        pass  # Say nothing when sending to dev null
    else:
        if options.output_type == 'auto':
            # For 'auto' mode, check what we actually produced
            actual_type = options.extra_attrs.get('_actual_output_type', 'pdf')
            pdfa_info = file_claims_pdfa(options.output_file)
            if actual_type == 'pdfa' and pdfa_info['pass']:
                log.info(
                    "Output file is a %s (auto mode achieved PDF/A)",
                    pdfa_info['conformance'],
                )
            elif pdfa_info['pass']:
                # Unexpectedly got PDF/A
                log.info("Output file is a %s", pdfa_info['conformance'])
            else:
                # Regular PDF - this is expected for auto mode fallback
                log.info("Output file is a PDF (auto mode)")
        elif options.output_type.startswith('pdfa'):
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                log.info("Output file is a %s (as expected)", pdfa_info['conformance'])
            else:
                log.warning(
                    "Output file is a valid PDF, but conversion to PDF/A did not "
                    "succeed (issue: %s)",
                    pdfa_info['conformance'],
                )
                return ExitCode.pdfa_conversion_failed
        if not check_pdf(options.output_file):
            log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf
        report_output_file_size(
            options, start_input_file, options.output_file, optimize_messages
        )
    return ExitCode.ok


================================================
FILE: src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py
================================================
# SPDX-FileCopyrightText: 2019-2023 James R. Barlow
# SPDX-FileCopyrightText: 2019 Martin Wind
# SPDX-License-Identifier: MPL-2.0

"""Implements the concurrent and page synchronous parts of the pipeline."""

from __future__ import annotations

import logging
import logging.handlers
from collections.abc import Sequence
from functools import partial

import PIL

from ocrmypdf._concurrent import Executor
from ocrmypdf._graft import OcrGrafter
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._options import OcrOptions
from ocrmypdf._pipeline import copy_final
from ocrmypdf._pipelines._common import (
    HOCRResult,
    do_get_pdfinfo,
    manage_work_folder,
    postprocess,
    report_output_pdf,
    set_thread_pageno,
    setup_pipeline,
    worker_init,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.helpers import available_cpu_count

log = logging.getLogger(__name__)


def _exec_hocrtransform_sync(page_context: PageContext) -> HOCRResult:
    """Process each page."""
    hocr_json = page_context.get_path('hocr.json')
    if not hocr_json.exists():
        # No hOCR file, so no OCR was performed on this page.
        return HOCRResult(pageno=page_context.pageno)
    hocr_result = HOCRResult.from_json(hocr_json.read_text())
    # hOCR path is passed directly to the grafting phase where fpdf2 renders it
    hocr_result.textpdf = page_context.get_path('ocr_hocr.hocr')
    return hocr_result


def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[str]:
    """Convert hOCR files to OCR PDF."""
    # Run exec_page_sync on every page
    options = context.options
    jobs = options.jobs or available_cpu_count()
    max_workers = min(len(context.pdfinfo), jobs)
    if max_workers > 1:
        log.info("Continue processing %d pages concurrently", max_workers)

    ocrgraft = OcrGrafter(context)

    def graft_page(result: HOCRResult, pbar: ProgressBar):
        """Graft text only PDF on to main PDF's page."""
        try:
            set_thread_pageno(result.pageno + 1)
            pbar.update()
            ocrgraft.graft_page(
                pageno=result.pageno,
                image=result.pdf_page_from_image,
                ocr_output=result.textpdf,
                ocr_tree=result.ocr_tree,
                autorotate_correction=result.orientation_correction,
            )
            pbar.update()
        finally:
            set_thread_pageno(None)

    executor(
        use_threads=options.use_threads,
        max_workers=max_workers,
        progress_kwargs=dict(
            total=(2 * len(context.pdfinfo)),
            desc='Grafting hOCR to PDF',
            unit='page',
            unit_scale=0.5,
            disable=not options.progress_bar,
        ),
        worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),
        task=_exec_hocrtransform_sync,
        task_arguments=context.get_page_context_args(),
        task_finished=graft_page,
    )

    pdf = ocrgraft.finalize()
    messages: Sequence[str] = []
    if options.output_type != 'none':
        # PDF/A and metadata
        log.info("Postprocessing...")
        pdf, messages = postprocess(pdf, context, executor)

        # Copy PDF file to destination (we don't know the input PDF file name)
        copy_final(pdf, options.output_file, None)
    return messages


def run_hocr_to_ocr_pdf_pipeline(
    options: OcrOptions,
    *,
    plugin_manager: OcrmypdfPluginManager,
) -> ExitCode:
    """Run pipeline to convert hOCR to final output PDF."""
    with manage_work_folder(
        work_folder=options.work_folder, retain=True, print_location=False
    ) as work_folder:
        executor = setup_pipeline(options, plugin_manager)
        origin_pdf = work_folder / 'origin.pdf'

        # Gather pdfinfo and create context
        pdfinfo = do_get_pdfinfo(origin_pdf, executor, options)
        context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)
        plugin_manager.check_options(options=options)
        optimize_messages = exec_hocr_to_ocr_pdf(context, executor)

        return report_output_pdf(options, origin_pdf, optimize_messages)


================================================
FILE: src/ocrmypdf/_pipelines/ocr.py
================================================
# SPDX-FileCopyrightText: 2019-2023 James R. Barlow
# SPDX-FileCopyrightText: 2019 Martin Wind
# SPDX-License-Identifier: MPL-2.0

"""Implements the concurrent and page synchronous parts of the pipeline."""

from __future__ import annotations

import logging
import logging.handlers
from collections.abc import Sequence
from functools import partial
from pathlib import Path
from tempfile import mkdtemp

import PIL

from ocrmypdf._concurrent import Executor
from ocrmypdf._graft import OcrGrafter
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._options import OcrOptions
from ocrmypdf._pipeline import (
    copy_final,
    is_ocr_required,
    merge_sidecars,
    ocr_engine_direct,
    ocr_engine_hocr,
    ocr_engine_textonly_pdf,
    triage,
    validate_pdfinfo_options,
)
from ocrmypdf._pipelines._common import (
    PageResult,
    cli_exception_handler,
    do_get_pdfinfo,
    manage_debug_log_handler,
    manage_work_folder,
    postprocess,
    process_page,
    report_output_pdf,
    set_thread_pageno,
    setup_pipeline,
    worker_init,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf._validation import (
    check_requested_output_file,
    create_input_file,
)
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.helpers import available_cpu_count
from ocrmypdf.models.ocr_element import OcrElement

log = logging.getLogger(__name__)


def _image_to_ocr_text(
    page_context: PageContext, ocr_image_out: Path
) -> tuple[Path | None, Path, OcrElement | None]:
    """Run OCR engine on image to create OCR PDF and text file."""
    options = page_context.options
    pdf_renderer = options.pdf_renderer

    # fpdf2 is the default renderer (auto resolves to fpdf2)
    if pdf_renderer in ('auto', 'fpdf2'):
        # Use generate_ocr() if the engine supports it, otherwise use hOCR path
        ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)
        if ocr_engine and ocr_engine.supports_generate_ocr():
            ocr_tree, text_out = ocr_engine_direct(ocr_image_out, page_context)
            return None, text_out, ocr_tree
        ocr_out, text_out = ocr_engine_hocr(ocr_image_out, page_context)
    elif pdf_renderer == 'sandwich':
        ocr_out, text_out = ocr_engine_textonly_pdf(ocr_image_out, page_context)
    else:
        raise NotImplementedError(f"pdf_renderer {pdf_renderer}")
    return ocr_out, text_out, None


def _exec_page_sync(page_context: PageContext) -> PageResult:
    """Execute a pipeline for a single page synchronously."""
    set_thread_pageno(page_context.pageno + 1)

    if not is_ocr_required(page_context):
        return PageResult(pageno=page_context.pageno)

    ocr_image_out, pdf_page_from_image_out, orientation_correction = process_page(
        page_context
    )
    ocr_out, text_out, ocr_tree = _image_to_ocr_text(page_context, ocr_image_out)
    return PageResult(
        pageno=page_context.pageno,
        pdf_page_from_image=pdf_page_from_image_out,
        ocr=ocr_out,
        text=text_out,
        orientation_correction=orientation_correction,
        ocr_tree=ocr_tree,
    )


def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:
    """Execute the OCR pipeline concurrently."""
    options = context.options
    jobs = options.jobs or available_cpu_count()
    max_workers = min(len(context.pdfinfo), jobs)
    if max_workers > 1:
        log.info("Starting processing with %d workers concurrently", max_workers)

    sidecars: list[Path | None] = [None] * len(context.pdfinfo)
    ocrgraft = OcrGrafter(context)

    def update_page(result: PageResult, pbar: ProgressBar):
        """After OCR is complete for a page, update the PDF."""
        try:
            set_thread_pageno(result.pageno + 1)
            sidecars[result.pageno] = result.text
            pbar.update(0.5)
            ocrgraft.graft_page(
                pageno=result.pageno,
                image=result.pdf_page_from_image,
                ocr_output=result.ocr,
                ocr_tree=result.ocr_tree,
                autorotate_correction=result.orientation_correction,
            )
            pbar.update(0.5)
        finally:
            set_thread_pageno(None)

    executor(
        use_threads=options.use_threads,
        max_workers=max_workers,
        progress_kwargs=dict(
            total=len(context.pdfinfo),
            desc='OCR' if options.ocr_engine != 'none' else 'Image processing',
            unit='page',
            disable=not options.progress_bar,
        ),
        worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),
        task=_exec_page_sync,
        task_arguments=context.get_page_context_args(),
        task_finished=update_page,
    )

    # Output sidecar text
    if options.sidecar:
        text = merge_sidecars(sidecars, context)
        # Copy text file to destination
        copy_final(text, options.sidecar, options.input_file)

    # Merge layers to one single pdf
    pdf = ocrgraft.finalize()

    messages: Sequence[str] = []
    if options.output_type != 'none':
        # PDF/A and metadata
        log.info("Postprocessing...")
        pdf, messages = postprocess(pdf, context, executor)

        # Copy PDF file to destination
        copy_final(pdf, options.output_file, options.input_file)
    return messages


def _run_pipeline(
    options: OcrOptions,
    plugin_manager: OcrmypdfPluginManager,
) -> ExitCode:
    with (
        manage_work_folder(
            work_folder=Path(mkdtemp(prefix="ocrmypdf.io.")),
            retain=options.keep_temporary_files,
            print_location=options.keep_temporary_files,
        ) as work_folder,
        manage_debug_log_handler(options=options, work_folder=work_folder),
    ):
        executor = setup_pipeline(options, plugin_manager)
        check_requested_output_file(options)
        start_input_file, original_filename = create_input_file(options, work_folder)

        # Triage image or pdf
        origin_pdf = triage(
            original_filename, start_input_file, work_folder / 'origin.pdf', options
        )

        # Gather pdfinfo and create context
        pdfinfo = do_get_pdfinfo(origin_pdf, executor, options)
        context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)

        # Validate options are okay for this pdf
        validate_pdfinfo_options(context)

        # Execute the pipeline
        optimize_messages = exec_concurrent(context, executor)

        exitcode = report_output_pdf(options, start_input_file, optimize_messages)
        return exitcode


def run_pipeline_cli(
    options: OcrOptions,
    *,
    plugin_manager: OcrmypdfPluginManager,
) -> ExitCode:
    """Run the OCR pipeline with command line exception handling.

    Args:
        options: The parsed OCR options.
        plugin_manager: The plugin manager to use. If not provided, one will be
            created.
    """
    return cli_exception_handler(_run_pipeline, options, plugin_manager)


def run_pipeline(
    options: OcrOptions,
    *,
    plugin_manager: OcrmypdfPluginManager,
) -> ExitCode:
    """Run the OCR pipeline without command line exception handling.

    Args:
        options: The parsed OCR options.
        plugin_manager: The plugin manager to use. If not provided, one will be
            created.
    """
    return _run_pipeline(options, plugin_manager)


================================================
FILE: src/ocrmypdf/_pipelines/pdf_to_hocr.py
================================================
# SPDX-FileCopyrightText: 2019-2023 James R. Barlow
# SPDX-FileCopyrightText: 2019 Martin Wind
# SPDX-License-Identifier: MPL-2.0

"""Implements the concurrent and page synchronous parts of the pipeline."""

from __future__ import annotations

import logging
import logging.handlers
import shutil
from functools import partial

import PIL

from ocrmypdf._concurrent import Executor
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._options import OcrOptions
from ocrmypdf._pipeline import (
    is_ocr_required,
    ocr_engine_hocr,
    validate_pdfinfo_options,
)
from ocrmypdf._pipelines._common import (
    HOCRResult,
    do_get_pdfinfo,
    manage_work_folder,
    process_page,
    set_thread_pageno,
    setup_pipeline,
    worker_init,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf.helpers import available_cpu_count

log = logging.getLogger(__name__)


def _exec_page_hocr_sync(page_context: PageContext) -> HOCRResult:
    """Execute a pipeline for a single page hOCR."""
    set_thread_pageno(page_context.pageno + 1)

    if not is_ocr_required(page_context):
        return HOCRResult(pageno=page_context.pageno)

    ocr_image_out, pdf_page_from_image_out, orientation_correction = process_page(
        page_context
    )
    hocr_out, _ = ocr_engine_hocr(ocr_image_out, page_context)

    result = HOCRResult(
        pageno=page_context.pageno,
        pdf_page_from_image=pdf_page_from_image_out,
        hocr=hocr_out,
        orientation_correction=orientation_correction,
    )
    page_context.get_path('hocr.json').write_text(result.to_json())
    return result


def exec_pdf_to_hocr(context: PdfContext, executor: Executor) -> None:
    """Execute the OCR pipeline concurrently and output hOCR."""
    # Run exec_page_sync on every page
    options = context.options
    jobs = options.jobs or available_cpu_count()
    max_workers = min(len(context.pdfinfo), jobs)
    if max_workers > 1:
        log.info("Starting processing with %d workers concurrently", max_workers)

    executor(
        use_threads=options.use_threads,
        max_workers=max_workers,
        progress_kwargs=dict(
            total=(2 * len(context.pdfinfo)),
            desc='hOCR',
            unit='page',
            unit_scale=0.5,
            disable=not options.progress_bar,
        ),
        worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),
        task=_exec_page_hocr_sync,
        task_arguments=context.get_page_context_args(),
    )


def run_hocr_pipeline(
    options: OcrOptions,
    *,
    plugin_manager: OcrmypdfPluginManager,
) -> None:
    """Run pipeline to output hOCR."""
    if options.output_folder is None:
        raise ValueError("output_folder must be specified for hOCR pipeline")
    with manage_work_folder(
        work_folder=options.output_folder, retain=True, print_location=False
    ) as work_folder:
        executor = setup_pipeline(options, plugin_manager)
        origin_pdf = work_folder / 'origin.pdf'
        shutil.copy2(options.input_file, origin_pdf)

        # Gather pdfinfo and create context
        pdfinfo = do_get_pdfinfo(origin_pdf, executor, options)
        context = PdfContext(
            options, work_folder, options.input_file, pdfinfo, plugin_manager
        )
        # Validate options are okay for this pdf
        validate_pdfinfo_options(context)
        exec_pdf_to_hocr(context, executor)


================================================
FILE: src/ocrmypdf/_plugin_manager.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Plugin manager using pluggy with type-safe interface."""

from __future__ import annotations

import importlib
import importlib.util
import pkgutil
import sys
from argparse import ArgumentParser
from collections.abc import Sequence
from logging import Handler
from pathlib import Path
from typing import TYPE_CHECKING

import pluggy
from pydantic import BaseModel

import ocrmypdf.builtin_plugins
from ocrmypdf import Executor, PdfContext, pluginspec
from ocrmypdf._options import OcrOptions
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.helpers import Resolution
from ocrmypdf.pluginspec import OcrEngine

if TYPE_CHECKING:
    from PIL import Image

    from ocrmypdf._jobcontext import PageContext
    from ocrmypdf.pdfinfo import PdfInfo


class OcrmypdfPluginManager:
    """Type-safe wrapper around pluggy.PluginManager.

    Capable of reconstructing itself in child workers via pickle.

    This class provides type-safe methods for all hooks defined in pluginspec.py,
    removing the need for unsafe `hook.method_name()` calls.
    """

    def __init__(
        self,
        *args,
        plugins: Sequence[str | Path],
        builtins: bool = True,
        **kwargs,
    ):
        self._init_args = args
        self._init_kwargs = kwargs
        self._plugins = plugins
        self._builtins = builtins
        self._pm = pluggy.PluginManager(*args, **kwargs)
        self._setup_plugins()

    @property
    def pluggy(self) -> pluggy.PluginManager:
        """Access the underlying pluggy.PluginManager for advanced use cases.

        This is useful for plugins that need to call methods like set_blocked()
        in their initialize hook.
        """
        return self._pm

    def __getstate__(self):
        state = dict(
            init_args=self._init_args,
            plugins=self._plugins,
            builtins=self._builtins,
            init_kwargs=self._init_kwargs,
        )
        return state

    def __setstate__(self, state):
        self.__init__(
            *state['init_args'],
            plugins=state['plugins'],
            builtins=state['builtins'],
            **state['init_kwargs'],
        )

    def _setup_plugins(self):
        self._pm.add_hookspecs(pluginspec)

        # 1. Register builtins
        if self._builtins:
            for module in sorted(
                pkgutil.iter_modules(ocrmypdf.builtin_plugins.__path__)
            ):
                name = f'ocrmypdf.builtin_plugins.{module.name}'
                module = importlib.import_module(name)
                self._pm.register(module)

        # 2. Register setuptools plugins
        self._pm.load_setuptools_entrypoints('ocrmypdf')

        # 3. Register plugins specified on command line
        for name in self._plugins:
            if isinstance(name, Path) or name.endswith('.py'):
                # Import by filename
                module_name = Path(name).stem
                spec = importlib.util.spec_from_file_location(module_name, name)
                module = importlib.util.module_from_spec(spec)
                sys.modules[module_name] = module
                spec.loader.exec_module(module)
            else:
                # Import by dotted module name
                module = importlib.import_module(name)
            self._pm.register(module)

    # =========================================================================
    # Type-safe hook methods
    # =========================================================================

    # --- firstresult hooks ---

    def get_logging_console(self) -> Handler | None:
        """Returns a custom logging handler for progress bar compatibility."""
        return self._pm.hook.get_logging_console()

    def get_executor(self, *, progressbar_class: type[ProgressBar]) -> Executor | None:
        """Returns an executor for parallel processing."""
        return self._pm.hook.get_executor(progressbar_class=progressbar_class)

    def get_progressbar_class(self) -> type[ProgressBar] | None:
        """Returns a progress bar class."""
        return self._pm.hook.get_progressbar_class()

    def rasterize_pdf_page(
        self,
        *,
        input_file: Path,
        output_file: Path,
        raster_device: str,
        raster_dpi: Resolution,
        pageno: int,
        page_dpi: Resolution | None,
        rotation: int | None,
        filter_vector: bool,
        stop_on_soft_error: bool,
        options: OcrOptions | None,
        use_cropbox: bool,
    ) -> Path | None:
        """Rasterize one page of a PDF at specified resolution."""
        return self._pm.hook.rasterize_pdf_page(
            input_file=input_file,
            output_file=output_file,
            raster_device=raster_device,
            raster_dpi=raster_dpi,
            pageno=pageno,
            page_dpi=page_dpi,
            rotation=rotation,
            filter_vector=filter_vector,
            stop_on_soft_error=stop_on_soft_error,
            options=options,
            use_cropbox=use_cropbox,
        )

    def filter_ocr_image(
        self, *, page: PageContext, image: Image.Image
    ) -> Image.Image | None:
        """Filter the image before it is sent to OCR."""
        return self._pm.hook.filter_ocr_image(page=page, image=image)

    def filter_page_image(
        self, *, page: PageContext, image_filename: Path
    ) -> Path | None:
        """Filter the whole page image before it is inserted into the PDF."""
        return self._pm.hook.filter_page_image(page=page, image_filename=image_filename)

    def filter_pdf_page(
        self, *, page: PageContext, image_filename: Path, output_pdf: Path
    ) -> Path:
        """Convert a filtered whole page image into a PDF."""
        result = self._pm.hook.filter_pdf_page(
            page=page, image_filename=image_filename, output_pdf=output_pdf
        )
        if result is None:
            raise ValueError('No PDF produced')
        if result != output_pdf:
            raise ValueError('filter_pdf_page must return output_pdf')
        return result

    def get_ocr_engine(self, *, options: OcrOptions | None = None) -> OcrEngine:
        """Returns an OcrEngine to use for processing.

        Args:
            options: OcrOptions to pass to the hook for engine selection.
        """
        result = self._pm.hook.get_ocr_engine(options=options)
        if result is None:
            raise ValueError('No OCR engine selected')
        return result

    def generate_pdfa(
        self,
        *,
        pdf_pages: list[Path],
        pdfmark: Path,
        output_file: Path,
        context: PdfContext,
        pdf_version: str,
        pdfa_part: str,
        progressbar_class: type[ProgressBar] | None,
        stop_on_soft_error: bool,
    ) -> Path | None:
        """Generate a PDF/A file."""
        return self._pm.hook.generate_pdfa(
            pdf_pages=pdf_pages,
            pdfmark=pdfmark,
            output_file=output_file,
            context=context,
            pdf_version=pdf_version,
            pdfa_part=pdfa_part,
            progressbar_class=progressbar_class,
            stop_on_soft_error=stop_on_soft_error,
        )

    def optimize_pdf(
        self,
        *,
        input_pdf: Path,
        output_pdf: Path,
        context: PdfContext,
        executor: Executor,
        linearize: bool,
    ) -> tuple[Path, Sequence[str]]:
        """Optimize a PDF after OCR processing."""
        result = self._pm.hook.optimize_pdf(
            input_pdf=input_pdf,
            output_pdf=output_pdf,
            context=context,
            executor=executor,
            linearize=linearize,
        )
        if result is None:
            return input_pdf, []
        return result

    def is_optimization_enabled(self, *, context: PdfContext) -> bool | None:
        """Returns whether optimization is enabled for given context."""
        return self._pm.hook.is_optimization_enabled(context=context)

    # --- non-firstresult hooks ---

    def initialize(self, *, plugin_manager: pluggy.PluginManager) -> list[None]:
        """Called when plugins are first loaded.

        Args:
            plugin_manager: The underlying pluggy.PluginManager, allowing
                plugins to call methods like set_blocked().
        """
        return self._pm.hook.initialize(plugin_manager=plugin_manager)

    def add_options(self, *, parser: ArgumentParser) -> list[None]:
        """Allows plugins to add command line and API arguments."""
        return self._pm.hook.add_options(parser=parser)

    def register_options(self) -> list[dict[str, type[BaseModel]]]:
        """Returns plugin option models keyed by namespace."""
        return self._pm.hook.register_options()

    def check_options(self, *, options: OcrOptions) -> list[None]:
        """Called to validate options after parsing."""
        return self._pm.hook.check_options(options=options)

    def validate(self, *, pdfinfo: PdfInfo, options: OcrOptions) -> list[None]:
        """Called to validate options and pdfinfo after PDF is loaded."""
        return self._pm.hook.validate(pdfinfo=pdfinfo, options=options)


def get_plugin_manager(
    plugins: Sequence[str | Path] | None = None, builtins=True
) -> OcrmypdfPluginManager:
    return OcrmypdfPluginManager(
        project_name='ocrmypdf',
        plugins=plugins if plugins is not None else [],
        builtins=builtins,
    )


__all__ = ['OcrmypdfPluginManager', 'get_plugin_manager']


================================================
FILE: src/ocrmypdf/_plugin_registry.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Plugin option registry for dynamic model composition."""

from __future__ import annotations

import logging

from pydantic import BaseModel

log = logging.getLogger(__name__)


class PluginOptionRegistry:
    """Registry for plugin option models.

    This registry collects option models from plugins during initialization.
    Plugin options can be accessed via nested namespaces on OcrOptions
    (e.g., options.tesseract.timeout) or via flat field names for backward
    compatibility (e.g., options.tesseract_timeout).
    """

    def __init__(self):
        self._option_models: dict[str, type[BaseModel]] = {}

    def register_option_model(
        self, namespace: str, model_class: type[BaseModel]
    ) -> None:
        """Register a plugin's option model.

        Args:
            namespace: The namespace for the plugin options (e.g., 'tesseract')
            model_class: The Pydantic model class for the plugin options
        """
        if namespace in self._option_models:
            log.warning(
                f"Plugin option namespace '{namespace}' already registered, overriding"
            )

        self._option_models[namespace] = model_class

        log.debug(
            f"Registered plugin option model for namespace '{namespace}': "
            f"{model_class.__name__}"
        )

    def get_registered_models(self) -> dict[str, type[BaseModel]]:
        """Get all registered plugin option models."""
        return self._option_models.copy()


================================================
FILE: src/ocrmypdf/_progressbar.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Defines progress bar API."""

from __future__ import annotations

from typing import Protocol

from rich.console import Console
from rich.progress import (
    BarColumn,
    MofNCompleteColumn,
    Progress,
    TaskProgressColumn,
    TextColumn,
    TimeRemainingColumn,
)
from rich.table import Column


class ProgressBar(Protocol):
    """The protocol that OCRmyPDF expects progress bar classes to be compatible with.

    In practice this could be used for any time of monitoring, not just a progress bar.

    Calling the class should return a new progress bar object, which is activated
    with ``__enter__`` and terminated with ``__exit__``. An update method is called
    whenever the progress bar is updated. Progress bar objects will not be reused;
    a new one will be created for each group of tasks.

    The progress bar is held in the main process/thread and not updated by child
    process/threads. When a child notifies the parent of completed work, the
    parent updates the progress bar.
    Progress bars should never write to ``sys.stdout``, or they will corrupt the
    output if OCRmyPDF writes a PDF to standard output.

    Note:
        The type of events that OCRmyPDF reports to a progress bar may change in
    minor releases.

    Args:
        total (int | float | None):
            The total number of work units expected. If ``None``, the total is unknown.
            For example, if you are processing pages, this might be the number of pages,
            or if you are measuring overall progress in percent, this might be 100.
        desc (str | None):
            A brief description of the current step (e.g. "Scanning contents",
            "OCR", "PDF/A conversion"). OCRmyPDF updates this before each major step.
        unit (str | None):
            A short label for the type of work being tracked
            (e.g. "page", "%", "image").
        disable (bool):
            If ``True``, progress updates are suppressed (no output).
            Defaults to ``False``.
        **kwargs:
            Future or extra parameters that OCRmyPDF might pass. Implementations
            should accept and ignore unrecognized keywords gracefully.

    Example:
        A simple plugin implementation could look like this:

        .. code-block:: python

            from ocrmypdf.pluginspec import ProgressBar
            from ocrmypdf import hookimpl

            class ConsoleProgressBar(ProgressBar):
                def __init__(self, *, total=None, desc=None, unit=None, disable=False,
                             **kwargs):
                    self.total = total
                    self.desc = desc
                    self.unit = unit
                    self.disable = disable
                    self.current = 0

                def __enter__(self):
                    if not self.disable:
                        print(f"Starting {self.desc or 'an OCR task'} "
                              f"(total={self.total} {self.unit})"
                        )
                    return self

                def __exit__(self, exc_type, exc_value, traceback):
                    if not self.disable:
                        if exc_type is None:
                            print("Completed successfully.")
                        else:
                            print(f"Task ended with error: {exc_value}")
                    return False  # Let OCRmyPDF raise any exceptions

                def update(self, n=1, *, completed=None):
                    if completed is not None:
                        # If 'completed' is given, set self.current
                        # but let's just read it to show usage
                        print(f"Absolute completion reported: {completed}")
                    # Otherwise, we increment by 'n'
                    self.current += n
                    if not self.disable:
                        if self.total:
                            percent = (self.current / self.total) * 100
                            print(
                                f"{self.desc}: {self.current}"
                                f"/{self.total} ({percent:.1f}%)"
                            )
                        else:
                            print(f"{self.desc}: {self.current} units done")

            @hookimpl
            def get_progressbar_class():
                return MyProgressBar

    """

    def __init__(
        self,
        *,
        total: int | float | None,
        desc: str | None,
        unit: str | None,
        disable: bool = False,
        **kwargs,
    ):
        """Initialize a progress bar.

        This is called once before any work is done. OCRmyPDF supplies the total
        number of units (or None if unknown), a description of the work, and the
        type of units. The ``disable`` parameter can be used to turn off progress
        reporting. Unrecognized keyword arguments should be ignored.

        Args:
            total (int | float | None):
                The total amount of work. If ``None``, the total is unknown.
            desc (str | None):
                A description of the current task. May change for different stages.
            unit (str | None):
                A short label for the unit of work.
            disable (bool):
                If ``True``, no output or logging should be displayed.
            **kwargs:
                Extra parameters that may be passed by OCRmyPDF in future versions.
        """

    def __enter__(self):
        """Enter a progress bar context."""

    def __exit__(self, *args):
        """Exit a progress bar context."""

    def update(self, n: float = 1, *, completed: float | None = None):
        """Increment the progress bar by ``n`` units, or set an absolute completion.

        OCRmyPDF calls this method repeatedly while processing pages or other tasks.
        If your total is known and you track it, you might do something like:

        .. code-block:: python

            self.current += n
            percent = (self.current / total) * 100

        The ``completed`` argument can indicate an absolute position, which is
        particularly helpful if you're tracking a percentage of work (e.g., 0 to 100)
        and want precise updates. In contrast, the incremental parameter ``n`` is
        often more useful for page-based increments.

        Args:
            n (float, optional):
                The amount to increment the progress by. Defaults to 1. May be
                fractional if OCRmyPDF performs partial steps. If you are tracking
                pages, this is typically how many pages have been processed in the
                most recent step.
            completed (float | None, optional):
                The absolute amount of work completed so far. This can override or
                supplement the simple increment logic. It's particularly useful
                for percentage-based tracking (e.g., when ``total`` is 100).
        """


class NullProgressBar:
    """Progress bar API that takes no actions."""

    def __init__(self, **kwargs):
        pass

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        return False

    def update(self, _arg=None, *, completed=None):
        return


class RichProgressBar:
    """Display progress bar using rich."""

    def __init__(
        self,
        *,
        console: Console,
        desc: str,
        total: float | None = None,
        unit: str | None = None,
        unit_scale: float | None = 1.0,
        disable: bool = False,
        **kwargs,
    ):
        self._entered = False
        self.progress = Progress(
            TextColumn(
                "[progress.description]{task.description}",
                table_column=Column(min_width=20),
            ),
            BarColumn(),
            TaskProgressColumn(),
            MofNCompleteColumn(),
            TimeRemainingColumn(),
            console=console,
            auto_refresh=True,
            redirect_stderr=True,
            redirect_stdout=False,
            disable=disable,
            **kwargs,
        )
        self.unit_scale = unit_scale
        self.progress_bar = self.progress.add_task(
            desc,
            total=total * self.unit_scale
            if total is not None and self.unit_scale is not None
            else None,
            unit=unit,
        )

    def __enter__(self):
        self.progress.start()
        self._entered = True
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.progress.refresh()
        self.progress.stop()
        return False

    def update(self, n=1, *, completed=None):
        assert self._entered, "Progress bar must be entered before updating"
        if completed is None:
            advance = self.unit_scale if n is None else n
            self.progress.update(self.progress_bar, advance=advance)
        else:
            self.progress.update(self.progress_bar, completed=completed)


================================================
FILE: src/ocrmypdf/_validation.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Validate a work order from API or command line."""

from __future__ import annotations

import logging
import os
import sys
from collections.abc import Sequence
from pathlib import Path
from shutil import copyfileobj

import pikepdf

from ocrmypdf._defaults import DEFAULT_ROTATE_PAGES_THRESHOLD
from ocrmypdf._exec import unpaper
from ocrmypdf._options import OcrOptions
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf.exceptions import (
    BadArgsError,
    InputFileError,
    MissingDependencyError,
    OutputFileAccessError,
)
from ocrmypdf.helpers import (
    is_file_writable,
    running_in_docker,
    running_in_snap,
    safe_symlink,
)
from ocrmypdf.subprocess import check_external_program

log = logging.getLogger(__name__)


def check_platform() -> None:
    if sys.maxsize <= 2**32:  # pragma: no cover
        log.warning(
            "You are running OCRmyPDF in a 32-bit (x86) Python interpreter. "
            "This is not supported. 32-bit does not have enough address space "
            "to process large files. "
            "Please use a 64-bit (x86-64) version of Python."
        )


def check_options_languages(
    options: OcrOptions, ocr_engine_languages: list[str]
) -> None:
    # Check for blocked languages first, before checking if they're installed
    DENIED_LANGUAGES = {'equ', 'osd'}
    blocked = DENIED_LANGUAGES & set(options.languages)
    if blocked:
        raise BadArgsError(
            "The following languages are for Tesseract's internal use and "
            "should not be issued explicitly: "
            f"{', '.join(blocked)}\n"
            "Remove them from the -l/--language argument."
        )

    if not ocr_engine_languages:
        return

    missing_languages = set(options.languages) - set(ocr_engine_languages)
    if missing_languages:
        lang_text = '\n'.join(lang for lang in missing_languages)
        msg = (
            "OCR engine does not have language data for the following "
            "requested languages: \n"
            f"{lang_text}\n"
            "Please install the appropriate language data for your OCR engine.\n"
            "\n"
            "See the online documentation for instructions:\n"
            "    https://ocrmypdf.readthedocs.io/en/latest/languages.html\n"
            "\n"
            "Note: most languages are identified by a 3-letter ISO 639-2 Code.\n"
            "For example, English is 'eng', German is 'deu', and Spanish is 'spa'.\n"
            "Simplified Chinese is 'chi_sim' and Traditional Chinese is 'chi_tra'."
            "\n"
        )
        raise MissingDependencyError(msg)


def check_options_sidecar(options: OcrOptions) -> None:
    if options.sidecar == '\0':
        if options.output_file == '-':
            raise BadArgsError("--sidecar filename needed when output file is stdout.")
        elif options.output_file == os.devnull:
            raise BadArgsError(
                "--sidecar filename needed when output file is /dev/null or NUL."
            )
        options.sidecar = options.output_file + '.txt'
    if options.sidecar == options.input_file or options.sidecar == options.output_file:
        raise BadArgsError(
            "--sidecar file must be different from the input and output files"
        )


def check_options_preprocessing(options: OcrOptions) -> None:
    if options.clean_final:
        options.clean = True
    if options.unpaper_args and not options.clean:
        raise BadArgsError("--clean is required for --unpaper-args")
    if (
        options.rotate_pages_threshold != DEFAULT_ROTATE_PAGES_THRESHOLD
        and not options.rotate_pages
    ):
        raise BadArgsError("--rotate-pages is required for --rotate-pages-threshold")
    if options.clean:
        check_external_program(
            program='unpaper',
            package='unpaper',
            version_checker=unpaper.version,
            need_version='6.1',
            required_for="--clean, --clean-final",
        )


def _check_plugin_invariant_options(options: OcrOptions) -> None:
    check_platform()
    check_options_sidecar(options)
    check_options_preprocessing(options)


def _check_plugin_options(
    options: OcrOptions, plugin_manager: OcrmypdfPluginManager
) -> None:
    # First, let plugins check their external dependencies
    plugin_manager.check_options(options=options)

    # Then check OCR engine language support
    ocr_engine_languages = plugin_manager.get_ocr_engine(options=options).languages(
        options
    )
    check_options_languages(options, ocr_engine_languages)

    # Finally, run comprehensive validation using the coordinator
    from ocrmypdf._validation_coordinator import ValidationCoordinator

    coordinator = ValidationCoordinator(plugin_manager)
    coordinator.validate_all_options(options)


def check_options(options: OcrOptions, plugin_manager: OcrmypdfPluginManager) -> None:
    """Check options for validity and consistency.

    This function coordinates validation across the entire system:
    1. Core validation (platform, files, preprocessing)
    2. Plugin external dependency validation
    3. Plugin-specific validation (handled by plugin models)
    4. Cross-cutting validation (handled by validation coordinator)
    """
    _check_plugin_invariant_options(options)
    _check_plugin_options(options, plugin_manager)


def create_input_file(options: OcrOptions, work_folder: Path) -> tuple[Path, str]:
    if options.input_file == '-':
        # stdin
        log.info('reading file from standard input')
        target = work_folder / 'stdin'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(sys.stdin.buffer, stream_buffer)
        return target, "stdin"
    elif hasattr(options.input_file, 'readable'):
        if not options.input_file.readable():
            raise InputFileError("Input file stream is not readable")
        log.info('reading file from input stream')
        target = work_folder / 'stream'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(options.input_file, stream_buffer)
        return target, "stream"
    else:
        try:
            target = work_folder / 'origin'
            safe_symlink(options.input_file, target)
            return target, os.fspath(options.input_file)
        except FileNotFoundError as e:
            msg = f"File not found - {options.input_file}"
            if running_in_docker():  # pragma: no cover
                msg += (
                    "\nDocker cannot access your working directory unless you "
                    "explicitly share it with the Docker container and set up"
                    "permissions correctly.\n"
                    "You may find it easier to use stdin/stdout:"
                    "\n"
                    "\tdocker run -i --rm jbarlow83/ocrmypdf - - <input.pdf >output.pdf"
                    "\n"
                )
            elif running_in_snap():  # pragma: no cover
                msg += (
                    "\nSnap applications cannot access files outside of "
                    "your home directory unless you explicitly allow it. "
                    "You may find it easier to use stdin/stdout:"
                    "\n"
                    "\tsnap run ocrmypdf - - <input.pdf >output.pdf"
                    "\n"
                )
            raise InputFileError(msg) from e


def check_requested_output_file(options: OcrOptions) -> None:
    if options.output_file == '-':
        if sys.stdout.isatty():
            raise BadArgsError(
                "Output was set to stdout '-' but it looks like stdout "
                "is connected to a terminal.  Please redirect stdout to a "
                "file."
            )
    elif hasattr(options.output_file, 'writable'):
        if not options.output_file.writable():
            raise OutputFileAccessError("Output stream is not writable")
    elif not is_file_writable(options.output_file):
        raise OutputFileAccessError(
            f"Output file location ({options.output_file}) is not a writable file."
        )

    if (
        options.no_overwrite
        and not hasattr(options.output_file, 'writable')
        and options.output_file != '-'
        and Path(str(options.output_file)).exists()
    ):
        raise OutputFileAccessError(
            f"Output file already exists: {options.output_file}\n"
            "To overwrite it, omit the --no-overwrite / -n option."
        )


def report_output_file_size(
    options: OcrOptions,
    input_file: Path,
    output_file: Path,
    optimize_messages: Sequence[str] | None = None,
    file_overhead: int = 4000,
    page_overhead: int = 3000,
) -> None:
    if optimize_messages is None:
        optimize_messages = []
    try:
        output_size = Path(output_file).stat().st_size
        input_size = Path(input_file).stat().st_size
    except FileNotFoundError:
        return  # Outputting to stream or something
    with pikepdf.open(output_file) as p:
        # Overhead constants obtained by estimating amount of data added by OCR
        # PDF/A conversion, and possible XMP metadata addition, with compression
        reasonable_overhead = file_overhead + page_overhead * len(p.pages)
    ratio = output_size / input_size
    reasonable_ratio = output_size / (input_size + reasonable_overhead)
    if reasonable_ratio < 1.35 or input_size < 25000:
        return  # Seems fine

    reasons = []
    image_preproc = {
        'deskew',
        'clean_final',
        'remove_background',
        'oversample',
    }
    for arg in image_preproc:
        if getattr(options, arg, False):
            reasons.append(
                f"--{arg.replace('_', '-')} was issued, causing transcoding."
            )
    # Check force_ocr via the backward-compatible property
    if options.force_ocr:
        reasons.append("--force-ocr (or --mode force) was issued, causing transcoding.")

    reasons.extend(optimize_messages)

    if options.output_type.startswith('pdfa'):
        reasons.append("PDF/A conversion was enabled. (Try `--output-type pdf`.)")
    if options.plugins:
        reasons.append("Plugins were used.")

    if reasons:
        explanation = "Possible reasons for this include:\n" + '\n'.join(reasons) + "\n"
    else:
        explanation = "No reason for this increase is known.  Please report this issue."

    log.warning(
        f"The output file size is {ratio:.2f}× larger than the input file.\n"
        f"{explanation}"
    )


================================================
FILE: src/ocrmypdf/_validation_coordinator.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Validation coordinator for plugin options and cross-cutting concerns."""

from __future__ import annotations

import logging
import os
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    import pluggy

    from ocrmypdf._options import OcrOptions

log = logging.getLogger(__name__)


class ValidationCoordinator:
    """Coordinates validation across plugin models and core options."""

    def __init__(self, plugin_manager: pluggy.PluginManager):
        self.plugin_manager = plugin_manager
        self.registry = getattr(plugin_manager, '_option_registry', None)

    def validate_all_options(self, options: OcrOptions) -> None:
        """Run comprehensive validation on all options.

        This runs validation in the correct order:
        1. Plugin self-validation (already done by Pydantic)
        2. Plugin context validation (requires external context)
        3. Cross-cutting validation (between plugins and core)

        Args:
            options: The options to validate
        """
        # Step 1: Plugin context validation
        self._validate_plugin_contexts(options)

        # Step 2: Cross-cutting validation
        self._validate_cross_cutting_concerns(options)

    def _validate_plugin_contexts(self, options: OcrOptions) -> None:
        """Validate plugin options that require external context."""
        # For now, we'll run the plugin validation directly since the models
        # are still being integrated. This ensures the validation warnings
        # and checks still work as expected.

        # Run Tesseract validation
        self._validate_tesseract_options(options)

        # Run Optimize validation
        self._validate_optimize_options(options)

    def _validate_tesseract_options(self, options: OcrOptions) -> None:
        """Validate Tesseract options."""
        # Check pagesegmode warning
        if options.tesseract.pagesegmode in (0, 2):
            log.warning(
                "The tesseract-pagesegmode you selected will disable OCR. "
                "This may cause processing to fail."
            )

        # Check downsample consistency
        if (
            options.tesseract.downsample_above != 32767
            and not options.tesseract.downsample_large_images
        ):
            log.warning(
                "The --tesseract-downsample-above argument will have no effect unless "
                "--tesseract-downsample-large-images is also given."
            )

        # Note: blocked languages (equ, osd) are checked earlier in
        # check_options_languages() to ensure the check runs before
        # the missing language check.

    def _validate_optimize_options(self, options: OcrOptions) -> None:
        """Validate optimization options."""
        # Check optimization consistency
        if options.optimize == 0 and any(
            [
                options.png_quality and options.png_quality > 0,
                options.jpeg_quality and options.jpeg_quality > 0,
            ]
        ):
            log.warning(
                "The arguments --png-quality and --jpeg-quality "
                "will be ignored because --optimize=0."
            )

    def _validate_cross_cutting_concerns(self, options: OcrOptions) -> None:
        """Validate cross-cutting concerns that span multiple plugins."""
        from ocrmypdf._options import ProcessingMode

        # Handle deprecated pdf_renderer values
        self._handle_deprecated_pdf_renderer(options)

        # Note: Mutual exclusivity of force_ocr/skip_text/redo_ocr is now enforced
        # by the ProcessingMode enum - only one mode can be active at a time.

        # Validate redo mode compatibility
        if options.mode == ProcessingMode.redo and (
            options.deskew or options.clean_final or options.remove_background
        ):
            raise ValueError(
                "--redo-ocr (or --mode redo) is not currently compatible with "
                "--deskew, --clean-final, and --remove-background"
            )

        # Validate output type compatibility
        if options.output_type == 'none' and str(options.output_file) not in (
            os.devnull,
            '-',
        ):
            raise ValueError(
                "Since you specified `--output-type none`, the output file "
                f"{options.output_file} cannot be produced. Set the output file to "
                "`-` to suppress this message."
            )

        # Validate PDF/A image compression compatibility
        if (
            options.ghostscript.pdfa_image_compression
            and options.ghostscript.pdfa_image_compression != 'auto'
            and not options.output_type.startswith('pdfa')
        ):
            log.warning(
                "--pdfa-image-compression argument only applies when "
                "--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
            )

    def _handle_deprecated_pdf_renderer(self, options: OcrOptions) -> None:
        """Handle deprecated pdf_renderer values by redirecting to fpdf2."""
        if options.pdf_renderer in ('hocr', 'hocrdebug'):
            log.info(
                "The '%s' PDF renderer has been removed. Using 'fpdf2' instead, "
                "which provides full international language support, proper RTL "
                "rendering, and improved text positioning.",
                options.pdf_renderer,
            )
            # Modify the options object to use fpdf2
            object.__setattr__(options, 'pdf_renderer', 'fpdf2')


================================================
FILE: src/ocrmypdf/_version.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
__version__ = "17.3.0"


================================================
FILE: src/ocrmypdf/api.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Python API for OCRmyPDF.

This module provides the main Python API for OCRmyPDF, allowing you to perform
OCR operations programmatically without using the command line interface.

Main Functions:
    ocr(): The primary function for OCR processing. Takes an input PDF or image
        file and produces an OCR'd PDF with searchable text.

    configure_logging(): Set up logging to match the command line interface
        behavior, with support for progress bars and colored output.

Experimental Functions:
    _pdf_to_hocr(): Extract text from PDF pages and save as hOCR files for
        manual editing before final PDF generation.

    _hocr_to_ocr_pdf(): Convert hOCR files back to a searchable PDF after
        manual text corrections.

The API maintains thread safety through internal locking since OCRmyPDF uses
global state for plugins. Only one OCR operation can run per Python process
at a time. For parallel processing, use multiple Python processes.

Example:
    import ocrmypdf

    # Configure logging (optional)
    ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)

    # Perform OCR
    ocrmypdf.ocr('input.pdf', 'output.pdf', language='eng')

For detailed parameter documentation, see the ocr() function docstring and
the equivalent command line parameters in the OCRmyPDF documentation.
"""

from __future__ import annotations

import logging
import os
import sys
import threading
from collections.abc import Iterable, Sequence
from enum import IntEnum
from io import IOBase
from pathlib import Path
from typing import BinaryIO, overload
from warnings import warn

from ocrmypdf._logging import PageNumberFilter
from ocrmypdf._options import OcrOptions
from ocrmypdf._pipelines.hocr_to_ocr_pdf import run_hocr_to_ocr_pdf_pipeline
from ocrmypdf._pipelines.ocr import run_pipeline, run_pipeline_cli
from ocrmypdf._pipelines.pdf_to_hocr import run_hocr_pipeline
from ocrmypdf._plugin_manager import OcrmypdfPluginManager, get_plugin_manager
from ocrmypdf._validation import check_options
from ocrmypdf.cli import ArgumentParser, get_parser
from ocrmypdf.exceptions import ExitCode

StrPath = Path | str | bytes
PathOrIO = BinaryIO | StrPath

# Installing plugins affects the global state of the Python interpreter,
# so we need to use a lock to prevent multiple threads from installing
# plugins at the same time.
_api_lock = threading.Lock()


def setup_plugin_infrastructure(
    plugins: Sequence[Path | str] | None = None,
    plugin_manager: OcrmypdfPluginManager | None = None,
) -> OcrmypdfPluginManager:
    """Set up plugin infrastructure with proper initialization.

    This function handles:
    1. Creating or validating the plugin manager
    2. Calling plugin initialization hooks
    3. Setting up plugin option registry

    Args:
        plugins: List of plugin paths/names to load
        plugin_manager: Existing plugin manager (if any)

    Returns:
        Properly initialized plugin manager

    Raises:
        ValueError: If both plugins and plugin_manager are provided
    """
    if plugins and plugin_manager:
        raise ValueError("plugins= and plugin_manager are mutually exclusive")

    if not plugins:
        plugins = []
    elif isinstance(plugins, str | Path):
        plugins = [plugins]
    else:
        plugins = list(plugins)

    # Create plugin manager if not provided
    if not plugin_manager:
        plugin_manager = get_plugin_manager(plugins)

    # Initialize plugins (pass the underlying pluggy manager)
    plugin_manager.initialize(plugin_manager=plugin_manager.pluggy)

    # Initialize plugin option registry
    from ocrmypdf._plugin_registry import PluginOptionRegistry

    registry = PluginOptionRegistry()

    # Let plugins register their option models
    option_models = plugin_manager.register_options()
    all_plugin_models: dict[str, type] = {}
    for plugin_options in option_models:
        if plugin_options:  # Skip None returns
            for namespace, model_class in plugin_options.items():
                registry.register_option_model(namespace, model_class)
                all_plugin_models[namespace] = model_class

    # Register plugin models with OcrOptions for dynamic nested access
    OcrOptions.register_plugin_models(all_plugin_models)

    # Store registry in plugin manager for later access
    plugin_manager._option_registry = registry

    return plugin_manager


class Verbosity(IntEnum):
    """Verbosity level for configure_logging."""

    # pylint: disable=invalid-name
    quiet = -1  #: Suppress most messages
    default = 0  #: Default level of logging
    debug = 1  #: Output ocrmypdf debug messages
    debug_all = 2  #: More detailed debugging from ocrmypdf and dependent modules


def configure_logging(
    verbosity: Verbosity,
    *,
    progress_bar_friendly: bool = True,
    manage_root_logger: bool = False,
    plugin_manager: OcrmypdfPluginManager | None = None,
):
    """Set up logging.

    Before calling :func:`ocrmypdf.ocr()`, you can use this function to
    configure logging if you want ocrmypdf's output to look like the ocrmypdf
    command line interface. It will register log handlers, log filters, and
    formatters, configure color logging to standard error, and adjust the log
    levels of third party libraries. Details of this are fine-tuned and subject
    to change. The ``verbosity`` argument is equivalent to the argument
    ``--verbose`` and applies those settings. If you have a wrapper
    script for ocrmypdf and you want it to be very similar to ocrmypdf, use this
    function; if you are using ocrmypdf as part of an application that manages
    its own logging, you probably do not want this function.

    If this function is not called, ocrmypdf will not configure logging, and it
    is up to the caller of ``ocrmypdf.ocr()`` to set up logging as it wishes using
    the Python standard library's logging module. If this function is called,
    the caller may of course make further adjustments to logging.

    Regardless of whether this function is called, ocrmypdf will perform all of
    its logging under the ``"ocrmypdf"`` logging namespace. In addition,
    ocrmypdf imports pdfminer, which logs under ``"pdfminer"``. A library user
    may wish to configure both; note that pdfminer is extremely chatty at the
    log level ``logging.INFO``.

    This function does not set up the ``debug.log`` log file that the command
    line interface does at certain verbosity levels. Applications should configure
    their own debug logging.

    Args:
        verbosity: Verbosity level.
        progress_bar_friendly: If True (the default), install a custom log handler
            that is compatible with progress bars and colored output.
        manage_root_logger: Configure the process's root logger.
        plugin_manager: The plugin manager, used for obtaining the custom log handler.

    Returns:
        The toplevel logger for ocrmypdf (or the root logger, if we are managing it).
    """
    prefix = '' if manage_root_logger else 'ocrmypdf'

    log = logging.getLogger(prefix)
    log.setLevel(logging.DEBUG)

    console = None
    if plugin_manager and progress_bar_friendly:
        console = plugin_manager.get_logging_console()

    if not console:
        console = logging.StreamHandler(stream=sys.stderr)

    if verbosity < 0:
        console.setLevel(logging.ERROR)
    elif verbosity >= 1:
        console.setLevel(logging.DEBUG)
    else:
        console.setLevel(logging.INFO)

    console.addFilter(PageNumberFilter())

    if verbosity >= 2:
        fmt = '%(levelname)7s %(name)s -%(pageno)s %(message)s'
    else:
        fmt = '%(pageno)s%(message)s'

    formatter = None

    if not formatter:
        formatter = logging.Formatter(fmt=fmt)

    console.setFormatter(formatter)
    log.addHandler(console)

    if verbosity <= 1:
        pdfminer_log = logging.getLogger('pdfminer')
        pdfminer_log.setLevel(logging.ERROR)
        pil_log = logging.getLogger('PIL')
        pil_log.setLevel(logging.INFO)
        fonttools_log = logging.getLogger('fontTools')
        fonttools_log.setLevel(logging.ERROR)

    if manage_root_logger:
        logging.captureWarnings(True)

    return log


def _check_no_conflicting_ocr_params(
    locals_dict: dict,
    kwargs: dict,
    excluded: set[str] | None = None,
) -> None:
    """Check that no individual OCR parameters conflict with OcrOptions.

    When a user passes an OcrOptions object, they should not also pass
    individual OCR parameters (except plugins/plugin_manager which are
    handled separately).

    Args:
        locals_dict: The locals() dict from the calling function.
        kwargs: The **kwargs dict from the calling function.
        excluded: Parameter names to exclude from conflict checking.

    Raises:
        ValueError: If conflicting parameters are found.
    """
    if excluded is None:
        excluded = set()

    # Parameters that are allowed alongside OcrOptions
    allowed_with_options = {
        'input_file_or_options',
        'options',  # The OcrOptions object itself after assignment
        'plugins',
        'plugin_manager',
        'kwargs',
    } | excluded

    # Check all locals that are OCR parameters (not None and not allowed)
    conflicts = [
        name
        for name, value in locals_dict.items()
        if value is not None and name not in allowed_with_options
    ]

    # Check kwargs
    conflicts.extend(kwargs.keys())

    if conflicts:
        raise ValueError(
            f"When passing OcrOptions as the first argument, do not pass "
            f"additional OCR parameters. Conflicting parameters: "
            f"{', '.join(sorted(conflicts))}. "
            f"Set these values in OcrOptions instead."
        )


def _remap_language_to_languages(options_kwargs: dict) -> None:
    """Map the public API 'language' parameter to OcrOptions 'languages' field.

    The public API uses 'language' (matching CLI --language) but OcrOptions
    uses 'languages' (plural). This also coerces a bare string to a list
    and splits '+'-separated language codes (e.g. 'eng+deu' -> ['eng', 'deu'])
    to match the CLI behavior.
    """
    if 'language' in options_kwargs and 'languages' not in options_kwargs:
        lang = options_kwargs.pop('language')
        if lang is None:
            return
        if isinstance(lang, str):
            lang = lang.split('+')
        else:
            # Flatten any '+'-separated entries in the list
            expanded: list[str] = []
            for item in lang:
                if isinstance(item, str) and '+' in item:
                    expanded.extend(item.split('+'))
                else:
                    expanded.append(item)
            lang = expanded
        options_kwargs['languages'] = lang
    elif 'language' in options_kwargs:
        del options_kwargs['language']


def create_options(
    *, input_file: PathOrIO, output_file: PathOrIO, parser: ArgumentParser, **kwargs
) -> OcrOptions:
    """Construct an options object from the input/output files and keyword arguments.

    Args:
        input_file: Input file path or file object.
        output_file: Output file path or file object.
        parser: ArgumentParser object (kept for compatibility,
            may be used for plugin validation).
        **kwargs: Keyword arguments.

    Returns:
        OcrOptions: An options object containing the parsed arguments.

    Raises:
        TypeError: If the type of a keyword argument is not supported.
    """
    # Prepare kwargs for direct OcrOptions construction
    options_kwargs = kwargs.copy()

    # Map API parameter 'language' to OcrOptions field 'languages'
    _remap_language_to_languages(options_kwargs)

    # Set input and output files
    options_kwargs['input_file'] = input_file
    options_kwargs['output_file'] = output_file

    # Handle special stream cases for sidecar
    if 'sidecar' in options_kwargs and isinstance(
        options_kwargs['sidecar'], BinaryIO | IOBase
    ):
        # Keep the stream object as-is - OcrOptions can handle it
        pass

    # Remove None values to let OcrOptions use its defaults
    options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None}

    # Remove any kwargs that aren't OcrOptions fields and store in extra_attrs
    extra_attrs = {}
    ocr_fields = set(OcrOptions.model_fields.keys())
    # Legacy mode flags are handled by OcrOptions model validator
    legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}

    # Known extra attributes that should be preserved
    known_extra = {'progress_bar', 'plugins'}

    for key in list(options_kwargs.keys()):
        if key in ocr_fields or key in legacy_mode_flags or key in known_extra:
            continue
        extra_attrs[key] = options_kwargs.pop(key)

    # Create OcrOptions directly
    try:
        options = OcrOptions(**options_kwargs)
        # Add any extra attributes
        if extra_attrs:
            options.extra_attrs.update(extra_attrs)
        return options
    except Exception as e:
        # If direct construction fails, provide a helpful error message
        raise TypeError(f"Failed to create OcrOptions: {e}") from e


@overload
def ocr(
    options: OcrOptions,
    /,
    *,
    plugins: Iterable[Path | str] | None = None,
    plugin_manager: OcrmypdfPluginManager | None = None,
) -> ExitCode: ...


@overload
def ocr(
    input_file_or_options: PathOrIO,
    output_file: PathOrIO,
    *,
    language: Iterable[str] | None = None,
    image_dpi: int | None = None,
    output_type: str | None = None,
    sidecar: PathOrIO | None = None,
    jobs: int | None = None,
    use_threads: bool | None = None,
    title: str | None = None,
    author: str | None = None,
    subject: str | None = None,
    keywords: str | None = None,
    rotate_pages: bool | None = None,
    remove_background: bool | None = None,
    deskew: bool | None = None,
    clean: bool | None = None,
    clean_final: bool | None = None,
    unpaper_args: str | None = None,
    oversample: int | None = None,
    remove_vectors: bool | None = None,
    mode: str | None = None,
    force_ocr: bool | None = None,
    skip_text: bool | None = None,
    redo_ocr: bool | None = None,
    skip_big: float | None = None,
    optimize: int | None = None,
    jpg_quality: int | None = None,
    png_quality: int | None = None,
    jbig2_lossy: bool | None = None,
    jbig2_page_group_size: int | None = None,
    jbig2_threshold: float | None = None,
    pages: str | None = None,
    max_image_mpixels: float | None = None,
    tesseract_config: Iterable[str] | None = None,
    tesseract_pagesegmode: int | None = None,
    tesseract_oem: int | None = None,
    tesseract_thresholding: int | None = None,
    pdf_renderer: str | None = None,
    rasterizer: str | None = None,
    tesseract_timeout: float | None = None,
    tesseract_non_ocr_timeout: float | None = None,
    tesseract_downsample_above: int | None = None,
    tesseract_downsample_large_images: bool | None = None,
    rotate_pages_threshold: float | None = None,
    pdfa_image_compression: str | None = None,
    color_conversion_strategy: str | None = None,
    user_words: os.PathLike | None = None,
    user_patterns: os.PathLike | None = None,
    fast_web_view: float | None = None,
    continue_on_soft_render_error: bool | None = None,
    invalidate_digital_signatures: bool | None = None,
    tagged_pdf_mode: str | None = None,
    no_overwrite: bool | None = None,
    plugins: Iterable[Path | str] | None = None,
    plugin_manager: OcrmypdfPluginManager | None = None,
    keep_temporary_files: bool | None = None,
    progress_bar: bool | None = None,
    **kwargs,
) -> ExitCode: ...


def ocr(  # noqa: D417
    input_file_or_options: PathOrIO | OcrOptions,
    output_file: PathOrIO | None = None,
    *,
    language: Iterable[str] | None = None,
    image_dpi: int | None = None,
    output_type: str | None = None,
    sidecar: PathOrIO | None = None,
    jobs: int | None = None,
    use_threads: bool | None = None,
    title: str | None = None,
    author: str | None = None,
    subject: str | None = None,
    keywords: str | None = None,
    rotate_pages: bool | None = None,
    remove_background: bool | None = None,
    deskew: bool | None = None,
    clean: bool | None = None,
    clean_final: bool | None = None,
    unpaper_args: str | None = None,
    oversample: int | None = None,
    remove_vectors: bool | None = None,
    mode: str | None = None,
    force_ocr: bool | None = None,  # Legacy, use mode='force' instead
    skip_text: bool | None = None,  # Legacy, use mode='skip' instead
    redo_ocr: bool | None = None,  # Legacy, use mode='redo' instead
    skip_big: float | None = None,
    optimize: int | None = None,
    jpg_quality: int | None = None,
    png_quality: int | None = None,
    jbig2_lossy: bool | None = None,  # Deprecated, ignored
    jbig2_page_group_size: int | None = None,  # Deprecated, ignored
    jbig2_threshold: float | None = None,
    pages: str | None = None,
    max_image_mpixels: float | None = None,
    tesseract_config: Iterable[str] | None = None,
    tesseract_pagesegmode: int | None = None,
    tesseract_oem: int | None = None,
    tesseract_thresholding: int | None = None,
    pdf_renderer: str | None = None,
    rasterizer: str | None = None,
    tesseract_timeout: float | None = None,
    tesseract_non_ocr_timeout: float | None = None,
    tesseract_downsample_above: int | None = None,
    tesseract_downsample_large_images: bool | None = None,
    rotate_pages_threshold: float | None = None,
    pdfa_image_compression: str | None = None,
    color_conversion_strategy: str | None = None,
    user_words: os.PathLike | None = None,
    user_patterns: os.PathLike | None = None,
    fast_web_view: float | None = None,
    continue_on_soft_render_error: bool | None = None,
    invalidate_digital_signatures: bool | None = None,
    tagged_pdf_mode: str | None = None,
    no_overwrite: bool | None = None,
    plugins: Iterable[Path | str] | None = None,
    plugin_manager: OcrmypdfPluginManager | None = None,
    keep_temporary_files: bool | None = None,
    progress_bar: bool | None = None,
    **kwargs,
) -> ExitCode:
    """Run OCRmyPDF on one PDF or image.

    This function supports two calling conventions:

    **New style (recommended):**
        >>> from ocrmypdf import ocr
        >>> from ocrmypdf._options import OcrOptions
        >>> options = OcrOptions(
        ...     input_file="input.pdf",
        ...     output_file="output.pdf",
        ...     languages=["eng"],
        ... )
        >>> ocr(options)

    **Old style:**
        >>> ocr("input.pdf", "output.pdf", language=["eng"])

    For most arguments, see documentation for the equivalent command line parameter.

    This API takes a threading lock, because OCRmyPDF uses global state in particular
    for the plugin system. The jobs parameter will be used to create a pool of
    worker threads or processes at different times, subject to change. A Python
    process can only run one OCRmyPDF task at a time.

    To run parallelize instances OCRmyPDF, use separate Python processes to scale
    horizontally. Generally speaking you should set jobs=sqrt(cpu_count) and run
    sqrt(cpu_count) processes as a starting point. If you have files with a high page
    count, run fewer processes and more jobs per process. If you have a lot of short
    files, run more processes and fewer jobs per process.

    A few specific arguments are discussed here:

    Args:
        input_file_or_options: Either an OcrOptions object containing all settings,
            or a path/stream for the input file (old-style API).
        output_file: Output file path or stream. Required when using old-style API
            with input_file as first argument. Must be None when passing OcrOptions.
        use_threads: Use worker threads instead of processes. This reduces
            performance but may make debugging easier since it is easier to set
            breakpoints.
        plugins: List of plugin paths to load. Can be passed alongside OcrOptions.
        plugin_manager: Pre-configured plugin manager. Can be passed alongside
            OcrOptions.

        For input_file (old-style API): If a :class:`pathlib.Path`, ``str`` or
            ``bytes``, this is interpreted as file system path to the input file.
            If the object appears to be a readable stream (with methods such as
            ``.read()`` and ``.seek()``), the object will be read in its entirety
            and saved to a temporary file. If ``input_file`` is ``"-"``, standard
            input will be read.

        For output_file (old-style API): If a :class:`pathlib.Path`, ``str`` or
            ``bytes``, this is interpreted as file system path to the output file.
            If the object appears to be a writable stream (with methods such as
            ``.write()`` and ``.seek()``), the output will be written to this
            stream. If ``output_file`` is ``"-"``, the output will be written to
            ``sys.stdout`` (provided that standard output does not seem to be a
            terminal device). When a stream is used as output, whether via a
            writable object or ``"-"``, some final validation steps are not
            performed (we do not read back the stream after it is written).

    Raises:
        ocrmypdf.MissingDependencyError: If a required dependency program is missing or
            was not found on PATH.
        ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
            could not be read, or some other file type that is not a PDF.
        ocrmypdf.DpiError: If the input file is an image, but the resolution of the
            image is not credible (allowing it to proceed would cause poor OCR).
        ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
            file failed.
        ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
            text already, and settings did not tell us to proceed.
        ocrmypdf.InputFileError: Any other problem with the input file.
        ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
        ocrmypdf.EncryptedPdfError: If the input PDF is encrypted (password protected).
            OCRmyPDF does not remove passwords.
        ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
            valid.
        ValueError: If OcrOptions is passed along with other OCR parameters, or if
            both plugins and plugin_manager are provided.
        TypeError: If output_file is missing when using the old-style API.

    Returns:
        :class:`ocrmypdf.ExitCode`
    """
    # Detect calling convention: OcrOptions object vs individual parameters
    if isinstance(input_file_or_options, OcrOptions):
        # New-style API: OcrOptions passed directly
        options = input_file_or_options

        # Check for conflicting parameters
        # (all should be None except plugins/plugin_manager)
        _check_no_conflicting_ocr_params(locals(), kwargs)

        # plugins and plugin_manager can still be passed alongside OcrOptions
        if plugins and plugin_manager:
            raise ValueError("plugins= and plugin_manager are mutually exclusive")

        # Use plugins from OcrOptions if not explicitly passed
        if plugins is None:
            plugins = options.plugins or []

        if isinstance(plugins, str | Path):
            plugins = [plugins]
        else:
            plugins = list(plugins) if plugins else []

        # Run the pipeline with the OcrOptions
        with _api_lock:
            plugin_manager = setup_plugin_infrastructure(
                plugins=plugins, plugin_manager=plugin_manager
            )

            parser = get_parser()
            plugin_manager.add_options(parser=parser)

            check_options(options, plugin_manager)
            return run_pipeline(options=options, plugin_manager=plugin_manager)

    else:
        # Old-style API: positional arguments
        input_file = input_file_or_options

        if output_file is None:
            raise TypeError(
                "ocr() missing required argument: 'output_file'. "
                "Either pass output_file as the second argument, or pass "
                "an OcrOptions object as the first argument."
            )

        if plugins and plugin_manager:
            raise ValueError("plugins= and plugin_manager are mutually exclusive")

        if not plugins:
            plugins = []
        elif isinstance(plugins, str | Path):
            plugins = [plugins]
        else:
            plugins = list(plugins)

        # No new variable names should be assigned until these two steps are run
        create_options_kwargs = {
            k: v
            for k, v in locals().items()
            if k
            not in {
                'input_file_or_options',
                'input_file',
                'output_file',
                'kwargs',
                'plugin_manager',
            }
        }
        create_options_kwargs.update(kwargs)

        parser = get_parser()
        with _api_lock:
            # Set up plugin infrastructure with proper initialization
            plugin_manager = setup_plugin_infrastructure(
                plugins=plugins, plugin_manager=plugin_manager
            )

            # Get parser and let plugins add their options
            parser = get_parser()
            plugin_manager.add_options(parser=parser)

            if 'verbose' in kwargs:
                warn(
                    "ocrmypdf.ocr(verbose=) is ignored. "
                    "Use ocrmypdf.configure_logging()."
                )

            # Warn about deprecated jbig2 options and remove from kwargs
            if jbig2_lossy:
                warn(
                    "jbig2_lossy is deprecated and will be ignored. "
                    "Lossy JBIG2 has been removed due to character substitution risks."
                )
                create_options_kwargs.pop('jbig2_lossy', None)
            if jbig2_page_group_size:
                warn("jbig2_page_group_size is deprecated and will be ignored.")
                create_options_kwargs.pop('jbig2_page_group_size', None)

            options = create_options(
                input_file=input_file,
                output_file=output_file,
                parser=parser,
                **create_options_kwargs,
            )
            check_options(options, plugin_manager)
            return run_pipeline(options=options, plugin_manager=plugin_manager)


def _pdf_to_hocr(  # noqa: D417
    input_pdf: Path,
    output_folder: Path,
    *,
    language: Iterable[str] | None = None,
    image_dpi: int | None = None,
    jobs: int | None = None,
    use_threads: bool | None = None,
    title: str | None = None,
    author: str | None = None,
    subject: str | None = None,
    keywords: str | None = None,
    rotate_pages: bool | None = None,
    remove_background: bool | None = None,
    deskew: bool | None = None,
    clean: bool | None = None,
    clean_final: bool | None = None,
    unpaper_args: str | None = None,
    oversample: int | None = None,
    remove_vectors: bool | None = None,
    mode: str | None = None,
    force_ocr: bool | None = None,  # Legacy, use mode='force' instead
    skip_text: bool | None = None,  # Legacy, use mode='skip' instead
    redo_ocr: bool | None = None,  # Legacy, use mode='redo' instead
    skip_big: float | None = None,
    pages: str | None = None,
    max_image_mpixels: float | None = None,
    tesseract_config: Iterable[str] | None = None,
    tesseract_pagesegmode: int | None = None,
    tesseract_oem: int | None = None,
    tesseract_thresholding: int | None = None,
    tesseract_timeout: float | None = None,
    tesseract_non_ocr_timeout: float | None = None,
    tesseract_downsample_above: int | None = None,
    tesseract_downsample_large_images: bool | None = None,
    rotate_pages_threshold: float | None = None,
    rasterizer: str | None = None,
    user_words: os.PathLike | None = None,
    user_patterns: os.PathLike | None = None,
    continue_on_soft_render_error: bool | None = None,
    invalidate_digital_signatures: bool | None = None,
    plugin_manager=None,
    plugins: Sequence[Path | str] | None = None,
    keep_temporary_files: bool | None = None,
    **kwargs,
):
    """Partially run OCRmyPDF and produces an output folder containing hOCR files.

    Given a PDF file, this function will run OCRmyPDF up to the point where
    the PDF is rasterized to images, OCRed, and the hOCR files are produced,
    all of which are saved to the output folder. This is useful for applications
    that want to provide an interface for users to edit the text before
    rendering the final PDF.

    Use :func:`hocr_to_ocr_pdf` to produce the final PDF.

    For arguments not explicitly documented here, see documentation for the
    equivalent command line parameter.

    This API is **experimental** and subject to change.

    Args:
        input_pdf: Input PDF file path.
        output_folder: Output folder path.
        **kwargs: Keyword arguments.
    """
    if plugins and plugin_manager:
        raise ValueError("plugins= and plugin_manager are mutually exclusive")

    if not plugins:
        plugins = []
    elif isinstance(plugins, str | Path):
        plugins = [plugins]
    else:
        plugins = list(plugins)

    # Prepare kwargs for direct OcrOptions construction
    options_kwargs = kwargs.copy()

    # Set input file and handle special output_folder case
    options_kwargs['input_file'] = input_pdf
    options_kwargs['output_file'] = '/dev/null'  # Placeholder for hOCR pipeline

    # Add all the function parameters
    for param_name, param_value in locals().items():
        if (
            param_name
            not in {'input_pdf', 'output_folder', 'kwargs', 'plugin_manager', 'plugins'}
            and param_value is not None
        ):
            options_kwargs[param_name] = param_value

    # Map API parameter 'language' to OcrOptions field 'languages'
    _remap_language_to_languages(options_kwargs)

    # Handle plugins
    if plugins:
        options_kwargs['plugins'] = plugins

    # Remove None values to let OcrOptions use its defaults
    options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None}

    # Add output_folder to options_kwargs since it's now a proper field
    options_kwargs['output_folder'] = output_folder

    # Remove any kwargs that aren't OcrOptions fields and store in extra_attrs
    extra_attrs = {}
    ocr_fields = set(OcrOptions.model_fields.keys())
    # Legacy mode flags are handled by OcrOptions model validator
    legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}
    known_extra = {'progress_bar', 'plugins'}

    for key in list(options_kwargs.keys()):
        if key in ocr_fields or key in legacy_mode_flags or key in known_extra:
            continue
        extra_attrs[key] = options_kwargs.pop(key)

    with _api_lock:
        # Set up plugin infrastructure with proper initialization
        plugin_manager = setup_plugin_infrastructure(
            plugins=plugins, plugin_manager=plugin_manager
        )

        plugin_manager.add_options(parser=get_parser())

        # Create OcrOptions directly
        try:
            options = OcrOptions(**options_kwargs)
            # Add any extra attributes
            if extra_attrs:
                options.extra_attrs.update(extra_attrs)
        except Exception as e:
            raise TypeError(
                f"Failed to create OcrOptions for hOCR pipeline: {e}"
            ) from e

        return run_hocr_pipeline(options=options, plugin_manager=plugin_manager)


def _hocr_to_ocr_pdf(  # noqa: D417
    work_folder: Path,
    output_file: Path,
    *,
    jobs: int | None = None,
    use_threads: bool | None = None,
    optimize: int | None = None,
    jpg_quality: int | None = None,
    png_quality: int | None = None,
    jbig2_lossy: bool | None = None,  # Deprecated, ignored
    jbig2_page_group_size: int | None = None,  # Deprecated, ignored
    jbig2_threshold: float | None = None,
    pdfa_image_compression: str | None = None,
    color_conversion_strategy: str | None = None,
    fast_web_view: float | None = None,
    plugin_manager=None,
    plugins: Sequence[Path | str] | None = None,
    **kwargs,
):
    """Run OCRmyPDF on a work folder and produce an output PDF.

    After running :func:`pdf_to_hocr`, this function will run OCRmyPDF on the work
    folder to produce an output PDF. This function consolidates any changes made
    to the hOCR files in the work folder and produces a final PDF.

    For arguments not explicitly documented here, see documentation for the
    equivalent command line parameter.

    This API is **experimental** and subject to change.

    Args:
        work_folder: Work folder path, as generated by :func:`pdf_to_hocr`.
        output_file: Output PDF file path.
        **kwargs: Keyword arguments.
    """
    if plugins and plugin_manager:
        raise ValueError("plugins= and plugin_manager are mutually exclusive")

    if not plugins:
        plugins = []
    elif isinstance(plugins, str | Path):
        plugins = [plugins]
    else:
        plugins = list(plugins)

    # Prepare kwargs for direct OcrOptions construction
    options_kwargs = kwargs.copy()

    # Set output file and handle special work_folder case
    options_kwargs['input_file'] = '/dev/null'  # Placeholder for hOCR to PDF pipeline
    options_kwargs['output_file'] = output_file

    # Add all the function parameters
    for param_name, param_value in locals().items():
        if (
            param_name
            not in {'work_folder', 'output_file', 'kwargs', 'plugin_manager', 'plugins'}
            and param_value is not None
        ):
            options_kwargs[param_name] = param_value

    # Handle plugins
    if plugins:
        options_kwargs['plugins'] = plugins

    # Remove None values to let OcrOptions use its defaults
    options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None}

    # Warn about deprecated jbig2 options and remove from kwargs
    if jbig2_lossy:
        warn(
            "jbig2_lossy is deprecated and will be ignored. "
            "Lossy JBIG2 has been removed due to character substitution risks."
        )
        options_kwargs.pop('jbig2_lossy', None)
    if jbig2_page_group_size:
        warn("jbig2_page_group_size is deprecated and will be ignored.")
        options_kwargs.pop('jbig2_page_group_size', None)

    # Add work_folder to options_kwargs since it's now a proper field
    options_kwargs['work_folder'] = work_folder

    # Remove any kwargs that aren't OcrOptions fields and store in extra_attrs
    extra_attrs = {}
    ocr_fields = set(OcrOptions.model_fields.keys())
    # Legacy mode flags are handled by OcrOptions model validator
    legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}
    known_extra = {'progress_bar', 'plugins'}

    for key in list(options_kwargs.keys()):
        if key in ocr_fields or key in legacy_mode_flags or key in known_extra:
            continue
        extra_attrs[key] = options_kwargs.pop(key)

    with _api_lock:
        # Set up plugin infrastructure with proper initialization
        plugin_manager = setup_plugin_infrastructure(
            plugins=plugins, plugin_manager=plugin_manager
        )

        plugin_manager.add_options(parser=get_parser())

        # Create OcrOptions directly
        try:
            options = OcrOptions(**options_kwargs)
            # Add any extra attributes
            if extra_attrs:
                options.extra_attrs.update(extra_attrs)
        except Exception as e:
            raise TypeError(
                f"Failed to create OcrOptions for hOCR to PDF pipeline: {e}"
            ) from e

        return run_hocr_to_ocr_pdf_pipeline(
            options=options, plugin_manager=plugin_manager
        )


__all__ = [
    'PageNumberFilter',
    'Verbosity',
    'check_options',
    'configure_logging',
    'create_options',
    'get_parser',
    'get_plugin_manager',
    'ocr',
    'run_pipeline',
    'run_pipeline_cli',
    'setup_plugin_infrastructure',
]


================================================
FILE: src/ocrmypdf/builtin_plugins/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Plugins in this package are automatically loaded by ocrmypdf."""

from __future__ import annotations


================================================
FILE: src/ocrmypdf/builtin_plugins/concurrency.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""OCRmyPDF's multiprocessing/multithreading abstraction layer."""

from __future__ import annotations

import logging
import logging.handlers
import multiprocessing
import multiprocessing.queues
import os
import queue
import signal
import sys
import threading
from collections.abc import Callable, Iterable
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from contextlib import suppress
from typing import TYPE_CHECKING

from rich.console import Console as RichConsole

from ocrmypdf import Executor, hookimpl
from ocrmypdf._logging import RichLoggingHandler
from ocrmypdf._progressbar import RichProgressBar
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import remove_all_log_handlers

if TYPE_CHECKING:
    from typing import TypeAlias

    Queue: TypeAlias = multiprocessing.queues.Queue | queue.Queue
    UserInit: TypeAlias = Callable[[], None]
    WorkerInit: TypeAlias = Callable[[Queue, UserInit, int], None]

FuturesExecutorClass = type[ThreadPoolExecutor] | type[ProcessPoolExecutor]


def log_listener(q: Queue):
    """Listen to the worker processes and forward the messages to logging.

    For simplicity this is a thread rather than a process. Only one process
    should actually write to sys.stderr or whatever we're using, so if this is
    made into a process the main application needs to be directed to it.

    See:
    https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
    """
    while True:
        try:
            record = q.get()
            if record is None:
                break
            logger = logging.getLogger(record.name)
            logger.handle(record)
        except Exception:  # pylint: disable=broad-except
            import traceback  # pylint: disable=import-outside-toplevel

            print("Logging problem", file=sys.stderr)
            traceback.print_exc(file=sys.stderr)


def process_sigbus(*args):
    """Handle SIGBUS signal at the worker level."""
    raise InputFileError("A worker process lost access to an input file")


def process_init(q: Queue, user_init: UserInit, loglevel) -> None:
    """Initialize a process pool worker."""
    # Ignore SIGINT (our parent process will kill us gracefully)
    signal.signal(signal.SIGINT, signal.SIG_IGN)

    # Install SIGBUS handler (so our parent process can abort somewhat gracefully)
    with suppress(AttributeError):  # Windows and Cygwin do not have SIGBUS
        # Windows and Cygwin do not have pthread_sigmask or SIGBUS
        signal.signal(signal.SIGBUS, process_sigbus)

    # Remove any log handlers inherited from the parent process
    root = logging.getLogger()
    remove_all_log_handlers(root)

    # Set up our single log handler to forward messages to the parent
    root.setLevel(loglevel)
    root.addHandler(logging.handlers.QueueHandler(q))

    user_init()
    return


def thread_init(q: Queue, user_init: UserInit, loglevel) -> None:
    """Begin a thread pool worker."""
    del q  # unused but required argument
    del loglevel  # unused but required argument
    # As a thread, block SIGBUS so the main thread deals with it...
    with suppress(AttributeError):
        signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGBUS})

    user_init()
    return


def setup_executor(use_threads: bool) -> tuple[Queue, Executor, WorkerInit]:
    if not use_threads:
        # Some execution environments like AWS Lambda and Termux do not support
        # semaphores. Check if semaphore support is available, and if not, fall back
        # to using threads.
        try:
            # pylint: disable=import-outside-toplevel
            from multiprocessing.synchronize import SemLock

            del SemLock
        except ImportError:
            use_threads = True

    if use_threads:
        loq_queue = queue.Queue(-1)
        executor_class = ThreadPoolExecutor
        initializer = thread_init
    else:
        loq_queue = multiprocessing.Queue(-1)
        executor_class = ProcessPoolExecutor
        initializer = process_init

    return loq_queue, executor_class, initializer


class StandardExecutor(Executor):
    """Standard OCRmyPDF concurrent task executor."""

    def _execute(
        self,
        *,
        use_threads: bool,
        max_workers: int,
        progress_kwargs: dict,
        worker_initializer: Callable,
        task: Callable,
        task_arguments: Iterable,
        task_finished: Callable,
    ):
        log_queue, executor_class, initializer = setup_executor(use_threads)

        # Regardless of whether we use_threads for worker processes, the log_listener
        # must be a thread. Make sure we create the listener after the worker pool,
        # so that it does not get forked into the workers.
        # If use_threads is False, we are currently guilty of creating a thread before
        # forking on Linux, which is not recommended. However, we take a big
        # performance hit in pdfinfo if we can't fork. Long term solution is to
        # replace most of this with an asyncio implementation, and probably to
        # migrate some of pdfinfo into C++ or Rust.
        listener = threading.Thread(target=log_listener, args=(log_queue,))
        listener.start()

        with (
            self.pbar_class(**progress_kwargs) as pbar,
            executor_class(
                max_workers=max_workers,
                initializer=initializer,
                initargs=(log_queue, worker_initializer, logging.getLogger("").level),
            ) as executor,
        ):
            futures = [executor.submit(task, *args) for args in task_arguments]
            try:
                for future in as_completed(futures):
                    result = future.result()
                    task_finished(result, pbar)
            except KeyboardInterrupt:
                # Terminate pool so we exit instantly
                executor.shutdown(wait=False, cancel_futures=True)
                raise
            except Exception:
                if not os.environ.get("PYTEST_CURRENT_TEST", ""):
                    # Normally we shutdown without waiting for other child workers
                    # on error, because there is no point in waiting for them. Their
                    # results will be discard. But if the condition above is True,
                    # then we are running in pytest, and we want everything to exit
                    # as cleanly as possible so that we get good error messages.
                    executor.shutdown(wait=False, cancel_futures=True)
                raise
            finally:
                # Terminate log listener
                log_queue.put_nowait(None)

        # When the above succeeds, wait for the listener thread to exit. (If
        # an exception occurs, we don't try to join, in case it deadlocks.)
        listener.join()


@hookimpl
def get_executor(progressbar_class):
    """Return the default executor."""
    return StandardExecutor(pbar_class=progressbar_class)


RICH_CONSOLE = RichConsole(stderr=True)


@hookimpl
def get_progressbar_class():
    """Return the default progress bar class."""

    def partial_RichProgressBar(*args, **kwargs):
        return RichProgressBar(*args, **kwargs, console=RICH_CONSOLE)

    return partial_RichProgressBar


@hookimpl
def get_logging_console():
    """Return the default logging console handler."""
    return RichLoggingHandler(console=RICH_CONSOLE)


================================================
FILE: src/ocrmypdf/builtin_plugins/default_filters.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""OCRmyPDF automatically installs these filters as plugins."""

from __future__ import annotations

from ocrmypdf import hookimpl


@hookimpl
def filter_pdf_page(page, image_filename, output_pdf):  # pylint: disable=unused-argument
    return output_pdf


================================================
FILE: src/ocrmypdf/builtin_plugins/ghostscript.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Built-in plugin to implement PDF page rasterization and PDF/A production."""

from __future__ import annotations

import logging
from enum import StrEnum
from pathlib import Path
from typing import Annotated

from packaging.version import Version
from pikepdf import Name, Pdf, Stream
from pydantic import BaseModel, Field

from ocrmypdf import hookimpl
from ocrmypdf._exec import ghostscript
from ocrmypdf._options import ProcessingMode
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.subprocess import check_external_program

log = logging.getLogger(__name__)

# Currently all blacklisted versions are lower than 9.55, so none need to
# be added here. If a future version is blacklisted, add it here.
BLACKLISTED_GS_VERSIONS: frozenset[Version] = frozenset()


class ColorConversionStrategy(StrEnum):
    """Ghostscript color conversion strategies."""

    CMYK = 'CMYK'
    GRAY = 'Gray'
    LEAVE_COLOR_UNCHANGED = 'LeaveColorUnchanged'
    RGB = 'RGB'
    USE_DEVICE_INDEPENDENT_COLOR = 'UseDeviceIndependentColor'


class PdfaImageCompression(StrEnum):
    """PDF/A image compression methods."""

    AUTO = 'auto'
    JPEG = 'jpeg'
    LOSSLESS = 'lossless'


class GhostscriptOptions(BaseModel):
    """Options specific to Ghostscript operations."""

    color_conversion_strategy: Annotated[
        ColorConversionStrategy,
        Field(description="Ghostscript color conversion strategy"),
    ] = ColorConversionStrategy.LEAVE_COLOR_UNCHANGED
    pdfa_image_compression: Annotated[
        PdfaImageCompression, Field(description="PDF/A image compression method")
    ] = PdfaImageCompression.AUTO

    @classmethod
    def add_arguments_to_parser(cls, parser, namespace: str = 'ghostscript'):
        """Add Ghostscript-specific arguments to the argument parser.

        Args:
            parser: The argument parser to add arguments to
            namespace: The namespace prefix for argument names (not used for ghostscript
                for backward compatibility)
        """
        gs = parser.add_argument_group("Ghostscript", "Advanced control of Ghostscript")
        gs.add_argument(
            '--color-conversion-strategy',
            action='store',
            type=str,
            choices=[ccs.value for ccs in ColorConversionStrategy],
            default=ColorConversionStrategy.LEAVE_COLOR_UNCHANGED.value,
            help="Set Ghostscript color conversion strategy",
        )
        gs.add_argument(
            '--pdfa-image-compression',
            choices=[pc.value for pc in PdfaImageCompression],
            default=PdfaImageCompression.AUTO.value,
            help="Specify how to compress images in the output PDF/A. 'auto' lets "
            "OCRmyPDF decide.  'jpeg' changes all grayscale and color images to "
            "JPEG compression.  'lossless' uses PNG-style lossless compression "
            "for all images.  Monochrome images are always compressed using a "
            "lossless codec.  Compression settings "
            "are applied to all pages, including those for which OCR was "
            "skipped.  Not supported for --output-type=pdf ; that setting "
            "preserves the original compression of all images.",
        )


@hookimpl
def register_options():
    """Register Ghostscript option model."""
    return {'ghostscript': GhostscriptOptions}


@hookimpl
def add_options(parser):
    # Use the model's CLI generation method
    GhostscriptOptions.add_arguments_to_parser(parser)


@hookimpl
def check_options(options):
    """Check that the options are valid for this plugin."""
    # Only require Ghostscript for pdfa* output types (not 'auto' or 'pdf')
    # 'auto' mode uses best-effort PDF/A without Ghostscript fallback
    if options.output_type.startswith('pdfa'):
        check_external_program(
            program='gs',
            package='ghostscript',
            version_checker=ghostscript.version,
            need_version='9.54',  # RHEL 9's version; Ubuntu 22.04 has 9.55
        )
        gs_version = ghostscript.version()
        if gs_version in BLACKLISTED_GS_VERSIONS:
            raise MissingDependencyError(
                f"Ghostscript {gs_version} contains serious regressions and is not "
                "supported. Please upgrade to a newer version."
            )
        if Version('10.0.0') <= gs_version < Version('10.02.1') and (
            options.mode in (ProcessingMode.skip, ProcessingMode.redo)
        ):
            raise MissingDependencyError(
                f"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) "
                "contain serious regressions that corrupt PDFs with existing text, "
                "such as those processed using --skip-text or --redo-ocr "
                "(or --mode skip/redo). Please upgrade to a newer version, or use "
                "--output-type pdf to avoid Ghostscript, or use --force-ocr "
                "(or --mode force) to discard existing text."
            )
        if gs_version >= Version('10.6.0'):
            log.warning(
                "Ghostscript 10.6.x contains JPEG encoding errors that may corrupt "
                "images. OCRmyPDF will attempt to mitigate, but this version is "
                "strongly not recommended. Please upgrade to a newer version. "
                "As of 2025-12, 10.6.0 is the latest version of Ghostscript."
            )
        if options.output_type == 'pdfa':
            options.output_type = 'pdfa-2'

    if (
        options.ghostscript.color_conversion_strategy
        not in ghostscript.COLOR_CONVERSION_STRATEGIES
    ):
        raise ValueError(
            f"Invalid color conversion strategy: "
            f"{options.ghostscript.color_conversion_strategy}"
        )
    if (
        options.ghostscript.pdfa_image_compression != 'auto'
        and options.output_type not in ('auto', 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3')
    ):
        log.warning(
            "--pdfa-image-compression argument only applies when "
            "--output-type is 'auto' or one of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'"
        )


@hookimpl
def rasterize_pdf_page(
    input_file,
    output_file,
    raster_device,
    raster_dpi,
    pageno,
    page_dpi,
    rotation,
    filter_vector,
    stop_on_soft_error,
    options,
    use_cropbox,
):
    """Rasterize a single page of a PDF file using Ghostscript."""
    # Check if user explicitly requested a different rasterizer
    if options is not None and options.rasterizer == 'pypdfium':
        # Let pypdfium handle it (it will error in check_options if unavailable)
        return None

    ghostscript.rasterize_pdf(
        input_file,
        output_file,
        raster_device=raster_device,
        raster_dpi=raster_dpi,
        pageno=pageno,
        page_dpi=page_dpi,
        rotation=rotation,
        filter_vector=filter_vector,
        stop_on_error=stop_on_soft_error,
        use_cropbox=use_cropbox,
    )
    return output_file


def _collect_dctdecode_images(pdf: Pdf) -> dict[tuple, list[tuple[Stream, bytes]]]:
    """Collect all DCTDecode (JPEG) images from a PDF.

    Returns a dict mapping image signatures to a list of (stream, raw_bytes) tuples.
    The signature is (Width, Height, Filter, BitsPerComponent, ColorSpace).
    """
    images: dict[tuple, list[tuple[Stream, bytes]]] = {}

    def get_colorspace_key(obj):
        """Get a hashable key for the colorspace."""
        cs = obj.get(Name.ColorSpace)
        if cs is None:
            return None
        if isinstance(cs, Name):
            return str(cs)
        # For array colorspaces like [/ICCBased ...], use the first element
        try:
            return str(cs[0]) if len(cs) > 0 else str(cs)
        except (TypeError, KeyError):
            return str(cs)

    def process_xobject_dict(xobjects, depth=0):
        """Process an XObject dictionary for DCTDecode images."""
        if xobjects is None:
            return
        if depth > 10:
            log.warning("Recursion depth exceeded in _collect_dctdecode_images")
            return
        for key in xobjects.keys():
            obj = xobjects[key]
            if obj is None:
                continue
            # Check if it's an image with DCTDecode
            if obj.get(Name.Subtype) == Name.Image:
                filt = obj.get(Name.Filter)
                if filt == Name.DCTDecode:
                    sig = (
                        int(obj.get(Name.Width, 0)),
                        int(obj.get(Name.Height, 0)),
                        str(filt),
                        int(obj.get(Name.BitsPerComponent, 0)),
                        get_colorspace_key(obj),
                    )
                    raw_bytes = obj.read_raw_bytes()
                    if sig not in images:
                        images[sig] = []
                    images[sig].append((obj, raw_bytes))
            # Recurse into Form XObjects
            elif obj.get(Name.Subtype) == Name.Form:
                if Name.Resources in obj:
                    res = obj[Name.Resources]
                    if Name.XObject in res:
                        process_xobject_dict(res[Name.XObject], depth=depth + 1)

    for page in pdf.pages:
        if Name.Resources not in page:
            continue
        resources = page[Name.Resources]
        if Name.XObject not in resources:
            continue
        process_xobject_dict(resources[Name.XObject])

    return images


def _repair_gs106_jpeg_corruption(
    input_pdf_path: Path,
    output_pdf_path: Path,
) -> bool:
    """Repair JPEG corruption caused by Ghostscript 10.6.

    Ghostscript 10.6 has a bug that truncates JPEG data by 1-15 bytes.
    This function detects and repairs such corruption by copying the
    original JPEG bytes from the input PDF.

    Returns True if any repairs were made.
    """
    repaired_count = 0
    first_error_logged = False

    with (
        Pdf.open(input_pdf_path) as input_pdf,
        Pdf.open(output_pdf_path, allow_overwriting_input=True) as output_pdf,
    ):
        # Collect all DCTDecode images from both PDFs
        input_images = _collect_dctdecode_images(input_pdf)
        output_images = _collect_dctdecode_images(output_pdf)

        # For each output image, try to find a corresponding input image
        for sig, output_list in output_images.items():
            if sig not in input_images:
                continue
            input_list = input_images[sig]

            for output_stream, output_bytes in output_list:
                # Try to find a matching input image
                for _input_stream, input_bytes in input_list:
                    input_len = len(input_bytes)
                    output_len = len(output_bytes)

                    # Check if output is 1-15 bytes shorter
                    diff = input_len - output_len
                    if not (1 <= diff <= 15):
                        continue

                    # Check if the bytes are identical up to the truncation point
                    if output_bytes != input_bytes[:output_len]:
                        continue

                    # This is a corrupt image - repair it
                    if not first_error_logged:
                        log.error(
                            "Ghostscript 10.6 JPEG corruption detected. "
                            "Repairing damaged images from original PDF."
                        )
                        first_error_logged = True
                    log.warning(
                        f"Replacing corrupt JPEG image "
                        f"({sig[0]}x{sig[1]}, {diff} bytes truncated)"
                    )

                    # Write the original bytes back to the output stream
                    output_stream.write(
                        input_bytes,
                        filter=Name.DCTDecode,
                    )
                    repaired_count += 1
                    break  # Move to next output image

        if repaired_count > 0:
            output_pdf.save(output_pdf_path)
            log.info(
                f"Repaired {repaired_count} JPEG image(s) corrupted by Ghostscript"
            )

    return repaired_count > 0


@hookimpl
def generate_pdfa(
    pdf_pages,
    pdfmark,
    output_file,
    context,
    pdf_version,
    pdfa_part,
    progressbar_class,
    stop_on_soft_error,
):
    """Generate a PDF/A from the list of PDF pages and PDF/A metadata."""
    # Normalize output_type at point of use
    output_type = context.options.output_type
    if output_type == 'pdfa':
        output_type = 'pdfa-2'

    ghostscript.generate_pdfa(
        pdf_pages=[pdfmark, *pdf_pages],
        output_file=output_file,
        compression=context.options.ghostscript.pdfa_image_compression,
        color_conversion_strategy=context.options.ghostscript.color_conversion_strategy,
        pdf_version=pdf_version,
        pdfa_part=pdfa_part,
        progressbar_class=progressbar_class,
        stop_on_error=stop_on_soft_error,
    )

    # Repair JPEG corruption caused by Ghostscript 10.6.x
    gs_version = ghostscript.version()
    if gs_version >= Version('10.6.0') and len(pdf_pages) == 1:
        input_pdf = Path(pdf_pages[0])
        _repair_gs106_jpeg_corruption(input_pdf, Path(output_file))

    return output_file


================================================
FILE: src/ocrmypdf/builtin_plugins/null_ocr.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Built-in plugin implementing a null OCR engine (no OCR).

This plugin provides an OCR engine that produces no text output. It is useful
when users want OCRmyPDF's image processing, PDF/A conversion, or optimization
features without performing actual OCR.

Usage:
    ocrmypdf --ocr-engine none input.pdf output.pdf
"""

from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING

from PIL import Image

from ocrmypdf import hookimpl
from ocrmypdf.hocrtransform import BoundingBox, OcrClass, OcrElement
from ocrmypdf.pluginspec import OcrEngine, OrientationConfidence

if TYPE_CHECKING:
    from ocrmypdf._options import OcrOptions


class NullOcrEngine(OcrEngine):
    """A no-op OCR engine that produces no text output.

    Use this when you want OCRmyPDF's image processing, PDF/A conversion,
    or optimization features without performing actual OCR.
    """

    @staticmethod
    def version() -> str:
        """Return version string."""
        return "none"

    @staticmethod
    def creator_tag(options: OcrOptions) -> str:
        """Return creator tag for PDF metadata."""
        return "OCRmyPDF (no OCR)"

    def __str__(self) -> str:
        """Return human-readable engine name."""
        return "No OCR engine"

    @staticmethod
    def languages(options: OcrOptions) -> set[str]:
        """Return supported languages (empty set for null engine)."""
        return set()

    @staticmethod
    def get_orientation(input_file: Path, options: OcrOptions) -> OrientationConfidence:
        """Return neutral orientation (no rotation detected)."""
        return OrientationConfidence(angle=0, confidence=0.0)

    @staticmethod
    def get_deskew(input_file: Path, options: OcrOptions) -> float:
        """Return zero deskew angle."""
        return 0.0

    @staticmethod
    def supports_generate_ocr() -> bool:
        """Return True - this engine supports the generate_ocr() API."""
        return True

    @staticmethod
    def generate_ocr(
        input_file: Path,
        options: OcrOptions,
        page_number: int = 0,
    ) -> tuple[OcrElement, str]:
        """Generate empty OCR results.

        Args:
            input_file: The image file (used to get dimensions).
            options: OCR options (ignored).
            page_number: Page number (stored in result).

        Returns:
            A tuple of (empty OcrElement page, empty string).
        """
        # Get image dimensions
        with Image.open(input_file) as img:
            width, height = img.size
            dpi_info = img.info.get('dpi', (72, 72))
            dpi = dpi_info[0] if isinstance(dpi_info, tuple) else dpi_info

        # Create empty page element with correct dimensions
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=width, bottom=height),
            dpi=float(dpi),
            page_number=page_number,
        )

        return page, ""

    @staticmethod
    def generate_hocr(
        input_file: Path,
        output_hocr: Path,
        output_text: Path,
        options: OcrOptions,
    ) -> None:
        """Generate empty hOCR file.

        Creates minimal valid hOCR output with no text content.
        """
        # Get image dimensions for hOCR bbox
        with Image.open(input_file) as img:
            width, height = img.size

        hocr_content = f'''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
    <title>OCRmyPDF - No OCR</title>
    <meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
    <meta name='ocr-system' content='OCRmyPDF null engine'/>
</head>
<body>
    <div class='ocr_page' title='bbox 0 0 {width} {height}'>
    </div>
</body>
</html>
'''
        output_hocr.write_text(hocr_content, encoding='utf-8')
        output_text.write_text('', encoding='utf-8')

    @staticmethod
    def generate_pdf(
        input_file: Path,
        output_pdf: Path,
        output_text: Path,
        options: OcrOptions,
    ) -> None:
        """NullOcrEngine cannot generate PDFs directly.

        Use pdf_renderer='fpdf2' instead of 'sandwich'.
        """
        raise NotImplementedError(
            "NullOcrEngine cannot generate PDFs directly. "
            "Use --pdf-renderer fpdf2 instead of sandwich mode."
        )


@hookimpl
def get_ocr_engine(options):
    """Return NullOcrEngine when --ocr-engine none is selected."""
    if options is not None:
        ocr_engine = getattr(options, 'ocr_engine', 'auto')
        if ocr_engine != 'none':
            return None
    return NullOcrEngine()


================================================
FILE: src/ocrmypdf/builtin_plugins/optimize.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Built-in plugin to implement PDF page optimization."""

from __future__ import annotations

import argparse
import logging
from collections.abc import Sequence
from pathlib import Path
from typing import Annotated

from pydantic import BaseModel, Field, model_validator

from ocrmypdf import Executor, PdfContext, hookimpl
from ocrmypdf._exec import jbig2enc, pngquant
from ocrmypdf._pipeline import get_pdf_save_settings
from ocrmypdf.cli import numeric
from ocrmypdf.optimize import optimize
from ocrmypdf.subprocess import check_external_program

log = logging.getLogger(__name__)


class OptimizeOptions(BaseModel):
    """Options specific to PDF optimization."""

    level: Annotated[
        int,
        Field(
            ge=0,
            le=3,
            description="Optimization level (0=none, 1=safe, 2=lossy, 3=aggressive)",
        ),
    ] = 1
    jpeg_quality: Annotated[
        int, Field(ge=0, le=100, description="JPEG quality level for optimization")
    ] = 0
    png_quality: Annotated[
        int, Field(ge=0, le=100, description="PNG quality level for optimization")
    ] = 0
    jbig2_threshold: Annotated[
        float,
        Field(ge=0.4, le=0.9, description="JBIG2 symbol classification threshold"),
    ] = 0.85

    @classmethod
    def add_arguments_to_parser(cls, parser, namespace: str = 'optimize'):
        """Add optimization-specific arguments to the argument parser.

        Args:
            parser: The argument parser to add arguments to
            namespace: The namespace prefix for argument names
                (not used for optimize for backward compatibility)
        """
        optimizing = parser.add_argument_group(
            "Optimization options", "Control how the PDF is optimized after OCR"
        )
        optimizing.add_argument(
            '-O',
            '--optimize',
            type=int,
            choices=range(0, 4),
            default=1,
            help=(
                "Control how PDF is optimized after processing:"
                "0 - do not optimize; "
                "1 - do safe, lossless optimizations (default); "
                "2 - do lossy JPEG and JPEG2000 optimizations; "
                "3 - do more aggressive lossy JPEG and JPEG2000 optimizations. "
                "To enable lossy JBIG2, see --jbig2-lossy."
            ),
        )
        optimizing.add_argument(
            '--jpeg-quality',
            type=numeric(int, 0, 100),
            default=0,
            metavar='Q',
            help=(
                "Adjust JPEG quality level for JPEG optimization. "
                "100 is best quality and largest output size; "
                "1 is lowest quality and smallest output; "
                "0 uses the default."
            ),
        )
        optimizing.add_argument(
            '--jpg-quality',
            type=numeric(int, 0, 100),
            default=0,
            metavar='Q',
            dest='jpeg_quality',
            help=argparse.SUPPRESS,  # Alias for --jpeg-quality
        )
        optimizing.add_argument(
            '--png-quality',
            type=numeric(int, 0, 100),
            default=0,
            metavar='Q',
            help=(
                "Adjust PNG quality level to use when quantizing PNGs. "
                "Values have same meaning as with --jpeg-quality"
            ),
        )
        # Deprecated arguments - kept for backward compatibility, emit warnings
        optimizing.add_argument(
            '--jbig2-lossy',
            action='store_true',
            help=argparse.SUPPRESS,  # Deprecated, hidden from help
        )
        optimizing.add_argument(
            '--jbig2-page-group-size',
            type=numeric(int, 1, 10000),
            default=0,
            metavar='N',
            help=argparse.SUPPRESS,  # Deprecated, hidden from help
        )
        optimizing.add_argument(
            '--jbig2-threshold',
            type=numeric(float, 0.4, 0.9),
            default=0.85,
            metavar='T',
            help=(
                "Adjust JBIG2 symbol code classification threshold "
                "(default 0.85), range 0.4 to 0.9."
            ),
        )

    @model_validator(mode='after')
    def validate_optimization_consistency(self):
        """Validate optimization options are consistent."""
        if self.level == 0 and any([self.png_quality > 0, self.jpeg_quality > 0]):
            log.warning(
                "The arguments --png-quality and --jpeg-quality "
                "will be ignored because --optimize=0."
            )
        return self

    def validate_with_context(
        self, external_programs_available: dict[str, bool]
    ) -> None:
        """Validate options that require external context.

        Args:
            external_programs_available: Dict of program name -> availability
        """
        if self.level >= 2:
            if not external_programs_available.get('pngquant', False):
                log.warning(
                    "pngquant is not available, so PNG optimization will be limited"
                )
            if not external_programs_available.get('jbig2enc', False):
                log.warning(
                    "jbig2enc is not available, so JBIG2 optimization will be limited"
                )


@hookimpl
def register_options():
    """Register optimization option model."""
    return {'optimize': OptimizeOptions}


@hookimpl
def add_options(parser):
    # Use the model's CLI generation method
    OptimizeOptions.add_arguments_to_parser(parser)


@hookimpl
def check_options(options):
    """Check external dependencies for optimization."""
    # Warn about deprecated options
    if getattr(options, 'jbig2_lossy', False):
        log.warning(
            "The --jbig2-lossy option is deprecated and will be ignored. "
            "Lossy JBIG2 compression has been removed due to risks of "
            "character substitution errors."
        )
    if getattr(options, 'jbig2_page_group_size', 0) not in (0, None):
        log.warning(
            "The --jbig2-page-group-size option is deprecated and will be ignored."
        )

    if options.optimize >= 2:
        check_external_program(
            program='pngquant',
            package='pngquant',
            version_checker=pngquant.version,
            need_version='2.12.2',
            required_for='--optimize {2,3}',
        )

    if options.optimize >= 2:
        # Although we use JBIG2 for optimize=1, don't nag about it unless the
        # user is asking for more optimization
        check_external_program(
            program='jbig2',
            package='jbig2enc',
            version_checker=jbig2enc.version,
            need_version='0.28',
            required_for='--optimize {2,3}',
            recommended=True,
        )


@hookimpl
def optimize_pdf(
    input_pdf: Path,
    output_pdf: Path,
    context: PdfContext,
    executor: Executor,
    linearize: bool,
) -> tuple[Path, Sequence[str]]:
    save_settings = dict(
        linearize=linearize,
        **get_pdf_save_settings(context.options.output_type),
    )
    result_path = optimize(input_pdf, output_pdf, context, save_settings, executor)
    messages = []
    if context.options.optimize == 0:
        messages.append("Optimization was disabled.")
    else:
        image_optimizers = {
            'jbig2': jbig2enc.available(),
            'pngquant': pngquant.available(),
        }
        for name, available in image_optimizers.items():
            if not available:
                messages.append(
                    f"The optional dependency '{name}' was not found, so some image "
                    f"optimizations could not be attempted."
                )
    return result_path, messages


@hookimpl
def is_optimization_enabled(context: PdfContext) -> bool:
    return context.options.optimize != 0


================================================
FILE: src/ocrmypdf/builtin_plugins/pypdfium.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Built-in plugin to implement PDF page rasterization using pypdfium2."""

from __future__ import annotations

import logging
import threading
from contextlib import closing
from pathlib import Path
from typing import TYPE_CHECKING, Literal

if TYPE_CHECKING:
    import pypdfium2 as pdfium
else:
    try:
        import pypdfium2 as pdfium
    except ImportError:
        pdfium = None
from PIL import Image

from ocrmypdf import hookimpl
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.helpers import Resolution

log = logging.getLogger(__name__)

# pypdfium2/PDFium is not thread-safe. All calls to the library must be serialized.
# See: https://pypdfium2.readthedocs.io/en/stable/python_api.html#incompatibility-with-threading
# When using process-based parallelism (use_threads=False), each process has its own
# pdfium instance, so locking is not needed across processes.
_pdfium_lock = threading.Lock()


@hookimpl
def check_options(options):
    """Check that pypdfium2 is available if explicitly requested."""
    if options.rasterizer == 'pypdfium' and pdfium is None:
        raise MissingDependencyError(
            "The --rasterizer pypdfium option requires the pypdfium2 package. "
            "Install it with: pip install pypdfium2"
        )


def _open_pdf_document(input_file: Path):
    """Open a PDF document using pypdfium2."""
    assert pdfium is not None, "pypdfium2 must be available to call this function"
    return pdfium.PdfDocument(input_file)


def _calculate_mediabox_crop(page) -> tuple[float, float, float, float]:
    """Calculate crop values to expand rendering from CropBox to MediaBox.

    By default pypdfium2 renders to the CropBox. To render the full MediaBox,
    we need negative crop values to expand the rendering area.

    Returns:
        Tuple of (left, bottom, right, top) crop values. Negative values
        expand the rendering area beyond the CropBox to the MediaBox.
    """
    mediabox = page.get_mediabox()  # (left, bottom, right, top)
    cropbox = page.get_cropbox()  # (left, bottom, right, top), defaults to mediabox

    # Calculate how much to expand from cropbox to mediabox
    # Negative values = expand, positive = shrink
    return (
        mediabox[0] - cropbox[0],  # Expand left
        mediabox[1] - cropbox[1],  # Expand bottom
        cropbox[2] - mediabox[2],  # Expand right
        cropbox[3] - mediabox[3],  # Expand top
    )


def _render_page_to_bitmap(
    page: pdfium.PdfPage,
    raster_device: str,
    raster_dpi: Resolution,
    rotation: int | None,
    use_cropbox: bool,
) -> tuple[pdfium.PdfBitmap, int, int]:
    """Render a PDF page to a bitmap."""
    # Round DPI to match Ghostscript's precision
    raster_dpi = raster_dpi.round(6)

    # Get page dimensions BEFORE applying rotation
    page_width_pts, page_height_pts = page.get_size()

    # Calculate expected output dimensions using separate x/y DPI
    expected_width = int(round(page_width_pts * raster_dpi.x / 72.0))
    expected_height = int(round(page_height_pts * raster_dpi.y / 72.0))

    # Calculate the scale factor based on DPI
    # pypdfium2 uses points (72 DPI) as base unit
    scale = raster_dpi.to_scalar() / 72.0

    # Apply rotation if specified
    if rotation:
        # pypdfium2 rotation is in degrees, same as our input
        # we track rotation in CCW, and pypdfium2 expects CW, so negate
        page.set_rotation(-rotation % 360)
        # When rotation is 90 or 270, dimensions are swapped in output
        if rotation % 180 == 90:
            expected_width, expected_height = expected_height, expected_width

    # Render the page to a bitmap
    # The scale parameter controls the resolution
    # Render in grayscale for mono and gray devices (better input for 1-bit conversion)
    grayscale = raster_device.lower() in ('pngmono', 'pnggray', 'jpeggray')

    # Calculate crop to render the appropriate box
    # Default (use_cropbox=False) renders MediaBox for consistency with Ghostscript
    crop = (0, 0, 0, 0) if use_cropbox else _calculate_mediabox_crop(page)

    bitmap = page.render(
        scale=scale,
        rotation=0,  # We already set rotation on the page
        crop=crop,
        may_draw_forms=True,
        draw_annots=True,
        grayscale=grayscale,
        # Note: pypdfium2 doesn't have a direct equivalent to filter_vector
        # This would require more complex implementation if needed
    )
    return bitmap, expected_width, expected_height


def _process_image_for_output(
    pil_image: Image.Image,
    raster_device: str,
    raster_dpi: Resolution,
    page_dpi: Resolution | None,
    stop_on_soft_error: bool,
    expected_width: int | None = None,
    expected_height: int | None = None,
) -> tuple[Image.Image, Literal['PNG', 'TIFF', 'JPEG']]:
    """Process PIL image for output format and set DPI metadata."""
    # Correct dimensions if slightly off (within 2 pixels tolerance)
    if expected_width and expected_height:
        actual_width, actual_height = pil_image.width, pil_image.height
        width_diff = abs(actual_width - expected_width)
        height_diff = abs(actual_height - expected_height)

        # Only resize if off by small amount (1-2 pixels)
        if (width_diff <= 2 or height_diff <= 2) and (
            width_diff > 0 or height_diff > 0
        ):
            log.debug(
                f"Adjusting rendered dimensions from "
                f"{actual_width}x{actual_height} to expected "
                f"{expected_width}x{expected_height}"
            )
            pil_image = pil_image.resize(
                (expected_width, expected_height), Image.Resampling.LANCZOS
            )

    # Set the DPI metadata if page_dpi is specified
    if page_dpi:
        # PIL expects DPI as a tuple
        dpi_tuple = (float(page_dpi.x), float(page_dpi.y))
        pil_image.info['dpi'] = dpi_tuple
    else:
        # Use the raster DPI
        dpi_tuple = (float(raster_dpi.x), float(raster_dpi.y))
        pil_image.info['dpi'] = dpi_tuple

    # Convert image mode to match raster_device
    # This ensures pypdfium output matches Ghostscript's native device output
    raster_device_lower = raster_device.lower()

    if raster_device_lower == 'pngmono':
        # Convert to 1-bit black and white (matches Ghostscript pngmono device)
        if pil_image.mode != '1':
            if pil_image.mode not in ('L', '1'):
                pil_image = pil_image.convert('L')
            pil_image = pil_image.convert('1')
    elif raster_device_lower in ('pnggray', 'jpeggray'):
        # Convert to 8-bit grayscale
        if pil_image.mode not in ('L', '1'):
            pil_image = pil_image.convert('L')
    elif raster_device_lower == 'png256':
        # Convert to 8-bit indexed color (256 colors)
        if pil_image.mode != 'P':
            if pil_image.mode not in ('RGB', 'RGBA'):
                pil_image = pil_image.convert('RGB')
            pil_image = pil_image.quantize(colors=256)
    elif raster_device_lower in ('png16m', 'jpeg'):
        # Convert to RGB
        if pil_image.mode == 'RGBA':
            background = Image.new('RGB', pil_image.size, (255, 255, 255))
            background.paste(pil_image, mask=pil_image.split()[-1])
            pil_image = background
        elif pil_image.mode not in ('RGB',):
            pil_image = pil_image.convert('RGB')
    # pngalpha: keep RGBA as-is

    # Determine output format based on raster_device
    png_devices = ('png', 'pngmono', 'pnggray', 'png256', 'png16m', 'pngalpha')
    if raster_device_lower in png_devices:
        format_name = 'PNG'
    elif raster_device_lower in ('jpeg', 'jpeggray', 'jpg'):
        format_name = 'JPEG'
    elif raster_device_lower in ('tiff', 'tif'):
        format_name = 'TIFF'
    else:
        # Default to PNG for unknown formats
        format_name = 'PNG'
        if stop_on_soft_error:
            raise ValueError(f"Unsupported raster device: {raster_device}")
        else:
            log.warning(f"Unsupported raster device {raster_device}, using PNG")

    return pil_image, format_name


def _save_image(pil_image: Image.Image, output_file: Path, format_name: str) -> None:
    """Save PIL image to file with appropriate DPI metadata."""
    save_kwargs = {}
    if (
        format_name in ('PNG', 'TIFF')
        and 'dpi' in pil_image.info
        or format_name == 'JPEG'
        and 'dpi' in pil_image.info
    ):
        save_kwargs['dpi'] = pil_image.info['dpi']

    pil_image.save(output_file, format=format_name, **save_kwargs)


@hookimpl
def rasterize_pdf_page(
    input_file: Path,
    output_file: Path,
    raster_device: str,
    raster_dpi: Resolution,
    pageno: int,
    page_dpi: Resolution | None,
    rotation: int | None,
    filter_vector: bool,
    stop_on_soft_error: bool,
    options,
    use_cropbox: bool,
) -> Path | None:
    """Rasterize a single page of a PDF file using pypdfium2.

    Returns None if pypdfium2 is not available or if the user has selected
    a different rasterizer, allowing Ghostscript to be used.
    """
    # Check if user explicitly requested a different rasterizer
    if options is not None and options.rasterizer == 'ghostscript':
        return None  # Let Ghostscript handle it

    if pdfium is None:
        return None  # Fall back to Ghostscript

    # Acquire lock to ensure thread-safe access to pypdfium2
    with (
        _pdfium_lock,
        closing(_open_pdf_document(input_file)) as pdf,
        closing(pdf[pageno - 1]) as page,
    ):
        # Render the page to a bitmap
        bitmap, expected_width, expected_height = _render_page_to_bitmap(
            page, raster_device, raster_dpi, rotation, use_cropbox
        )
        with closing(bitmap):
            # Convert to PIL Image
            pil_image = bitmap.to_pil()

    # Process and save image outside the lock (PIL operations are thread-safe)
    pil_image, format_name = _process_image_for_output(
        pil_image,
        raster_device,
        raster_dpi,
        page_dpi,
        stop_on_soft_error,
        expected_width,
        expected_height,
    )

    _save_image(pil_image, output_file, format_name)

    return output_file


================================================
FILE: src/ocrmypdf/builtin_plugins/tesseract_ocr.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Built-in plugin to implement OCR using Tesseract."""

from __future__ import annotations

import argparse
import logging
import os
from typing import Annotated

from PIL import Image
from pydantic import BaseModel, Field, field_validator, model_validator

from ocrmypdf import hookimpl
from ocrmypdf._exec import tesseract
from ocrmypdf._exec.tesseract import ThresholdingMethod
from ocrmypdf._jobcontext import PageContext
from ocrmypdf.cli import numeric
from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
from ocrmypdf.helpers import available_cpu_count, clamp
from ocrmypdf.imageops import calculate_downsample, downsample_image
from ocrmypdf.pluginspec import OcrEngine
from ocrmypdf.subprocess import check_external_program

log = logging.getLogger(__name__)


def _thresholding_method_converter(value: str) -> ThresholdingMethod:
    """Convert string argument to ThresholdingMethod enum.

    Args:
        value: String name of thresholding method (auto, otsu, adaptive-otsu, sauvola)

    Returns:
        ThresholdingMethod enum value

    Raises:
        argparse.ArgumentTypeError: If value is not a valid thresholding method
    """
    method_map = {
        'auto': ThresholdingMethod.AUTO,
        'otsu': ThresholdingMethod.OTSU,
        'adaptive-otsu': ThresholdingMethod.ADAPTIVE_OTSU,
        'sauvola': ThresholdingMethod.SAUVOLA,
    }
    if value.lower() not in method_map:
        import argparse

        valid = ', '.join(method_map.keys())
        raise argparse.ArgumentTypeError(
            f"Invalid thresholding method '{value}'. Must be one of: {valid}"
        )
    return method_map[value.lower()]


class TesseractOptions(BaseModel):
    """Options specific to Tesseract OCR engine."""

    config: Annotated[
        list[str], Field(description="Additional Tesseract configuration files")
    ] = []
    pagesegmode: Annotated[
        int | None,
        Field(ge=0, le=13, description="Set Tesseract page segmentation mode"),
    ] = None
    oem: Annotated[
        int | None, Field(ge=0, le=3, description="Set Tesseract OCR engine mode")
    ] = None
    thresholding: Annotated[
        ThresholdingMethod,
        Field(description="Set Tesseract input image thresholding mode"),
    ] = ThresholdingMethod.AUTO
    timeout: Annotated[
        float, Field(ge=0, description="Timeout for OCR operations in seconds")
    ] = 180.0
    non_ocr_timeout: Annotated[
        float, Field(ge=0, description="Timeout for non-OCR operations in seconds")
    ] = 180.0
    downsample_large_images: Annotated[
        bool, Field(description="Downsample large images before OCR")
    ] = True
    downsample_above: Annotated[
        int,
        Field(
            ge=100,
            le=32767,
            description="Downsample images larger than this pixel size",
        ),
    ] = 32767
    user_words: Annotated[
        str | None, Field(description="Path to Tesseract user words file")
    ] = None
    user_patterns: Annotated[
        str | None, Field(description="Path to Tesseract user patterns file")
    ] = None
    omp_thread_limit: Annotated[
        int | None,
        Field(
            description="Calculated OMP_THREAD_LIMIT for Tesseract subprocesses",
            exclude=True,
        ),
    ] = None

    @classmethod
    def add_arguments_to_parser(cls, parser, namespace: str = 'tesseract'):
        """Add Tesseract-specific arguments to the argument parser.

        Args:
            parser: The argument parser to add arguments to
            namespace: The namespace prefix for argument names
        """
        tess = parser.add_argument_group(
            "Tesseract", "Advanced control of Tesseract OCR"
        )

        tess.add_argument(
            f'--{namespace}-config',
            action='append',
            metavar='CFG',
            default=[],
            dest=f'{namespace}_config',
            help="Additional Tesseract configuration files -- see documentation.",
        )

        tess.add_argument(
            f'--{namespace}-pagesegmode',
            action='store',
            type=int,
            metavar='PSM',
            choices=range(0, 14),
            dest=f'{namespace}_pagesegmode',
            help="Set Tesseract page segmentation mode (see tesseract --help).",
        )

        tess.add_argument(
            f'--{namespace}-oem',
            action='store',
            type=int,
            metavar='MODE',
            choices=range(0, 4),
            dest=f'{namespace}_oem',
            help=(
                "Set Tesseract 4+ OCR engine mode: "
                "0 - original Tesseract only; "
                "1 - neural nets LSTM only; "
                "2 - Tesseract + LSTM; "
                "3 - default."
            ),
        )

        tess.add_argument(
            f'--{namespace}-thresholding',
            action='store',
            type=_thresholding_method_converter,
            default='auto',
            dest=f'{namespace}_thresholding',
            help=(
                "Set Tesseract 5.0+ input image thresholding mode. This may improve "
                "OCR results on low quality images or those that contain high "
                "contrast color. Options: auto, otsu, adaptive-otsu, sauvola. "
                "auto/otsu is the Tesseract default (legacy Otsu); adaptive-otsu "
                "is an improved Otsu algorithm with improved sort for background "
                "color changes; sauvola is based on local standard deviation."
            ),
        )

        tess.add_argument(
            f'--{namespace}-timeout',
            default=180.0,
            type=numeric(float, 0),
            metavar='SECONDS',
            dest=f'{namespace}_timeout',
            help=(
                "Give up on OCR after the timeout, but copy the preprocessed page "
                "into the final output. This timeout is only used when using Tesseract "
                "for OCR. When Tesseract is used for other operations such as "
                "deskewing and orientation, the timeout is controlled by "
                f"--{namespace}-non-ocr-timeout."
            ),
        )

        tess.add_argument(
            f'--{namespace}-non-ocr-timeout',
            default=180.0,
            type=numeric(float, 0),
            metavar='SECONDS',
            dest=f'{namespace}_non_ocr_timeout',
            help=(
                "Give up on non-OCR operations such as deskewing and orientation "
                f"after timeout. This is a separate timeout from --{namespace}-timeout "
                "because these operations are not as expensive as OCR."
            ),
        )

        tess.add_argument(
            f'--{namespace}-downsample-large-images',
            action=argparse.BooleanOptionalAction,
            default=True,
            dest=f'{namespace}_downsample_large_images',
            help=(
                "Downsample large images before OCR. Tesseract has "
                "an upper limit on the size images it will support."
                " If this argument is given, OCRmyPDF will "
                "downsample large images to fit Tesseract. This "
                "may reduce OCR quality, on large images the most"
                " desirable text is usually larger. If this "
                "parameter is not supplied, Tesseract will error "
                "out and produce no OCR on the page in question. "
                "This argument should be used with a high value "
                f"of --{namespace}-timeout to ensure Tesseract "
                "has enough to time."
            ),
        )

        tess.add_argument(
            f'--{namespace}-downsample-above',
            action='store',
            type=numeric(int, 100, 32767),
            default=32767,
            dest=f'{namespace}_downsample_above',
            help=(
                "Downsample images larger than this size pixel size (either dimension) "
                f"before OCR. --{namespace}-downsample-large-images downsamples when "
                "an image exceeds Tesseract's internal limits. This argument causes "
                "downsampling to occur when an image exceeds the given size. This may "
                "reduce OCR quality, but on large images the most desirable text is "
                "usually larger."
            ),
        )

        tess.add_argument(
            '--user-words',
            metavar='FILE',
            dest='user_words',
            help="Specify the location of the Tesseract user words file. This is a "
            "list of words Tesseract should consider while performing OCR in "
            "addition to its standard language dictionaries. This can improve "
            "OCR quality especially for specialized and technical documents.",
        )
        tess.add_argument(
            '--user-patterns',
            metavar='FILE',
            dest='user_patterns',
            help="Specify the location of the Tesseract user patterns file.",
        )

    @field_validator('timeout', 'non_ocr_timeout')
    @classmethod
    def validate_timeout_reasonable(cls, v):
        """Validate timeout values are reasonable."""
        if v > 3600:  # 1 hour
            log.warning(f"Timeout of {v} seconds is very long and may cause issues")
        return v

    @field_validator('pagesegmode')
    @classmethod
    def validate_pagesegmode_warning(cls, v):
        """Validate page segmentation mode and warn about problematic values."""
        if v in (0, 2):
            log.warning(
                "The tesseract-pagesegmode you selected will disable OCR. "
                "This may cause processing to fail."
            )
        return v

    @model_validator(mode='after')
    def validate_downsample_consistency(self):
        """Validate downsample options are consistent."""
        if self.downsample_above != 32767 and not self.downsample_large_images:
            log.warning(
                "The --tesseract-downsample-above argument will have no effect unless "
                "--tesseract-downsample-large-images is also given."
            )
        return self

    def validate_with_context(self, languages: list[str]) -> None:
        """Validate options that require external context.

        Args:
            languages: List of languages being used for OCR
        """
        # Validate languages are not internal Tesseract languages
        DENIED_LANGUAGES = {'equ', 'osd'}
        if DENIED_LANGUAGES & set(languages):
            raise BadArgsError(
                "The following languages are for Tesseract's internal use "
                "and should not be issued explicitly: "
                f"{', '.join(DENIED_LANGUAGES & set(languages))}\n"
                "Remove them from the -l/--language argument."
            )


@hookimpl
def register_options():
    """Register Tesseract option model."""
    return {'tesseract': TesseractOptions}


@hookimpl
def add_options(parser):
    # Use the model's CLI generation method - it now handles all Tesseract options
    TesseractOptions.add_arguments_to_parser(parser)


@hookimpl
def check_options(options):
    """Check external dependencies and version compatibility for Tesseract."""
    check_external_program(
        program='tesseract',
        package={'linux': 'tesseract-ocr'},
        version_checker=tesseract.version,
        need_version='4.1.1',  # Ubuntu 22.04 version (also 20.04)
        version_parser=tesseract.TesseractVersion,
    )
    tess_version = tesseract.version()
    if tess_version == tesseract.TesseractVersion('5.4.0'):
        raise MissingDependencyError(
            "Tesseract 5.4.0 is not supported due to regressions in this version. "
            "Please upgrade to a newer or supported older version."
        )

    # Check version-specific feature compatibility
    if (
        not tesseract.has_thresholding()
        and options.tesseract.thresholding != ThresholdingMethod.AUTO
    ):
        log.warning(
            "The installed version of Tesseract does not support changes to its "
            "thresholding method. The --tesseract-threshold argument will be "
            "ignored."
        )


@hookimpl
def validate(pdfinfo, options):
    # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
    # to manage how many threads it uses to avoid creating total threads than cores.
    # Performance testing shows we're better off
    # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
    # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
    # input file is small, then we allow Tesseract to use threads, subject to the
    # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
    # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
    if not os.environ.get('OMP_THREAD_LIMIT', '').isnumeric():
        jobs = options.jobs or available_cpu_count()
        tess_threads = clamp(jobs // len(pdfinfo), 1, 3)
    else:
        tess_threads = int(os.environ['OMP_THREAD_LIMIT'])
    # Store the thread limit in options - it will be passed to subprocess env
    options.tesseract.omp_thread_limit = tess_threads
    log.debug("Using Tesseract OpenMP thread limit %d", tess_threads)

    if (
        options.tesseract.downsample_above != 32767
        and not options.tesseract.downsample_large_images
    ):
        log.warning(
            "The --tesseract-downsample-above argument will have no effect unless "
            "--tesseract-downsample-large-images is also given."
        )


@hookimpl
def filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image:
    """Filter the image before OCR.

    Tesseract cannot handle images with more than 32767 pixels in either axis,
    or more than 2**31 bytes. This function resizes the image to fit within
    those limits.
    """
    options = page.options
    if getattr(options, 'tesseract', None) is None:
        return image
    threshold = min(options.tesseract.downsample_above, 32767)

    if options.tesseract.downsample_large_images:
        size = calculate_downsample(
            image, max_size=(threshold, threshold), max_bytes=(2**31) - 1
        )
        image = downsample_image(image, size)
    return image


class TesseractOcrEngine(OcrEngine):
    """Implements OCR with Tesseract."""

    @staticmethod
    def version():
        return str(tesseract.version())

    @staticmethod
    def _determine_renderer(options):
        """Determine the PDF renderer to use based on options and languages."""
        if options.pdf_renderer == 'auto':
            return 'fpdf2'
        return options.pdf_renderer

    @staticmethod
    def creator_tag(options):
        renderer = TesseractOcrEngine._determine_renderer(options)
        match renderer:
            case 'hocr':
                return f"OCRmyPDF hOCR + Tesseract OCR {TesseractOcrEngine.version()}"
            case 'fpdf2':
                return f"OCRmyPDF fpdf2 + Tesseract OCR {TesseractOcrEngine.version()}"
            case "sandwich":
                return f"Tesseract OCR + PDF {TesseractOcrEngine.version()}"
            case _:
                return f"Tesseract OCR {TesseractOcrEngine.version()}"

    def __str__(self):
        return f"Tesseract OCR {TesseractOcrEngine.version()}"

    @staticmethod
    def languages(options):
        return tesseract.get_languages()

    @staticmethod
    def get_orientation(input_file, options):
        return tesseract.get_orientation(
            input_file,
            engine_mode=options.tesseract.oem,
            timeout=options.tesseract.non_ocr_timeout,
            omp_thread_limit=options.tesseract.omp_thread_limit,
        )

    @staticmethod
    def get_deskew(input_file, options) -> float:
        return tesseract.get_deskew(
            input_file,
            languages=options.languages,
            engine_mode=options.tesseract.oem,
            timeout=options.tesseract.non_ocr_timeout,
            omp_thread_limit=options.tesseract.omp_thread_limit,
        )

    @staticmethod
    def generate_hocr(input_file, output_hocr, output_text, options):
        tesseract.generate_hocr(
            input_file=input_file,
            output_hocr=output_hocr,
            output_text=output_text,
            languages=options.languages,
            engine_mode=options.tesseract.oem,
            tessconfig=options.tesseract.config,
            timeout=options.tesseract.timeout,
            pagesegmode=options.tesseract.pagesegmode,
            thresholding=options.tesseract.thresholding,
            user_words=options.tesseract.user_words,
            user_patterns=options.tesseract.user_patterns,
            omp_thread_limit=options.tesseract.omp_thread_limit,
        )

    @staticmethod
    def generate_pdf(input_file, output_pdf, output_text, options):
        tesseract.generate_pdf(
            input_file=input_file,
            output_pdf=output_pdf,
            output_text=output_text,
            languages=options.languages,
            engine_mode=options.tesseract.oem,
            tessconfig=options.tesseract.config,
            timeout=options.tesseract.timeout,
            pagesegmode=options.tesseract.pagesegmode,
            thresholding=options.tesseract.thresholding,
            user_words=options.tesseract.user_words,
            user_patterns=options.tesseract.user_patterns,
            omp_thread_limit=options.tesseract.omp_thread_limit,
        )


@hookimpl
def get_ocr_engine(options):
    """Return TesseractOcrEngine when selected or as default."""
    if options is not None:
        ocr_engine = getattr(options, 'ocr_engine', 'auto')
        # Tesseract is selected if explicitly requested or if 'auto'
        if ocr_engine not in ('auto', 'tesseract'):
            return None
    return TesseractOcrEngine()


================================================
FILE: src/ocrmypdf/cli.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Command line interface customization and validation."""

from __future__ import annotations

import argparse
from argparse import ArgumentParser
from collections.abc import Callable, Mapping
from typing import Any, TypeVar

from ocrmypdf._defaults import DEFAULT_ROTATE_PAGES_THRESHOLD
from ocrmypdf._defaults import PROGRAM_NAME as _PROGRAM_NAME
from ocrmypdf._options import OcrOptions, ProcessingMode, TaggedPdfMode
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._version import __version__ as _VERSION

T = TypeVar('T', int, float)


def numeric(basetype: Callable[[Any], T], min_: T | None = None, max_: T | None = None):
    """Validator for numeric command line parameters.

    Stipulates that the value must be of type basetype (typically int or float), and
    optionally, within the range [min_, max_].
    """
    min_ = basetype(min_) if min_ is not None else None
    max_ = basetype(max_) if max_ is not None else None

    def _numeric(s: str) -> T:
        value = basetype(s)
        if (min_ is not None and value < min_) or (max_ is not None and value > max_):
            raise argparse.ArgumentTypeError(
                f"{s!r} not in valid range {(min_, max_)!r}"
            )
        return value

    _numeric.__name__ = basetype.__name__
    return _numeric


def str_to_int(mapping: Mapping[str, int]):
    """Accept text on command line and convert to integer."""

    def _str_to_int(s: str) -> int:
        try:
            return mapping[s]
        except KeyError:
            raise argparse.ArgumentTypeError(
                f"{s!r} must be one of: {', '.join(mapping.keys())}"
            ) from None

    return _str_to_int


class LanguageSetAction(argparse.Action):
    """Manages a list of languages."""

    def __init__(self, option_strings, dest, default=None, **kwargs):
        """Initialize the action."""
        if default is None:
            default = list()
        super().__init__(option_strings, dest, default=default, **kwargs)

    def __call__(self, parser, namespace, values, option_string=None):
        """Add a language to the set."""
        dest = getattr(namespace, self.dest)
        if isinstance(values, str) and '+' in values:
            [dest.append(lang) for lang in values.split('+')]
        else:
            dest.append(values)


def get_parser():
    """Get the main CLI parser."""
    parser = ArgumentParser(
        prog=_PROGRAM_NAME,
        allow_abbrev=True,
        fromfile_prefix_chars='@',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="""\
Generates a searchable PDF or PDF/A from a regular PDF.

OCRmyPDF rasterizes each page of the input PDF, optionally corrects page
rotation and performs image processing, runs the Tesseract OCR engine on the
image, and then creates a PDF from the OCR information.
""",
        epilog="""\
OCRmyPDF attempts to keep the output file at about the same size.  If a file
contains losslessly compressed images, and images in the output file will be
losslessly compressed as well.

PDF is a page description file that attempts to preserve a layout exactly.
A PDF can contain vector objects (such as text or lines) and raster objects
(images).  A page might have multiple images.  OCRmyPDF is prepared to deal
with the wide variety of PDFs that exist in the wild.

When a PDF page contains text, OCRmyPDF assumes that the page has already
been OCRed or is a "born digital" page that should not be OCRed.  The default
behavior is to exit in this case without producing a file.  You can use the
option --skip-text to ignore pages with text, or --force-ocr to rasterize
all objects on the page and produce an image-only PDF as output.

    ocrmypdf --skip-text file_with_some_text_pages.pdf output.pdf

    ocrmypdf --force-ocr word_document.pdf output.pdf

If you are concerned about long-term archiving of PDFs, use the default option
--output-type pdfa which converts the PDF to a standardized PDF/A-2b.  This
removes some features from the PDF such as Javascript or forms. If you want to
minimize the number of changes made to your PDF, use --output-type pdf.

If OCRmyPDF is given an image file as input, it will attempt to convert the
image to a PDF before processing.  For more control over the conversion of
images to PDF, use the Python package img2pdf or other image to PDF software.

For example, this command uses img2pdf to convert all .png files beginning
with the 'page' prefix to a PDF, fitting each image on A4-sized paper, and
sending the result to OCRmyPDF through a pipe.

    img2pdf --pagesize A4 page*.png | ocrmypdf - myfile.pdf

Online documentation is located at:
    https://ocrmypdf.readthedocs.io/en/latest/introduction.html

""",
    )

    parser.add_argument(
        'input_file',
        metavar="input_pdf_or_image",
        help="PDF file containing the images to be OCRed (or '-' to read from "
        "standard input)",
    )
    parser.add_argument(
        'output_file',
        metavar="output_pdf",
        help="Output searchable PDF file (or '-' to write to standard output). "
        "Existing files will be overwritten (use --no-overwrite to prevent this). "
        "If same as input file, the input file will be updated only if "
        "processing is successful.",
    )
    parser.add_argument(
        '-l',
        '--language',
        dest='languages',
        action=LanguageSetAction,
        help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
        "all language packs installed in your system). Use -l eng+deu for "
        "multiple languages.",
    )
    parser.add_argument(
        '--image-dpi',
        metavar='DPI',
        type=int,
        help="When the input file is an image, not a PDF, use this DPI instead "
        "of the DPI claimed by the input file. If the input does not claim a "
        "sensible DPI, this option will be required.",
    )
    parser.add_argument(
        '--output-type',
        choices=['auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'],
        default='auto',
        help="Choose output type. 'auto' (default) produces best-effort PDF/A "
        "without requiring Ghostscript - uses verapdf validation when available, "
        "otherwise passes through as PDF/A if safe (input already PDF/A or "
        "force-ocr was used), or falls back to regular PDF. 'pdfa' creates a "
        "PDF/A-2b compliant file for long term archiving (requires Ghostscript "
        "as fallback). 'pdf' minimizes changes to the input file. 'pdfa-1' "
        "creates a PDF/A-1b file. 'pdfa-2' is equivalent to 'pdfa'. 'pdfa-3' "
        "creates a PDF/A-3b file. 'none' will produce no output, which may be "
        "helpful if only the --sidecar is desired.",
    )

    # Use null string '\0' as sentinel to indicate the user supplied no argument,
    # since that is the only invalid character for filepaths on all platforms
    # bool('\0') is True in Python
    parser.add_argument(
        '--sidecar',
        nargs='?',
        const='\0',
        default=None,
        metavar='FILE',
        help="Generate sidecar text files that contain the same text recognized "
        "by Tesseract. This may be useful for building a OCR text database. "
        "If FILE is omitted, the sidecar file be named {output_file}.txt; the next "
        "argument must NOT be the name of the input PDF. "
        "If FILE is set to '-', the sidecar is written to stdout (a "
        "convenient way to preview OCR quality). The output file and sidecar "
        "may not both use stdout at the same time.",
    )

    parser.add_argument(
        '-n',
        '--no-overwrite',
        action='store_true',
        default=False,
        help="If the output file already exists, exit with an error instead of "
        "overwriting it.",
    )

    parser.add_argument(
        '--version',
        action='version',
        version=_VERSION,
        help="Print program version and exit",
    )

    jobcontrol = parser.add_argument_group("Job control options")
    jobcontrol.add_argument(
        '-j',
        '--jobs',
        metavar='N',
        type=numeric(int, 0, 256),
        help="Use up to N CPU cores simultaneously (default: use all).",
    )
    jobcontrol.add_argument(
        '-q', '--quiet', action='store_true', help="Suppress INFO messages"
    )
    jobcontrol.add_argument(
        '-v',
        '--verbose',
        type=numeric(int, 0, 2),
        default=0,
        const=1,
        nargs='?',
        help="Print more verbose messages for each additional verbose level. Use "
        "`-v 1` typically for much more detailed logging. Higher numbers "
        "are probably only useful in debugging.",
    )
    jobcontrol.add_argument(
        '--no-progress-bar',
        action='store_false',
        dest='progress_bar',
        help=argparse.SUPPRESS,
    )
    jobcontrol.add_argument(
        '--use-threads', action='store_true', default=True, help=argparse.SUPPRESS
    )
    jobcontrol.add_argument(
        '--no-use-threads',
        action='store_false',
        dest='use_threads',
        help=argparse.SUPPRESS,
    )

    metadata = parser.add_argument_group(
        "Metadata options",
        "Set output PDF/A metadata (default: copy input document's metadata)",
    )
    metadata.add_argument(
        '--title', type=str, help="Set document title (place multiple words in quotes)"
    )
    metadata.add_argument('--author', type=str, help="Set document author")
    metadata.add_argument(
        '--subject', type=str, help="Set document subject description"
    )
    metadata.add_argument('--keywords', type=str, help="Set document keywords")

    preprocessing = parser.add_argument_group(
        "Image preprocessing options",
        "Options to improve the quality of the final PDF and OCR",
    )
    preprocessing.add_argument(
        '-r',
        '--rotate-pages',
        action='store_true',
        help="Automatically rotate pages based on detected text orientation",
    )
    preprocessing.add_argument(
        '--remove-background',
        action='store_true',
        help="Attempt to remove background from gray or color pages, setting it "
        "to white ",
    )
    preprocessing.add_argument(
        '-d',
        '--deskew',
        action='store_true',
        help="Deskew each page before performing OCR",
    )
    preprocessing.add_argument(
        '-c',
        '--clean',
        action='store_true',
        help="Clean pages from scanning artifacts before performing OCR, and send "
        "the cleaned page to OCR, but do not include the cleaned page in "
        "the output",
    )
    preprocessing.add_argument(
        '-i',
        '--clean-final',
        action='store_true',
        help="Clean page as above, and incorporate the cleaned image in the final "
        "PDF.  Might remove desired content.",
    )
    preprocessing.add_argument(
        '--unpaper-args',
        type=str,
        default=None,
        help="A quoted string of arguments to pass to unpaper. Requires --clean. "
        "Example: --unpaper-args '--layout double'.",
    )
    preprocessing.add_argument(
        '--oversample',
        metavar='DPI',
        type=numeric(int, 0, 5000),
        default=0,
        help="Oversample images to at least the specified DPI, to improve OCR "
        "results slightly",
    )
    preprocessing.add_argument(
        '--remove-vectors',
        action='store_true',
        help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they "
        "will not be included in OCR. This can eliminate false characters.",
    )

    ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied")
    ocrsettings.add_argument(
        '-m',
        '--mode',
        choices=[mode.value for mode in ProcessingMode],
        default=ProcessingMode.default.value,
        help="Processing mode for pages with existing text. "
        "'default' errors if text is found. "
        "'force' rasterizes all content and runs OCR (same as --force-ocr). "
        "'skip' skips pages with existing text (same as --skip-text). "
        "'redo' re-OCRs pages, replacing old invisible text (same as --redo-ocr).",
    )
    # Legacy flags for backward compatibility - these set the mode internally
    ocrsettings.add_argument(
        '-f',
        '--force-ocr',
        action='store_true',
        help="Rasterize any text or vector objects on each page, apply OCR, and "
        "save the rastered output (this rewrites the PDF). "
        "Equivalent to --mode force.",
    )
    ocrsettings.add_argument(
        '-s',
        '--skip-text',
        action='store_true',
        help="Skip OCR on any pages that already contain text, but include the "
        "page in final output; useful for PDFs that contain a mix of "
        "images, text pages, and/or previously OCRed pages. "
        "Equivalent to --mode skip.",
    )
    ocrsettings.add_argument(
        '--redo-ocr',
        action='store_true',
        help="Attempt to detect and remove the hidden OCR layer from files that "
        "were previously OCRed with OCRmyPDF or another program. Apply OCR "
        "to text found in raster images. Existing visible text objects will "
        "not be changed. If there is no existing OCR, OCR will be added. "
        "Equivalent to --mode redo.",
    )
    ocrsettings.add_argument(
        '--skip-big',
        type=numeric(float, 0, 5000),
        metavar='MPixels',
        help="Skip OCR on pages larger than the specified amount of megapixels, "
        "but include skipped pages in final output",
    )
    ocrsettings.add_argument(
        '--invalidate-digital-signatures',
        action='store_true',
        help="Normally, OCRmyPDF will refuse to OCR a PDF that has a digital "
        "signature. This option allows OCR to proceed, but the digital signature "
        "will be invalidated.",
    )
    ocrsettings.add_argument(
        '--tagged-pdf-mode',
        choices=[mode.value for mode in TaggedPdfMode],
        default=TaggedPdfMode.default.value,
        help="Control behavior when a Tagged PDF is encountered. "
        "'default' errors if --mode is default, otherwise warns. "
        "'ignore' always warns but continues processing.",
    )

    advanced = parser.add_argument_group(
        "Advanced", "Advanced options to control OCRmyPDF"
    )
    advanced.add_argument(
        '--pages',
        type=str,
        help=(
            "Limit OCR to the specified pages (ranges or comma separated), "
            "skipping others"
        ),
    )
    advanced.add_argument(
        '--max-image-mpixels',
        action='store',
        type=numeric(float, 0),
        metavar='MPixels',
        help="Set maximum number of megapixels to unpack before treating an image as a "
        "decompression bomb",
        default=250.0,
    )
    advanced.add_argument(
        '--pdf-renderer',
        choices=['auto', 'hocr', 'sandwich', 'hocrdebug', 'fpdf2'],
        default='auto',
        help="Choose OCR PDF renderer. 'auto' (recommended) uses fpdf2, which "
        "provides full international language support including RTL scripts, "
        "proper text positioning, and invisible text that becomes visible when "
        "selected. 'sandwich' renders text as a background layer. Legacy 'hocr' "
        "and 'hocrdebug' options are deprecated and will use fpdf2.",
    )
    advanced.add_argument(
        '--ocr-engine',
        choices=['auto', 'tesseract', 'none'],
        default='auto',
        help="OCR engine to use. 'auto' (default) selects the best available engine. "
        "'tesseract' uses Tesseract OCR. "
        "'none' skips OCR entirely, useful for PDF/A conversion or image processing "
        "without text recognition.",
    )
    advanced.add_argument(
        '--rasterizer',
        choices=['auto', 'ghostscript', 'pypdfium'],
        default='auto',
        help="Choose PDF page rasterizer. 'auto' prefers pypdfium when available, "
        "falling back to Ghostscript. 'pypdfium' is faster but requires the "
        "pypdfium2 package. 'ghostscript' uses the traditional Ghostscript rasterizer.",
    )
    advanced.add_argument(
        '--rotate-pages-threshold',
        default=DEFAULT_ROTATE_PAGES_THRESHOLD,
        type=numeric(float, 0, 1000),
        metavar='CONFIDENCE',
        help="Only rotate pages when confidence is above this value (arbitrary "
        "units reported by tesseract)",
    )
    advanced.add_argument(
        '--fast-web-view',
        type=numeric(float, 0),
        default=1.0,
        metavar="MEGABYTES",
        help="If the size of file is more than this threshold (in MB), then "
        "linearize the PDF for fast web viewing. This allows the PDF to be "
        "displayed before it is fully downloaded in web browsers, but increases "
        "the space required slightly. By default we skip this for small files "
        "which do not benefit. If the threshold is 0 it will be apply to all files. "
        "Set the threshold very high to disable.",
    )
    advanced.add_argument(
        '--continue-on-soft-render-error',
        action='store_true',
        help="Continue processing pages after a recoverable PDF rendering error. "
        "A recoverable error is one that does not prevent the page from being "
        "rendered, but may result in visual differences compared to the input "
        "file. Missing fonts are a typical source of these errors.",
    )
    advanced.add_argument(
        '--plugin',
        dest='plugins',
        action='append',
        default=[],
        help="Name of plugin to import. Argument may be issued multiple times to "
        "import multiple plugins. Plugins may be specified as module names in "
        "Python syntax, provided they are installed in the same Python (virtual) "
        "environment as ocrmypdf; or you may give the path to the Python file that "
        "contains the plugin. Plugins must conform to the specification in the "
        "OCRmyPDF documentation.",
    )

    debugging = parser.add_argument_group(
        "Debugging", "Arguments to help with troubleshooting and debugging"
    )
    debugging.add_argument(
        '-k',
        '--keep-temporary-files',
        action='store_true',
        help="Keep temporary files (helpful for debugging)",
    )
    return parser


plugins_only_parser = ArgumentParser(
    prog=_PROGRAM_NAME, fromfile_prefix_chars='@', add_help=False, allow_abbrev=False
)
plugins_only_parser.add_argument(
    '--plugin',
    dest='plugins',
    action='append',
    default=[],
    help="Name of plugin to import.",
)


def namespace_to_options(ns) -> OcrOptions:
    """Convert argparse.Namespace to OcrOptions.

    This function encapsulates CLI-specific knowledge of how command line
    arguments map to our internal options model.
    """
    # Extract known fields
    known_fields = {}
    extra_attrs = {}

    # Legacy boolean flags that map to mode - handled by OcrOptions model validator
    legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}

    for key, value in vars(ns).items():
        if key in OcrOptions.model_fields:
            known_fields[key] = value
        elif key in legacy_mode_flags:
            # Pass legacy flags to OcrOptions for conversion to mode
            known_fields[key] = value
        else:
            extra_attrs[key] = value

    # Handle special cases for hOCR API
    if 'output_folder' in extra_attrs and 'output_file' not in known_fields:
        known_fields['output_file'] = '/dev/null'  # Placeholder

    # Handle case where input_file is missing (e.g., in _hocr_to_ocr_pdf)
    if 'work_folder' in extra_attrs and 'input_file' not in known_fields:
        known_fields['input_file'] = '/dev/null'  # Placeholder

    instance = OcrOptions(**known_fields)
    instance.extra_attrs = extra_attrs
    return instance


def get_options_and_plugins(
    args=None,
) -> tuple[OcrOptions, OcrmypdfPluginManager]:
    """Parse command line arguments and return OcrOptions and plugin manager.

    This is the main entry point for CLI argument processing. It handles
    plugin discovery, argument parsing, and conversion to our internal
    options model.

    Args:
        args: Command line arguments. If None, uses sys.argv.

    Returns:
        Tuple of (OcrOptions, PluginManager)
    """
    # Import here to avoid circular imports
    from ocrmypdf.api import setup_plugin_infrastructure

    # First pass: get plugins so we can register their options
    pre_options, _unused = plugins_only_parser.parse_known_args(args=args)

    # Set up plugin infrastructure with proper initialization
    plugin_manager = setup_plugin_infrastructure(plugins=pre_options.plugins)

    # Get parser and let plugins add their options
    parser = get_parser()
    plugin_manager.add_options(parser=parser)

    # Parse all arguments
    namespace = parser.parse_args(args=args)

    # Convert to OcrOptions
    options = namespace_to_options(namespace)

    return options, plugin_manager


================================================
FILE: src/ocrmypdf/data/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Data files used to generate certain PDFs."""

from __future__ import annotations


================================================
FILE: src/ocrmypdf/exceptions.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""OCRmyPDF's exceptions."""

from __future__ import annotations

from enum import IntEnum
from textwrap import dedent


class ExitCode(IntEnum):
    """OCRmyPDF's exit codes."""

    # pylint: disable=invalid-name
    ok = 0
    bad_args = 1
    input_file = 2
    missing_dependency = 3
    invalid_output_pdf = 4
    file_access_error = 5
    already_done_ocr = 6
    child_process_error = 7
    encrypted_pdf = 8
    invalid_config = 9
    pdfa_conversion_failed = 10
    other_error = 15
    ctrl_c = 130


class ExitCodeException(Exception):
    """An exception which should return an exit code with sys.exit()."""

    exit_code = ExitCode.other_error
    message = ""

    def __str__(self):
        """Return a string representation of the exception."""
        super_msg = super().__str__()  # Don't do str(super())
        if self.message:
            return self.message.format(super_msg)
        return super_msg


class BadArgsError(ExitCodeException):
    """Invalid arguments on the command line or API."""

    exit_code = ExitCode.bad_args


class MissingDependencyError(ExitCodeException):
    """A third-party dependency is missing."""

    exit_code = ExitCode.missing_dependency


class UnsupportedImageFormatError(ExitCodeException):
    """The image format is not supported."""

    exit_code = ExitCode.input_file


class DpiError(ExitCodeException):
    """Missing information about input image DPI."""

    exit_code = ExitCode.input_file


class OutputFileAccessError(ExitCodeException):
    """Cannot access the intended output file path."""

    exit_code = ExitCode.file_access_error


class PriorOcrFoundError(ExitCodeException):
    """This file already has OCR."""

    exit_code = ExitCode.already_done_ocr


class InputFileError(ExitCodeException):
    """Something is wrong with the input file."""

    exit_code = ExitCode.input_file


class SubprocessOutputError(ExitCodeException):
    """A subprocess returned an unexpected error."""

    exit_code = ExitCode.child_process_error


class EncryptedPdfError(ExitCodeException):
    """Input PDF is encrypted."""

    exit_code = ExitCode.encrypted_pdf
    message = dedent(
        """\
        Input PDF is encrypted. The encryption must be removed to
        perform OCR.

        For information about this PDF's security use
            qpdf --show-encryption infilename

        You can remove the encryption using
            qpdf --decrypt [--password=[password]] infilename
        """
    )


class TesseractConfigError(ExitCodeException):
    """Tesseract config can't be parsed."""

    exit_code = ExitCode.invalid_config
    message = "Error occurred while parsing a Tesseract configuration file"


class DigitalSignatureError(InputFileError):
    """PDF has a digital signature."""

    message = dedent(
        """\
        Input PDF has a digital signature. OCR would alter the document,
        invalidating the signature.
        """
    )


class TaggedPDFError(InputFileError):
    """PDF is tagged."""

    message = dedent(
        """\
        This PDF is marked as a Tagged PDF. This often indicates
        that the PDF was generated from an office document and does
        not need OCR. Use --force-ocr, --skip-text or --redo-ocr to
        override this error.
        """
    )


class ColorConversionNeededError(BadArgsError):
    """PDF needs color conversion."""

    message = dedent(
        """\
        The input PDF has an unusual color space. Use
        --color-conversion-strategy to convert to a common color space
        such as RGB, or use --output-type pdf to skip PDF/A conversion
        and retain the original color space.
        """
    )


================================================
FILE: src/ocrmypdf/extra_plugins/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
#
# SPDX-License-Identifier: MPL-2.0

"""Extra plugins. These are not automatically inserted when ocrmypdf is run."""


================================================
FILE: src/ocrmypdf/extra_plugins/semfree.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Semaphore-free alternate executor.

There are two popular environments that do not fully support the standard Python
multiprocessing module: AWS Lambda, and Termux (a terminal emulator for Android).

This alternate executor divvies up work among worker processes before processing,
rather than having each worker consume work from a shared queue when they finish
their task. This means workers have no need to coordinate with each other. Each
worker communicates only with the main process.

This is not without drawbacks. If the tasks are not "even" in size, which cannot
be guaranteed, some workers may end up with too much work while others are idle.
It is less efficient than the standard implementation, so not the default.

This module is deprecated and will be removed in a future release. The standard
executor will fall back to threads in these environments.
"""

from __future__ import annotations

import logging
import logging.handlers
import signal
import warnings
from collections.abc import Callable, Iterable, Iterator
from contextlib import suppress
from enum import Enum, auto
from itertools import islice, repeat, takewhile, zip_longest
from multiprocessing import Pipe, Process
from multiprocessing.connection import Connection, wait

from ocrmypdf import Executor, hookimpl
from ocrmypdf._concurrent import NullProgressBar
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import remove_all_log_handlers

warnings.warn(
    "semfree.py is deprecated and will be removed in a future release.",
    DeprecationWarning,
)


class MessageType(Enum):
    """Implement basic IPC messaging."""

    exception = auto()  # pylint: disable=invalid-name
    result = auto()  # pylint: disable=invalid-name
    complete = auto()  # pylint: disable=invalid-name


def split_every(n: int, iterable: Iterable) -> Iterator:
    """Split iterable into groups of n.

    >>> list(split_every(4, range(10)))
    [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]

    https://stackoverflow.com/a/22919323
    """
    iterator = iter(iterable)
    return takewhile(bool, (list(islice(iterator, n)) for _ in repeat(None)))


def process_sigbus(*args):
    """Handle SIGBUS signal at the worker level."""
    raise InputFileError("A worker process lost access to an input file")


class ConnectionLogHandler(logging.handlers.QueueHandler):
    """Handler used by child processes to forward log messages to parent."""

    def __init__(self, conn: Connection) -> None:
        """Initialize the handler."""
        # sets the parent's queue to None - parent only touches queue
        # in enqueue() which we override
        super().__init__(None)  # type: ignore
        self.conn = conn

    def enqueue(self, record):
        """Enqueue a log message."""
        self.conn.send(('log', record))


def process_loop(
    conn: Connection, user_init: Callable[[], None], loglevel, task, task_args
):
    """Initialize a process pool worker."""
    # Install SIGBUS handler (so our parent process can abort somewhat gracefully)
    with suppress(AttributeError):  # Windows and Cygwin do not have SIGBUS
        # Windows and Cygwin do not have pthread_sigmask or SIGBUS
        signal.signal(signal.SIGBUS, process_sigbus)

    # Reconfigure the root logger for this process to send all messages to a queue
    h = ConnectionLogHandler(conn)
    root = logging.getLogger()
    remove_all_log_handlers(root)
    root.setLevel(loglevel)
    root.addHandler(h)

    user_init()

    for args in task_args:
        try:
            result = task(*args)
        except Exception as e:  # pylint: disable=broad-except
            conn.send((MessageType.exception, e))
            break
        else:
            conn.send((MessageType.result, result))

    conn.send((MessageType.complete, None))
    conn.close()
    return


class LambdaExecutor(Executor):
    """Executor for AWS Lambda or similar environments that lack semaphores."""

    def _execute(
        self,
        *,
        use_threads: bool,
        max_workers: int,
        progress_kwargs: dict,
        worker_initializer: Callable,
        task: Callable,
        task_arguments: Iterable,
        task_finished: Callable,
    ):
        if use_threads and max_workers == 1:
            with self.pbar_class(**progress_kwargs) as pbar:
                for args in task_arguments:
                    result = task(*args)
                    task_finished(result, pbar)
            return

        task_arguments = list(task_arguments)
        grouped_args = list(
            zip_longest(*list(split_every(max_workers, task_arguments)))
        )
        if not grouped_args:
            return

        processes: list[Process] = []
        connections: list[Connection] = []
        for chunk in grouped_args:
            parent_conn, child_conn = Pipe()

            worker_args = [args for args in chunk if args is not None]
            process = Process(
                target=process_loop,
                args=(
                    child_conn,
                    worker_initializer,
                    logging.getLogger("").level,
                    task,
                    worker_args,
                ),
            )
            process.daemon = True
            processes.append(process)
            connections.append(parent_conn)

        for process in processes:
            process.start()

        with self.pbar_class(**progress_kwargs) as pbar:
            while connections:
                for result in wait(connections):
                    if not isinstance(result, Connection):
                        raise NotImplementedError("We only support Connection()")
                    try:
                        msg_type, msg = result.recv()
                    except EOFError:
                        connections.remove(result)
                        continue

                    if msg_type == MessageType.result:
                        task_finished(msg, pbar)
                    elif msg_type == 'log':
                        record = msg
                        logger = logging.getLogger(record.name)
                        logger.handle(record)
                    elif msg_type == MessageType.complete:
                        connections.remove(result)
                    elif msg_type == MessageType.exception:
                        for process in processes:
                            process.terminate()
                        raise msg

        for process in processes:
            process.join()


@hookimpl
def get_executor(progressbar_class):
    """Return a LambdaExecutor instance."""
    return LambdaExecutor(pbar_class=progressbar_class)


@hookimpl
def get_logging_console():
    """Return a logging.StreamHandler instance."""
    return logging.StreamHandler()


@hookimpl
def get_progressbar_class():
    """Return a NullProgressBar instance.

    This executor cannot use a progress bar.
    """
    return NullProgressBar


================================================
FILE: src/ocrmypdf/font/__init__.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Font management for OCRmyPDF PDF rendering.

This module provides font infrastructure for the fpdf2 PDF renderer. It includes:

- FontManager: Base class for font loading and glyph checking
- FontProvider: Protocol and implementations for font discovery
- MultiFontManager: Automatic font selection for multilingual documents
- SystemFontProvider: System font discovery
"""
from __future__ import annotations

from ocrmypdf.font.font_manager import FontManager
from ocrmypdf.font.font_provider import (
    BuiltinFontProvider,
    ChainedFontProvider,
    FontProvider,
)
from ocrmypdf.font.multi_font_manager import MultiFontManager
from ocrmypdf.font.system_font_provider import SystemFontProvider

__all__ = [
    "FontManager",
    "FontProvider",
    "BuiltinFontProvider",
    "ChainedFontProvider",
    "MultiFontManager",
    "SystemFontProvider",
]


================================================
FILE: src/ocrmypdf/font/font_manager.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Base font management for PDF rendering.

This module provides the base FontManager class that handles font loading
and glyph checking using uharfbuzz.
"""

from __future__ import annotations

from pathlib import Path

import uharfbuzz as hb


class FontManager:
    """Manages font loading and glyph checking for PDF rendering.

    This base class handles loading fonts with uharfbuzz for glyph checking
    and text shaping. Renderer-specific subclasses should extend this to
    add their own font objects.

    Attributes:
        font_path: Path to the font file
        font_data: Raw font file bytes
        font_index: Index within TTC collection (0 for single-font files)
        hb_face: uharfbuzz Face object
        hb_font: uharfbuzz Font object
    """

    def __init__(self, font_path: Path, font_index: int = 0):
        """Initialize font manager.

        Args:
            font_path: Path to TrueType/OpenType font file
            font_index: Index of font within a TTC collection (default 0).
                        For single-font files (.ttf, .otf), use 0.
        """
        self.font_path = font_path
        self.font_index = font_index

        # Load font data
        self.font_data = font_path.read_bytes()

        # Load font with uharfbuzz for glyph checking and text measurement
        # Note: uharfbuzz Face also supports font_index for TTC files
        self.hb_face = hb.Face(self.font_data, font_index)
        self.hb_font = hb.Font(self.hb_face)

    def get_hb_font(self) -> hb.Font:
        """Get uharfbuzz Font object for text measurement.

        Returns:
            UHarfBuzz Font instance
        """
        return self.hb_font

    def has_glyph(self, codepoint: int) -> bool:
        """Check if font has a glyph for given codepoint.

        Args:
            codepoint: Unicode codepoint

        Returns:
            True if font has a real glyph (not .notdef)
        """
        glyph_id = self.hb_font.get_nominal_glyph(codepoint)
        return glyph_id is not None and glyph_id != 0

    def get_font_metrics(self) -> tuple[float, float, float]:
        """Get normalized font metrics (ascent, descent, units_per_em).

        Returns:
            Tuple of (ascent, descent, units_per_em) where ascent and descent
            are in font units. Ascent is positive (above baseline), descent
            is typically negative (below baseline).
        """
        extents = self.hb_font.get_font_extents('ltr')
        units_per_em = self.hb_face.upem
        return (extents.ascender, extents.descender, units_per_em)

    def get_left_side_bearing(self, char: str, font_size: float) -> float:
        """Get the left side bearing of a character at a given font size.

        The left side bearing (lsb) is the horizontal distance from the glyph
        origin (x=0) to the leftmost pixel of the glyph. A positive lsb means
        there's whitespace before the glyph starts.

        Args:
            char: Single character to get lsb for
            font_size: Font size in points

        Returns:
            Left side bearing in points. Returns 0 if character not found.
        """
        if not char:
            return 0.0

        codepoint = ord(char)
        glyph_id = self.hb_font.get_nominal_glyph(codepoint)
        if glyph_id is None or glyph_id == 0:
            return 0.0

        # Get glyph extents which include left/right bearing info
        extents = self.hb_font.get_glyph_extents(glyph_id)
        if extents is None:
            return 0.0

        # x_bearing is the left side bearing in font units
        units_per_em = self.hb_face.upem
        lsb_units = extents.x_bearing
        lsb_pt = lsb_units * font_size / units_per_em

        return lsb_pt


================================================
FILE: src/ocrmypdf/font/font_provider.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Font provider protocol and implementations for PDF rendering."""

from __future__ import annotations

import logging
from pathlib import Path
from typing import Protocol

from ocrmypdf.font.font_manager import FontManager

log = logging.getLogger(__name__)


class FontProvider(Protocol):
    """Protocol for providing fonts to MultiFontManager.

    Implementations are responsible for knowing where fonts are located
    and loading them. MultiFontManager asks for fonts by name and uses
    them for glyph coverage checking.
    """

    def get_font(self, font_name: str) -> FontManager | None:
        """Get a FontManager for the named font.

        Args:
            font_name: Logical font name (e.g., 'NotoSans-Regular')

        Returns:
            FontManager if font is available, None otherwise
        """
        ...

    def get_available_fonts(self) -> list[str]:
        """Get list of available font names.

        Returns:
            List of font names that can be retrieved with get_font()
        """
        ...

    def get_fallback_font(self) -> FontManager:
        """Get the glyphless fallback font.

        This font must always be available and handles any codepoint.

        Returns:
            FontManager for the glyphless fallback font (Occulta.ttf)
        """
        ...


class BuiltinFontProvider:
    """Font provider using builtin fonts from ocrmypdf/data directory."""

    # Mapping of logical font names to filenames
    # Only Latin (NotoSans) and the glyphless fallback (Occulta.ttf) are bundled.
    # All other scripts (Arabic, Devanagari, CJK, etc.) are discovered from
    # system fonts by SystemFontProvider to reduce package size.
    FONT_FILES = {
        'NotoSans-Regular': 'NotoSans-Regular.ttf',
        'Occulta': 'Occulta.ttf',
    }

    def __init__(self, font_dir: Path | None = None):
        """Initialize builtin font provider.

        Args:
            font_dir: Directory containing font files. If None, uses
                      the default ocrmypdf/data directory.
        """
        if font_dir is None:
            font_dir = Path(__file__).parent.parent / "data"
        self.font_dir = font_dir
        self._fonts: dict[str, FontManager] = {}
        self._load_fonts()

    def _load_fonts(self) -> None:
        """Load available fonts, logging warnings for missing ones."""
        for font_name, font_file in self.FONT_FILES.items():
            font_path = self.font_dir / font_file
            if not font_path.exists():
                if font_name == 'Occulta':
                    raise FileNotFoundError(
                        f"Required fallback font not found: {font_path}"
                    )
                log.warning(
                    "Font %s not found at %s - OCR output quality for some "
                    "scripts may be affected",
                    font_name,
                    font_path,
                )
                continue

            try:
                self._fonts[font_name] = FontManager(font_path)
            except Exception as e:
                if font_name == 'Occulta':
                    raise ValueError(
                        f"Failed to load required fallback font {font_file}: {e}"
                    ) from e
                log.warning(
                    "Failed to load font %s: %s - OCR output quality may be affected",
                    font_name,
                    e,
                )

    def get_font(self, font_name: str) -> FontManager | None:
        """Get a FontManager for the named font."""
        return self._fonts.get(font_name)

    def get_available_fonts(self) -> list[str]:
        """Get list of available font names."""
        return list(self._fonts.keys())

    def get_fallback_font(self) -> FontManager:
        """Get the glyphless fallback font."""
        return self._fonts['Occulta']


class ChainedFontProvider:
    """Font provider that tries multiple providers in order.

    This allows combining builtin fonts with system fonts, trying
    the builtin provider first and falling back to system fonts
    for fonts not bundled with the package.
    """

    def __init__(self, providers: list[FontProvider]):
        """Initialize chained font provider.

        Args:
            providers: List of font providers to try in order.
                       The first provider that returns a font wins.
        """
        if not providers:
            raise ValueError("At least one provider is required")
        self.providers = providers

    def get_font(self, font_name: str) -> FontManager | None:
        """Get a FontManager for the named font.

        Tries each provider in order until one returns a font.

        Args:
            font_name: Logical font name (e.g., 'NotoSans-Regular')

        Returns:
            FontManager if any provider has the font, None otherwise
        """
        for provider in self.providers:
            if font := provider.get_font(font_name):
                return font
        return None

    def get_available_fonts(self) -> list[str]:
        """Get list of available font names from all providers.

        Returns:
            Combined list of font names (deduplicated, order preserved)
        """
        seen: set[str] = set()
        result: list[str] = []
        for provider in self.providers:
            for name in provider.get_available_fonts():
                if name not in seen:
                    seen.add(name)
                    result.append(name)
        return result

    def get_fallback_font(self) -> FontManager:
        """Get the glyphless fallback font.

        Tries each provider until one provides a fallback font.

        Returns:
            FontManager for the fallback font

        Raises:
            RuntimeError: If no provider can provide a fallback font
        """
        for provider in self.providers:
            try:
                return provider.get_fallback_font()
            except (NotImplementedError, AttributeError, KeyError):
                continue
        raise RuntimeError("No fallback font available from any provider")


================================================
FILE: src/ocrmypdf/font/multi_font_manager.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Multi-font management for PDF rendering.

Provides automatic font selection for multilingual documents based on
language hints and glyph coverage analysis.
"""

from __future__ import annotations

import logging
from pathlib import Path

from ocrmypdf.font.font_manager import FontManager
from ocrmypdf.font.font_provider import (
    BuiltinFontProvider,
    ChainedFontProvider,
    FontProvider,
)
from ocrmypdf.font.system_font_provider import SystemFontProvider

log = logging.getLogger(__name__)


class MultiFontManager:
    """Manages multiple fonts with automatic selection and fallback.

    This class orchestrates multiple FontManager instances to provide
    word-level font selection for multilingual documents. It uses a hybrid
    approach combining language hints from hOCR with glyph coverage analysis.

    Font selection strategy:
    1. Try language-preferred font (if language hint available)
    2. Try fallback fonts in order by glyph coverage
    3. Fall back to Occulta.ttf (glyphless fallback)
    """

    # Language to font mapping
    # Keys are ISO 639-2/3 codes or Tesseract language codes
    LANGUAGE_FONT_MAP = {
        # Arabic script
        'ara': 'NotoSansArabic-Regular',  # Arabic
        'per': 'NotoSansArabic-Regular',  # Persian (uses Arabic script)
        'fas': 'NotoSansArabic-Regular',  # Farsi (alternative code for Persian)
        'urd': 'NotoSansArabic-Regular',  # Urdu (uses Arabic script)
        'pus': 'NotoSansArabic-Regular',  # Pashto
        'kur': 'NotoSansArabic-Regular',  # Kurdish (Arabic script variant)
        # Devanagari script
        'hin': 'NotoSansDevanagari-Regular',  # Hindi
        'san': 'NotoSansDevanagari-Regular',  # Sanskrit
        'mar': 'NotoSansDevanagari-Regular',  # Marathi
        'nep': 'NotoSansDevanagari-Regular',  # Nepali
        'kok': 'NotoSansDevanagari-Regular',  # Konkani
        'bho': 'NotoSansDevanagari-Regular',  # Bhojpuri
        'mai': 'NotoSansDevanagari-Regular',  # Maithili
        # CJK
        'chi': 'NotoSansCJK-Regular',  # Chinese (generic)
        'zho': 'NotoSansCJK-Regular',  # Chinese (ISO 639-3)
        'chi_sim': 'NotoSansCJK-Regular',  # Chinese Simplified (Tesseract)
        'chi_tra': 'NotoSansCJK-Regular',  # Chinese Traditional (Tesseract)
        'jpn': 'NotoSansCJK-Regular',  # Japanese
        'kor': 'NotoSansCJK-Regular',  # Korean
        # Thai
        'tha': 'NotoSansThai-Regular',  # Thai
        # Hebrew
        'heb': 'NotoSansHebrew-Regular',  # Hebrew
        'yid': 'NotoSansHebrew-Regular',  # Yiddish (uses Hebrew script)
        # Bengali script
        'ben': 'NotoSansBengali-Regular',  # Bengali
        'asm': 'NotoSansBengali-Regular',  # Assamese (uses Bengali script)
        # Tamil
        'tam': 'NotoSansTamil-Regular',  # Tamil
        # Gujarati
        'guj': 'NotoSansGujarati-Regular',  # Gujarati
        # Telugu
        'tel': 'NotoSansTelugu-Regular',  # Telugu
        # Kannada
        'kan': 'NotoSansKannada-Regular',  # Kannada
        # Malayalam
        'mal': 'NotoSansMalayalam-Regular',  # Malayalam
        # Myanmar (Burmese)
        'mya': 'NotoSansMyanmar-Regular',  # Myanmar
        # Khmer (Cambodian)
        'khm': 'NotoSansKhmer-Regular',  # Khmer
        # Lao
        'lao': 'NotoSansLao-Regular',  # Lao
        # Georgian
        'kat': 'NotoSansGeorgian-Regular',  # Georgian
        'geo': 'NotoSansGeorgian-Regular',  # Georgian (alternative)
        # Armenian
        'hye': 'NotoSansArmenian-Regular',  # Armenian
        'arm': 'NotoSansArmenian-Regular',  # Armenian (alternative)
        # Ethiopic
        'amh': 'NotoSansEthiopic-Regular',  # Amharic
        'tir': 'NotoSansEthiopic-Regular',  # Tigrinya
        # Sinhala
        'sin': 'NotoSansSinhala-Regular',  # Sinhala
        # Gurmukhi (Punjabi)
        'pan': 'NotoSansGurmukhi-Regular',  # Punjabi
        'pnb': 'NotoSansGurmukhi-Regular',  # Western Punjabi
        # Oriya
        'ori': 'NotoSansOriya-Regular',  # Oriya
        'ory': 'NotoSansOriya-Regular',  # Oriya (alternative)
        # Tibetan
        'bod': 'NotoSansTibetan-Regular',  # Tibetan
        'tib': 'NotoSansTibetan-Regular',  # Tibetan (alternative)
    }

    # Ordered fallback chain for fonts (after language-preferred font)
    # Order matters: most common scripts first for faster matching
    FALLBACK_FONTS = [
        'NotoSans-Regular',  # Latin, Greek, Cyrillic
        'NotoSansArabic-Regular',
        'NotoSansDevanagari-Regular',
        'NotoSansCJK-Regular',
        'NotoSansThai-Regular',
        'NotoSansHebrew-Regular',
        'NotoSansBengali-Regular',
        'NotoSansTamil-Regular',
        'NotoSansGujarati-Regular',
        'NotoSansTelugu-Regular',
        'NotoSansKannada-Regular',
        'NotoSansMalayalam-Regular',
        'NotoSansMyanmar-Regular',
        'NotoSansKhmer-Regular',
        'NotoSansLao-Regular',
        'NotoSansGeorgian-Regular',
        'NotoSansArmenian-Regular',
        'NotoSansEthiopic-Regular',
        'NotoSansSinhala-Regular',
        'NotoSansGurmukhi-Regular',
        'NotoSansOriya-Regular',
        'NotoSansTibetan-Regular',
    ]

    def __init__(
        self,
        font_dir: Path | None = None,
        *,
        font_provider: FontProvider | None = None,
    ):
        """Initialize multi-font manager.

        Args:
            font_dir: Directory containing font files. If font_provider is
                      not specified, this is passed to BuiltinFontProvider.
            font_provider: Provider for loading fonts. If None, uses a
                           ChainedFontProvider that tries builtin fonts first,
                           then searches system fonts.
        """
        if font_provider is not None:
            self.font_provider = font_provider
        else:
            # Use chained provider: try builtin fonts first, then system fonts
            self.font_provider = ChainedFontProvider(
                [
                    BuiltinFontProvider(font_dir),
                    SystemFontProvider(),
                ]
            )

        # Font selection cache: (word_text, language) -> font_name
        self._selection_cache: dict[tuple[str, str | None], str] = {}
        # Track whether we've warned about missing fonts (warn once per script)
        self._warned_scripts: set[str] = set()

    @property
    def fonts(self) -> dict[str, FontManager]:
        """Get all loaded fonts (backward compatibility)."""
        return self.get_all_fonts()

    def _try_font(
        self, font_name: str, word_text: str, cache_key: tuple[str, str | None]
    ) -> FontManager | None:
        """Try to use a font for the given word.

        Args:
            font_name: Name of font to try
            word_text: Text content to check
            cache_key: Cache key for storing successful result

        Returns:
            FontManager if font exists and has all glyphs, None otherwise
        """
        font = self.font_provider.get_font(font_name)
        if font is None:
            return None
        if self._has_all_glyphs(font, word_text):
            self._selection_cache[cache_key] = font_name
            return font
        return None

    def select_font_for_word(
        self, word_text: str, line_language: str | None
    ) -> FontManager:
        """Select appropriate font for a word.

        Uses a hybrid approach:
        1. Language-based selection (if language hint available)
        2. Ordered fallback through available fonts by glyph coverage
        3. Final fallback to Occulta.ttf (glyphless)

        Args:
            word_text: The text content of the word
            line_language: Language code from hOCR (e.g., 'ara', 'eng')

        Returns:
            FontManager instance to use for rendering this word
        """
        cache_key = (word_text, line_language)
        if cache_key in self._selection_cache:
            cached_name = self._selection_cache[cache_key]
            font = self.font_provider.get_font(cached_name)
            if font:
                return font

        tried_fonts: set[str] = set()

        # Phase 1: Try language-preferred font
        if line_language and line_language in self.LANGUAGE_FONT_MAP:
            preferred = self.LANGUAGE_FONT_MAP[line_language]
            tried_fonts.add(preferred)
            if result := self._try_font(preferred, word_text, cache_key):
                return result

        # Phase 2: Try fallback fonts in order
        for font_name in self.FALLBACK_FONTS:
            if font_name in tried_fonts:
                continue
            if result := self._try_font(font_name, word_text, cache_key):
                return result

        # Phase 3: Glyphless fallback (always succeeds)
        # Warn if we're falling back for non-ASCII text (likely missing font)
        self._warn_missing_font(word_text, line_language)
        self._selection_cache[cache_key] = 'Occulta'
        return self.font_provider.get_fallback_font()

    def _warn_missing_font(self, word_text: str, line_language: str | None) -> None:
        """Warn user about missing font for non-Latin text.

        Only warns once per language/script to avoid log spam.
        """
        # Determine a key for deduplication (language or 'non-ascii')
        warn_key = line_language if line_language else 'unknown'

        # Only warn for non-ASCII text and only once per key
        if warn_key in self._warned_scripts:
            return

        # Check if text contains non-ASCII characters
        if not any(ord(c) > 127 for c in word_text):
            return

        self._warned_scripts.add(warn_key)

        if line_language and line_language in self.LANGUAGE_FONT_MAP:
            font_name = self.LANGUAGE_FONT_MAP[line_language]
            log.warning(
                "No font found with glyphs for '%s' text. "
                "Install %s for better rendering. "
                "See https://fonts.google.com/noto",
                line_language,
                font_name,
            )
        else:
            log.warning(
                "No font found with glyphs for some text. "
                "Install Noto fonts for better rendering. "
                "See https://fonts.google.com/noto"
            )

    def _has_all_glyphs(self, font: FontManager, text: str) -> bool:
        """Check if a font has glyphs for all characters in text.

        Args:
            font: FontManager instance to check
            text: Text to verify coverage for

        Returns:
            True if font has real glyphs for all characters (not .notdef)
        """
        if not text:
            return True

        hb_font = font.get_hb_font()

        for char in text:
            codepoint = ord(char)
            glyph_id = hb_font.get_nominal_glyph(codepoint)
            if glyph_id is None or glyph_id == 0:  # 0 = .notdef glyph
                return False

        return True

    def has_font(self, font_name: str) -> bool:
        """Check if a named font is available.

        Args:
            font_name: Name of font to check

        Returns:
            True if font is available
        """
        return self.font_provider.get_font(font_name) is not None

    def has_all_glyphs(self, font_name: str, text: str) -> bool:
        """Check if a named font has glyphs for all characters in text.

        Args:
            font_name: Name of font to check
            text: Text to verify coverage for

        Returns:
            True if font has real glyphs for all characters (not .notdef)
        """
        font = self.font_provider.get_font(font_name)
        if font is None:
            return False
        return self._has_all_glyphs(font, text)

    def get_all_fonts(self) -> dict[str, FontManager]:
        """Get all loaded font managers.

        Returns:
            Dictionary mapping font names to FontManager instances
        """
        result = {}
        for name in self.font_provider.get_available_fonts():
            font = self.font_provider.get_font(name)
            if font is not None:
                result[name] = font
        return result


================================================
FILE: src/ocrmypdf/font/system_font_provider.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""System font discovery for PDF rendering.

Provides lazy discovery of Noto fonts installed on the system across
Linux, macOS, and Windows platforms.
"""

from __future__ import annotations

import logging
import os
import sys
from pathlib import Path

from ocrmypdf.font.font_manager import FontManager

log = logging.getLogger(__name__)


class SystemFontProvider:
    """Discovers and provides system-installed Noto fonts with lazy scanning.

    This provider searches standard system font directories for Noto fonts.
    Scanning is performed lazily - only when a font is actually requested
    and not found in the builtin fonts. Results are cached for the lifetime
    of the provider instance.
    """

    # System font directories by platform
    SYSTEM_FONT_DIRS: dict[str, list[Path]] = {
        'linux': [
            Path('/usr/share/fonts'),
            Path('/usr/local/share/fonts'),
            Path.home() / '.fonts',
            Path.home() / '.local/share/fonts',
        ],
        'freebsd': [
            Path('/usr/local/share/fonts'),
            Path.home() / '.fonts',
        ],
        'darwin': [
            Path('/Library/Fonts'),
            Path('/System/Library/Fonts'),
            Path.home() / 'Library/Fonts',
        ],
        # Windows is handled dynamically in _get_font_dirs()
    }

    # Noto font logical names → possible filenames (priority order)
    # The first match found will be used
    NOTO_FONT_PATTERNS: dict[str, list[str]] = {
        'NotoSans-Regular': [
            'NotoSans-Regular.ttf',
            'NotoSans-Regular.otf',
        ],
        'NotoSansArabic-Regular': [
            'NotoSansArabic-Regular.ttf',
            'NotoSansArabic-Regular.otf',
        ],
        'NotoSansDevanagari-Regular': [
            'NotoSansDevanagari-Regular.ttf',
            'NotoSansDevanagari-Regular.otf',
        ],
        'NotoSansCJK-Regular': [
            # Language-specific variants (any will work for CJK)
            'NotoSansCJKsc-Regular.otf',  # Simplified Chinese
            'NotoSansCJKtc-Regular.otf',  # Traditional Chinese
            'NotoSansCJKjp-Regular.otf',  # Japanese
            'NotoSansCJKkr-Regular.otf',  # Korean
            # TTC collections (common on Linux distros)
            'NotoSansCJK-Regular.ttc',
            'NotoSansCJKsc-Regular.ttc',
            # Variable fonts
            'NotoSansCJKsc-VF.otf',
        ],
        'NotoSansThai-Regular': [
            'NotoSansThai-Regular.ttf',
            'NotoSansThai-Regular.otf',
        ],
        'NotoSansHebrew-Regular': [
            'NotoSansHebrew-Regular.ttf',
            'NotoSansHebrew-Regular.otf',
        ],
        'NotoSansBengali-Regular': [
            'NotoSansBengali-Regular.ttf',
            'NotoSansBengali-Regular.otf',
        ],
        'NotoSansTamil-Regular': [
            'NotoSansTamil-Regular.ttf',
            'NotoSansTamil-Regular.otf',
        ],
        'NotoSansGujarati-Regular': [
            'NotoSansGujarati-Regular.ttf',
            'NotoSansGujarati-Regular.otf',
        ],
        'NotoSansTelugu-Regular': [
            'NotoSansTelugu-Regular.ttf',
            'NotoSansTelugu-Regular.otf',
        ],
        'NotoSansKannada-Regular': [
            'NotoSansKannada-Regular.ttf',
            'NotoSansKannada-Regular.otf',
        ],
        'NotoSansMalayalam-Regular': [
            'NotoSansMalayalam-Regular.ttf',
            'NotoSansMalayalam-Regular.otf',
        ],
        'NotoSansMyanmar-Regular': [
            'NotoSansMyanmar-Regular.ttf',
            'NotoSansMyanmar-Regular.otf',
        ],
        'NotoSansKhmer-Regular': [
            'NotoSansKhmer-Regular.ttf',
            'NotoSansKhmer-Regular.otf',
        ],
        'NotoSansLao-Regular': [
            'NotoSansLao-Regular.ttf',
            'NotoSansLao-Regular.otf',
        ],
        'NotoSansGeorgian-Regular': [
            'NotoSansGeorgian-Regular.ttf',
            'NotoSansGeorgian-Regular.otf',
        ],
        'NotoSansArmenian-Regular': [
            'NotoSansArmenian-Regular.ttf',
            'NotoSansArmenian-Regular.otf',
        ],
        'NotoSansEthiopic-Regular': [
            'NotoSansEthiopic-Regular.ttf',
            'NotoSansEthiopic-Regular.otf',
        ],
        'NotoSansSinhala-Regular': [
            'NotoSansSinhala-Regular.ttf',
            'NotoSansSinhala-Regular.otf',
        ],
        'NotoSansGurmukhi-Regular': [
            'NotoSansGurmukhi-Regular.ttf',
            'NotoSansGurmukhi-Regular.otf',
        ],
        'NotoSansOriya-Regular': [
            'NotoSansOriya-Regular.ttf',
            'NotoSansOriya-Regular.otf',
        ],
        'NotoSansTibetan-Regular': [
            'NotoSansTibetan-Regular.ttf',
            'NotoSansTibetan-Regular.otf',
        ],
    }

    def __init__(self) -> None:
        """Initialize system font provider with empty caches."""
        # Cache: font_name -> FontManager (successfully loaded fonts)
        self._font_cache: dict[str, FontManager] = {}
        # Negative cache: font names we've searched for but not found
        self._not_found: set[str] = set()
        # Cached font directories (computed lazily)
        self._font_dirs: list[Path] | None = None

    def _get_platform(self) -> str:
        """Get the current platform identifier.

        Returns:
            Platform string: 'linux', 'darwin', 'windows', or 'freebsd'
        """
        if sys.platform == 'win32':
            return 'windows'
        elif sys.platform == 'darwin':
            return 'darwin'
        elif 'freebsd' in sys.platform:
            return 'freebsd'
        else:
            return 'linux'

    def _get_font_dirs(self) -> list[Path]:
        """Get font directories for the current platform.

        Returns:
            List of paths to search for fonts (may include non-existent paths)
        """
        if self._font_dirs is not None:
            return self._font_dirs

        platform = self._get_platform()

        if platform == 'windows':
            # Get Windows font directories from environment
            windir = os.environ.get('WINDIR', r'C:\Windows')
            self._font_dirs = [Path(windir) / 'Fonts']
            # User-installed fonts (Windows 10+)
            localappdata = os.environ.get('LOCALAPPDATA')
            if localappdata:
                self._font_dirs.append(
                    Path(localappdata) / 'Microsoft' / 'Windows' / 'Fonts'
                )
        else:
            self._font_dirs = list(self.SYSTEM_FONT_DIRS.get(platform, []))

        return self._font_dirs

    def _find_font_file(self, font_name: str) -> Path | None:
        """Search system directories for a font file.

        Args:
            font_name: Logical font name (e.g., 'NotoSansCJK-Regular')

        Returns:
            Path to font file if found, None otherwise
        """
        if font_name not in self.NOTO_FONT_PATTERNS:
            return None

        patterns = self.NOTO_FONT_PATTERNS[font_name]

        for font_dir in self._get_font_dirs():
            if not font_dir.exists():
                continue

            for pattern in patterns:
                # Search recursively for the font file
                try:
                    matches = list(font_dir.rglob(pattern))
                    if matches:
                        log.debug(
                            "Found system font %s at %s", font_name, matches[0]
                        )
                        return matches[0]
                except PermissionError:
                    # Skip directories we can't read
                    continue

        return None

    def get_font(self, font_name: str) -> FontManager | None:
        """Get a FontManager for the named font (lazy loading).

        This method implements lazy scanning: fonts are only searched for
        when first requested. Results (both positive and negative) are
        cached for subsequent calls.

        Args:
            font_name: Logical font name (e.g., 'NotoSansCJK-Regular')

        Returns:
            FontManager if font is found and loadable, None otherwise
        """
        # Check positive cache first
        if font_name in self._font_cache:
            return self._font_cache[font_name]

        # Check negative cache (already searched, not found)
        if font_name in self._not_found:
            return None

        # Lazy scan for this specific font
        font_path = self._find_font_file(font_name)
        if font_path is not None:
            try:
                fm = FontManager(font_path)
                self._font_cache[font_name] = fm
                return fm
            except Exception as e:
                log.warning(
                    "Found font %s at %s but failed to load: %s",
                    font_name,
                    font_path,
                    e,
                )

        # Cache negative result
        self._not_found.add(font_name)
        return None

    def get_available_fonts(self) -> list[str]:
        """Get list of font names this provider can potentially find.

        Note: This returns all font names we know patterns for, not
        necessarily fonts that are actually installed. Use get_font()
        to check if a specific font is available.

        Returns:
            List of logical font names
        """
        return list(self.NOTO_FONT_PATTERNS.keys())

    def get_fallback_font(self) -> FontManager:
        """Get the glyphless fallback font.

        Raises:
            NotImplementedError: System provider doesn't provide fallback.
                Use BuiltinFontProvider for the fallback font.
        """
        raise NotImplementedError(
            "SystemFontProvider does not provide a fallback font. "
            "Use BuiltinFontProvider for Occulta.ttf fallback."
        )


================================================
FILE: src/ocrmypdf/fpdf_renderer/__init__.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""fpdf2-based PDF renderer for OCR text layers.

This module provides the PDF renderer using fpdf2 for creating
searchable OCR text layers.
"""
from __future__ import annotations

from ocrmypdf.fpdf_renderer.renderer import (
    DebugRenderOptions,
    Fpdf2MultiPageRenderer,
    Fpdf2PdfRenderer,
)

__all__ = [
    "DebugRenderOptions",
    "Fpdf2PdfRenderer",
    "Fpdf2MultiPageRenderer",
]


================================================
FILE: src/ocrmypdf/fpdf_renderer/renderer.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""fpdf2-based PDF renderer for OCR text layers.

This module provides PDF rendering using fpdf2 for creating searchable
OCR text layers.
"""

from __future__ import annotations

import logging
from dataclasses import dataclass
from math import atan, cos, degrees, radians, sin, sqrt
from pathlib import Path

from fpdf import FPDF
from fpdf.enums import PDFResourceType, TextMode
from pikepdf import Matrix, Rectangle

from ocrmypdf.font import FontManager, MultiFontManager
from ocrmypdf.models.ocr_element import OcrClass, OcrElement

log = logging.getLogger(__name__)


def transform_point(matrix: Matrix, x: float, y: float) -> tuple[float, float]:
    """Transform a point (x, y) by a matrix.

    Args:
        matrix: pikepdf Matrix to apply
        x: X coordinate
        y: Y coordinate

    Returns:
        Tuple of (transformed_x, transformed_y)
    """
    # Use a degenerate rectangle to transform a single point
    rect = Rectangle(x, y, x, y)
    transformed = matrix.transform(rect)
    return (transformed.llx, transformed.lly)


def transform_box(
    matrix: Matrix, left: float, top: float, right: float, bottom: float
) -> tuple[float, float, float, float]:
    """Transform a bounding box by a matrix.

    Args:
        matrix: pikepdf Matrix to apply
        left: Left edge of box
        top: Top edge of box
        right: Right edge of box
        bottom: Bottom edge of box

    Returns:
        Tuple of (llx, lly, width, height) of the transformed box
    """
    rect = Rectangle(left, top, right, bottom)
    transformed = matrix.transform(rect)
    return (
        transformed.llx,
        transformed.lly,
        transformed.width,
        transformed.height,
    )


@dataclass
class DebugRenderOptions:
    """Options for debug visualization during rendering.

    When enabled, draws colored lines/shapes to visualize OCR structure.
    """

    render_baseline: bool = False  # Magenta lines along baselines
    render_line_bbox: bool = False  # Blue rectangles around lines
    render_word_bbox: bool = False  # Green rectangles around words


class CoordinateTransform:
    """Manages coordinate transformations for fpdf2 rendering.

    Handles conversion from OCR pixel coordinates (top-left origin) to
    PDF points. fpdf2 uses top-left origin like hOCR, so no Y-flip needed.
    """

    def __init__(self, dpi: float, page_width_px: float, page_height_px: float):
        """Initialize coordinate transform."""
        self.dpi = dpi
        self.page_width_px = page_width_px
        self.page_height_px = page_height_px

    @property
    def page_width_pt(self) -> float:
        """Page width in PDF points."""
        return self.page_width_px * 72.0 / self.dpi

    @property
    def page_height_pt(self) -> float:
        """Page height in PDF points."""
        return self.page_height_px * 72.0 / self.dpi

    def px_to_pt(self, value: float) -> float:
        """Convert pixels to PDF points."""
        return value * 72.0 / self.dpi

    def bbox_to_pt(self, bbox) -> tuple[float, float, float, float]:
        """Convert BoundingBox from pixels to points."""
        return (
            self.px_to_pt(bbox.left),
            self.px_to_pt(bbox.top),
            self.px_to_pt(bbox.right),
            self.px_to_pt(bbox.bottom),
        )


class Fpdf2PdfRenderer:
    """Renders OcrElement trees to PDF using fpdf2.

    This class provides the core rendering logic for converting OCR output
    into PDF text layers using fpdf2's text drawing capabilities.
    """

    def __init__(
        self,
        page: OcrElement,
        dpi: float,
        multi_font_manager: MultiFontManager,
        invisible_text: bool = True,
        image: Path | None = None,
        debug_render_options: DebugRenderOptions | None = None,
    ):
        """Initialize renderer.

        Args:
            page: Root OcrElement (must be ocr_page)
            dpi: Source image DPI
            multi_font_manager: MultiFontManager instance
            invisible_text: If True, render text as invisible (text mode 3)
            image: Optional path to image to overlay on top of the text layer,
                creating a sandwich PDF (text underneath, image on top)
            debug_render_options: Options for debug visualization

        Raises:
            ValueError: If page is not an ocr_page or lacks a bounding box
        """
        if page.ocr_class != OcrClass.PAGE:
            raise ValueError("Root element must be ocr_page")
        if page.bbox is None:
            raise ValueError("Page must have bounding box")

        self.page = page
        self.dpi = dpi
        self.multi_font_manager = multi_font_manager
        self.invisible_text = invisible_text
        self.image = image
        self.debug_options = debug_render_options or DebugRenderOptions()

        # Setup coordinate transform
        self.coord_transform = CoordinateTransform(
            dpi=dpi,
            page_width_px=page.bbox.width,
            page_height_px=page.bbox.height,
        )

        # Registered fonts: font_path -> fpdf_family_name
        self._registered_fonts: dict[str, str] = {}
        # Track whether we've already logged the info-level suppression message
        self._logged_aspect_ratio_suppression = False

    def render(self, output_path: Path) -> None:
        """Render page to PDF file.

        Args:
            output_path: Output PDF file path
        """
        # Create PDF with custom page size
        pdf = FPDF(
            unit="pt",
            format=(
                self.coord_transform.page_width_pt,
                self.coord_transform.page_height_pt,
            ),
        )
        pdf.set_auto_page_break(auto=False)

        # Enable text shaping for complex scripts
        pdf.set_text_shaping(True)

        # Disable cell margin to ensure precise text positioning
        # fpdf2's cell() adds c_margin padding by default, which shifts text
        pdf.c_margin = 0

        # Set text mode for invisible text
        if self.invisible_text:
            pdf.text_mode = TextMode.INVISIBLE
        else:
            pdf.text_mode = TextMode.FILL

        # Render content to PDF
        self.render_to_pdf(pdf)

        # Write PDF
        pdf.output(str(output_path))

    def render_to_pdf(self, pdf: FPDF) -> None:
        """Render page content to an existing FPDF instance.

        This method adds a page and renders all content. Used by both
        single-page rendering and multi-page rendering.

        Args:
            pdf: FPDF instance to render into
        """
        # Add page with correct dimensions
        pdf.add_page(
            format=(
                self.coord_transform.page_width_pt,
                self.coord_transform.page_height_pt,
            )
        )

        # Render all paragraphs
        for para in self.page.paragraphs:
            self._render_paragraph(pdf, para)

        # If no paragraphs, render lines directly
        if not self.page.paragraphs:
            for line in self.page.lines:
                self._render_line(pdf, line)

        # Place image on top of text layer (sandwich mode)
        if self.image is not None:
            pdf.image(
                str(self.image),
                x=0,
                y=0,
                w=self.coord_transform.page_width_pt,
                h=self.coord_transform.page_height_pt,
            )

    def _register_font(self, pdf: FPDF, font_manager: FontManager) -> str:
        """Register font with fpdf2 if not already registered.

        Args:
            pdf: FPDF instance
            font_manager: FontManager containing the font

        Returns:
            Font family name to use with pdf.set_font()
        """
        font_path_str = str(font_manager.font_path)

        if font_path_str not in self._registered_fonts:
            # Use the font filename stem as the family name
            family_name = font_manager.font_path.stem
            pdf.add_font(family=family_name, fname=font_path_str)
            self._registered_fonts[font_path_str] = family_name

        return self._registered_fonts[font_path_str]

    def _render_paragraph(self, pdf: FPDF, para: OcrElement) -> None:
        """Render a paragraph element.

        Args:
            pdf: FPDF instance
            para: Paragraph OCR element
        """
        for line in para.children:
            if line.ocr_class in OcrClass.LINE_TYPES:
                self._render_line(pdf, line)

    def _render_line(self, pdf: FPDF, line: OcrElement) -> None:
        """Render a line element with baseline support.

        Strategy (following pikepdf reference implementation):
        1. Create a baseline_matrix that transforms from hOCR coordinates to
           a coordinate system aligned with the text baseline
        2. For each word, transform its hOCR bbox using baseline_matrix.inverse()
           to get its position in the baseline coordinate system
        3. Render words along the baseline with horizontal scaling

        Args:
            pdf: FPDF instance
            line: Line OCR element
        """
        if line.bbox is None:
            return

        # Validate line bbox
        if line.bbox.height <= 0:
            log.error(
                "line box is invalid so we cannot render it: box=%s text=%s",
                line.bbox,
                line.text if hasattr(line, 'text') else '',
            )
            return

        # Convert line bbox to PDF points
        line_left_pt = self.coord_transform.px_to_pt(line.bbox.left)
        line_top_pt = self.coord_transform.px_to_pt(line.bbox.top)
        line_right_pt = self.coord_transform.px_to_pt(line.bbox.right)
        line_bottom_pt = self.coord_transform.px_to_pt(line.bbox.bottom)
        # Note: line_width_pt and line_height_pt not needed since we compute
        # dimensions in the un-rotated coordinate system via matrix transform

        # Debug rendering: draw line bbox (in page coordinates)
        if self.debug_options.render_line_bbox:
            self._render_debug_line_bbox(
                pdf, line_left_pt, line_top_pt, line_right_pt, line_bottom_pt
            )

        # Get textangle (rotation of the entire line)
        textangle = line.textangle or 0.0

        # Read baseline early so we can detect rotation from steep slopes.
        # When Tesseract doesn't report textangle for rotated text, the
        # rotation gets encoded as a very steep baseline slope instead.
        slope = 0.0
        intercept_pt = 0.0
        has_meaningful_baseline = False
        if line.baseline is not None:
            slope = line.baseline.slope
            intercept_pt = self.coord_transform.px_to_pt(line.baseline.intercept)
            if abs(slope) < 0.005:
                slope = 0.0
            has_meaningful_baseline = True

        # Detect text rotation from steep baseline slope.
        # A slope magnitude > 1.0 corresponds to > 45° from horizontal,
        # which indicates the line is rotated, not merely skewed.
        if textangle == 0.0 and abs(slope) > 1.0:
            textangle = degrees(atan(slope))
            # The original baseline slope and intercept are not meaningful
            # after extracting rotation; recalculate intercept from font
            # metrics below.
            slope = 0.0
            has_meaningful_baseline = False

        # Build line_size_aabb_matrix: transforms from page coords to un-rotated
        # line coords. The hOCR bbox is the minimum axis-aligned bounding box
        # enclosing the rotated text.
        # Start at top-left corner of line bbox, then rotate by -textangle
        line_size_aabb_matrix = (
            Matrix()
            .translated(line_left_pt, line_top_pt)
            .rotated(-textangle)  # textangle is counter-clockwise per hOCR spec
        )

        # Get the line dimensions in the un-rotated coordinate system
        # Transform line bbox corners to get the un-rotated dimensions
        inv_line_matrix = line_size_aabb_matrix.inverse()
        # Transform bottom-right corner to get line dimensions in rotated space
        _, _, line_size_width, line_size_height = transform_box(
            inv_line_matrix, line_left_pt, line_top_pt, line_right_pt, line_bottom_pt
        )

        # Get baseline intercept
        if not has_meaningful_baseline:
            # No baseline provided or baseline was used for rotation detection:
            # calculate intercept from font metrics
            default_font_manager = self.multi_font_manager.fonts['NotoSans-Regular']
            ascent, descent, units_per_em = default_font_manager.get_font_metrics()
            ascent_norm = ascent / units_per_em
            descent_norm = descent / units_per_em
            # Baseline intercept based on font metrics
            intercept_pt = (
                -abs(descent_norm)
                * line_size_height
                / (ascent_norm + abs(descent_norm))
            )

        slope_angle_deg = degrees(atan(slope)) if slope != 0.0 else 0.0

        # Build baseline_matrix: transforms from page coords to baseline coords
        # 1. Start with line_size_aabb_matrix (translates to line corner, rotates)
        # 2. Translate down to bottom of un-rotated line (line_size_height)
        # 3. Apply baseline intercept offset
        # 4. Rotate by baseline slope
        baseline_matrix = (
            line_size_aabb_matrix.translated(
                0, line_size_height
            )  # Move to bottom of line
            .translated(0, intercept_pt)  # Apply baseline intercept
            .rotated(slope_angle_deg)  # Rotate by baseline slope
        )

        # Calculate font size: height from baseline to top of line
        font_size = line_size_height + intercept_pt
        if font_size < 1.0:
            font_size = line_size_height * 0.8

        # Total rotation for rendering (textangle + slope)
        total_rotation_deg = -textangle + slope_angle_deg

        # Debug rendering: draw baseline
        if self.debug_options.render_baseline:
            # Baseline starts at origin in baseline coords, extends line width
            baseline_start = transform_point(baseline_matrix, 0, 0)
            baseline_end = transform_point(baseline_matrix, line_size_width, 0)
            pdf.set_draw_color(255, 0, 255)  # Magenta
            pdf.set_line_width(0.75)
            pdf.line(
                baseline_start[0], baseline_start[1], baseline_end[0], baseline_end[1]
            )

        # Extract line language for font selection
        line_language = line.language

        # Get inverse of baseline_matrix for transforming word bboxes
        inv_baseline_matrix = baseline_matrix.inverse()

        # Collect words to render
        words: list[OcrElement | None] = [
            w for w in line.children if w.ocr_class == OcrClass.WORD and w.text
        ]

        # Suppress lines where the text aspect ratio is implausible.
        # This catches cases where Tesseract failed to detect rotation
        # entirely (slope=0, no textangle) and produced garbage text in a
        # bounding box whose shape doesn't match the text content at all.
        if not self._check_aspect_ratio_plausible(
            pdf, words, font_size, slope_angle_deg,
            line_size_width, line_size_height, line_language,
        ):
            return

        # Collect word rendering data: (text, x_baseline, font_family, word_tz)
        word_render_data: list[tuple[str, float, str, float]] = []
        for word in words:
            if word is None or not word.text or word.bbox is None:
                continue

            word_left_pt = self.coord_transform.px_to_pt(word.bbox.left)
            word_top_pt = self.coord_transform.px_to_pt(word.bbox.top)
            word_right_pt = self.coord_transform.px_to_pt(word.bbox.right)
            word_bottom_pt = self.coord_transform.px_to_pt(word.bbox.bottom)
            word_width_pt = word_right_pt - word_left_pt

            # Debug rendering: draw word bbox (in page coordinates)
            if self.debug_options.render_word_bbox:
                self._render_debug_word_bbox(
                    pdf, word_left_pt, word_top_pt, word_right_pt, word_bottom_pt
                )

            # Get x position in baseline coordinate system
            box_llx, _, _, _ = transform_box(
                inv_baseline_matrix,
                word_left_pt,
                word_top_pt,
                word_right_pt,
                word_bottom_pt,
            )

            # Select font and compute word-only Tz
            font_manager = self.multi_font_manager.select_font_for_word(
                word.text, line_language
            )
            font_family = self._register_font(pdf, font_manager)
            pdf.set_font(font_family, size=font_size)
            natural_width = pdf.get_string_width(word.text)
            if natural_width > 0 and word_width_pt > 0:
                word_tz = (word_width_pt / natural_width) * 100
            else:
                word_tz = 100.0

            word_render_data.append((word.text, box_llx, font_family, word_tz))

        if not word_render_data:
            return

        # Emit single BT block for the entire line using raw PDF operators.
        # This avoids a poppler bug where Tz (horizontal scaling) is not
        # carried across BT/ET boundaries, affecting all poppler-based tools
        # and viewers (Evince, pdftotext, etc.). By keeping all words in a
        # single BT block with relative Td positioning and per-word Tz, we
        # ensure correct inter-word spacing.
        self._emit_line_bt_block(
            pdf,
            word_render_data,
            baseline_matrix,
            font_size,
            total_rotation_deg,
        )

    def _check_aspect_ratio_plausible(
        self,
        pdf: FPDF,
        words: list[OcrElement | None],
        font_size: float,
        slope_angle_deg: float,
        line_size_width: float,
        line_size_height: float,
        line_language: str | None,
    ) -> bool:
        """Check whether the line's aspect ratio is plausible for its text.

        Compares the aspect ratio of the OCR bounding box to the aspect ratio
        the text would have if rendered normally (accounting for baseline
        slope). A large mismatch indicates Tesseract misread rotated text
        without detecting the rotation.

        Returns:
            True if plausible (rendering should proceed), False to suppress.
        """
        if line_size_width <= 0 or line_size_height <= 0 or font_size <= 0:
            return True

        # Fast path: most lines are wider than they are tall, which is
        # the normal shape for horizontal text. Only tall-narrow boxes
        # (height > width) need the expensive font measurement check.
        if line_size_width >= line_size_height:
            return True

        line_text = ' '.join(
            w.text for w in words if w is not None and w.text
        )
        if not line_text:
            return True

        # Measure the natural rendered width of the line text
        font_manager = self.multi_font_manager.select_font_for_word(
            line_text, line_language
        )
        font_family = self._register_font(pdf, font_manager)
        pdf.set_font(font_family, size=round(font_size))
        natural_width = pdf.get_string_width(line_text)

        if natural_width <= 0:
            return True

        # Compute the AABB the text would occupy considering baseline slope
        theta = radians(abs(slope_angle_deg))
        expected_w = natural_width * cos(theta) + font_size * sin(theta)
        expected_h = natural_width * sin(theta) + font_size * cos(theta)

        if expected_h <= 0:
            return True

        actual_aspect = line_size_width / line_size_height
        expected_aspect = expected_w / expected_h
        ratio = actual_aspect / expected_aspect

        if ratio >= 0.1:
            return True

        # Implausible aspect ratio — suppress this line
        log.debug(
            "Suppressing text with improbable aspect ratio: "
            "actual=%.3f expected=%.3f ratio=%.4f text=%r",
            actual_aspect,
            expected_aspect,
            ratio,
            line_text[:80],
        )
        if not self._logged_aspect_ratio_suppression:
            log.info(
                "Suppressing OCR output text with improbable aspect ratio"
            )
            self._logged_aspect_ratio_suppression = True
        return False

    def _emit_line_bt_block(
        self,
        pdf: FPDF,
        word_render_data: list[tuple[str, float, str, float]],
        baseline_matrix: Matrix,
        font_size: float,
        total_rotation_deg: float,
    ) -> None:
        """Emit a single BT block for the entire line using raw PDF operators.

        Writes all words in a single BT..ET block with relative Td positioning
        and per-word Tz. Each non-last word gets a trailing space appended, with
        Tz calculated so the rendered width of "word " spans from the current
        word's start to the next word's start. This works around a poppler bug
        where Tz is not carried across BT/ET boundaries, which affects all
        poppler-based viewers and tools (Evince, pdftotext, etc.).

        Args:
            pdf: FPDF instance
            word_render_data: List of (text, x_baseline, font_family, word_tz)
                tuples, one per word on this line
            baseline_matrix: Transform from baseline coords to page coords
            font_size: Font size in points
            total_rotation_deg: Total rotation angle (textangle + slope)
        """
        page_height = self.coord_transform.page_height_pt

        # Compute baseline direction in PDF coordinates for rotation
        has_rotation = abs(total_rotation_deg) > 0.01
        bx0, by0_fpdf = transform_point(baseline_matrix, 0, 0)
        by0_pdf = page_height - by0_fpdf

        ops: list[str] = []

        if has_rotation:
            # Compute direction vector along the baseline in PDF coordinates
            bx1, by1_fpdf = transform_point(baseline_matrix, 100, 0)
            by1_pdf = page_height - by1_fpdf
            dx = bx1 - bx0
            dy = by1_pdf - by0_pdf
            length = sqrt(dx * dx + dy * dy)
            if length > 0:
                cos_a = dx / length
                sin_a = dy / length
            else:
                cos_a = 1.0
                sin_a = 0.0

            # Save graphics state, apply rotation+translation via cm.
            # The cm maps local coordinates (baseline-aligned, x along text)
            # to PDF page coordinates.
            ops.append('q')
            ops.append(
                f'{cos_a:.6f} {sin_a:.6f} {-sin_a:.6f} {cos_a:.6f} '
                f'{bx0:.2f} {by0_pdf:.2f} cm'
            )

        # Begin text object
        ops.append('BT')

        # Text render mode: 3 = invisible, 0 = fill
        tr = 3 if self.invisible_text else 0
        ops.append(f'{tr} Tr')

        # Initial text position
        first_x_baseline = word_render_data[0][1]
        if has_rotation:
            # In the cm-transformed space, origin is at the baseline start
            ops.append(f'{first_x_baseline:.2f} 0 Td')
        else:
            # Direct PDF coordinates
            page_x, page_y_fpdf = transform_point(
                baseline_matrix, first_x_baseline, 0
            )
            page_y_pdf = page_height - page_y_fpdf
            ops.append(f'{page_x:.2f} {page_y_pdf:.2f} Td')

        prev_font_family: str | None = None
        prev_x_baseline = first_x_baseline

        for i, (text, x_baseline, font_family, word_tz) in enumerate(
            word_render_data
        ):
            is_last = i == len(word_render_data) - 1

            # Set font if changed
            if font_family != prev_font_family:
                pdf.set_font(font_family, size=font_size)
                # Register font resource on this page
                pdf._resource_catalog.add(
                    PDFResourceType.FONT, pdf.current_font.i, pdf.page
                )
                ops.append(
                    f'/F{pdf.current_font.i} {pdf.font_size_pt:.2f} Tf'
                )
                prev_font_family = font_family

            # Relative positioning (for words after the first)
            if i > 0:
                if has_rotation:
                    # In rotated space, advance is purely along x-axis
                    dx_baseline = x_baseline - prev_x_baseline
                    ops.append(f'{dx_baseline:.2f} 0 Td')
                else:
                    # Non-rotated: compute delta in PDF coordinates
                    px_prev, py_prev_f = transform_point(
                        baseline_matrix, prev_x_baseline, 0
                    )
                    px_curr, py_curr_f = transform_point(
                        baseline_matrix, x_baseline, 0
                    )
                    dx_pdf = px_curr - px_prev
                    # Flip y delta for PDF coordinates (y-up)
                    dy_pdf = -(py_curr_f - py_prev_f)
                    ops.append(f'{dx_pdf:.2f} {dy_pdf:.2f} Td')

            # Determine text to render
            if not is_last:
                next_text, next_x_baseline, _, _ = word_render_data[i + 1]
                advance = next_x_baseline - x_baseline

                # Add trailing space for text extraction unless both are CJK
                if (
                    advance > 0
                    and not (
                        self._is_cjk_only(text)
                        and self._is_cjk_only(next_text)
                    )
                ):
                    text_to_render = text + ' '
                else:
                    text_to_render = text
            else:
                text_to_render = text

            # Use word_tz (fits word into its hOCR bbox) — Td handles
            # inter-word gaps, so Tz should not stretch to fill them.
            render_tz = word_tz

            ops.append(f'{render_tz:.2f} Tz')
            ops.append(self._encode_shaped_text(pdf, text_to_render))

            prev_x_baseline = x_baseline

        # End text object
        ops.append('ET')

        if has_rotation:
            ops.append('Q')

        pdf._out('\n'.join(ops))

        # Reset fpdf2's internal stretching tracking so subsequent API calls
        # don't think Tz is still set from our raw operators
        pdf.font_stretching = 100

    def _encode_shaped_text(self, pdf: FPDF, text: str) -> str:
        """Encode text using HarfBuzz text shaping for complex script support.

        Unlike font.encode_text() which maps unicode characters one-by-one to
        glyph IDs, this uses HarfBuzz to handle BiDi reordering, Arabic joining
        forms, Devanagari conjuncts, and other complex script shaping. Falls
        back to encode_text() when text shaping is not enabled.
        """
        font = pdf.current_font
        if pdf.text_shaping and pdf.text_shaping.get("use_shaping_engine"):
            shaped = font.shape_text(text, pdf.font_size_pt, pdf.text_shaping)
            if shaped:
                mapped = "".join(
                    chr(ti["mapped_char"])
                    for ti in shaped
                    if ti["mapped_char"] is not None
                )
                if mapped:
                    return f"({font.escape_text(mapped)}) Tj"
        return font.encode_text(text)

    def _is_cjk_only(self, text: str) -> bool:
        """Check if text contains only CJK characters.

        CJK scripts don't use spaces between words, so we should not insert
        spaces between adjacent CJK words.

        Args:
            text: Text to check

        Returns:
            True if text contains only CJK characters
        """
        for char in text:
            cp = ord(char)
            # Check if character is in CJK ranges
            if not (
                0x4E00 <= cp <= 0x9FFF  # CJK Unified Ideographs
                or 0x3400 <= cp <= 0x4DBF  # CJK Extension A
                or 0x20000 <= cp <= 0x2A6DF  # CJK Extension B
                or 0x2A700 <= cp <= 0x2B73F  # CJK Extension C
                or 0x2B740 <= cp <= 0x2B81F  # CJK Extension D
                or 0x2B820 <= cp <= 0x2CEAF  # CJK Extension E
                or 0x2CEB0 <= cp <= 0x2EBEF  # CJK Extension F
                or 0x30000 <= cp <= 0x3134F  # CJK Extension G
                or 0x3040 <= cp <= 0x309F  # Hiragana
                or 0x30A0 <= cp <= 0x30FF  # Katakana
                or 0x31F0 <= cp <= 0x31FF  # Katakana Phonetic Extensions
                or 0xAC00 <= cp <= 0xD7AF  # Hangul Syllables
                or 0x1100 <= cp <= 0x11FF  # Hangul Jamo
                or 0x3130 <= cp <= 0x318F  # Hangul Compatibility Jamo
                or 0xA960 <= cp <= 0xA97F  # Hangul Jamo Extended-A
                or 0xD7B0 <= cp <= 0xD7FF  # Hangul Jamo Extended-B
                or 0x3000 <= cp <= 0x303F  # CJK Symbols and Punctuation
                or 0xFF00 <= cp <= 0xFFEF  # Halfwidth and Fullwidth Forms
            ):
                return False
        return True

    def _render_debug_line_bbox(
        self,
        pdf: FPDF,
        left: float,
        top: float,
        right: float,
        bottom: float,
    ) -> None:
        """Draw a blue box around the line bbox."""
        pdf.set_draw_color(0, 0, 255)  # Blue
        pdf.set_line_width(0.5)
        pdf.rect(left, top, right - left, bottom - top)

    def _render_debug_baseline(
        self,
        pdf: FPDF,
        x: float,
        y: float,
        width: float,
        rotation_deg: float,
    ) -> None:
        """Draw a magenta line along the baseline."""
        pdf.set_draw_color(255, 0, 255)  # Magenta
        pdf.set_line_width(0.75)

        if abs(rotation_deg) > 0.1:
            with pdf.rotation(rotation_deg, x=x, y=y):
                pdf.line(x, y, x + width, y)
        else:
            pdf.line(x, y, x + width, y)

    def _render_debug_word_bbox(
        self,
        pdf: FPDF,
        left: float,
        top: float,
        right: float,
        bottom: float,
    ) -> None:
        """Draw a green box around the word bbox."""
        pdf.set_draw_color(0, 255, 0)  # Green
        pdf.set_line_width(0.3)
        pdf.rect(left, top, right - left, bottom - top)


class Fpdf2MultiPageRenderer:
    """Renders multiple OcrElement pages into a single PDF.

    This class handles multi-page documents by delegating to Fpdf2PdfRenderer
    for each page while sharing a single FPDF instance and font registration.
    """

    def __init__(
        self,
        pages_data: list[tuple[int, OcrElement, float]],
        multi_font_manager: MultiFontManager,
        invisible_text: bool = True,
        debug_render_options: DebugRenderOptions | None = None,
    ):
        """Initialize multi-page renderer.

        Args:
            pages_data: List of (pageno, ocr_tree, dpi) tuples
            multi_font_manager: Shared multi-font manager for all pages
            invisible_text: Whether to render invisible text
            debug_render_options: Options for debug visualization
        """
        self.pages_data = pages_data
        self.multi_font_manager = multi_font_manager
        self.invisible_text = invisible_text
        self.debug_options = debug_render_options or DebugRenderOptions()

    def render(self, output_path: Path) -> None:
        """Render all pages to a single multi-page PDF.

        Args:
            output_path: Output PDF file path
        """
        if not self.pages_data:
            raise ValueError("No pages to render")

        # Create PDF (page size will be set per-page)
        pdf = FPDF(unit="pt")
        pdf.set_auto_page_break(auto=False)
        pdf.set_text_shaping(True)

        # Disable cell margin to ensure precise text positioning
        # fpdf2's cell() adds c_margin padding by default, which shifts text
        pdf.c_margin = 0

        # Set text mode for invisible text
        if self.invisible_text:
            pdf.text_mode = TextMode.INVISIBLE
        else:
            pdf.text_mode = TextMode.FILL

        # Shared font registration across all pages
        shared_registered_fonts: dict[str, str] = {}

        # Render each page using Fpdf2PdfRenderer
        for _pageno, page, dpi in self.pages_data:
            if page.bbox is None:
                continue

            # Create a renderer for this page
            page_renderer = Fpdf2PdfRenderer(
                page=page,
                dpi=dpi,
                multi_font_manager=self.multi_font_manager,
                invisible_text=self.invisible_text,
                debug_render_options=self.debug_options,
            )

            # Share font registration to avoid re-registering fonts
            page_renderer._registered_fonts = shared_registered_fonts

            # Render page content to the shared PDF
            page_renderer.render_to_pdf(pdf)

        # Write PDF
        pdf.output(str(output_path))


================================================
FILE: src/ocrmypdf/helpers.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Support functions."""

from __future__ import annotations

import logging
import multiprocessing
import os
import shutil
import warnings
from collections.abc import Callable, Iterable, Sequence
from contextlib import suppress
from decimal import Decimal
from io import StringIO
from math import isclose, isfinite
from pathlib import Path
from statistics import harmonic_mean
from typing import (
    Any,
    Generic,
    TypeVar,
)

import img2pdf
import pikepdf

log = logging.getLogger(__name__)

IMG2PDF_KWARGS = dict(engine=img2pdf.Engine.pikepdf, rotation=img2pdf.Rotation.ifvalid)


T = TypeVar('T', float, int, Decimal)


class Resolution(Generic[T]):
    """The number of pixels per inch in each 2D direction.

    Resolution objects are considered "equal" for == purposes if they are
    equal to a reasonable tolerance.
    """

    x: T
    y: T

    __slots__ = ('x', 'y')

    def __init__(self, x: T, y: T):
        """Construct a Resolution object."""
        self.x = x
        self.y = y

    # rel_tol after converting from dpi to pixels per meter and saving
    # as integer with rounding, as many file formats
    CONVERSION_ERROR = 0.002

    def round(self, ndigits: int) -> Resolution:
        """Round to ndigits after the decimal point."""
        return Resolution(round(self.x, ndigits), round(self.y, ndigits))

    def to_int(self) -> Resolution[int]:
        """Round to nearest integer."""
        return Resolution(int(round(self.x)), int(round(self.y)))

    @classmethod
    def _isclose(cls, a, b):
        return isclose(a, b, rel_tol=cls.CONVERSION_ERROR)

    @property
    def is_square(self) -> bool:
        """True if the resolution is square (x == y)."""
        return self._isclose(self.x, self.y)

    @property
    def is_finite(self) -> bool:
        """True if both x and y are finite numbers."""
        return isfinite(self.x) and isfinite(self.y)

    def to_scalar(self) -> float:
        """Return the harmonic mean of x and y as a 1D approximation.

        In most cases, Resolution is 2D, but typically it is "square" (x == y) and
        can be approximated as a single number. When not square, the harmonic mean
        is used to approximate the 2D resolution as a single number.
        """
        return harmonic_mean([float(self.x), float(self.y)])

    def _take_minmax(
        self, vals: Iterable[Any], yvals: Iterable[Any] | None, cmp: Callable
    ) -> Resolution:
        """Return a new Resolution object with the maximum resolution of inputs."""
        if yvals is not None:
            return Resolution(cmp(self.x, *vals), cmp(self.y, *yvals))
        cmp_x, cmp_y = self.x, self.y
        for x, y in vals:
            cmp_x = cmp(x, cmp_x)
            cmp_y = cmp(y, cmp_y)
        return Resolution(cmp_x, cmp_y)

    def take_max(
        self, vals: Iterable[Any], yvals: Iterable[Any] | None = None
    ) -> Resolution:
        """Return a new Resolution object with the maximum resolution of inputs."""
        return self._take_minmax(vals, yvals, max)

    def take_min(
        self, vals: Iterable[Any], yvals: Iterable[Any] | None = None
    ) -> Resolution:
        """Return a new Resolution object with the minimum resolution of inputs."""
        return self._take_minmax(vals, yvals, min)

    def flip_axis(self) -> Resolution[T]:
        """Return a new Resolution object with x and y swapped."""
        return Resolution(self.y, self.x)

    def __getitem__(self, idx: int | slice) -> T:
        """Support [0] and [1] indexing."""
        return (self.x, self.y)[idx]

    def __str__(self):
        """Return a string representation of the resolution."""
        return f"{self.x:f}×{self.y:f}"

    def __repr__(self):  # pragma: no cover
        """Return a repr() of the resolution."""
        return f"Resolution({self.x!r}, {self.y!r})"

    def __eq__(self, other):
        """Return True if the resolution is equal to another resolution."""
        if isinstance(other, tuple) and len(other) == 2:
            other = Resolution(*other)
        if not isinstance(other, Resolution):
            return NotImplemented
        return self._isclose(self.x, other.x) and self._isclose(self.y, other.y)


def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike) -> None:
    """Create a symbolic link at ``soft_link_name``, which references ``input_file``.

    Think of this as copying ``input_file`` to ``soft_link_name`` with less overhead.

    Use symlinks safely. Self-linking loops are prevented. On Windows, file copy is
    used since symlinks may require administrator privileges. An existing link at the
    destination is removed.
    """
    input_file = os.fspath(input_file)
    soft_link_name = os.fspath(soft_link_name)

    # Guard against soft linking to oneself
    if input_file == soft_link_name:
        log.warning(
            "No symbolic link created. You are using the original data directory "
            "as the working directory."
        )
        return

    # Soft link already exists: delete for relink?
    if os.path.lexists(soft_link_name):
        # do not delete or overwrite real (non-soft link) file
        if not os.path.islink(soft_link_name):
            raise FileExistsError(f"{soft_link_name} exists and is not a link")
        os.unlink(soft_link_name)

    if not os.path.exists(input_file):
        raise FileNotFoundError(f"trying to create a broken symlink to {input_file}")

    if os.name == 'nt':
        # Don't actually use symlinks on Windows due to permission issues
        shutil.copyfile(input_file, soft_link_name)
        return

    log.debug("os.symlink(%s, %s)", input_file, soft_link_name)

    # Create symbolic link using absolute path
    os.symlink(os.path.abspath(input_file), soft_link_name)


def samefile(file1: os.PathLike, file2: os.PathLike) -> bool:
    """Return True if two files are the same file.

    Attempts to account for different relative paths to the same file.
    """
    if os.name == 'nt':
        return file1 == file2
    else:
        return os.path.samefile(file1, file2)


def is_iterable_notstr(thing: Any) -> bool:
    """Is this is an iterable type, other than a string?"""
    return isinstance(thing, Iterable) and not isinstance(thing, str)


def monotonic(seq: Sequence) -> bool:
    """Does this sequence increase monotonically?"""
    return all(b > a for a, b in zip(seq, seq[1:], strict=False))


def page_number(input_file: os.PathLike) -> int:
    """Get one-based page number implied by filename (000002.pdf -> 2)."""
    return int(os.path.basename(os.fspath(input_file))[0:6])


def available_cpu_count() -> int:
    """Returns number of CPUs in the system."""
    try:
        return multiprocessing.cpu_count()
    except NotImplementedError:
        pass
    warnings.warn(
        "Could not get CPU count. Assuming one (1) CPU. Use -j N to set manually."
    )
    return 1


def is_file_writable(test_file: os.PathLike) -> bool:
    """Intentionally racy test if target is writable.

    We intend to write to the output file if and only if we succeed and
    can replace it atomically. Before doing the OCR work, make sure
    the location is writable.
    """
    try:
        p = Path(test_file)
        if p.is_symlink():
            p = p.resolve(strict=False)

        # p.is_file() throws an exception in some cases
        if p.exists() and (p.is_file() or p.samefile(os.devnull)):
            return os.access(
                os.fspath(p),
                os.W_OK,
                effective_ids=(os.access in os.supports_effective_ids),
            )

        try:
            fp = p.open('wb')
        except OSError:
            return False
        else:
            fp.close()
            with suppress(OSError):
                p.unlink()
        return True
    except (OSError, RuntimeError) as e:
        log.debug(e)
        log.error(str(e))
        return False


def check_pdf(input_file: Path) -> bool:
    """Check if a PDF complies with the PDF specification.

    Checks for proper formatting and proper linearization. Uses pikepdf (which in
    turn, uses QPDF) to perform the checks.
    """
    try:
        pdf = pikepdf.open(input_file)
    except pikepdf.PdfError as e:
        log.error(e)
        return False
    else:
        with pdf:
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', message=r'pikepdf.*JBIG2.*')
                messages = pdf.check_pdf_syntax()
            success = True
            for msg in messages:
                if 'error' in msg.lower():
                    log.error(msg)
                    success = False
                elif (
                    "/DecodeParms: operation for dictionary attempted on object "
                    "of type null" in msg
                ):
                    pass  # Ignore/spurious warning
                else:
                    log.warning(msg)
                    success = False

            sio = StringIO()
            linearize_msgs = ''
            try:
                # If linearization is missing entirely, we do not complain. We do
                # complain if linearization is present but incorrect.
                pdf.check_linearization(sio)
            except (RuntimeError, pikepdf.ForeignObjectError):
                pass
            else:
                linearize_msgs = sio.getvalue()
                if linearize_msgs:
                    log.warning(linearize_msgs)

            return bool(success and not linearize_msgs)


def clamp(n: T, smallest: T, largest: T) -> T:
    """Clamps the value of ``n`` to between ``smallest`` and ``largest``."""
    return max(smallest, min(n, largest))


def remove_all_log_handlers(logger: logging.Logger) -> None:
    """Remove all log handlers, usually used in a child process.

    The child process inherits the log handlers from the parent process when
    a fork occurs. Typically we want to remove all log handlers in the child
    process so that the child process can set up a single queue handler to
    forward log messages to the parent process.
    """
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
        handler.close()  # To ensure handlers with opened resources are released


def pikepdf_enable_mmap() -> None:
    """Enable pikepdf memory mapping."""
    try:
        pikepdf._core.set_access_default_mmap(True)
        log.debug(
            "pikepdf mmap "
            + (
                'enabled'
                if pikepdf._core.get_access_default_mmap()  # type: ignore[attr-defined]
                else 'disabled'
            )
        )
    except AttributeError:
        log.debug("pikepdf mmap not available")


def running_in_docker() -> bool:
    """Returns True if we seem to be running in a Docker container."""
    return Path('/.dockerenv').exists()


def running_in_snap() -> bool:
    """Returns True if we seem to be running in a Snap container."""
    try:
        cgroup_text = Path('/proc/self/cgroup').read_text()
        return 'snap.ocrmypdf' in cgroup_text
    except FileNotFoundError:
        return False


================================================
FILE: src/ocrmypdf/hocrtransform/__init__.py
================================================
# SPDX-FileCopyrightText: 2023-2025 James R. Barlow
# SPDX-License-Identifier: MIT

"""Transform OCR output to text-only PDFs.

This package provides tools for:
1. Parsing OCR output (hOCR format) into generic OcrElement structures
2. Rendering OcrElement structures to searchable PDF text layers

The architecture separates parsing from rendering, allowing:
- Support for multiple OCR input formats (hOCR, ALTO, custom engines)
- Independent improvements to text rendering
- Reuse of the OcrElement data model for other purposes

Main components:
- OcrElement: Generic dataclass representing OCR output structure
- HocrParser: Parses hOCR files into OcrElement trees
- Fpdf2PdfRenderer: Renders OcrElement trees to PDF text layers (via fpdf2)

For PDF rendering, use the fpdf2_renderer module:
    from ocrmypdf.fpdf_renderer import Fpdf2PdfRenderer, DebugRenderOptions
"""

from __future__ import annotations

from ocrmypdf.hocrtransform.hocr_parser import (
    HocrParseError,
    HocrParser,
)
from ocrmypdf.models.ocr_element import (
    Baseline,
    BoundingBox,
    FontInfo,
    OcrClass,
    OcrElement,
)

__all__ = (
    # hOCR parsing
    'HocrParser',
    'HocrParseError',
    # OCR element data model
    'OcrElement',
    'OcrClass',
    'BoundingBox',
    'Baseline',
    'FontInfo',
)


================================================
FILE: src/ocrmypdf/hocrtransform/__main__.py
================================================
# SPDX-FileCopyrightText: 2023-2025 James R. Barlow
# SPDX-License-Identifier: MIT

"""Simple CLI for testing HOCR to PDF conversion using fpdf2 renderer."""

from __future__ import annotations

import argparse
from pathlib import Path

from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import DebugRenderOptions, Fpdf2PdfRenderer
from ocrmypdf.hocrtransform.hocr_parser import HocrParser

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Convert hocr file to PDF')
    parser.add_argument(
        '-b',
        '--boundingboxes',
        action="store_true",
        default=False,
        help='Show bounding boxes borders (debug mode)',
    )
    parser.add_argument(
        '-r',
        '--resolution',
        type=int,
        default=300,
        help='Resolution of the image that was OCRed',
    )
    parser.add_argument(
        '-i',
        '--image',
        default=None,
        help='Path to the image to overlay on top of the text layer',
    )
    parser.add_argument('hocrfile', help='Path to the hocr file to be parsed')
    parser.add_argument('outputfile', help='Path to the PDF file to be generated')
    args = parser.parse_args()

    # Parse hOCR file
    hocr_parser = HocrParser(args.hocrfile)
    ocr_page = hocr_parser.parse()

    # Use DPI from hOCR if available, otherwise use command-line resolution
    dpi = ocr_page.dpi or args.resolution

    # Setup debug render options if requested
    debug_options = None
    if args.boundingboxes:
        debug_options = DebugRenderOptions(
            render_line_bbox=True,
            render_word_bbox=True,
            render_baseline=True,
        )

    # Create multi-font manager with default font directory
    font_dir = Path(__file__).parent.parent / "data"
    multi_font_manager = MultiFontManager(font_dir)

    # Render to PDF using fpdf2
    image_path = Path(args.image) if args.image else None
    renderer = Fpdf2PdfRenderer(
        page=ocr_page,
        dpi=dpi,
        multi_font_manager=multi_font_manager,
        invisible_text=bool(args.image),
        image=image_path,
        debug_render_options=debug_options,
    )
    renderer.render(Path(args.outputfile))


================================================
FILE: src/ocrmypdf/hocrtransform/hocr_parser.py
================================================
# SPDX-FileCopyrightText: 2010 Jonathan Brinley
# SPDX-FileCopyrightText: 2013-2014 Julien Pfefferkorn
# SPDX-FileCopyrightText: 2023-2025 James R. Barlow
# SPDX-License-Identifier: MIT

"""Parser for hOCR format files.

This module provides functionality to parse hOCR files (HTML-based OCR format)
and convert them to the engine-agnostic OcrElement tree structure.

For details of the hOCR format, see:
http://kba.github.io/hocr-spec/1.2/
"""

from __future__ import annotations

import logging
import os
import re
import unicodedata
from pathlib import Path
from typing import Literal, cast
from xml.etree import ElementTree as ET

from ocrmypdf.models.ocr_element import (
    Baseline,
    BoundingBox,
    FontInfo,
    OcrClass,
    OcrElement,
)

TextDirection = Literal["ltr", "rtl"]

log = logging.getLogger(__name__)

Element = ET.Element


class HocrParseError(Exception):
    """Error while parsing hOCR file."""


class HocrParser:
    """Parser for hOCR format files.

    Converts hOCR XML/HTML files into OcrElement trees.

    The hOCR format uses HTML with special class attributes (ocr_page, ocr_line,
    ocrx_word, etc.) and a title attribute containing properties like bbox,
    baseline, and confidence scores.
    """

    # Regex patterns for parsing hOCR title attributes
    _bbox_pattern = re.compile(
        r'''
        bbox \s+
        (\d+) \s+   # left: uint
        (\d+) \s+   # top: uint
        (\d+) \s+   # right: uint
        (\d+)       # bottom: uint
        ''',
        re.VERBOSE,
    )

    _baseline_pattern = re.compile(
        r'''
        baseline \s+
        ([\-\+]?\d*\.?\d*) \s+  # slope: +/- decimal float
        ([\-\+]?\d+)            # intercept: +/- int
        ''',
        re.VERBOSE,
    )

    _textangle_pattern = re.compile(
        r'''
        textangle \s+
        ([\-\+]?\d*\.?\d*)  # angle: +/- decimal float
        ''',
        re.VERBOSE,
    )

    _x_wconf_pattern = re.compile(
        r'''
        x_wconf \s+
        (\d+)  # confidence: uint (0-100)
        ''',
        re.VERBOSE,
    )

    _x_fsize_pattern = re.compile(
        r'''
        x_fsize \s+
        (\d*\.?\d+)  # font size: float
        ''',
        re.VERBOSE,
    )

    _x_font_pattern = re.compile(
        r'''
        x_font \s+
        ([^\s;]+)  # font name: non-whitespace, non-semicolon string
        ''',
        re.VERBOSE,
    )

    _ppageno_pattern = re.compile(
        r'''
        ppageno \s+
        (\d+)  # page number: uint
        ''',
        re.VERBOSE,
    )

    _scan_res_pattern = re.compile(
        r'''
        scan_res \s+
        (\d+) \s+  # x resolution
        (\d+)      # y resolution
        ''',
        re.VERBOSE,
    )

    def __init__(self, hocr_file: str | Path):
        """Initialize the parser with an hOCR file.

        Args:
            hocr_file: Path to the hOCR file to parse

        Raises:
            HocrParseError: If the file cannot be parsed
        """
        self._hocr_path = Path(hocr_file)
        try:
            self._tree = ET.parse(os.fspath(hocr_file))
        except ET.ParseError as e:
            raise HocrParseError(f"Failed to parse hOCR file: {e}") from e

        # Detect XML namespace
        root_tag = self._tree.getroot().tag
        matches = re.match(r'({.*})html', root_tag)
        self._xmlns = matches.group(1) if matches else ''

    def parse(self) -> OcrElement:
        """Parse the hOCR file and return an OcrElement tree.

        Returns:
            The root OcrElement (ocr_page) containing the document structure

        Raises:
            HocrParseError: If no ocr_page element is found
        """
        # Find the first ocr_page element
        page_div = self._tree.find(self._xpath('div', 'ocr_page'))
        if page_div is None:
            raise HocrParseError("No ocr_page element found in hOCR file")

        return self._parse_page(page_div)

    def _xpath(self, html_tag: str, html_class: str | None = None) -> str:
        """Build an XPath expression for finding elements.

        Args:
            html_tag: HTML tag name (e.g., 'div', 'span', 'p')
            html_class: Optional class attribute to match

        Returns:
            XPath expression string
        """
        xpath = f".//{self._xmlns}{html_tag}"
        if html_class:
            xpath += f"[@class='{html_class}']"
        return xpath

    def _parse_page(self, page_elem: Element) -> OcrElement:
        """Parse an ocr_page element.

        Args:
            page_elem: The XML element with class="ocr_page"

        Returns:
            OcrElement representing the page
        """
        title = page_elem.attrib.get('title', '')

        bbox = self._parse_bbox(title)
        if bbox is None:
            raise HocrParseError("ocr_page missing bbox")

        # Parse page-level properties
        page_number = self._parse_ppageno(title)
        dpi = self._parse_scan_res(title)

        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=bbox,
            page_number=page_number,
            dpi=dpi,
        )

        # Parse child paragraphs
        for par_elem in page_elem.iterfind(self._xpath('p', 'ocr_par')):
            paragraph = self._parse_paragraph(par_elem)
            if paragraph is not None:
                page.children.append(paragraph)

        # If no paragraphs found, check for words directly under page
        # (some Tesseract output structures)
        if not page.children:
            for word_elem in page_elem.iterfind(self._xpath('span', 'ocrx_word')):
                word = self._parse_word(word_elem)
                if word is not None:
                    page.children.append(word)

        return page

    def _parse_paragraph(self, par_elem: Element) -> OcrElement | None:
        """Parse an ocr_par element.

        Args:
            par_elem: The XML element with class="ocr_par"

        Returns:
            OcrElement representing the paragraph, or None if empty
        """
        title = par_elem.attrib.get('title', '')
        bbox = self._parse_bbox(title)

        # Get direction and language from attributes
        dir_attr = par_elem.attrib.get('dir')
        direction: TextDirection | None = (
            cast(TextDirection, dir_attr) if dir_attr in ('ltr', 'rtl') else None
        )

        language = par_elem.attrib.get('lang')

        paragraph = OcrElement(
            ocr_class=OcrClass.PARAGRAPH,
            bbox=bbox,
            direction=direction,
            language=language,
        )

        # Parse child lines
        line_classes = {
            'ocr_line',
            'ocr_header',
            'ocr_footer',
            'ocr_caption',
            'ocr_textfloat',
        }
        for span_elem in par_elem.iterfind(self._xpath('span')):
            elem_class = span_elem.attrib.get('class', '')
            if elem_class in line_classes:
                line = self._parse_line(span_elem, elem_class, direction, language)
                if line is not None:
                    paragraph.children.append(line)

        # Return None if paragraph is empty
        if not paragraph.children:
            return None

        return paragraph

    def _parse_line(
        self,
        line_elem: Element,
        ocr_class: str,
        parent_direction: TextDirection | None,
        parent_language: str | None,
    ) -> OcrElement | None:
        """Parse a line element (ocr_line, ocr_header, etc.).

        Args:
            line_elem: The XML element representing the line
            ocr_class: The hOCR class of the line
            parent_direction: Text direction inherited from parent
            parent_language: Language inherited from parent

        Returns:
            OcrElement representing the line, or None if empty
        """
        title = line_elem.attrib.get('title', '')
        bbox = self._parse_bbox(title)

        if bbox is None:
            return None

        baseline = self._parse_baseline(title)
        textangle = self._parse_textangle(title)

        # Inherit direction and language from parent if not specified
        dir_attr = line_elem.attrib.get('dir')
        if dir_attr in ('ltr', 'rtl'):
            direction: TextDirection | None = cast(TextDirection, dir_attr)
        else:
            direction = parent_direction

        language = line_elem.attrib.get('lang') or parent_language

        line = OcrElement(
            ocr_class=ocr_class,
            bbox=bbox,
            baseline=baseline,
            textangle=textangle,
            direction=direction,
            language=language,
        )

        # Parse child words
        for word_elem in line_elem.iterfind(self._xpath('span', 'ocrx_word')):
            word = self._parse_word(word_elem)
            if word is not None:
                line.children.append(word)

        # Return None if line has no words
        if not line.children:
            return None

        return line

    def _parse_word(self, word_elem: Element) -> OcrElement | None:
        """Parse an ocrx_word element.

        Args:
            word_elem: The XML element with class="ocrx_word"

        Returns:
            OcrElement representing the word, or None if empty
        """
        title = word_elem.attrib.get('title', '')
        bbox = self._parse_bbox(title)

        # Get the text content
        text = self._get_element_text(word_elem)
        text = self._normalize_text(text)

        if not text:
            return None

        # Parse confidence (x_wconf is 0-100, convert to 0.0-1.0)
        confidence = self._parse_x_wconf(title)
        if confidence is not None:
            confidence = confidence / 100.0

        # Parse font info
        font = self._parse_font_info(title)

        return OcrElement(
            ocr_class=OcrClass.WORD,
            bbox=bbox,
            text=text,
            confidence=confidence,
            font=font,
        )

    def _get_element_text(self, element: Element) -> str:
        """Get the full text content of an element including children.

        Args:
            element: XML element

        Returns:
            Combined text content
        """
        text = element.text if element.text is not None else ''
        for child in element:
            text += self._get_element_text(child)
        text += element.tail if element.tail is not None else ''
        return text

    @staticmethod
    def _normalize_text(text: str) -> str:
        """Normalize text using NFKC normalization.

        This splits ligatures and combines diacritics.

        Args:
            text: Raw text

        Returns:
            Normalized text, stripped of leading/trailing whitespace
        """
        return unicodedata.normalize("NFKC", text).strip()

    def _parse_bbox(self, title: str) -> BoundingBox | None:
        """Parse a bbox from an hOCR title attribute.

        Args:
            title: The title attribute value

        Returns:
            BoundingBox or None if not found
        """
        match = self._bbox_pattern.search(title)
        if not match:
            return None

        try:
            return BoundingBox(
                left=float(match.group(1)),
                top=float(match.group(2)),
                right=float(match.group(3)),
                bottom=float(match.group(4)),
            )
        except ValueError:
            return None

    def _parse_baseline(self, title: str) -> Baseline | None:
        """Parse baseline from an hOCR title attribute.

        Args:
            title: The title attribute value

        Returns:
            Baseline or None if not found
        """
        match = self._baseline_pattern.search(title)
        if not match:
            return None

        try:
            return Baseline(
                slope=float(match.group(1)) if match.group(1) else 0.0,
                intercept=float(match.group(2)),
            )
        except ValueError:
            return None

    def _parse_textangle(self, title: str) -> float | None:
        """Parse textangle from an hOCR title attribute.

        Args:
            title: The title attribute value

        Returns:
            Angle in degrees or None if not found
        """
        match = self._textangle_pattern.search(title)
        if not match:
            return None

        try:
            return float(match.group(1))
        except ValueError:
            return None

    def _parse_x_wconf(self, title: str) -> float | None:
        """Parse word confidence from an hOCR title attribute.

        Args:
            title: The title attribute value

        Returns:
            Confidence (0-100) or None if not found
        """
        match = self._x_wconf_pattern.search(title)
        if not match:
            return None

        try:
            return float(match.group(1))
        except ValueError:
            return None

    def _parse_ppageno(self, title: str) -> int | None:
        """Parse physical page number from an hOCR title attribute.

        Args:
            title: The title attribute value

        Returns:
            Page number or None if not found
        """
        match = self._ppageno_pattern.search(title)
        if not match:
            return None

        try:
            return int(match.group(1))
        except ValueError:
            return None

    def _parse_scan_res(self, title: str) -> float | None:
        """Parse scan resolution (DPI) from an hOCR title attribute.

        Args:
            title: The title attribute value

        Returns:
            DPI (using first value if x and y differ) or None if not found
        """
        match = self._scan_res_pattern.search(title)
        if not match:
            return None

        try:
            # Use the first (x) resolution value
            return float(match.group(1))
        except ValueError:
            return None

    def _parse_font_info(self, title: str) -> FontInfo | None:
        """Parse font information from an hOCR title attribute.

        Args:
            title: The title attribute value

        Returns:
            FontInfo or None if no font info found
        """
        font_match = self._x_font_pattern.search(title)
        size_match = self._x_fsize_pattern.search(title)

        if not font_match and not size_match:
            return None

        return FontInfo(
            name=font_match.group(1) if font_match else None,
            size=float(size_match.group(1)) if size_match else None,
        )


================================================
FILE: src/ocrmypdf/imageops.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""OCR-related image manipulation."""

from __future__ import annotations

import logging
from math import floor, sqrt

from PIL import Image

log = logging.getLogger(__name__)


def bytes_per_pixel(mode: str) -> int:
    """Return the number of padded bytes per pixel for a given PIL image mode.

    In RGB mode we assume 4 bytes per pixel, which is the case for most
    consumers.
    """
    if mode in ('1', 'L', 'P'):
        return 1
    if mode in ('LA', 'PA', 'La') or mode.startswith('I;16'):
        return 2
    return 4


def _calculate_downsample(
    image_size: tuple[int, int],
    bytes_per_pixel: int,
    *,
    max_size: tuple[int, int] | None = None,
    max_pixels: int | None = None,
    max_bytes: int | None = None,
) -> tuple[int, int]:
    """Calculate image size required to downsample an image to fit limits.

    If no limit is exceeded, the input image's size is returned.

    Args:
        image_size: Dimensions of image.
        bytes_per_pixel: Number of bytes per pixel.
        max_size: The maximum width and height of the image.
        max_pixels: The maximum number of pixels in the image. Some image consumers
            limit the total number of pixels as some value other than width*height.
        max_bytes: The maximum number of bytes in the image. RGB is counted as 4
            bytes; all other modes are counted as 1 byte.
    """
    size = image_size

    if max_size is not None:
        overage = max_size[0] / size[0], max_size[1] / size[1]
        size_factor = min(overage)
        if size_factor < 1.0:
            log.debug("Resizing image to fit image dimensions limit")
            size = floor(size[0] * size_factor), floor(size[1] * size_factor)
            if size[0] == 0:
                size = 1, min(size[1], max_size[1])
            elif size[1] == 0:
                size = min(size[0], max_size[0]), 1

    if max_pixels is not None and size[0] * size[1] > max_pixels:
        log.debug("Resizing image to fit image pixel limit")
        pixels_factor = sqrt(max_pixels / (size[0] * size[1]))
        size = floor(size[0] * pixels_factor), floor(size[1] * pixels_factor)

    if max_bytes is not None:
        bpp = bytes_per_pixel
        # stride = bytes per line
        stride = size[0] * bpp
        height = size[1]
        if stride * height > max_bytes:
            log.debug("Resizing image to fit image byte size limit")
            bytes_factor = sqrt(max_bytes / (stride * height))
            scaled_stride = floor(stride * bytes_factor)
            scaled_height = floor(height * bytes_factor)
            if scaled_stride == 0:
                scaled_stride = bpp
                scaled_height = min(max_bytes // bpp, scaled_height)
            if scaled_height == 0:
                scaled_height = 1
                scaled_stride = min(max_bytes // scaled_height, scaled_stride)
            size = floor(scaled_stride / bpp), scaled_height

    return size


def calculate_downsample(
    image: Image.Image,
    *,
    max_size: tuple[int, int] | None = None,
    max_pixels: int | None = None,
    max_bytes: int | None = None,
) -> tuple[int, int]:
    """Calculate image size required to downsample an image to fit limits.

    If no limit is exceeded, the input image's size is returned.

    Args:
        image: The image to downsample.
        max_size: The maximum width and height of the image.
        max_pixels: The maximum number of pixels in the image. Some image consumers
            limit the total number of pixels as some value other than width*height.
        max_bytes: The maximum number of bytes in the image. RGB is counted as 4
            bytes; all other modes are counted as 1 byte.
    """
    return _calculate_downsample(
        image.size,
        bytes_per_pixel(image.mode),
        max_size=max_size,
        max_pixels=max_pixels,
        max_bytes=max_bytes,
    )


def downsample_image(
    image: Image.Image,
    new_size: tuple[int, int],
    *,
    resample_mode: Image.Resampling = Image.Resampling.BICUBIC,
    reducing_gap: int = 3,
) -> Image.Image:
    """Downsample an image to fit within the given limits.

    The DPI is adjusted to match the new size, which is how we can ensure the
    OCR is positioned correctly.

    Args:
        image: The image to downsample
        new_size: The new size of the image.
        resample_mode: The resampling mode to use when downsampling.
        reducing_gap: The reducing gap to use when downsampling (for larger
            reductions).
    """
    if new_size == image.size:
        return image

    original_size = image.size
    original_dpi = image.info['dpi']
    image = image.resize(
        new_size,
        resample=resample_mode,
        reducing_gap=reducing_gap,
    )
    image.info['dpi'] = (
        round(original_dpi[0] * new_size[0] / original_size[0]),
        round(original_dpi[1] * new_size[1] / original_size[1]),
    )
    log.debug(f"Rescaled image to {image.size} pixels and {image.info['dpi']} dpi")
    return image


================================================
FILE: src/ocrmypdf/languages.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Language codes and names from ISO 639.

Derived from
https://www.loc.gov/standards/iso639-2/ascii_8bits.html
"""
from __future__ import annotations

from typing import NamedTuple


class ISOCodeData(NamedTuple):
    """Data for a single ISO 639 code."""

    alt: str
    alpha_2: str
    english: str
    french: str


ISO_639_3 = {
    'aar': ISOCodeData('', 'aa', 'Afar', 'afar'),
    'abk': ISOCodeData('', 'ab', 'Abkhazian', 'abkhaze'),
    'ace': ISOCodeData('', '', 'Achinese', 'aceh'),
    'ach': ISOCodeData('', '', 'Acoli', 'acoli'),
    'ada': ISOCodeData('', '', 'Adangme', 'adangme'),
    'ady': ISOCodeData('', '', 'Adyghe; Adygei', 'adyghé'),
    'afa': ISOCodeData(
        '',
        '',
        'Afro-Asiatic languages',
        'afro-asiatiques, langues',
    ),
    'afh': ISOCodeData('', '', 'Afrihili', 'afrihili'),
    'afr': ISOCodeData('', 'af', 'Afrikaans', 'afrikaans'),
    'ain': ISOCodeData('', '', 'Ainu', 'aïnou'),
    'aka': ISOCodeData('', 'ak', 'Akan', 'akan'),
    'akk': ISOCodeData('', '', 'Akkadian', 'akkadien'),
    'alb': ISOCodeData('sqi', 'sq', 'Albanian', 'albanais'),
    'ale': ISOCodeData('', '', 'Aleut', 'aléoute'),
    'alg': ISOCodeData(
        '',
        '',
        'Algonquian languages',
        'algonquines, langues',
    ),
    'alt': ISOCodeData('', '', 'Southern Altai', 'altai du Sud'),
    'amh': ISOCodeData('', 'am', 'Amharic', 'amharique'),
    'ang': ISOCodeData(
        '',
        '',
        'English, Old (ca.450-1100)',
        'anglo-saxon (ca.450-1100)',
    ),
    'anp': ISOCodeData('', '', 'Angika', 'angika'),
    'apa': ISOCodeData('', '', 'Apache languages', 'apaches, langues'),
    'ara': ISOCodeData('', 'ar', 'Arabic', 'arabe'),
    'arc': ISOCodeData(
        '',
        '',
        'Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)',
        "araméen d'empire (700-300 BCE)",
    ),
    'arg': ISOCodeData('', 'an', 'Aragonese', 'aragonais'),
    'arm': ISOCodeData('hye', 'hy', 'Armenian', 'arménien'),
    'arn': ISOCodeData(
        '',
        '',
        'Mapudungun; Mapuche',
        'mapudungun; mapuche; mapuce',
    ),
    'arp': ISOCodeData('', '', 'Arapaho', 'arapaho'),
    'art': ISOCodeData(
        '',
        '',
        'Artificial languages',
        'artificielles, langues',
    ),
    'arw': ISOCodeData('', '', 'Arawak', 'arawak'),
    'asm': ISOCodeData('', 'as', 'Assamese', 'assamais'),
    'ast': ISOCodeData(
        '',
        '',
        'Asturian; Bable; Leonese; Asturleonese',
        'asturien; bable; léonais; asturoléonais',
    ),
    'ath': ISOCodeData(
        '',
        '',
        'Athapascan languages',
        'athapascanes, langues',
    ),
    'aus': ISOCodeData(
        '',
        '',
        'Australian languages',
        'australiennes, langues',
    ),
    'ava': ISOCodeData('', 'av', 'Avaric', 'avar'),
    'ave': ISOCodeData('', 'ae', 'Avestan', 'avestique'),
    'awa': ISOCodeData('', '', 'Awadhi', 'awadhi'),
    'aym': ISOCodeData('', 'ay', 'Aymara', 'aymara'),
    'aze': ISOCodeData('', 'az', 'Azerbaijani', 'azéri'),
    'bad': ISOCodeData('', '', 'Banda languages', 'banda, langues'),
    'bai': ISOCodeData('', '', 'Bamileke languages', 'bamiléké, langues'),
    'bak': ISOCodeData('', 'ba', 'Bashkir', 'bachkir'),
    'bal': ISOCodeData('', '', 'Baluchi', 'baloutchi'),
    'bam': ISOCodeData('', 'bm', 'Bambara', 'bambara'),
    'ban': ISOCodeData('', '', 'Balinese', 'balinais'),
    'baq': ISOCodeData('eus', 'eu', 'Basque', 'basque'),
    'bas': ISOCodeData('', '', 'Basa', 'basa'),
    'bat': ISOCodeData('', '', 'Baltic languages', 'baltes, langues'),
    'bej': ISOCodeData('', '', 'Beja; Bedawiyet', 'bedja'),
    'bel': ISOCodeData('', 'be', 'Belarusian', 'biélorusse'),
    'bem': ISOCodeData('', '', 'Bemba', 'bemba'),
    'ben': ISOCodeData('', 'bn', 'Bengali', 'bengali'),
    'ber': ISOCodeData('', '', 'Berber languages', 'berbères, langues'),
    'bho': ISOCodeData('', '', 'Bhojpuri', 'bhojpuri'),
    'bih': ISOCodeData('', 'bh', 'Bihari languages', 'langues biharis'),
    'bik': ISOCodeData('', '', 'Bikol', 'bikol'),
    'bin': ISOCodeData('', '', 'Bini; Edo', 'bini; edo'),
    'bis': ISOCodeData('', 'bi', 'Bislama', 'bichlamar'),
    'bla': ISOCodeData('', '', 'Siksika', 'blackfoot'),
    'bnt': ISOCodeData('', '', 'Bantu languages', 'bantou, langues'),
    'bos': ISOCodeData('', 'bs', 'Bosnian', 'bosniaque'),
    'bra': ISOCodeData('', '', 'Braj', 'braj'),
    'bre': ISOCodeData('', 'br', 'Breton', 'breton'),
    'btk': ISOCodeData('', '', 'Batak languages', 'batak, langues'),
    'bua': ISOCodeData('', '', 'Buriat', 'bouriate'),
    'bug': ISOCodeData('', '', 'Buginese', 'bugi'),
    'bul': ISOCodeData('', 'bg', 'Bulgarian', 'bulgare'),
    'bur': ISOCodeData('mya', 'my', 'Burmese', 'birman'),
    'byn': ISOCodeData('', '', 'Blin; Bilin', 'blin; bilen'),
    'cad': ISOCodeData('', '', 'Caddo', 'caddo'),
    'cai': ISOCodeData(
        '',
        '',
        'Central American Indian languages',
        "amérindiennes de L'Amérique centrale, langues",
    ),
    'car': ISOCodeData('', '', 'Galibi Carib', 'karib; galibi; carib'),
    'cat': ISOCodeData('', 'ca', 'Catalan; Valencian', 'catalan; valencien'),
    'cau': ISOCodeData(
        '',
        '',
        'Caucasian languages',
        'caucasiennes, langues',
    ),
    'ceb': ISOCodeData('', '', 'Cebuano', 'cebuano'),
    'cel': ISOCodeData(
        '',
        '',
        'Celtic languages',
        'celtiques, langues; celtes, langues',
    ),
    'cha': ISOCodeData('', 'ch', 'Chamorro', 'chamorro'),
    'chb': ISOCodeData('', '', 'Chibcha', 'chibcha'),
    'che': ISOCodeData('', 'ce', 'Chechen', 'tchétchène'),
    'chg': ISOCodeData('', '', 'Chagatai', 'djaghataï'),
    'chi': ISOCodeData('zho', 'zh', 'Chinese', 'chinois'),
    'chk': ISOCodeData('', '', 'Chuukese', 'chuuk'),
    'chm': ISOCodeData('', '', 'Mari', 'mari'),
    'chn': ISOCodeData('', '', 'Chinook jargon', 'chinook, jargon'),
    'cho': ISOCodeData('', '', 'Choctaw', 'choctaw'),
    'chp': ISOCodeData('', '', 'Chipewyan; Dene Suline', 'chipewyan'),
    'chr': ISOCodeData('', '', 'Cherokee', 'cherokee'),
    'chu': ISOCodeData(
        '',
        'cu',
        (
            'Church Slavic; Old Slavonic; Church Slavonic;'
            ' Old Bulgarian; Old Church Slavonic'
        ),
        "slavon d'église; vieux slave; slavon liturgique; vieux bulgare",
    ),
    'chv': ISOCodeData('', 'cv', 'Chuvash', 'tchouvache'),
    'chy': ISOCodeData('', '', 'Cheyenne', 'cheyenne'),
    'cmc': ISOCodeData('', '', 'Chamic languages', 'chames, langues'),
    'cnr': ISOCodeData('', '', 'Montenegrin', 'monténégrin'),
    'cop': ISOCodeData('', '', 'Coptic', 'copte'),
    'cor': ISOCodeData('', 'kw', 'Cornish', 'cornique'),
    'cos': ISOCodeData('', 'co', 'Corsican', 'corse'),
    'cpe': ISOCodeData(
        '',
        '',
        'Creoles and pidgins, English based',
        "créoles et pidgins basés sur l'anglais",
    ),
    'cpf': ISOCodeData(
        '',
        '',
        'Creoles and pidgins, French-based',
        'créoles et pidgins basés sur le français',
    ),
    'cpp': ISOCodeData(
        '',
        '',
        'Creoles and pidgins, Portuguese-based',
        'créoles et pidgins basés sur le portugais',
    ),
    'cre': ISOCodeData('', 'cr', 'Cree', 'cree'),
    'crh': ISOCodeData(
        '',
        '',
        'Crimean Tatar; Crimean Turkish',
        'tatar de Crimé',
    ),
    'crp': ISOCodeData('', '', 'Creoles and pidgins', 'créoles et pidgins'),
    'csb': ISOCodeData('', '', 'Kashubian', 'kachoube'),
    'cus': ISOCodeData('', '', 'Cushitic languages', 'couchitiques, langues'),
    'cze': ISOCodeData('ces', 'cs', 'Czech', 'tchèque'),
    'dak': ISOCodeData('', '', 'Dakota', 'dakota'),
    'dan': ISOCodeData('', 'da', 'Danish', 'danois'),
    'dar': ISOCodeData('', '', 'Dargwa', 'dargwa'),
    'day': ISOCodeData('', '', 'Land Dayak languages', 'dayak, langues'),
    'del': ISOCodeData('', '', 'Delaware', 'delaware'),
    'den': ISOCodeData('', '', 'Slave (Athapascan)', 'esclave (athapascan)'),
    'dgr': ISOCodeData('', '', 'Dogrib', 'dogrib'),
    'din': ISOCodeData('', '', 'Dinka', 'dinka'),
    'div': ISOCodeData('', 'dv', 'Divehi; Dhivehi; Maldivian', 'maldivien'),
    'doi': ISOCodeData('', '', 'Dogri', 'dogri'),
    'dra': ISOCodeData(
        '',
        '',
        'Dravidian languages',
        'dravidiennes, langues',
    ),
    'dsb': ISOCodeData('', '', 'Lower Sorbian', 'bas-sorabe'),
    'dua': ISOCodeData('', '', 'Duala', 'douala'),
    'dum': ISOCodeData(
        '',
        '',
        'Dutch, Middle (ca.1050-1350)',
        'néerlandais moyen (ca. 1050-1350)',
    ),
    'dut': ISOCodeData('nld', 'nl', 'Dutch; Flemish', 'néerlandais; flamand'),
    'dyu': ISOCodeData('', '', 'Dyula', 'dioula'),
    'dzo': ISOCodeData('', 'dz', 'Dzongkha', 'dzongkha'),
    'efi': ISOCodeData('', '', 'Efik', 'efik'),
    'egy': ISOCodeData('', '', 'Egyptian (Ancient)', 'égyptien'),
    'eka': ISOCodeData('', '', 'Ekajuk', 'ekajuk'),
    'elx': ISOCodeData('', '', 'Elamite', 'élamite'),
    'eng': ISOCodeData('', 'en', 'English', 'anglais'),
    'enm': ISOCodeData(
        '',
        '',
        'English, Middle (1100-1500)',
        'anglais moyen (1100-1500)',
    ),
    'epo': ISOCodeData('', 'eo', 'Esperanto', 'espéranto'),
    'est': ISOCodeData('', 'et', 'Estonian', 'estonien'),
    'ewe': ISOCodeData('', 'ee', 'Ewe', 'éwé'),
    'ewo': ISOCodeData('', '', 'Ewondo', 'éwondo'),
    'fan': ISOCodeData('', '', 'Fang', 'fang'),
    'fao': ISOCodeData('', 'fo', 'Faroese', 'féroïen'),
    'fat': ISOCodeData('', '', 'Fanti', 'fanti'),
    'fij': ISOCodeData('', 'fj', 'Fijian', 'fidjien'),
    'fil': ISOCodeData('', '', 'Filipino; Pilipino', 'filipino; pilipino'),
    'fin': ISOCodeData('', 'fi', 'Finnish', 'finnois'),
    'fiu': ISOCodeData(
        '',
        '',
        'Finno-Ugrian languages',
        'finno-ougriennes, langues',
    ),
    'fon': ISOCodeData('', '', 'Fon', 'fon'),
    'fre': ISOCodeData('fra', 'fr', 'French', 'français'),
    'frm': ISOCodeData(
        '',
        '',
        'French, Middle (ca.1400-1600)',
        'français moyen (1400-1600)',
    ),
    'fro': ISOCodeData(
        '',
        '',
        'French, Old (842-ca.1400)',
        'français ancien (842-ca.1400)',
    ),
    'frr': ISOCodeData('', '', 'Northern Frisian', 'frison septentrional'),
    'frs': ISOCodeData('', '', 'Eastern Frisian', 'frison oriental'),
    'fry': ISOCodeData('', 'fy', 'Western Frisian', 'frison occidental'),
    'ful': ISOCodeData('', 'ff', 'Fulah', 'peul'),
    'fur': ISOCodeData('', '', 'Friulian', 'frioulan'),
    'gaa': ISOCodeData('', '', 'Ga', 'ga'),
    'gay': ISOCodeData('', '', 'Gayo', 'gayo'),
    'gba': ISOCodeData('', '', 'Gbaya', 'gbaya'),
    'gem': ISOCodeData('', '', 'Germanic languages', 'germaniques, langues'),
    'geo': ISOCodeData('kat', 'ka', 'Georgian', 'géorgien'),
    'ger': ISOCodeData('deu', 'de', 'German', 'allemand'),
    'gez': ISOCodeData('', '', 'Geez', 'guèze'),
    'gil': ISOCodeData('', '', 'Gilbertese', 'kiribati'),
    'gla': ISOCodeData(
        '',
        'gd',
        'Gaelic; Scottish Gaelic',
        'gaélique; gaélique écossais',
    ),
    'gle': ISOCodeData('', 'ga', 'Irish', 'irlandais'),
    'glg': ISOCodeData('', 'gl', 'Galician', 'galicien'),
    'glv': ISOCodeData('', 'gv', 'Manx', 'manx; mannois'),
    'gmh': ISOCodeData(
        '',
        '',
        'German, Middle High (ca.1050-1500)',
        'allemand, moyen haut (ca. 1050-1500)',
    ),
    'goh': ISOCodeData(
        '',
        '',
        'German, Old High (ca.750-1050)',
        'allemand, vieux haut (ca. 750-1050)',
    ),
    'gon': ISOCodeData('', '', 'Gondi', 'gond'),
    'gor': ISOCodeData('', '', 'Gorontalo', 'gorontalo'),
    'got': ISOCodeData('', '', 'Gothic', 'gothique'),
    'grb': ISOCodeData('', '', 'Grebo', 'grebo'),
    'grc': ISOCodeData(
        '',
        '',
        'Greek, Ancient (to 1453)',
        "grec ancien (jusqu'à 1453)",
    ),
    'gre': ISOCodeData(
        'ell',
        'el',
        'Greek, Modern (1453-)',
        'grec moderne (après 1453)',
    ),
    'grn': ISOCodeData('', 'gn', 'Guarani', 'guarani'),
    'gsw': ISOCodeData(
        '',
        '',
        'Swiss German; Alemannic; Alsatian',
        'suisse alémanique; alémanique; alsacien',
    ),
    'guj': ISOCodeData('', 'gu', 'Gujarati', 'goudjrati'),
    'gwi': ISOCodeData('', '', "Gwich'in", "gwich'in"),
    'hai': ISOCodeData('', '', 'Haida', 'haida'),
    'hat': ISOCodeData(
        '',
        'ht',
        'Haitian; Haitian Creole',
        'haïtien; créole haïtien',
    ),
    'hau': ISOCodeData('', 'ha', 'Hausa', 'haoussa'),
    'haw': ISOCodeData('', '', 'Hawaiian', 'hawaïen'),
    'heb': ISOCodeData('', 'he', 'Hebrew', 'hébreu'),
    'her': ISOCodeData('', 'hz', 'Herero', 'herero'),
    'hil': ISOCodeData('', '', 'Hiligaynon', 'hiligaynon'),
    'him': ISOCodeData(
        '',
        '',
        'Himachali languages; Western Pahari languages',
        'langues himachalis; langues paharis occidentales',
    ),
    'hin': ISOCodeData('', 'hi', 'Hindi', 'hindi'),
    'hit': ISOCodeData('', '', 'Hittite', 'hittite'),
    'hmn': ISOCodeData('', '', 'Hmong; Mong', 'hmong'),
    'hmo': ISOCodeData('', 'ho', 'Hiri Motu', 'hiri motu'),
    'hrv': ISOCodeData('', 'hr', 'Croatian', 'croate'),
    'hsb': ISOCodeData('', '', 'Upper Sorbian', 'haut-sorabe'),
    'hun': ISOCodeData('', 'hu', 'Hungarian', 'hongrois'),
    'hup': ISOCodeData('', '', 'Hupa', 'hupa'),
    'iba': ISOCodeData('', '', 'Iban', 'iban'),
    'ibo': ISOCodeData('', 'ig', 'Igbo', 'igbo'),
    'ice': ISOCodeData('isl', 'is', 'Icelandic', 'islandais'),
    'ido': ISOCodeData('', 'io', 'Ido', 'ido'),
    'iii': ISOCodeData('', 'ii', 'Sichuan Yi; Nuosu', 'yi de Sichuan'),
    'ijo': ISOCodeData('', '', 'Ijo languages', 'ijo, langues'),
    'iku': ISOCodeData('', 'iu', 'Inuktitut', 'inuktitut'),
    'ile': ISOCodeData('', 'ie', 'Interlingue; Occidental', 'interlingue'),
    'ilo': ISOCodeData('', '', 'Iloko', 'ilocano'),
    'ina': ISOCodeData(
        '',
        'ia',
        'Interlingua (International Auxiliary Language Association)',
        'interlingua (langue auxiliaire internationale)',
    ),
    'inc': ISOCodeData('', '', 'Indic languages', 'indo-aryennes, langues'),
    'ind': ISOCodeData('', 'id', 'Indonesian', 'indonésien'),
    'ine': ISOCodeData(
        '',
        '',
        'Indo-European languages',
        'indo-européennes, langues',
    ),
    'inh': ISOCodeData('', '', 'Ingush', 'ingouche'),
    'ipk': ISOCodeData('', 'ik', 'Inupiaq', 'inupiaq'),
    'ira': ISOCodeData('', '', 'Iranian languages', 'iraniennes, langues'),
    'iro': ISOCodeData('', '', 'Iroquoian languages', 'iroquoises, langues'),
    'ita': ISOCodeData('', 'it', 'Italian', 'italien'),
    'jav': ISOCodeData('', 'jv', 'Javanese', 'javanais'),
    'jbo': ISOCodeData('', '', 'Lojban', 'lojban'),
    'jpn': ISOCodeData('', 'ja', 'Japanese', 'japonais'),
    'jpr': ISOCodeData('', '', 'Judeo-Persian', 'judéo-persan'),
    'jrb': ISOCodeData('', '', 'Judeo-Arabic', 'judéo-arabe'),
    'kaa': ISOCodeData('', '', 'Kara-Kalpak', 'karakalpak'),
    'kab': ISOCodeData('', '', 'Kabyle', 'kabyle'),
    'kac': ISOCodeData('', '', 'Kachin; Jingpho', 'kachin; jingpho'),
    'kal': ISOCodeData('', 'kl', 'Kalaallisut; Greenlandic', 'groenlandais'),
    'kam': ISOCodeData('', '', 'Kamba', 'kamba'),
    'kan': ISOCodeData('', 'kn', 'Kannada', 'kannada'),
    'kar': ISOCodeData('', '', 'Karen languages', 'karen, langues'),
    'kas': ISOCodeData('', 'ks', 'Kashmiri', 'kashmiri'),
    'kau': ISOCodeData('', 'kr', 'Kanuri', 'kanouri'),
    'kaw': ISOCodeData('', '', 'Kawi', 'kawi'),
    'kaz': ISOCodeData('', 'kk', 'Kazakh', 'kazakh'),
    'kbd': ISOCodeData('', '', 'Kabardian', 'kabardien'),
    'kha': ISOCodeData('', '', 'Khasi', 'khasi'),
    'khi': ISOCodeData('', '', 'Khoisan languages', 'khoïsan, langues'),
    'khm': ISOCodeData('', 'km', 'Central Khmer', 'khmer central'),
    'kho': ISOCodeData('', '', 'Khotanese; Sakan', 'khotanais; sakan'),
    'kik': ISOCodeData('', 'ki', 'Kikuyu; Gikuyu', 'kikuyu'),
    'kin': ISOCodeData('', 'rw', 'Kinyarwanda', 'rwanda'),
    'kir': ISOCodeData('', 'ky', 'Kirghiz; Kyrgyz', 'kirghiz'),
    'kmb': ISOCodeData('', '', 'Kimbundu', 'kimbundu'),
    'kok': ISOCodeData('', '', 'Konkani', 'konkani'),
    'kom': ISOCodeData('', 'kv', 'Komi', 'kom'),
    'kon': ISOCodeData('', 'kg', 'Kongo', 'kongo'),
    'kor': ISOCodeData('', 'ko', 'Korean', 'coréen'),
    'kos': ISOCodeData('', '', 'Kosraean', 'kosrae'),
    'kpe': ISOCodeData('', '', 'Kpelle', 'kpellé'),
    'krc': ISOCodeData('', '', 'Karachay-Balkar', 'karatchai balkar'),
    'krl': ISOCodeData('', '', 'Karelian', 'carélien'),
    'kro': ISOCodeData('', '', 'Kru languages', 'krou, langues'),
    'kru': ISOCodeData('', '', 'Kurukh', 'kurukh'),
    'kua': ISOCodeData('', 'kj', 'Kuanyama; Kwanyama', 'kuanyama; kwanyama'),
    'kum': ISOCodeData('', '', 'Kumyk', 'koumyk'),
    'kur': ISOCodeData('', 'ku', 'Kurdish', 'kurde'),
    'kut': ISOCodeData('', '', 'Kutenai', 'kutenai'),
    'lad': ISOCodeData('', '', 'Ladino', 'judéo-espagnol'),
    'lah': ISOCodeData('', '', 'Lahnda', 'lahnda'),
    'lam': ISOCodeData('', '', 'Lamba', 'lamba'),
    'lao': ISOCodeData('', 'lo', 'Lao', 'lao'),
    'lat': ISOCodeData('', 'la', 'Latin', 'latin'),
    'lav': ISOCodeData('', 'lv', 'Latvian', 'letton'),
    'lez': ISOCodeData('', '', 'Lezghian', 'lezghien'),
    'lim': ISOCodeData(
        '',
        'li',
        'Limburgan; Limburger; Limburgish',
        'limbourgeois',
    ),
    'lin': ISOCodeData('', 'ln', 'Lingala', 'lingala'),
    'lit': ISOCodeData('', 'lt', 'Lithuanian', 'lituanien'),
    'lol': ISOCodeData('', '', 'Mongo', 'mongo'),
    'loz': ISOCodeData('', '', 'Lozi', 'lozi'),
    'ltz': ISOCodeData(
        '',
        'lb',
        'Luxembourgish; Letzeburgesch',
        'luxembourgeois',
    ),
    'lua': ISOCodeData('', '', 'Luba-Lulua', 'luba-lulua'),
    'lub': ISOCodeData('', 'lu', 'Luba-Katanga', 'luba-katanga'),
    'lug': ISOCodeData('', 'lg', 'Ganda', 'ganda'),
    'lui': ISOCodeData('', '', 'Luiseno', 'luiseno'),
    'lun': ISOCodeData('', '', 'Lunda', 'lunda'),
    'luo': ISOCodeData(
        '',
        '',
        'Luo (Kenya and Tanzania)',
        'luo (Kenya et Tanzanie)',
    ),
    'lus': ISOCodeData('', '', 'Lushai', 'lushai'),
    'mac': ISOCodeData('mkd', 'mk', 'Macedonian', 'macédonien'),
    'mad': ISOCodeData('', '', 'Madurese', 'madourais'),
    'mag': ISOCodeData('', '', 'Magahi', 'magahi'),
    'mah': ISOCodeData('', 'mh', 'Marshallese', 'marshall'),
    'mai': ISOCodeData('', '', 'Maithili', 'maithili'),
    'mak': ISOCodeData('', '', 'Makasar', 'makassar'),
    'mal': ISOCodeData('', 'ml', 'Malayalam', 'malayalam'),
    'man': ISOCodeData('', '', 'Mandingo', 'mandingue'),
    'mao': ISOCodeData('mri', 'mi', 'Maori', 'maori'),
    'map': ISOCodeData(
        '',
        '',
        'Austronesian languages',
        'austronésiennes, langues',
    ),
    'mar': ISOCodeData('', 'mr', 'Marathi', 'marathe'),
    'mas': ISOCodeData('', '', 'Masai', 'massaï'),
    'may': ISOCodeData('msa', 'ms', 'Malay', 'malais'),
    'mdf': ISOCodeData('', '', 'Moksha', 'moksa'),
    'mdr': ISOCodeData('', '', 'Mandar', 'mandar'),
    'men': ISOCodeData('', '', 'Mende', 'mendé'),
    'mga': ISOCodeData(
        '',
        '',
        'Irish, Middle (900-1200)',
        'irlandais moyen (900-1200)',
    ),
    'mic': ISOCodeData('', '', "Mi'kmaq; Micmac", "mi'kmaq; micmac"),
    'min': ISOCodeData('', '', 'Minangkabau', 'minangkabau'),
    'mis': ISOCodeData('', '', 'Uncoded languages', 'langues non codées'),
    'mkh': ISOCodeData('', '', 'Mon-Khmer languages', 'môn-khmer, langues'),
    'mlg': ISOCodeData('', 'mg', 'Malagasy', 'malgache'),
    'mlt': ISOCodeData('', 'mt', 'Maltese', 'maltais'),
    'mnc': ISOCodeData('', '', 'Manchu', 'mandchou'),
    'mni': ISOCodeData('', '', 'Manipuri', 'manipuri'),
    'mno': ISOCodeData('', '', 'Manobo languages', 'manobo, langues'),
    'moh': ISOCodeData('', '', 'Mohawk', 'mohawk'),
    'mon': ISOCodeData('', 'mn', 'Mongolian', 'mongol'),
    'mos': ISOCodeData('', '', 'Mossi', 'moré'),
    'mul': ISOCodeData('', '', 'Multiple languages', 'multilingue'),
    'mun': ISOCodeData('', '', 'Munda languages', 'mounda, langues'),
    'mus': ISOCodeData('', '', 'Creek', 'muskogee'),
    'mwl': ISOCodeData('', '', 'Mirandese', 'mirandais'),
    'mwr': ISOCodeData('', '', 'Marwari', 'marvari'),
    'myn': ISOCodeData('', '', 'Mayan languages', 'maya, langues'),
    'myv': ISOCodeData('', '', 'Erzya', 'erza'),
    'nah': ISOCodeData('', '', 'Nahuatl languages', 'nahuatl, langues'),
    'nai': ISOCodeData(
        '',
        '',
        'North American Indian languages',
        'nord-amérindiennes, langues',
    ),
    'nap': ISOCodeData('', '', 'Neapolitan', 'napolitain'),
    'nau': ISOCodeData('', 'na', 'Nauru', 'nauruan'),
    'nav': ISOCodeData('', 'nv', 'Navajo; Navaho', 'navaho'),
    'nbl': ISOCodeData(
        '',
        'nr',
        'Ndebele, South; South Ndebele',
        'ndébélé du Sud',
    ),
    'nde': ISOCodeData(
        '',
        'nd',
        'Ndebele, North; North Ndebele',
        'ndébélé du Nord',
    ),
    'ndo': ISOCodeData('', 'ng', 'Ndonga', 'ndonga'),
    'nds': ISOCodeData(
        '',
        '',
        'Low German; Low Saxon; German, Low; Saxon, Low',
        'bas allemand; bas saxon; allemand, bas; saxon, bas',
    ),
    'nep': ISOCodeData('', 'ne', 'Nepali', 'népalais'),
    'new': ISOCodeData('', '', 'Nepal Bhasa; Newari', 'nepal bhasa; newari'),
    'nia': ISOCodeData('', '', 'Nias', 'nias'),
    'nic': ISOCodeData(
        '',
        '',
        'Niger-Kordofanian languages',
        'nigéro-kordofaniennes, langues',
    ),
    'niu': ISOCodeData('', '', 'Niuean', 'niué'),
    'nno': ISOCodeData(
        '',
        'nn',
        'Norwegian Nynorsk; Nynorsk, Norwegian',
        'norvégien nynorsk; nynorsk, norvégien',
    ),
    'nob': ISOCodeData(
        '',
        'nb',
        'Bokmål, Norwegian; Norwegian Bokmål',
        'norvégien bokmål',
    ),
    'nog': ISOCodeData('', '', 'Nogai', 'nogaï; nogay'),
    'non': ISOCodeData('', '', 'Norse, Old', 'norrois, vieux'),
    'nor': ISOCodeData('', 'no', 'Norwegian', 'norvégien'),
    'nqo': ISOCodeData('', '', "N'Ko", "n'ko"),
    'nso': ISOCodeData(
        '',
        '',
        'Pedi; Sepedi; Northern Sotho',
        'pedi; sepedi; sotho du Nord',
    ),
    'nub': ISOCodeData('', '', 'Nubian languages', 'nubiennes, langues'),
    'nwc': ISOCodeData(
        '',
        '',
        'Classical Newari; Old Newari; Classical Nepal Bhasa',
        'newari classique',
    ),
    'nya': ISOCodeData(
        '',
        'ny',
        'Chichewa; Chewa; Nyanja',
        'chichewa; chewa; nyanja',
    ),
    'nym': ISOCodeData('', '', 'Nyamwezi', 'nyamwezi'),
    'nyn': ISOCodeData('', '', 'Nyankole', 'nyankolé'),
    'nyo': ISOCodeData('', '', 'Nyoro', 'nyoro'),
    'nzi': ISOCodeData('', '', 'Nzima', 'nzema'),
    'oci': ISOCodeData(
        '',
        'oc',
        'Occitan (post 1500)',
        'occitan (après 1500)',
    ),
    'oji': ISOCodeData('', 'oj', 'Ojibwa', 'ojibwa'),
    'ori': ISOCodeData('', 'or', 'Oriya', 'oriya'),
    'orm': ISOCodeData('', 'om', 'Oromo', 'galla'),
    'osa': ISOCodeData('', '', 'Osage', 'osage'),
    'oss': ISOCodeData('', 'os', 'Ossetian; Ossetic', 'ossète'),
    'ota': ISOCodeData(
        '',
        '',
        'Turkish, Ottoman (1500-1928)',
        'turc ottoman (1500-1928)',
    ),
    'oto': ISOCodeData('', '', 'Otomian languages', 'otomi, langues'),
    'paa': ISOCodeData('', '', 'Papuan languages', 'papoues, langues'),
    'pag': ISOCodeData('', '', 'Pangasinan', 'pangasinan'),
    'pal': ISOCodeData('', '', 'Pahlavi', 'pahlavi'),
    'pam': ISOCodeData('', '', 'Pampanga; Kapampangan', 'pampangan'),
    'pan': ISOCodeData('', 'pa', 'Panjabi; Punjabi', 'pendjabi'),
    'pap': ISOCodeData('', '', 'Papiamento', 'papiamento'),
    'pau': ISOCodeData('', '', 'Palauan', 'palau'),
    'peo': ISOCodeData(
        '',
        '',
        'Persian, Old (ca.600-400 B.C.)',
        'perse, vieux (ca. 600-400 av. J.-C.)',
    ),
    'per': ISOCodeData('fas', 'fa', 'Persian', 'persan'),
    'phi': ISOCodeData(
        '',
        '',
        'Philippine languages',
        'philippines, langues',
    ),
    'phn': ISOCodeData('', '', 'Phoenician', 'phénicien'),
    'pli': ISOCodeData('', 'pi', 'Pali', 'pali'),
    'pol': ISOCodeData('', 'pl', 'Polish', 'polonais'),
    'pon': ISOCodeData('', '', 'Pohnpeian', 'pohnpei'),
    'por': ISOCodeData('', 'pt', 'Portuguese', 'portugais'),
    'pra': ISOCodeData('', '', 'Prakrit languages', 'prâkrit, langues'),
    'pro': ISOCodeData(
        '',
        '',
        'Provençal, Old (to 1500); Occitan, Old (to 1500)',
        "provençal ancien (jusqu'à 1500); occitan ancien (jusqu'à 1500)",
    ),
    'pus': ISOCodeData('', 'ps', 'Pushto; Pashto', 'pachto'),
    'qaa': ISOCodeData(
        '',
        '',
        'Reserved for local use',
        "réservée à l'usage local",
    ),
    'que': ISOCodeData('', 'qu', 'Quechua', 'quechua'),
    'raj': ISOCodeData('', '', 'Rajasthani', 'rajasthani'),
    'rap': ISOCodeData('', '', 'Rapanui', 'rapanui'),
    'rar': ISOCodeData(
        '',
        '',
        'Rarotongan; Cook Islands Maori',
        'rarotonga; maori des îles Cook',
    ),
    'roa': ISOCodeData('', '', 'Romance languages', 'romanes, langues'),
    'roh': ISOCodeData('', 'rm', 'Romansh', 'romanche'),
    'rom': ISOCodeData('', '', 'Romany', 'tsigane'),
    'rum': ISOCodeData(
        'ron',
        'ro',
        'Romanian; Moldavian; Moldovan',
        'roumain; moldave',
    ),
    'run': ISOCodeData('', 'rn', 'Rundi', 'rundi'),
    'rup': ISOCodeData(
        '',
        '',
        'Aromanian; Arumanian; Macedo-Romanian',
        'aroumain; macédo-roumain',
    ),
    'rus': ISOCodeData('', 'ru', 'Russian', 'russe'),
    'sad': ISOCodeData('', '', 'Sandawe', 'sandawe'),
    'sag': ISOCodeData('', 'sg', 'Sango', 'sango'),
    'sah': ISOCodeData('', '', 'Yakut', 'iakoute'),
    'sai': ISOCodeData(
        '',
        '',
        'South American Indian languages',
        'sud-amérindiennes, langues',
    ),
    'sal': ISOCodeData('', '', 'Salishan languages', 'salishennes, langues'),
    'sam': ISOCodeData('', '', 'Samaritan Aramaic', 'samaritain'),
    'san': ISOCodeData('', 'sa', 'Sanskrit', 'sanskrit'),
    'sas': ISOCodeData('', '', 'Sasak', 'sasak'),
    'sat': ISOCodeData('', '', 'Santali', 'santal'),
    'scn': ISOCodeData('', '', 'Sicilian', 'sicilien'),
    'sco': ISOCodeData('', '', 'Scots', 'écossais'),
    'sel': ISOCodeData('', '', 'Selkup', 'selkoupe'),
    'sem': ISOCodeData('', '', 'Semitic languages', 'sémitiques, langues'),
    'sga': ISOCodeData(
        '',
        '',
        'Irish, Old (to 900)',
        "irlandais ancien (jusqu'à 900)",
    ),
    'sgn': ISOCodeData('', '', 'Sign Languages', 'langues des signes'),
    'shn': ISOCodeData('', '', 'Shan', 'chan'),
    'sid': ISOCodeData('', '', 'Sidamo', 'sidamo'),
    'sin': ISOCodeData('', 'si', 'Sinhala; Sinhalese', 'singhalais'),
    'sio': ISOCodeData('', '', 'Siouan languages', 'sioux, langues'),
    'sit': ISOCodeData(
        '',
        '',
        'Sino-Tibetan languages',
        'sino-tibétaines, langues',
    ),
    'sla': ISOCodeData('', '', 'Slavic languages', 'slaves, langues'),
    'slo': ISOCodeData('slk', 'sk', 'Slovak', 'slovaque'),
    'slv': ISOCodeData('', 'sl', 'Slovenian', 'slovène'),
    'sma': ISOCodeData('', '', 'Southern Sami', 'sami du Sud'),
    'sme': ISOCodeData('', 'se', 'Northern Sami', 'sami du Nord'),
    'smi': ISOCodeData('', '', 'Sami languages', 'sames, langues'),
    'smj': ISOCodeData('', '', 'Lule Sami', 'sami de Lule'),
    'smn': ISOCodeData('', '', 'Inari Sami', "sami d'Inari"),
    'smo': ISOCodeData('', 'sm', 'Samoan', 'samoan'),
    'sms': ISOCodeData('', '', 'Skolt Sami', 'sami skolt'),
    'sna': ISOCodeData('', 'sn', 'Shona', 'shona'),
    'snd': ISOCodeData('', 'sd', 'Sindhi', 'sindhi'),
    'snk': ISOCodeData('', '', 'Soninke', 'soninké'),
    'sog': ISOCodeData('', '', 'Sogdian', 'sogdien'),
    'som': ISOCodeData('', 'so', 'Somali', 'somali'),
    'son': ISOCodeData('', '', 'Songhai languages', 'songhai, langues'),
    'sot': ISOCodeData('', 'st', 'Sotho, Southern', 'sotho du Sud'),
    'spa': ISOCodeData('', 'es', 'Spanish; Castilian', 'espagnol; castillan'),
    'srd': ISOCodeData('', 'sc', 'Sardinian', 'sarde'),
    'srn': ISOCodeData('', '', 'Sranan Tongo', 'sranan tongo'),
    'srp': ISOCodeData('', 'sr', 'Serbian', 'serbe'),
    'srr': ISOCodeData('', '', 'Serer', 'sérère'),
    'ssa': ISOCodeData(
        '',
        '',
        'Nilo-Saharan languages',
        'nilo-sahariennes, langues',
    ),
    'ssw': ISOCodeData('', 'ss', 'Swati', 'swati'),
    'suk': ISOCodeData('', '', 'Sukuma', 'sukuma'),
    'sun': ISOCodeData('', 'su', 'Sundanese', 'soundanais'),
    'sus': ISOCodeData('', '', 'Susu', 'soussou'),
    'sux': ISOCodeData('', '', 'Sumerian', 'sumérien'),
    'swa': ISOCodeData('', 'sw', 'Swahili', 'swahili'),
    'swe': ISOCodeData('', 'sv', 'Swedish', 'suédois'),
    'syc': ISOCodeData('', '', 'Classical Syriac', 'syriaque classique'),
    'syr': ISOCodeData('', '', 'Syriac', 'syriaque'),
    'tah': ISOCodeData('', 'ty', 'Tahitian', 'tahitien'),
    'tai': ISOCodeData('', '', 'Tai languages', 'tai, langues'),
    'tam': ISOCodeData('', 'ta', 'Tamil', 'tamoul'),
    'tat': ISOCodeData('', 'tt', 'Tatar', 'tatar'),
    'tel': ISOCodeData('', 'te', 'Telugu', 'télougou'),
    'tem': ISOCodeData('', '', 'Timne', 'temne'),
    'ter': ISOCodeData('', '', 'Tereno', 'tereno'),
    'tet': ISOCodeData('', '', 'Tetum', 'tetum'),
    'tgk': ISOCodeData('', 'tg', 'Tajik', 'tadjik'),
    'tgl': ISOCodeData('', 'tl', 'Tagalog', 'tagalog'),
    'tha': ISOCodeData('', 'th', 'Thai', 'thaï'),
    'tib': ISOCodeData('bod', 'bo', 'Tibetan', 'tibétain'),
    'tig': ISOCodeData('', '', 'Tigre', 'tigré'),
    'tir': ISOCodeData('', 'ti', 'Tigrinya', 'tigrigna'),
    'tiv': ISOCodeData('', '', 'Tiv', 'tiv'),
    'tkl': ISOCodeData('', '', 'Tokelau', 'tokelau'),
    'tlh': ISOCodeData('', '', 'Klingon; tlhIngan-Hol', 'klingon'),
    'tli': ISOCodeData('', '', 'Tlingit', 'tlingit'),
    'tmh': ISOCodeData('', '', 'Tamashek', 'tamacheq'),
    'tog': ISOCodeData('', '', 'Tonga (Nyasa)', 'tonga (Nyasa)'),
    'ton': ISOCodeData(
        '',
        'to',
        'Tonga (Tonga Islands)',
        'tongan (Îles Tonga)',
    ),
    'tpi': ISOCodeData('', '', 'Tok Pisin', 'tok pisin'),
    'tsi': ISOCodeData('', '', 'Tsimshian', 'tsimshian'),
    'tsn': ISOCodeData('', 'tn', 'Tswana', 'tswana'),
    'tso': ISOCodeData('', 'ts', 'Tsonga', 'tsonga'),
    'tuk': ISOCodeData('', 'tk', 'Turkmen', 'turkmène'),
    'tum': ISOCodeData('', '', 'Tumbuka', 'tumbuka'),
    'tup': ISOCodeData('', '', 'Tupi languages', 'tupi, langues'),
    'tur': ISOCodeData('', 'tr', 'Turkish', 'turc'),
    'tut': ISOCodeData('', '', 'Altaic languages', 'altaïques, langues'),
    'tvl': ISOCodeData('', '', 'Tuvalu', 'tuvalu'),
    'twi': ISOCodeData('', 'tw', 'Twi', 'twi'),
    'tyv': ISOCodeData('', '', 'Tuvinian', 'touva'),
    'udm': ISOCodeData('', '', 'Udmurt', 'oudmourte'),
    'uga': ISOCodeData('', '', 'Ugaritic', 'ougaritique'),
    'uig': ISOCodeData('', 'ug', 'Uighur; Uyghur', 'ouïgour'),
    'ukr': ISOCodeData('', 'uk', 'Ukrainian', 'ukrainien'),
    'umb': ISOCodeData('', '', 'Umbundu', 'umbundu'),
    'und': ISOCodeData('', '', 'Undetermined', 'indéterminée'),
    'urd': ISOCodeData('', 'ur', 'Urdu', 'ourdou'),
    'uzb': ISOCodeData('', 'uz', 'Uzbek', 'ouszbek'),
    'vai': ISOCodeData('', '', 'Vai', 'vaï'),
    'ven': ISOCodeData('', 've', 'Venda', 'venda'),
    'vie': ISOCodeData('', 'vi', 'Vietnamese', 'vietnamien'),
    'vol': ISOCodeData('', 'vo', 'Volapük', 'volapük'),
    'vot': ISOCodeData('', '', 'Votic', 'vote'),
    'wak': ISOCodeData('', '', 'Wakashan languages', 'wakashanes, langues'),
    'wal': ISOCodeData('', '', 'Wolaitta; Wolaytta', 'wolaitta; wolaytta'),
    'war': ISOCodeData('', '', 'Waray', 'waray'),
    'was': ISOCodeData('', '', 'Washo', 'washo'),
    'wel': ISOCodeData('cym', 'cy', 'Welsh', 'gallois'),
    'wen': ISOCodeData('', '', 'Sorbian languages', 'sorabes, langues'),
    'wln': ISOCodeData('', 'wa', 'Walloon', 'wallon'),
    'wol': ISOCodeData('', 'wo', 'Wolof', 'wolof'),
    'xal': ISOCodeData('', '', 'Kalmyk; Oirat', 'kalmouk; oïrat'),
    'xho': ISOCodeData('', 'xh', 'Xhosa', 'xhosa'),
    'yao': ISOCodeData('', '', 'Yao', 'yao'),
    'yap': ISOCodeData('', '', 'Yapese', 'yapois'),
    'yid': ISOCodeData('', 'yi', 'Yiddish', 'yiddish'),
    'yor': ISOCodeData('', 'yo', 'Yoruba', 'yoruba'),
    'ypk': ISOCodeData('', '', 'Yupik languages', 'yupik, langues'),
    'zap': ISOCodeData('', '', 'Zapotec', 'zapotèque'),
    'zbl': ISOCodeData(
        '',
        '',
        'Blissymbols; Blissymbolics; Bliss',
        'symboles Bliss; Bliss',
    ),
    'zen': ISOCodeData('', '', 'Zenaga', 'zenaga'),
    'zgh': ISOCodeData(
        '',
        '',
        'Standard Moroccan Tamazight',
        'amazighe standard marocain',
    ),
    'zha': ISOCodeData('', 'za', 'Zhuang; Chuang', 'zhuang; chuang'),
    'znd': ISOCodeData('', '', 'Zande languages', 'zandé, langues'),
    'zul': ISOCodeData('', 'zu', 'Zulu', 'zoulou'),
    'zun': ISOCodeData('', '', 'Zuni', 'zuni'),
    'zxx': ISOCodeData(
        '',
        '',
        'No linguistic content; Not applicable',
        'pas de contenu linguistique; non applicable',
    ),
    'zza': ISOCodeData(
        '',
        '',
        'Zaza; Dimili; Dimli; Kirdki; Kirmanjki; Zazaki',
        'zaza; dimili; dimli; kirdki; kirmanjki; zazaki',
    ),
}


def iso_639_2_from_3(iso3: str) -> str:
    """Convert ISO 639-3 code to ISO 639-2 code."""
    if iso3 in ISO_639_3:
        return ISO_639_3[iso3].alpha_2
    else:
        return ""


================================================
FILE: src/ocrmypdf/models/__init__.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""OCRmyPDF models for plugin options and cross-cutting concerns."""

from __future__ import annotations


================================================
FILE: src/ocrmypdf/models/ocr_element.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""OCR element dataclasses for representing OCR output structure.

This module provides a generic, engine-agnostic representation of OCR output.
The OcrElement dataclass can represent structural units from any OCR source
(hOCR, ALTO, custom engines, etc.) in a unified format suitable for rendering.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Literal


@dataclass
class BoundingBox:
    """An axis-aligned bounding box in pixel coordinates.

    Coordinates use top-left origin (standard for images and hOCR).

    Attributes:
        left: Left edge x-coordinate
        top: Top edge y-coordinate
        right: Right edge x-coordinate
        bottom: Bottom edge y-coordinate
    """

    left: float
    top: float
    right: float
    bottom: float

    @property
    def width(self) -> float:
        """Width of the bounding box."""
        return self.right - self.left

    @property
    def height(self) -> float:
        """Height of the bounding box."""
        return self.bottom - self.top

    def __post_init__(self):
        """Validate bounding box coordinates."""
        if self.right < self.left:
            raise ValueError(
                f"Invalid bounding box: right ({self.right}) < left ({self.left})"
            )
        if self.bottom < self.top:
            raise ValueError(
                f"Invalid bounding box: bottom ({self.bottom}) < top ({self.top})"
            )


@dataclass
class Baseline:
    """Text baseline information.

    The baseline is represented as a linear equation: y = slope * x + intercept.
    This describes the line along which text characters sit, relative to the
    bottom-left corner of the line's bounding box.

    In hOCR, the baseline is specified relative to the bottom of the line's bbox,
    with the intercept being the vertical offset from the bottom and the slope
    representing rotation (positive = ascending left-to-right).

    Attributes:
        slope: Slope of the baseline (rise over run)
        intercept: Y-intercept of the baseline (vertical offset from bbox bottom)
    """

    slope: float = 0.0
    intercept: float = 0.0


@dataclass
class FontInfo:
    """Font information for text rendering.

    Attributes:
        name: Font family name (e.g., "Times New Roman")
        size: Font size in points
        bold: Whether the font is bold
        italic: Whether the font is italic
        monospace: Whether the font is monospace
        serif: Whether the font is serif (vs sans-serif)
        smallcaps: Whether the font uses small caps
        underline: Whether the text is underlined
    """

    name: str | None = None
    size: float | None = None
    bold: bool = False
    italic: bool = False
    monospace: bool = False
    serif: bool = False
    smallcaps: bool = False
    underline: bool = False


@dataclass
class OcrElement:
    """A generic OCR element representing any structural unit of OCR output.

    OcrElements form a tree structure where pages contain paragraphs, paragraphs
    contain lines, lines contain words, etc. The specific hierarchy depends on
    the OCR engine, but this dataclass can represent any of these levels.

    The ocr_class field uses hOCR naming conventions (ocr_page, ocr_par, ocr_line,
    ocrx_word, etc.) as a common vocabulary, but elements from other sources can
    map to these classes.

    Common hOCR classes:
        - ocr_page: The root element for a page
        - ocr_carea: A content/column area
        - ocr_par: A paragraph
        - ocr_line: A line of text
        - ocr_header: A header line
        - ocr_footer: A footer line
        - ocr_caption: A caption line
        - ocr_textfloat: A floating text element
        - ocrx_word: A single word

    Attributes:
        ocr_class: The element type (e.g., "ocr_page", "ocr_line", "ocrx_word")
        bbox: Axis-aligned bounding box in source pixel coordinates (top-left origin)
        poly: Polygon vertices for oriented/non-rectangular bounds
        text: Text content (primarily for leaf nodes like words)
        confidence: OCR confidence score (0.0-1.0)
        children: Child elements (hierarchical structure)
        direction: Text direction ("ltr" or "rtl")
        language: Language code (e.g., "eng", "deu", "chi_sim")
        baseline: Text baseline information (slope and intercept)
        textangle: Text rotation angle in degrees (counter-clockwise from horizontal)
        font: Font information (name, size, style)
        dpi: Image resolution in dots per inch (typically for page-level)
        page_number: Physical page number (0-indexed)
        logical_page_number: Logical page number (as printed on the page)
    """

    ocr_class: str

    # Bounding boxes
    bbox: BoundingBox | None = None
    poly: list[tuple[float, float]] | None = None

    # Text content
    text: str = ""

    # Confidence (0.0-1.0)
    confidence: float | None = None

    # Children (hierarchical structure)
    children: list[OcrElement] = field(default_factory=list)

    # Text direction and language
    direction: Literal["ltr", "rtl"] | None = None
    language: str | None = None

    # Baseline (for lines)
    baseline: Baseline | None = None

    # Rotation angle in degrees (counter-clockwise)
    textangle: float | None = None

    # Font information
    font: FontInfo | None = None

    # Page-level properties
    dpi: float | None = None
    page_number: int | None = None
    logical_page_number: int | None = None

    def iter_by_class(self, *ocr_classes: str) -> list[OcrElement]:
        """Iterate over all descendants matching the given class(es).

        Args:
            *ocr_classes: One or more ocr_class values to match

        Returns:
            List of all matching descendant elements (depth-first order)
        """
        result = []
        if self.ocr_class in ocr_classes:
            result.append(self)
        for child in self.children:
            result.extend(child.iter_by_class(*ocr_classes))
        return result

    def find_by_class(self, *ocr_classes: str) -> OcrElement | None:
        """Find the first descendant matching the given class(es).

        Args:
            *ocr_classes: One or more ocr_class values to match

        Returns:
            The first matching element, or None if not found
        """
        if self.ocr_class in ocr_classes:
            return self
        for child in self.children:
            result = child.find_by_class(*ocr_classes)
            if result is not None:
                return result
        return None

    def get_text_recursive(self) -> str:
        """Get the combined text of this element and all descendants.

        Returns:
            Combined text content, with words separated by spaces
        """
        if self.text:
            return self.text
        texts = [child.get_text_recursive() for child in self.children]
        return " ".join(t for t in texts if t)

    @property
    def words(self) -> list[OcrElement]:
        """Get all word elements (ocrx_word) in this element's subtree."""
        return self.iter_by_class("ocrx_word")

    @property
    def lines(self) -> list[OcrElement]:
        """Get all line elements in this element's subtree."""
        return self.iter_by_class(
            "ocr_line", "ocr_header", "ocr_footer", "ocr_caption", "ocr_textfloat"
        )

    @property
    def paragraphs(self) -> list[OcrElement]:
        """Get all paragraph elements (ocr_par) in this element's subtree."""
        return self.iter_by_class("ocr_par")


# Type alias for text direction
TextDirection = Literal["ltr", "rtl"]


# hOCR class constants for convenience
class OcrClass:
    """Constants for common OCR element classes."""

    # Page-level
    PAGE = "ocr_page"
    CAREA = "ocr_carea"

    # Block-level
    PARAGRAPH = "ocr_par"

    # Line-level
    LINE = "ocr_line"
    HEADER = "ocr_header"
    FOOTER = "ocr_footer"
    CAPTION = "ocr_caption"
    TEXTFLOAT = "ocr_textfloat"

    # Word-level
    WORD = "ocrx_word"

    # Character-level
    CHAR = "ocrx_cinfo"

    # Line types (for convenience)
    LINE_TYPES = frozenset({LINE, HEADER, FOOTER, CAPTION, TEXTFLOAT})


================================================
FILE: src/ocrmypdf/optimize.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Post-processing image optimization of OCR PDFs."""

from __future__ import annotations

import logging
import sys
import tempfile
import threading
from collections.abc import Callable, Iterator, MutableSet, Sequence
from os import fspath
from pathlib import Path
from typing import Any, NamedTuple, NewType
from zlib import compress

import img2pdf
from packaging.version import Version
from pikepdf import (
    Array,
    Dictionary,
    Name,
    Object,
    ObjectStreamMode,
    Pdf,
    PdfError,
    PdfImage,
    Stream,
    UnsupportedImageTypeError,
)
from pikepdf.models.image import HifiPrintImageNotTranscodableError
from PIL import Image

from ocrmypdf._concurrent import Executor, SerialExecutor
from ocrmypdf._exec import ghostscript, jbig2enc, pngquant
from ocrmypdf._jobcontext import PdfContext
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.exceptions import OutputFileAccessError
from ocrmypdf.helpers import IMG2PDF_KWARGS, safe_symlink

log = logging.getLogger(__name__)

DEFAULT_JPEG_QUALITY = 75
DEFAULT_PNG_QUALITY = 70
FLATE_JPEG_THRESHOLD = 10000


Xref = NewType('Xref', int)


class XrefExt(NamedTuple):
    """A PDF xref and image extension pair."""

    xref: Xref
    ext: str


def img_name(root: Path, xref: Xref, ext: str) -> Path:
    """Return the name of an image file for a given xref and extension."""
    return root / f'{xref:08d}{ext}'


def png_name(root: Path, xref: Xref) -> Path:
    """Return the name of a PNG file for a given xref."""
    return img_name(root, xref, '.png')


def jpg_name(root: Path, xref: Xref) -> Path:
    """Return the name of a JPEG file for a given xref."""
    return img_name(root, xref, '.jpg')


def extract_image_filter(
    image: Stream, xref: Xref
) -> tuple[PdfImage, tuple[Name, Object]] | None:
    """Determine if an image is extractable."""
    if image.Subtype != Name.Image:
        return None
    if not isinstance(image.Length, int) or image.Length < 100:
        log.debug(f"xref {xref}: skipping image with small stream size")
        return None
    if (
        not isinstance(image.Width, int)
        or not isinstance(image.Height, int)
        or image.Width < 8
        or image.Height < 8
    ):  # Issue 732
        log.debug(f"xref {xref}: skipping image with unusually small dimensions")
        return None

    pim = PdfImage(image)

    if len(pim.filter_decodeparms) > 1:
        first_filtdp = pim.filter_decodeparms[0]
        second_filtdp = pim.filter_decodeparms[1]
        if (
            len(pim.filter_decodeparms) == 2
            and first_filtdp[0] == Name.FlateDecode
            and first_filtdp[1] is not None
            and first_filtdp[1].get(Name.Predictor, 1) == 1
            and second_filtdp[0] == Name.DCTDecode
            and not second_filtdp[1]
        ):
            log.debug(
                f"xref {xref}: found image compressed as /FlateDecode /DCTDecode, "
                "marked for JPEG optimization"
            )
            filtdp = pim.filter_decodeparms[1]
        else:
            log.debug(f"xref {xref}: skipping image with multiple compression filters")
            return None
    else:
        filtdp = pim.filter_decodeparms[0]

    if pim.bits_per_component > 8:
        log.debug(f"xref {xref}: skipping wide gamut image")
        return None  # Don't mess with wide gamut images

    if filtdp[0] == Name.JPXDecode:
        log.debug(f"xref {xref}: skipping JPEG2000 image")
        return None  # Don't do JPEG2000

    if filtdp[0] == Name.CCITTFaxDecode and filtdp[1].get('/K', 0) >= 0:
        log.debug(f"xref {xref}: skipping CCITT Group 3 image")
        return None  # pikepdf doesn't support Group 3 yet

    if Name.Decode in image:
        log.debug(f"xref {xref}: skipping image with Decode table")
        return None  # Don't mess with custom Decode tables
    if image.get(Name.SMask, Dictionary()).get(Name.Matte, None) is not None:
        # https://github.com/ocrmypdf/OCRmyPDF/issues/1536
        # Do not attempt to optimize images that have a SMask with a Matte.
        # That means alpha channel pre-blending is used, and we're not prepared
        # to deal with the complexities of that.
        log.debug(f"xref {xref}: skipping image whose SMask has Matte")
        return None

    return pim, filtdp


def extract_image_jbig2(
    *, pdf: Pdf, root: Path, image: Stream, xref: Xref, options
) -> XrefExt | None:
    """Extract an image, saving it as a JBIG2 file."""
    del options  # unused arg

    result = extract_image_filter(image, xref)
    if result is None:
        return None
    pim, filtdp = result

    if (
        pim.bits_per_component == 1
        and filtdp[0] != Name.JBIG2Decode
        and jbig2enc.available()
    ):
        # Save any colorspace associated with the image, so that we
        # will export a pure 1-bit PNG with no palette or ICC profile.
        # Showing the palette or ICC to jbig2enc will cause it to perform
        # colorspace transform to 1bpp, which will conflict the palette or
        # ICC if it exists.
        colorspace = pim.obj.get(Name.ColorSpace, None)
        if colorspace is not None or pim.image_mask:
            try:
                # Set to DeviceGray temporarily; we already in 1 bpc.
                pim.obj.ColorSpace = Name.DeviceGray
                imgname = root / f'{xref:08d}'
                with imgname.open('wb') as f:
                    ext = pim.extract_to(stream=f)
                # Rename the file so it has .prejbig2.ext extension
                # Making it unique avoids problems with Windows if the
                # same image is extracted multiple times
                imgname.rename(imgname.with_suffix(".prejbig2" + ext))
            except NotImplementedError as e:
                if '/Decode' in str(e):
                    log.debug(
                        f"xref {xref}: skipping image with unsupported Decode table"
                    )
                    return None
                raise
            except UnsupportedImageTypeError:
                return None
            finally:
                # Restore image colorspace after temporarily setting it to DeviceGray
                if colorspace is not None:
                    pim.obj.ColorSpace = colorspace
                else:
                    del pim.obj.ColorSpace
            return XrefExt(xref, ".prejbig2" + ext)
    return None


def _should_optimize_jpeg(options, filtdp):
    if options.optimize >= 2:
        return True
    # Ghostscript 10.6.0+ introduced some sort of JPEG encoding issue.
    # To resolve this, re-optimize the JPEG anyway.
    return options.optimize < 2 and ghostscript.version() >= Version('10.6.0')


def extract_image_generic(
    *, pdf: Pdf, root: Path, image: Stream, xref: Xref, options
) -> XrefExt | None:
    """Generic image extraction."""
    result = extract_image_filter(image, xref)
    if result is None:
        return None
    pim, filtdp = result

    # Don't try to PNG-optimize 1bpp images, since JBIG2 does it better.
    if pim.bits_per_component == 1:
        return None

    if filtdp[0] == Name.DCTDecode and _should_optimize_jpeg(options, filtdp):
        try:
            imgname = root / f'{xref:08d}'
            with imgname.open('wb') as f:
                ext = pim.extract_to(stream=f)
            imgname.rename(imgname.with_suffix(ext))
        except (UnsupportedImageTypeError, HifiPrintImageNotTranscodableError):
            return None
        return XrefExt(xref, ext)
    elif (
        pim.indexed
        and pim.colorspace in pim.SIMPLE_COLORSPACES
        and options.optimize >= 3
    ):
        # Try to improve on indexed images - these are far from low hanging
        # fruit in most cases
        pim.as_pil_image().save(png_name(root, xref))
        return XrefExt(xref, '.png')
    elif not pim.indexed and pim.colorspace in pim.SIMPLE_COLORSPACES:
        # An optimization opportunity here, not currently taken, is directly
        # generating a PNG from compressed data
        try:
            pim.as_pil_image().save(png_name(root, xref))
        except NotImplementedError:
            log.warning("PDF contains an atypical image that cannot be optimized.")
            return None
        return XrefExt(xref, '.png')
    elif (
        not pim.indexed
        and pim.colorspace == Name.ICCBased
        and pim.bits_per_component == 1
    ):
        # We can losslessly optimize 1-bit images to CCITT or JBIG2 without
        # paying any attention to the ICC profile
        pim.as_pil_image().save(png_name(root, xref))
        return XrefExt(xref, '.png')

    return None


def _find_image_xrefs_container(
    pdf: Pdf,
    container: Object,
    pageno: int,
    include_xrefs: MutableSet[Xref],
    exclude_xrefs: MutableSet[Xref],
    pageno_for_xref: dict[Xref, int],
    depth: int = 0,
):
    """Find all image XRefs or Form XObject and add to the include/exclude sets."""
    if depth > 10:
        log.warning("Recursion depth exceeded in _find_image_xrefs_page")
        return
    try:
        xobjs = container.Resources.XObject
    except AttributeError:
        return
    for _imname, image in dict(xobjs).items():
        if image.objgen[1] != 0:
            continue  # Ignore images in an incremental PDF
        xref = Xref(image.objgen[0])
        if xref in include_xrefs or xref in exclude_xrefs:
            continue  # Already processed
        if Name.Subtype in image and image.Subtype == Name.Form:
            # Recurse into Form XObjects
            log.debug(f"Recursing into Form XObject {_imname} in page {pageno}")
            _find_image_xrefs_container(
                pdf,
                image,
                pageno,
                include_xrefs,
                exclude_xrefs,
                pageno_for_xref,
                depth + 1,
            )
            continue
        if Name.SMask in image:
            # Ignore soft masks
            smask_xref = Xref(image.SMask.objgen[0])
            exclude_xrefs.add(smask_xref)
            log.debug(f"xref {smask_xref}: skipping image because it is an SMask")
        include_xrefs.add(xref)
        log.debug(f"xref {xref}: treating as an optimization candidate")
        if xref not in pageno_for_xref:
            pageno_for_xref[xref] = pageno


def _find_image_xrefs(pdf: Pdf):
    include_xrefs: MutableSet[Xref] = set()
    exclude_xrefs: MutableSet[Xref] = set()
    pageno_for_xref: dict[Xref, int] = {}

    for pageno, page in enumerate(pdf.pages):
        _find_image_xrefs_container(
            pdf, page.obj, pageno, include_xrefs, exclude_xrefs, pageno_for_xref
        )

    working_xrefs = include_xrefs - exclude_xrefs
    return working_xrefs, pageno_for_xref


def extract_images(
    pdf: Pdf,
    root: Path,
    options,
    extract_fn: Callable[..., XrefExt | None],
) -> Iterator[tuple[int, XrefExt]]:
    """Extract image using extract_fn.

    Enumerate images on each page, lookup their xref/ID number in the PDF.
    Exclude images that are soft masks (i.e. alpha transparency related).
    Record the page number on which an image is first used, since images may be
    used on multiple pages (or multiple times on the same page).

    Current we do not check Form XObjects or other objects that may contain
    images, and we don't evaluate alternate images or thumbnails.

    extract_fn must decide if wants to extract the image in this context. If
    it does a tuple should be returned: (xref, ext) where .ext is the file
    extension. extract_fn must also extract the file it finds interesting.
    """
    errors = 0
    working_xrefs, pageno_for_xref = _find_image_xrefs(pdf)
    for xref in working_xrefs:
        image = pdf.get_object((xref, 0))
        try:
            result = extract_fn(
                pdf=pdf, root=root, image=image, xref=xref, options=options
            )
        except Exception:  # pylint: disable=broad-except
            log.exception(
                f"xref {xref}: While extracting this image, an error occurred"
            )
            errors += 1
        else:
            if result:
                _, ext = result
                yield pageno_for_xref[xref], XrefExt(xref, ext)


def extract_images_generic(
    pdf: Pdf, root: Path, options
) -> tuple[list[Xref], list[Xref]]:
    """Extract any >=2bpp image we think we can improve."""
    jpegs = []
    pngs = []
    for _, xref_ext in extract_images(pdf, root, options, extract_image_generic):
        log.debug('%s', xref_ext)
        if xref_ext.ext == '.png':
            pngs.append(xref_ext.xref)
        elif xref_ext.ext == '.jpg':
            jpegs.append(xref_ext.xref)
    log.debug(f"Optimizable images: JPEGs: {len(jpegs)} PNGs: {len(pngs)}")
    return jpegs, pngs


def extract_images_jbig2(pdf: Pdf, root: Path, options) -> list[XrefExt]:
    """Extract any bitonal image that we think we can improve as JBIG2."""
    jbig2_images = []
    for _pageno, xref_ext in extract_images(pdf, root, options, extract_image_jbig2):
        jbig2_images.append(xref_ext)

    log.debug(f"Optimizable images: JBIG2: {len(jbig2_images)}")
    return jbig2_images


def _produce_jbig2_images(
    jbig2_images: list[XrefExt], root: Path, options, executor: Executor
) -> None:
    """Produce JBIG2 images using lossless single-image encoding."""

    def jbig2_args():
        for xref_ext in jbig2_images:
            xref, ext = xref_ext
            yield (
                fspath(root),
                img_name(root, xref, ext),
                root / f'{xref:08d}.jbig2',
                options.jbig2_threshold,
            )

    executor(
        use_threads=True,
        max_workers=options.jobs,
        progress_kwargs=dict(
            total=len(jbig2_images),
            desc="JBIG2",
            unit='image',
            disable=not options.progress_bar,
        ),
        task=jbig2enc.convert_single,
        task_arguments=jbig2_args(),
    )


def convert_to_jbig2(
    pdf: Pdf,
    jbig2_images: list[XrefExt],
    root: Path,
    options,
    executor: Executor,
) -> None:
    """Convert images to JBIG2 and insert into PDF.

    Each JBIG2 image is encoded independently using lossless compression.
    No symbol dictionary (JBIG2Globals) is used.
    """
    _produce_jbig2_images(jbig2_images, root, options, executor)

    for xref_ext in jbig2_images:
        xref, _ = xref_ext
        jbig2_im_file = root / f'{xref:08d}.jbig2'
        jbig2_im_data = jbig2_im_file.read_bytes()
        im_obj = pdf.get_object(xref, 0)
        im_obj.write(jbig2_im_data, filter=Name.JBIG2Decode, decode_parms=None)


def _optimize_jpeg(
    xref: Xref, in_jpg: Path, opt_jpg: Path, jpg_quality: int
) -> tuple[Xref, Path | None]:
    with Image.open(in_jpg) as im:
        save_kwargs: dict[str, Any] = {'optimize': True}
        if isinstance(jpg_quality, int) and 0 < jpg_quality <= 100:
            save_kwargs['quality'] = jpg_quality
        im.save(opt_jpg, **save_kwargs)

    if opt_jpg.stat().st_size > in_jpg.stat().st_size:
        log.debug(f"xref {xref}, jpeg, made larger - skip")
        opt_jpg.unlink()
        return xref, None
    return xref, opt_jpg


def transcode_jpegs(
    pdf: Pdf, jpegs: Sequence[Xref], root: Path, options, executor: Executor
) -> None:
    """Optimize JPEGs according to optimization settings."""

    def jpeg_args() -> Iterator[tuple[Xref, Path, Path, int]]:
        for xref in jpegs:
            in_jpg = jpg_name(root, xref)
            opt_jpg = in_jpg.with_suffix('.opt.jpg')
            yield xref, in_jpg, opt_jpg, options.jpg_quality

    def finish_jpeg(result: tuple[Xref, Path | None], pbar: ProgressBar):
        xref, opt_jpg = result
        if opt_jpg:
            compdata = opt_jpg.read_bytes()  # JPEG can inserted into PDF as is
            im_obj = pdf.get_object(xref, 0)
            im_obj.write(compdata, filter=Name.DCTDecode)
        pbar.update()

    executor(
        use_threads=True,  # Processes are significantly slower at this task
        max_workers=options.jobs,
        progress_kwargs=dict(
            desc="Recompressing JPEGs",
            total=len(jpegs),
            unit='image',
            disable=not options.progress_bar,
        ),
        task=_optimize_jpeg,
        task_arguments=jpeg_args(),
        task_finished=finish_jpeg,
    )


def _already_flate_encoded(image: Stream) -> bool:
    """Check if the image already has FlateDecode in its filter chain."""
    filt = image.get(Name.Filter)
    if filt is None:
        return False
    if isinstance(filt, Array):
        return Name.FlateDecode in list(filt)
    return filt == Name.FlateDecode


def _find_deflatable_jpeg(
    *, pdf: Pdf, root: Path, image: Stream, xref: Xref, options
) -> XrefExt | None:
    result = extract_image_filter(image, xref)
    if result is None:
        return None
    _pim, filtdp = result

    # Skip if already FlateDecode compressed - would double-compress
    if _already_flate_encoded(image):
        return None

    if (
        filtdp[0] == Name.DCTDecode
        and not filtdp[1]
        and (
            (
                # Don't flate very large images because it will slow down PDF viewers
                1 <= options.optimize <= 2
                and image.get(Name.Width, 0) < FLATE_JPEG_THRESHOLD
                and image.get(Name.Height, 0) < FLATE_JPEG_THRESHOLD
            )
            or options.optimize == 3
        )
    ):
        return XrefExt(xref, '.memory')

    return None


def _deflate_jpeg(
    pdf: Pdf, lock: threading.Lock, xref: Xref, complevel: int
) -> tuple[Xref, bytes]:
    with lock:
        xobj = pdf.get_object(xref, 0)
        try:
            data = xobj.read_raw_bytes()
        except PdfError:
            return xref, b''
    compdata = compress(data, complevel)
    if len(compdata) >= len(data):
        return xref, b''
    return xref, compdata


def deflate_jpegs(pdf: Pdf, root: Path, options, executor: Executor) -> None:
    """Apply FlateDecode to JPEGs.

    This is a lossless compression method that is supported by all PDF viewers,
    and generally results in a smaller file size compared to straight DCTDecode
    images.
    """
    jpegs = []
    for _pageno, xref_ext in extract_images(pdf, root, options, _find_deflatable_jpeg):
        xref = xref_ext.xref
        log.debug(f'xref {xref}: marking this JPEG as deflatable')
        jpegs.append(xref)

    complevel = 9 if options.optimize == 3 else 6

    # Our calls to xobj.write() in finish() need coordination
    lock = threading.Lock()

    def deflate_args() -> Iterator:
        for xref in jpegs:
            yield pdf, lock, xref, complevel

    def finish(result: tuple[Xref, bytes], pbar: ProgressBar):
        xref, compdata = result
        if len(compdata) > 0:
            with lock:
                xobj = pdf.get_object(xref, 0)
                xobj.write(compdata, filter=[Name.FlateDecode, Name.DCTDecode])
        pbar.update()

    executor(
        use_threads=True,  # We're sharing the pdf directly, must use threads
        max_workers=options.jobs,
        progress_kwargs=dict(
            desc="Deflating JPEGs",
            total=len(jpegs),
            unit='image',
            disable=not options.progress_bar,
        ),
        task=_deflate_jpeg,
        task_arguments=deflate_args(),
        task_finished=finish,
    )


def _transcode_png(pdf: Pdf, filename: Path, xref: Xref) -> bool:
    output = filename.with_suffix('.png.pdf')
    with output.open('wb') as f:
        img2pdf.convert(fspath(filename), outputstream=f, **IMG2PDF_KWARGS)

    with Pdf.open(output) as pdf_image:
        foreign_image = next(iter(pdf_image.pages[0].images.values()))
        local_image = pdf.copy_foreign(foreign_image)

        im_obj = pdf.get_object(xref, 0)
        im_obj.write(
            local_image.read_raw_bytes(),
            filter=local_image.Filter,
            decode_parms=local_image.DecodeParms,
        )

        # Don't copy keys from the new image...
        del_keys = set(im_obj.keys()) - set(local_image.keys())
        # ...except for the keep_fields, which are essential to displaying
        # the image correctly and preserving its metadata. (/Decode arrays
        # and /SMaskInData are implicitly discarded prior to this point.)
        keep_fields = {
            '/ID',
            '/Intent',
            '/Interpolate',
            '/Mask',
            '/Metadata',
            '/OC',
            '/OPI',
            '/SMask',
            '/StructParent',
        }
        del_keys -= keep_fields
        for key in local_image.keys():
            if key != Name.Length and str(key) not in keep_fields:
                im_obj[key] = local_image[key]
        for key in del_keys:
            del im_obj[key]
    return True


def transcode_pngs(
    pdf: Pdf,
    images: Sequence[Xref],
    image_name_fn: Callable[[Path, Xref], Path],
    root: Path,
    options,
    executor: Executor,
) -> None:
    """Apply lossy transcoding to PNGs."""
    modified: MutableSet[Xref] = set()
    if options.optimize >= 2:
        png_quality = (
            max(10, options.png_quality - 10),
            min(100, options.png_quality + 10),
        )

        def pngquant_args():
            for xref in images:
                log.debug(image_name_fn(root, xref))
                yield (
                    image_name_fn(root, xref),
                    png_name(root, xref),
                    png_quality[0],
                    png_quality[1],
                )
                modified.add(xref)

        executor(
            use_threads=True,
            max_workers=options.jobs,
            progress_kwargs=dict(
                desc="PNGs",
                total=len(images),
                unit='image',
                disable=not options.progress_bar,
            ),
            task=pngquant.quantize,
            task_arguments=pngquant_args(),
        )

    for xref in modified:
        filename = png_name(root, xref)
        _transcode_png(pdf, filename, xref)


DEFAULT_EXECUTOR = SerialExecutor()


def optimize(
    input_file: Path,
    output_file: Path,
    context: PdfContext,
    save_settings: dict[str, Any],
    executor: Executor = DEFAULT_EXECUTOR,
) -> Path:
    """Optimize images in a PDF file."""
    options = context.options
    if options.optimize == 0:
        safe_symlink(input_file, output_file)
        return output_file

    if not options.jpg_quality:
        options.jpg_quality = DEFAULT_JPEG_QUALITY if options.optimize < 3 else 40
    if not options.png_quality:
        options.png_quality = DEFAULT_PNG_QUALITY if options.optimize < 3 else 30

    with Pdf.open(input_file) as pdf:
        root = output_file.parent / 'images'
        root.mkdir(exist_ok=True)

        jpegs, pngs = extract_images_generic(pdf, root, options)
        transcode_jpegs(pdf, jpegs, root, options, executor)
        deflate_jpegs(pdf, root, options, executor)
        # if options.optimize >= 2:
        # Try pngifying the jpegs
        #    transcode_pngs(pdf, jpegs, jpg_name, root, options)
        transcode_pngs(pdf, pngs, png_name, root, options, executor)

        jbig2_images = extract_images_jbig2(pdf, root, options)
        convert_to_jbig2(pdf, jbig2_images, root, options, executor)

        target_file = output_file.with_suffix('.opt.pdf')
        pdf.remove_unreferenced_resources()
        pdf.save(target_file, **save_settings)

    input_size = input_file.stat().st_size
    output_size = target_file.stat().st_size
    if output_size == 0:
        raise OutputFileAccessError(
            f"Output file not created after optimizing. We probably ran "
            f"out of disk space in the temporary folder: {tempfile.gettempdir()}."
        )
    savings = 1 - output_size / input_size

    if savings < 0:
        log.info(
            "Image optimization did not improve the file - "
            "optimizations will not be used"
        )
        # We still need to save the file
        with Pdf.open(input_file) as pdf:
            pdf.remove_unreferenced_resources()
            pdf.save(output_file, **save_settings)
    else:
        safe_symlink(target_file, output_file)

    return output_file


def main(infile, outfile, level, jobs=1):
    """Entry point for direct optimization of a file."""
    from shutil import copy  # pylint: disable=import-outside-toplevel
    from tempfile import TemporaryDirectory  # pylint: disable=import-outside-toplevel

    from ocrmypdf._options import OcrOptions  # pylint: disable=import-outside-toplevel

    infile = Path(infile)

    # Create OcrOptions with optimization-specific settings
    options = OcrOptions(
        input_file=infile,
        output_file=outfile,  # Required field
        jobs=jobs,
        optimize=int(level),
        jpg_quality=0,  # Use default
        png_quality=0,
        jbig2_threshold=0.85,
        quiet=True,
        progress_bar=False,
    )

    with TemporaryDirectory() as tmpdir:
        context = PdfContext(options, Path(tmpdir), infile, None, None)
        tmpout = Path(tmpdir) / 'out.pdf'
        optimize(
            infile,
            tmpout,
            context,
            dict(
                compress_streams=True,
                preserve_pdfa=True,
                object_stream_mode=ObjectStreamMode.generate,
            ),
        )
        copy(fspath(tmpout), fspath(outfile))


if __name__ == '__main__':
    main(sys.argv[1], sys.argv[2], sys.argv[3])


================================================
FILE: src/ocrmypdf/pdfa.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Utilities for PDF/A production and confirmation with Ghostscript."""

from __future__ import annotations

import base64
import logging
from collections.abc import Iterator
from importlib.resources import files as package_files
from pathlib import Path

import pikepdf
from pikepdf import Array, Dictionary, Name, Pdf, Stream

log = logging.getLogger(__name__)

SRGB_ICC_PROFILE_NAME = 'sRGB.icc'


def _postscript_objdef(
    alias: str,
    dictionary: dict[str, str],
    *,
    stream_name: str | None = None,
    stream_data: bytes | None = None,
) -> Iterator[str]:
    assert (stream_name is None) == (stream_data is None)

    objtype = '/stream' if stream_name else '/dict'

    if stream_name:
        assert stream_data is not None
        a85_data = base64.a85encode(stream_data, adobe=True).decode('ascii')
        yield f'{stream_name} ' + a85_data
        yield 'def'

    if alias != '{Catalog}':  # Catalog needs no definition
        yield f'[/_objdef {alias} /type {objtype} /OBJ pdfmark'

    yield f'[{alias} <<'
    for key, val in dictionary.items():
        yield f'  {key} {val}'
    yield '>> /PUT pdfmark'

    if stream_name:
        yield f'[{alias} {stream_name[1:]} /PUT pdfmark'


def _make_postscript(icc_name: str, icc_data: bytes, colors: int) -> Iterator[str]:
    yield '%!'
    yield from _postscript_objdef(
        '{icc_PDFA}',  # Not an f-string
        {'/N': str(colors)},
        stream_name='/ICCProfile',
        stream_data=icc_data,
    )
    yield ''
    yield from _postscript_objdef(
        '{OutputIntent_PDFA}',
        {
            '/Type': '/OutputIntent',
            '/S': '/GTS_PDFA1',
            '/DestOutputProfile': '{icc_PDFA}',
            '/OutputConditionIdentifier': f'({icc_name})',  # Only f-string
        },
    )
    yield ''
    yield from _postscript_objdef(
        '{Catalog}', {'/OutputIntents': '[ {OutputIntent_PDFA} ]'}
    )


def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
    """Create a Postscript PDFMARK file for Ghostscript PDF/A conversion.

    pdfmark is an extension to the Postscript language that describes some PDF
    features like bookmarks and annotations. It was originally specified Adobe
    Distiller, for Postscript to PDF conversion.

    Ghostscript uses pdfmark for PDF to PDF/A conversion as well. To use Ghostscript
    to create a PDF/A, we need to create a pdfmark file with the necessary metadata.

    This function takes care of the many version-specific bugs and peculiarities in
    Ghostscript's handling of pdfmark.

    The only information we put in specifies that we want the file to be a
    PDF/A, and we want to Ghostscript to convert objects to the sRGB colorspace
    if it runs into any object that it decides must be converted.

    Arguments:
        target_filename: filename to save
        icc: ICC identifier such as 'sRGB'
    References:
        Adobe PDFMARK Reference:
        https://opensource.adobe.com/dc-acrobat-sdk-docs/library/pdfmark/
    """
    if icc != 'sRGB':
        raise NotImplementedError("Only supporting sRGB")

    bytes_icc_profile = (
        package_files('ocrmypdf.data') / SRGB_ICC_PROFILE_NAME
    ).read_bytes()
    postscript = '\n'.join(_make_postscript(icc, bytes_icc_profile, 3))

    # We should have encoded everything to pure ASCII by this point, and
    # to be safe, only allow ASCII in PostScript
    Path(target_filename).write_text(postscript, encoding='ascii')
    return target_filename


def file_claims_pdfa(filename: Path):
    """Determines if the file claims to be PDF/A compliant.

    This only checks if the XMP metadata contains a PDF/A marker. It does not
    do full PDF/A validation.
    """
    with pikepdf.open(filename) as pdf:
        pdfmeta = pdf.open_metadata()
        if not pdfmeta.pdfa_status:
            return {
                'pass': False,
                'output': 'pdf',
                'conformance': 'No PDF/A metadata in XMP',
            }
        valid_part_conforms = {'1a', '1b', '2a', '2b', '2u', '3a', '3b', '3u'}
        # Raw value in XMP metadata returned by pikepdf is uppercase, but ISO
        # uses lower case for conformance levels.
        pdfa_status_iso = pdfmeta.pdfa_status.lower()
        conformance = f'PDF/A-{pdfa_status_iso}'
        pdfa_dict: dict[str, str | bool] = {}
        if pdfa_status_iso in valid_part_conforms:
            pdfa_dict['pass'] = True
            pdfa_dict['output'] = 'pdfa'
        pdfa_dict['conformance'] = conformance
    return pdfa_dict


def _load_srgb_icc_profile() -> bytes:
    """Load the sRGB ICC profile from package data."""
    return (package_files('ocrmypdf.data') / SRGB_ICC_PROFILE_NAME).read_bytes()


def _pdfa_part_conformance(output_type: str) -> tuple[str, str]:
    """Extract PDF/A part and conformance from output_type.

    Args:
        output_type: One of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'

    Returns:
        Tuple of (part, conformance) e.g., ('2', 'B')
    """
    mapping = {
        'pdfa': ('2', 'B'),
        'pdfa-1': ('1', 'B'),
        'pdfa-2': ('2', 'B'),
        'pdfa-3': ('3', 'B'),
    }
    return mapping.get(output_type, ('2', 'B'))


def add_pdfa_metadata(pdf: Pdf, part: str, conformance: str) -> None:
    """Add PDF/A XMP metadata declaration to a PDF.

    Args:
        pdf: An open pikepdf.Pdf object
        part: PDF/A part number ('1', '2', or '3')
        conformance: Conformance level ('A', 'B', or 'U')
    """
    with pdf.open_metadata() as meta:
        meta['pdfaid:part'] = part
        meta['pdfaid:conformance'] = conformance


def add_srgb_output_intent(pdf: Pdf) -> None:
    """Add sRGB ICC profile as OutputIntent to PDF catalog.

    This creates the required PDF/A OutputIntent structure with:
    - An ICC profile stream containing sRGB profile
    - An OutputIntent dictionary pointing to that profile
    - Updates the Catalog's OutputIntents array

    Args:
        pdf: An open pikepdf.Pdf object
    """
    icc_data = _load_srgb_icc_profile()

    # Create ICC profile stream
    icc_stream = Stream(pdf, icc_data)
    icc_stream[Name.N] = 3  # RGB has 3 components

    # Create OutputIntent dictionary
    output_intent = Dictionary({
        '/Type': Name.OutputIntent,
        '/S': Name('/GTS_PDFA1'),
        '/OutputConditionIdentifier': 'sRGB',
        '/DestOutputProfile': icc_stream,
    })

    # Add to catalog's OutputIntents array
    if Name.OutputIntents not in pdf.Root:
        pdf.Root[Name.OutputIntents] = Array([])

    # Check if sRGB OutputIntent already exists
    for intent in pdf.Root.OutputIntents:  # type: ignore[attr-defined]
        if str(intent.get(Name.OutputConditionIdentifier)) == 'sRGB':
            log.debug('sRGB OutputIntent already exists, skipping')
            return

    pdf.Root.OutputIntents.append(output_intent)


def speculative_pdfa_conversion(
    input_file: Path,
    output_file: Path,
    output_type: str,
) -> Path:
    """Attempt to convert a PDF to PDF/A by adding required structures.

    This function creates a copy of the input PDF and adds:
    1. sRGB ICC profile as OutputIntent
    2. XMP metadata declaring PDF/A conformance

    This approach works for PDFs that are already mostly PDF/A compliant
    but lack the formal declarations. It does NOT perform color conversion,
    font embedding, or other transformations that Ghostscript does.

    Args:
        input_file: Path to input PDF
        output_file: Path where output PDF should be written
        output_type: One of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'

    Returns:
        Path to the output file

    Raises:
        pikepdf.PdfError: If the PDF cannot be opened or modified
    """
    part, conformance = _pdfa_part_conformance(output_type)

    with Pdf.open(input_file) as pdf:
        add_srgb_output_intent(pdf)
        add_pdfa_metadata(pdf, part, conformance)

        pdf.save(output_file)

    log.debug('Speculative PDF/A conversion complete: %s', output_file)
    return output_file


================================================
FILE: src/ocrmypdf/pdfinfo/__init__.py
================================================
#!/usr/bin/env python3

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""For extracting information about PDFs prior to OCR."""

from __future__ import annotations

from ocrmypdf.pdfinfo._types import Colorspace, Encoding, FloatRect
from ocrmypdf.pdfinfo.info import PageInfo, PdfInfo

__all__ = ["Colorspace", "Encoding", "FloatRect", "PageInfo", "PdfInfo"]


================================================
FILE: src/ocrmypdf/pdfinfo/_contentstream.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""PDF content stream interpretation."""

from __future__ import annotations

import re
from collections import defaultdict
from collections.abc import Mapping
from math import hypot, inf, isclose
from typing import NamedTuple
from warnings import warn

from pikepdf import Matrix, Object, PdfInlineImage, parse_content_stream

from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo._types import UNIT_SQUARE


class XobjectSettings(NamedTuple):
    """Info about an XObject found in a PDF."""

    name: str
    shorthand: tuple[float, float, float, float, float, float]
    stack_depth: int


class InlineSettings(NamedTuple):
    """Info about an inline image found in a PDF."""

    iimage: PdfInlineImage
    shorthand: tuple[float, float, float, float, float, float]
    stack_depth: int


class ContentsInfo(NamedTuple):
    """Info about various objects found in a PDF."""

    xobject_settings: list[XobjectSettings]
    inline_images: list[InlineSettings]
    found_vector: bool
    found_text: bool
    name_index: Mapping[str, list[XobjectSettings]]


class TextboxInfo(NamedTuple):
    """Info about a text box found in a PDF."""

    bbox: tuple[float, float, float, float]
    is_visible: bool
    is_corrupt: bool


class VectorMarker:
    """Sentinel indicating vector drawing operations were found on a page."""


class TextMarker:
    """Sentinel indicating text drawing operations were found on a page."""


def _is_unit_square(shorthand):
    """Check if the shorthand represents a unit square transformation."""
    values = map(float, shorthand)
    pairwise = zip(values, UNIT_SQUARE, strict=False)
    return all(isclose(a, b, rel_tol=1e-3) for a, b in pairwise)


def _normalize_stack(graphobjs):
    """Convert runs of qQ's in the stack into single graphobjs."""
    for operands, operator in graphobjs:
        operator = str(operator)
        if re.match(r'Q*q+$', operator):  # Zero or more Q, one or more q
            for char in operator:  # Split into individual
                yield ([], char)  # Yield individual
        else:
            yield (operands, operator)


def _interpret_contents(contentstream: Object, initial_shorthand=UNIT_SQUARE):
    """Interpret the PDF content stream.

    The stack represents the state of the PDF graphics stack.  We are only
    interested in the current transformation matrix (CTM) so we only track
    this object; a full implementation would need to track many other items.

    The CTM is initialized to the mapping from user space to device space.
    PDF units are 1/72".  In a PDF viewer or printer this matrix is initialized
    to the transformation to device space.  For example if set to
    (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches.

    Images are always considered to be (0, 0) -> (1, 1).  Before drawing an
    image there should be a 'cm' that sets up an image coordinate system
    where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
    page.

    PDF units suit our needs so we initialize ctm to the identity matrix.

    According to the PDF specification, the maximum stack depth is 32. Other
    viewers tolerate some amount beyond this.  We issue a warning if the
    stack depth exceeds the spec limit and set a hard limit beyond this to
    bound our memory requirements.  If the stack underflows behavior is
    undefined in the spec, but we just pretend nothing happened and leave the
    CTM unchanged.
    """
    stack = []
    ctm = Matrix(initial_shorthand)
    xobject_settings: list[XobjectSettings] = []
    inline_images: list[InlineSettings] = []
    name_index = defaultdict(lambda: [])
    found_vector = False
    found_text = False
    vector_ops = set('S s f F f* B B* b b*'.split())
    text_showing_ops = set("""TJ Tj " '""".split())
    image_ops = set('BI ID EI q Q Do cm'.split())
    operator_whitelist = ' '.join(vector_ops | text_showing_ops | image_ops)

    for n, graphobj in enumerate(
        _normalize_stack(parse_content_stream(contentstream, operator_whitelist))
    ):
        operands, operator = graphobj
        if operator == 'q':
            stack.append(ctm)
            if len(stack) > 32:  # See docstring
                if len(stack) > 128:
                    raise RuntimeError(
                        f"PDF graphics stack overflowed hard limit at operator {n}"
                    )
                warn("PDF graphics stack overflowed spec limit")
        elif operator == 'Q':
            try:
                ctm = stack.pop()
            except IndexError:
                # Keeping the ctm the same seems to be the only sensible thing
                # to do. Just pretend nothing happened, keep calm and carry on.
                warn("PDF graphics stack underflowed - PDF may be malformed")
        elif operator == 'cm':
            try:
                ctm = Matrix(operands) @ ctm
            except ValueError as e:
                raise InputFileError(
                    "PDF content stream is corrupt - this PDF is malformed. "
                    "Use a PDF editor that is capable of visually inspecting the PDF."
                ) from e
        elif operator == 'Do':
            image_name = operands[0]
            settings = XobjectSettings(
                name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack)
            )
            xobject_settings.append(settings)
            name_index[str(image_name)].append(settings)
        elif operator == 'INLINE IMAGE':  # BI/ID/EI are grouped into this
            iimage = operands[0]
            inline = InlineSettings(
                iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack)
            )
            inline_images.append(inline)
        elif operator in vector_ops:
            found_vector = True
        elif operator in text_showing_ops:
            found_text = True

    return ContentsInfo(
        xobject_settings=xobject_settings,
        inline_images=inline_images,
        found_vector=found_vector,
        found_text=found_text,
        name_index=name_index,
    )


def _get_dpi(ctm_shorthand, image_size) -> Resolution:
    """Given the transformation matrix and image size, find the image DPI.

    PDFs do not include image resolution information within image data.
    Instead, the PDF page content stream describes the location where the
    image will be rasterized, and the effective resolution is the ratio of the
    pixel size to raster target size.

    Normally a scanned PDF has the paper size set appropriately but this is
    not guaranteed. The most common case is a cropped image will change the
    page size (/CropBox) without altering the page content stream. That means
    it is not sufficient to assume that the image fills the page, even though
    that is the most common case.

    A PDF image may be scaled (always), cropped, translated, rotated in place
    to an arbitrary angle (rarely) and skewed. Only equal area mappings can
    be expressed, that is, it is not necessary to consider distortions where
    the effective DPI varies with position.

    To determine the image scale, transform an offset axis vector v0 (0, 0),
    width-axis vector v0 (1, 0), height-axis vector vh (0, 1) with the matrix,
    which gives the dimensions of the image in PDF units. From there we can
    compare to actual image dimensions. PDF uses
    row vector * matrix_transposed unlike the traditional
    matrix * column vector.

    The offset, width and height vectors can be combined in a matrix and
    multiplied by the transform matrix. Then we want to calculated
        magnitude(width_vector - offset_vector)
    and
        magnitude(height_vector - offset_vector)

    When the above is worked out algebraically, the effect of translation
    cancels out, and the vector magnitudes become functions of the nonzero
    transformation matrix indices. The results of the derivation are used
    in this code.

    pdfimages -list does calculate the DPI in some way that is not completely
    naive, but it does not get the DPI of rotated images right, so cannot be
    used anymore to validate this. Photoshop works, or using Acrobat to
    rotate the image back to normal.

    It does not matter if the image is partially cropped, or even out of the
    /MediaBox.

    """
    a, b, c, d, _, _ = ctm_shorthand  # pylint: disable=invalid-name

    # Calculate the width and height of the image in PDF units
    image_drawn = hypot(a, b), hypot(c, d)

    def calc(drawn, pixels, inches_per_pt=72.0):
        # The scale of the image is pixels per unit of default user space (1/72")
        scale = pixels / drawn if drawn != 0 else inf
        dpi = scale * inches_per_pt
        return dpi

    dpi_w, dpi_h = (calc(image_drawn[n], image_size[n]) for n in range(2))
    return Resolution(dpi_w, dpi_h)


================================================
FILE: src/ocrmypdf/pdfinfo/_image.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""PDF image analysis."""

from __future__ import annotations

import logging
from collections.abc import Iterator
from decimal import Decimal

from pikepdf import (
    Dictionary,
    Matrix,
    Name,
    Object,
    Pdf,
    PdfImage,
    PdfInlineImage,
    Stream,
    UnsupportedImageTypeError,
)

from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo._contentstream import (
    ContentsInfo,
    TextMarker,
    VectorMarker,
    _get_dpi,
    _interpret_contents,
    _is_unit_square,
)
from ocrmypdf.pdfinfo._types import (
    FRIENDLY_COLORSPACE,
    FRIENDLY_COMP,
    FRIENDLY_ENCODING,
    UNIT_SQUARE,
    Colorspace,
    Encoding,
)

logger = logging.getLogger()


class ImageInfo:
    """Information about an image found in a PDF.

    This gathers information from pikepdf and pdfminer.six, and is pickle-able
    so that it can be passed to a worker process, unlike objects from those
    libraries.
    """

    DPI_PREC = Decimal('1.000')

    _comp: int | None
    _name: str

    def __init__(
        self,
        *,
        name='',
        pdfimage: Object | None = None,
        inline: PdfInlineImage | None = None,
        shorthand=None,
    ):
        """Initialize an ImageInfo."""
        self._name = str(name)
        self._shorthand = shorthand

        pim: PdfInlineImage | PdfImage

        if inline is not None:
            self._origin = 'inline'
            pim = inline
        elif pdfimage is not None and isinstance(pdfimage, Stream):
            self._origin = 'xobject'
            pim = PdfImage(pdfimage)
        else:
            raise ValueError("Either pdfimage or inline must be set")

        self._width = pim.width
        self._height = pim.height
        if (smask := pim.obj.get(Name.SMask, None)) is not None and isinstance(
            smask, Stream | Dictionary
        ):
            # SMask is pretty much an alpha channel, but in PDF it's possible
            # for channel to have different dimensions than the image
            # itself. Some PDF writers use this to create a grayscale stencil
            # mask. For our purposes, the effective size is the size of the
            # larger component (image or smask).
            self._width = max(smask.get(Name.Width, 0), self._width)
            self._height = max(smask.get(Name.Height, 0), self._height)
        if (mask := pim.obj.get(Name.Mask, None)) is not None and isinstance(
            mask, Stream | Dictionary
        ):
            # If the image has a /Mask entry, it has an explicit mask.
            # /Mask can be a Stream or an Array. If it's a Stream,
            # use its /Width and /Height if they are larger than the main
            # image's.
            self._width = max(mask.get(Name.Width, 0), self._width)
            self._height = max(mask.get(Name.Height, 0), self._height)

        # If /ImageMask is true, then this image is a stencil mask
        # (Images that draw with this stencil mask will have a reference to
        # it in their /Mask, but we don't actually need that information)
        if pim.image_mask:
            self._type = 'stencil'
        else:
            self._type = 'image'

        self._bpc = int(pim.bits_per_component)
        if (
            len(pim.filters) == 2
            and pim.filters[0] == '/FlateDecode'
            and pim.filters[1] == '/DCTDecode'
        ):
            # Special case: FlateDecode followed by DCTDecode
            self._enc = Encoding.flate_jpeg
        else:
            try:
                self._enc = FRIENDLY_ENCODING.get(pim.filters[0])
            except IndexError:
                self._enc = None

        try:
            self._color = FRIENDLY_COLORSPACE.get(pim.colorspace or '')
        except NotImplementedError:
            self._color = None
        if self._enc == Encoding.jpeg2000:
            self._color = Colorspace.jpeg2000

        self._comp = None
        if self._color == Colorspace.icc and isinstance(pim, PdfImage):
            self._comp = self._init_icc(pim)
        else:
            if isinstance(self._color, Colorspace):
                self._comp = FRIENDLY_COMP.get(self._color)
            # Bit of a hack... infer grayscale if component count is uncertain
            # but encoding only supports monochrome.
            if self._comp is None and self._enc in (Encoding.ccitt, Encoding.jbig2):
                self._comp = FRIENDLY_COMP[Colorspace.gray]

    def _init_icc(self, pim: PdfImage):
        try:
            icc = pim.icc
        except UnsupportedImageTypeError as e:
            logger.warning(
                f"An image with a corrupt or unreadable ICC profile was found. "
                f"Output PDF may not match the input PDF visually: {e}. {self}"
            )
            return None
        # Check the ICC profile to determine actual colorspace
        if icc is None or not hasattr(icc, 'profile'):
            logger.warning(
                f"An image with an ICC profile but no ICC profile data was found. "
                f"The output PDF may not match the input PDF visually. {self}"
            )
            return None
        try:
            if icc.profile.xcolor_space == 'GRAY':
                return 1
            elif icc.profile.xcolor_space == 'CMYK':
                return 4
            else:
                return 3
        except AttributeError:
            return None

    @property
    def name(self):
        """Name of the image as it appears in the PDF."""
        return self._name

    @property
    def type_(self):
        """Type of image, either 'image' or 'stencil'."""
        return self._type

    @property
    def width(self) -> int:
        """Width of the image in pixels."""
        return self._width

    @property
    def height(self) -> int:
        """Height of the image in pixels."""
        return self._height

    @property
    def bpc(self):
        """Bits per component."""
        return self._bpc

    @property
    def color(self):
        """Colorspace of the image."""
        return self._color if self._color is not None else '?'

    @property
    def comp(self):
        """Number of components/channels in the image."""
        return self._comp if self._comp is not None else '?'

    @property
    def enc(self):
        """Encoding of the image."""
        return self._enc if self._enc is not None else 'image'

    @property
    def renderable(self) -> bool:
        """Whether the image is renderable.

        Some PDFs in the wild have invalid images that are not renderable,
        due to unusual dimensions.

        Stencil masks are not also not renderable, since they are not
        drawn, but rather they control how rendering happens.
        """
        return (
            self.dpi.is_finite
            and self.width >= 0
            and self.height >= 0
            and self.type_ != 'stencil'
        )

    @property
    def dpi(self) -> Resolution:
        """Dots per inch of the image.

        Calculated based on where and how the image is drawn in the PDF.
        """
        return _get_dpi(self._shorthand, (self._width, self._height))

    @property
    def printed_area(self) -> float:
        """Physical area of the image in square inches."""
        if not self.renderable:
            return 0.0
        return float((self.width / self.dpi.x) * (self.height / self.dpi.y))

    def __repr__(self):
        """Return a string representation of the image."""
        return (
            f"<ImageInfo '{self.name}' {self.type_} {self.width}×{self.height} "
            f"{self.color} {self.comp} {self.bpc} {self.enc} {self.dpi}>"
        )


def _find_inline_images(contentsinfo: ContentsInfo) -> Iterator[ImageInfo]:
    """Find inline images in the contentstream."""
    for n, inline in enumerate(contentsinfo.inline_images):
        yield ImageInfo(
            name=f'inline-{n:02d}', shorthand=inline.shorthand, inline=inline.iimage
        )


def _image_xobjects(container) -> Iterator[tuple[Object, str]]:
    """Search for all XObject-based images in the container.

    Usually the container is a page, but it could also be a Form XObject
    that contains images. Filter out the Form XObjects which are dealt with
    elsewhere.

    Generate a sequence of tuples (image, xobj container), where container,
    where xobj is the name of the object and image is the object itself,
    since the object does not know its own name.

    """
    if Name.Resources not in container:
        return
    resources = container[Name.Resources]
    if Name.XObject not in resources:
        return
    for key, candidate in resources[Name.XObject].items():
        if candidate is None or Name.Subtype not in candidate:
            continue
        if candidate[Name.Subtype] == Name.Image:
            pdfimage = candidate
            yield (pdfimage, key)


def _find_regular_images(
    container: Object, contentsinfo: ContentsInfo
) -> Iterator[ImageInfo]:
    """Find images stored in the container's /Resources /XObject.

    Usually the container is a page, but it could also be a Form XObject
    that contains images.

    Generates images with their DPI at time of drawing.
    """
    for pdfimage, xobj in _image_xobjects(container):
        if xobj not in contentsinfo.name_index:
            continue
        for draw in contentsinfo.name_index[xobj]:
            if draw.stack_depth == 0 and _is_unit_square(draw.shorthand):
                # At least one PDF in the wild (and test suite) draws an image
                # when the graphics stack depth is 0, meaning that the image
                # gets drawn into a square of 1x1 PDF units (or 1/72",
                # or 0.35 mm).  The equivalent DPI will be >100,000.  Exclude
                # these from our DPI calculation for the page.
                continue

            yield ImageInfo(name=draw.name, pdfimage=pdfimage, shorthand=draw.shorthand)


def _find_form_xobject_images(pdf: Pdf, container: Object, contentsinfo: ContentsInfo):
    """Find any images that are in Form XObjects in the container.

    The container may be a page, or a parent Form XObject.

    """
    if Name.Resources not in container:
        return
    resources = container[Name.Resources]
    if Name.XObject not in resources:
        return
    xobjs = resources[Name.XObject].as_dict()
    for xobj in xobjs:
        candidate = xobjs[xobj]
        if candidate is None or candidate.get(Name.Subtype) != Name.Form:
            continue

        form_xobject = candidate
        for settings in contentsinfo.xobject_settings:
            if settings.name != xobj:
                continue

            # Find images once for each time this Form XObject is drawn.
            # This could be optimized to cache the multiple drawing events
            # but in practice both Form XObjects and multiple drawing of the
            # same object are both very rare.
            ctm_shorthand = settings.shorthand
            yield from _process_content_streams(
                pdf=pdf, container=form_xobject, shorthand=ctm_shorthand
            )


def _process_content_streams(
    *, pdf: Pdf, container: Object, shorthand=None
) -> Iterator[VectorMarker | TextMarker | ImageInfo]:
    """Find all individual instances of images drawn in the container.

    Usually the container is a page, but it may also be a Form XObject.

    On a typical page images are stored inline or as regular images
    in an XObject.

    Form XObjects may include inline images, XObject images,
    and recursively, other Form XObjects; and also vector graphic objects.

    Every instance of an image being drawn somewhere is flattened and
    treated as a unique image, since if the same image is drawn multiple times
    on one page it may be drawn at differing resolutions, and our objective
    is to find the resolution at which the page can be rastered without
    downsampling.

    """
    if container.get(Name.Type) == Name.Page and Name.Contents in container:
        initial_shorthand = shorthand or UNIT_SQUARE
    elif (
        container.get(Name.Type) == Name.XObject
        and container[Name.Subtype] == Name.Form
    ):
        # Set the CTM to the state it was when the "Do" operator was
        # encountered that is drawing this instance of the Form XObject
        ctm = Matrix(shorthand) if shorthand else Matrix()

        # A Form XObject may provide its own matrix to map form space into
        # user space. Get this if one exists
        form_shorthand = container.get(Name.Matrix, Matrix())
        form_matrix = Matrix(form_shorthand)

        # Concatenate form matrix with CTM to ensure CTM is correct for
        # drawing this instance of the XObject
        ctm = form_matrix @ ctm
        initial_shorthand = ctm.shorthand
    else:
        return

    contentsinfo = _interpret_contents(container, initial_shorthand)

    if contentsinfo.found_vector:
        yield VectorMarker()
    if contentsinfo.found_text:
        yield TextMarker()
    yield from _find_inline_images(contentsinfo)
    yield from _find_regular_images(container, contentsinfo)
    yield from _find_form_xobject_images(pdf, container, contentsinfo)


================================================
FILE: src/ocrmypdf/pdfinfo/_types.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""PDF type definitions and constants."""

from __future__ import annotations

from enum import Enum, auto


class Colorspace(Enum):
    """Description of common image colorspaces in a PDF."""

    # pylint: disable=invalid-name
    gray = auto()
    rgb = auto()
    cmyk = auto()
    lab = auto()
    icc = auto()
    index = auto()
    sep = auto()
    devn = auto()
    pattern = auto()
    jpeg2000 = auto()


class Encoding(Enum):
    """Description of common image encodings in a PDF."""

    # pylint: disable=invalid-name
    ccitt = auto()
    jpeg = auto()
    jpeg2000 = auto()
    jbig2 = auto()
    asciihex = auto()
    ascii85 = auto()
    lzw = auto()
    flate = auto()
    runlength = auto()
    flate_jpeg = auto()


FloatRect = tuple[float, float, float, float]

FRIENDLY_COLORSPACE: dict[str, Colorspace] = {
    '/DeviceGray': Colorspace.gray,
    '/CalGray': Colorspace.gray,
    '/DeviceRGB': Colorspace.rgb,
    '/CalRGB': Colorspace.rgb,
    '/DeviceCMYK': Colorspace.cmyk,
    '/Lab': Colorspace.lab,
    '/ICCBased': Colorspace.icc,
    '/Indexed': Colorspace.index,
    '/Separation': Colorspace.sep,
    '/DeviceN': Colorspace.devn,
    '/Pattern': Colorspace.pattern,
    '/G': Colorspace.gray,  # Abbreviations permitted in inline images
    '/RGB': Colorspace.rgb,
    '/CMYK': Colorspace.cmyk,
    '/I': Colorspace.index,
}

FRIENDLY_ENCODING: dict[str, Encoding] = {
    '/CCITTFaxDecode': Encoding.ccitt,
    '/DCTDecode': Encoding.jpeg,
    '/JPXDecode': Encoding.jpeg2000,
    '/JBIG2Decode': Encoding.jbig2,
    '/CCF': Encoding.ccitt,  # Abbreviations permitted in inline images
    '/DCT': Encoding.jpeg,
    '/AHx': Encoding.asciihex,
    '/A85': Encoding.ascii85,
    '/LZW': Encoding.lzw,
    '/Fl': Encoding.flate,
    '/RL': Encoding.runlength,
}

FRIENDLY_COMP: dict[Colorspace, int] = {
    Colorspace.gray: 1,
    Colorspace.rgb: 3,
    Colorspace.cmyk: 4,
    Colorspace.lab: 3,
    Colorspace.index: 1,
}

UNIT_SQUARE = (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)


================================================
FILE: src/ocrmypdf/pdfinfo/_worker.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""PDF page info worker process handling."""

from __future__ import annotations

import atexit
import logging
from collections.abc import Container, Sequence
from contextlib import contextmanager
from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING

from pikepdf import Pdf

from ocrmypdf._concurrent import Executor
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import available_cpu_count, pikepdf_enable_mmap

if TYPE_CHECKING:
    from ocrmypdf.pdfinfo.info import PageInfo
    from ocrmypdf.pdfinfo.layout import PdfMinerState

logger = logging.getLogger()

worker_pdf = None  # pylint: disable=invalid-name


def _pdf_pageinfo_sync_init(pdf: Pdf, infile: Path, pdfminer_loglevel):
    global worker_pdf  # pylint: disable=global-statement,invalid-name
    pikepdf_enable_mmap()

    logging.getLogger('pdfminer').setLevel(pdfminer_loglevel)

    # If the pdf is not opened, open a copy for our worker process to use
    if pdf is None:
        worker_pdf = Pdf.open(infile)

        def on_process_close():
            worker_pdf.close()

        # Close when this process exits
        atexit.register(on_process_close)


@contextmanager
def _pdf_pageinfo_sync_pdf(thread_pdf: Pdf | None, infile: Path):
    if thread_pdf is not None:
        yield thread_pdf
    elif worker_pdf is not None:
        yield worker_pdf
    else:
        with Pdf.open(infile) as pdf:
            yield pdf


def _pdf_pageinfo_sync(
    pageno: int,
    thread_pdf: Pdf | None,
    infile: Path,
    check_pages: Container[int],
    detailed_analysis: bool,
    miner_state: PdfMinerState | None,
) -> PageInfo:
    # Import here to avoid circular import - info.py imports this module,
    # but PageInfo is defined in info.py
    from ocrmypdf.pdfinfo.info import PageInfo

    with _pdf_pageinfo_sync_pdf(thread_pdf, infile) as pdf:
        return PageInfo(
            pdf, pageno, infile, check_pages, detailed_analysis, miner_state
        )


def _pdf_pageinfo_concurrent(
    pdf,
    executor: Executor,
    max_workers: int,
    use_threads: bool,
    infile,
    progbar,
    check_pages,
    detailed_analysis: bool = False,
    miner_state: PdfMinerState | None = None,
) -> Sequence[PageInfo | None]:
    pages: list[PageInfo | None] = [None] * len(pdf.pages)

    def update_pageinfo(page: PageInfo, pbar: ProgressBar):
        if not page:
            raise InputFileError("Could read a page in the PDF")
        pages[page.pageno] = page
        pbar.update()

    if max_workers is None:
        max_workers = available_cpu_count()

    total = len(pdf.pages)

    n_workers = min(1 + len(pages) // 4, max_workers)
    if n_workers == 1:
        # If we decided on only one worker, there is no point in using
        # a separate process.
        use_threads = True

    if use_threads and n_workers > 1:
        # If we are using threads, there is no point in using more than one
        # worker thread - they will just fight over the GIL.
        n_workers = 1

    # If we use a thread, we can pass the already-open Pdf for them to use
    # If we use processes, we pass a None which tells the init function to open its
    # own
    initial_pdf = pdf if use_threads else None

    contexts = (
        (n, initial_pdf, infile, check_pages, detailed_analysis, miner_state)
        for n in range(total)
    )
    assert n_workers == 1 if use_threads else n_workers >= 1, "Not multithreadable"
    logger.debug(
        f"Gathering info with {n_workers} "
        + ('thread' if use_threads else 'process')
        + " workers"
    )
    executor(
        use_threads=use_threads,
        max_workers=n_workers,
        progress_kwargs=dict(
            total=total, desc="Scanning contents", unit='page', disable=not progbar
        ),
        worker_initializer=partial(
            _pdf_pageinfo_sync_init,
            initial_pdf,
            infile,
            logging.getLogger('pdfminer').level,
        ),
        task=_pdf_pageinfo_sync,
        task_arguments=contexts,
        task_finished=update_pageinfo,
    )
    return pages


================================================
FILE: src/ocrmypdf/pdfinfo/info.py
================================================
#!/usr/bin/env python3

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Extract information about the content of a PDF."""

from __future__ import annotations

import logging
import statistics
from collections.abc import Callable, Container, Iterable, Iterator
from contextlib import nullcontext
from decimal import Decimal
from os import PathLike
from pathlib import Path
from typing import NamedTuple

from pdfminer.layout import LTPage, LTTextBox
from pikepdf import Name, Page, Pdf

from ocrmypdf._concurrent import Executor, SerialExecutor
from ocrmypdf.exceptions import EncryptedPdfError
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo._contentstream import TextboxInfo, TextMarker, VectorMarker
from ocrmypdf.pdfinfo._image import ImageInfo, _process_content_streams
from ocrmypdf.pdfinfo._types import FloatRect
from ocrmypdf.pdfinfo._worker import _pdf_pageinfo_concurrent
from ocrmypdf.pdfinfo.layout import (
    LTStateAwareChar,
    PdfMinerState,
    get_text_boxes,
)

logger = logging.getLogger()


def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) -> bool:
    """Smarter text detection that ignores text in margins."""
    pw, ph = float(page_width), float(page_height)  # pylint: disable=invalid-name

    margin_ratio = 0.125
    interior_bbox = (
        margin_ratio * pw,  # left
        (1 - margin_ratio) * ph,  # top
        (1 - margin_ratio) * pw,  # right
        margin_ratio * ph,  # bottom  (first quadrant: bottom < top)
    )

    def rects_intersect(a: FloatRect, b: FloatRect) -> bool:
        """Check if two 4-tuple rects intersect.

        Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3)
        https://stackoverflow.com/questions/306316/determine-if-two-rectangles-overlap-each-other
        Formula assumes all boxes are in first quadrant.
        """
        return a[0] < b[2] and a[2] > b[0] and a[1] > b[3] and a[3] < b[1]

    has_text = False
    for bbox in text_blocks:
        if rects_intersect(bbox, interior_bbox):
            has_text = True
            break
    return has_text


def simplify_textboxes(
    miner_page: LTPage, textbox_getter: Callable[[LTPage], Iterator[LTTextBox]]
) -> Iterator[TextboxInfo]:
    """Extract only limited content from text boxes.

    We do this to save memory and ensure that our objects are pickleable.
    """
    for box in textbox_getter(miner_page):
        first_line = box._objs[0]  # pylint: disable=protected-access
        first_char = first_line._objs[0]  # pylint: disable=protected-access
        if not isinstance(first_char, LTStateAwareChar):
            continue
        visible = first_char.rendermode != 3
        corrupt = first_char.get_text() == '\ufffd'
        yield TextboxInfo(box.bbox, visible, corrupt)


class PageResolutionProfile(NamedTuple):
    """Information about the resolutions of a page."""

    weighted_dpi: float
    """The weighted average DPI of the page, weighted by the area of each image."""

    max_dpi: float
    """The maximum DPI of an image on the page."""

    average_to_max_dpi_ratio: float
    """The average DPI of the page divided by the maximum DPI of the page.

    This indicates the intensity of the resolution variation on the page.

    If the average is 1.0 or close to 1.0, has all of its content at a uniform
    resolution. If the average is much lower than 1.0, some content is at a
    higher resolution than the rest of the page.
    """

    area_ratio: float
    """The maximum-DPI area of the page divided by the total drawn area.

    This indicates the prevalence of high-resolution content on the page.
    """


class PageInfo:
    """Information about type of contents on each page in a PDF."""

    _has_text: bool | None
    _has_vector: bool | None
    _images: list[ImageInfo] = []

    def __init__(
        self,
        pdf: Pdf,
        pageno: int,
        infile: PathLike,
        check_pages: Container[int],
        detailed_analysis: bool = False,
        miner_state: PdfMinerState | None = None,
    ):
        """Initialize a PageInfo object."""
        self._pageno = pageno
        self._infile = infile
        self._detailed_analysis = detailed_analysis
        self._gather_pageinfo(
            pdf, pageno, infile, check_pages, detailed_analysis, miner_state
        )

    def _gather_pageinfo(
        self,
        pdf: Pdf,
        pageno: int,
        infile: PathLike,
        check_pages: Container[int],
        detailed_analysis: bool,
        miner_state: PdfMinerState | None,
    ):
        page: Page = pdf.pages[pageno]
        mediabox = [Decimal(d) for d in page.mediabox.as_list()]
        width_pt = mediabox[2] - mediabox[0]
        height_pt = mediabox[3] - mediabox[1]

        self._artbox = [float(d) for d in page.artbox.as_list()]
        self._bleedbox = [float(d) for d in page.bleedbox.as_list()]
        self._cropbox = [float(d) for d in page.cropbox.as_list()]
        self._mediabox = [float(d) for d in page.mediabox.as_list()]
        self._trimbox = [float(d) for d in page.trimbox.as_list()]

        check_this_page = pageno in check_pages

        if check_this_page and detailed_analysis:
            page_analysis = miner_state.get_page_analysis(pageno)
            if page_analysis is not None:
                self._textboxes = list(
                    simplify_textboxes(page_analysis, get_text_boxes)
                )
            else:
                self._textboxes = []
            bboxes = (box.bbox for box in self._textboxes)

            self._has_text = _page_has_text(bboxes, width_pt, height_pt)
        else:
            self._textboxes = []
            self._has_text = None  # i.e. "no information"

        userunit = page.get(Name.UserUnit, Decimal(1.0))
        if not isinstance(userunit, Decimal):
            userunit = Decimal(userunit)
        self._userunit = userunit
        self._width_inches = width_pt * userunit / Decimal(72.0)
        self._height_inches = height_pt * userunit / Decimal(72.0)
        self._rotate = int(getattr(page.obj, 'Rotate', 0))

        userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)

        if check_this_page:
            self._has_vector = False
            self._has_text = False
            self._images = []
            for info in _process_content_streams(
                pdf=pdf, container=page, shorthand=userunit_shorthand
            ):
                if isinstance(info, VectorMarker):
                    self._has_vector = True
                elif isinstance(info, TextMarker):
                    self._has_text = True
                elif isinstance(info, ImageInfo):
                    self._images.append(info)
                else:
                    raise NotImplementedError()
        else:
            self._has_vector = None  # i.e. "no information"
            self._has_text = None
            self._images = []

        self._dpi = None
        if self._images:
            dpi = Resolution(0.0, 0.0).take_max(
                image.dpi for image in self._images if image.renderable
            )
            self._dpi = dpi
            self._width_pixels = int(round(dpi.x * float(self._width_inches)))
            self._height_pixels = int(round(dpi.y * float(self._height_inches)))

    @property
    def pageno(self) -> int:
        """Return page number (0-based)."""
        return self._pageno

    @property
    def has_text(self) -> bool:
        """Return True if page has text, False if not or unknown."""
        return bool(self._has_text)

    @property
    def has_corrupt_text(self) -> bool:
        """Return True if page has corrupt text, False if not or unknown."""
        if not self._detailed_analysis:
            raise NotImplementedError('Did not do detailed analysis')
        return any(tbox.is_corrupt for tbox in self._textboxes)

    @property
    def has_vector(self) -> bool:
        """Return True if page has vector graphics, False if not or unknown.

        Vector graphics are sometimes used to draw fonts, so it may not be
        obvious on visual inspection whether a page has text or not.
        """
        return bool(self._has_vector)

    @property
    def width_inches(self) -> Decimal:
        """Return width of page in inches."""
        return self._width_inches

    @property
    def height_inches(self) -> Decimal:
        """Return height of page in inches."""
        return self._height_inches

    @property
    def width_pixels(self) -> int:
        """Return width of page in pixels."""
        return int(round(float(self.width_inches) * self.dpi.x))

    @property
    def height_pixels(self) -> int:
        """Return height of page in pixels."""
        return int(round(float(self.height_inches) * self.dpi.y))

    @property
    def rotation(self) -> int:
        """Return rotation of page in degrees.

        Will only be a multiple of 90.
        """
        return self._rotate

    @rotation.setter
    def rotation(self, value):
        if value in (0, 90, 180, 270, 360, -90, -180, -270):
            self._rotate = value
        else:
            raise ValueError("rotation must be a cardinal angle")

    @property
    def cropbox(self) -> FloatRect:
        """Return cropbox of page in PDF coordinates."""
        return self._cropbox

    @property
    def mediabox(self) -> FloatRect:
        """Return mediabox of page in PDF coordinates."""
        return self._mediabox

    @property
    def trimbox(self) -> FloatRect:
        """Return trimbox of page in PDF coordinates."""
        return self._trimbox

    @property
    def artbox(self) -> FloatRect:
        """Return artbox of page in PDF coordinates."""
        return self._artbox

    @property
    def bleedbox(self) -> FloatRect:
        """Return bleedbox of page in PDF coordinates."""
        return self._bleedbox

    @property
    def images(self) -> list[ImageInfo]:
        """Return images."""
        return self._images

    def get_textareas(self, visible: bool | None = None, corrupt: bool | None = None):
        """Return textareas bounding boxes in PDF coordinates on the page."""

        def predicate(
            obj: TextboxInfo, want_visible: bool | None, want_corrupt: bool | None
        ) -> bool:
            result = True
            if want_visible is not None and obj.is_visible != want_visible:
                result = False
            if want_corrupt is not None and obj.is_corrupt != want_corrupt:
                result = False
            return result

        if not self._textboxes:
            if visible is not None and corrupt is not None:
                raise NotImplementedError('Incomplete information on textboxes')
            return self._textboxes

        return (obj.bbox for obj in self._textboxes if predicate(obj, visible, corrupt))

    @property
    def dpi(self) -> Resolution:
        """Return DPI needed to render all images on the page."""
        if self._dpi is None:
            return Resolution(0.0, 0.0)
        return self._dpi

    @property
    def userunit(self) -> Decimal:
        """Return user unit of page."""
        return self._userunit

    @property
    def min_version(self) -> str:
        """Return minimum PDF version needed to render this page."""
        if self.userunit is not None:
            return '1.6'
        else:
            return '1.5'

    def page_dpi_profile(self) -> PageResolutionProfile | None:
        """Return information about the DPIs of the page.

        This is useful to detect pages with a small proportion of high-resolution
        content that is forcing us to use a high DPI for the whole page. The ratio
        is weighted by the area of each image. If images overlap, the overlapped
        area counts.

        Vector graphics and text are ignored.

        Returns None if there is no meaningful DPI for the page.
        """
        image_dpis = []
        image_areas = []
        for image in self._images:
            if not image.renderable:
                continue
            image_dpis.append(image.dpi.to_scalar())
            image_areas.append(image.printed_area)

        total_drawn_area = sum(image_areas)
        if total_drawn_area == 0:
            return None

        weights = [area / total_drawn_area for area in image_areas]
        # Calculate harmonic mean of DPIs weighted by area
        weighted_dpi = statistics.harmonic_mean(image_dpis, weights)
        max_dpi = max(image_dpis)
        dpi_average_max_ratio = weighted_dpi / max_dpi

        arg_max_dpi = image_dpis.index(max_dpi)
        max_area_ratio = image_areas[arg_max_dpi] / total_drawn_area
        return PageResolutionProfile(
            weighted_dpi,
            max_dpi,
            dpi_average_max_ratio,
            max_area_ratio,
        )

    def __repr__(self):
        """Return string representation."""
        return (
            f'<PageInfo '
            f'pageno={self.pageno} {self.width_inches}"x{self.height_inches}" '
            f'rotation={self.rotation} dpi={self.dpi} has_text={self.has_text}>'
        )


DEFAULT_EXECUTOR = SerialExecutor()


class PdfInfo:
    """Extract summary information about a PDF without retaining the PDF itself.

    Crucially this lets us get the information in a pure Python format so that
    it can be pickled and passed to a worker process.
    """

    _has_acroform: bool = False
    _has_signature: bool = False
    _needs_rendering: bool = False

    def __init__(
        self,
        infile: Path,
        *,
        detailed_analysis: bool = False,
        progbar: bool = False,
        max_workers: int | None = None,
        use_threads: bool = True,
        check_pages=None,
        executor: Executor = DEFAULT_EXECUTOR,
    ):
        """Initialize."""
        self._infile = infile
        if check_pages is None:
            check_pages = range(0, 1_000_000_000)

        with Pdf.open(infile) as pdf:
            if pdf.is_encrypted:
                raise EncryptedPdfError()  # Triggered by encryption with empty passwd
            pscript5_mode = str(pdf.docinfo.get(Name.Creator, "")).startswith(
                'PScript5'
            )
            self._miner_state = (
                PdfMinerState(infile, pscript5_mode)
                if detailed_analysis
                else nullcontext()
            )
            with self._miner_state as miner_state:
                self._pages = _pdf_pageinfo_concurrent(
                    pdf,
                    executor,
                    max_workers,
                    use_threads,
                    infile,
                    progbar,
                    check_pages=check_pages,
                    detailed_analysis=detailed_analysis,
                    miner_state=miner_state,
                )
            self._needs_rendering = pdf.Root.get(Name.NeedsRendering, False)
            if Name.AcroForm in pdf.Root:
                if (
                    len(pdf.Root.AcroForm.get(Name.Fields, [])) > 0
                    or Name.XFA in pdf.Root.AcroForm
                ):
                    self._has_acroform = True
                self._has_signature = bool(pdf.Root.AcroForm.get(Name.SigFlags, 0) & 1)
            self._is_tagged = bool(
                pdf.Root.get(Name.MarkInfo, {}).get(Name.Marked, False)
            )

    @property
    def pages(self) -> list[PageInfo | None]:
        """Return list of PageInfo objects, one per page in the PDF."""
        return self._pages

    @property
    def min_version(self) -> str:
        """Return minimum PDF version needed to render this PDF."""
        # The minimum PDF is the maximum version that any particular page needs
        return max(page.min_version for page in self.pages if page)

    @property
    def has_userunit(self) -> bool:
        """Return True if any page has a user unit."""
        return any(page.userunit != 1.0 for page in self.pages if page)

    @property
    def has_acroform(self) -> bool:
        """Return True if the document catalog has an AcroForm."""
        return self._has_acroform

    @property
    def has_signature(self) -> bool:
        """Return True if the document annotations has a digital signature."""
        return self._has_signature

    @property
    def is_tagged(self) -> bool:
        """Return True if the document catalog indicates this is a Tagged PDF."""
        return self._is_tagged

    @property
    def filename(self) -> str | Path:
        """Return filename of PDF."""
        if not isinstance(self._infile, str | Path):
            raise NotImplementedError("can't get filename from stream")
        return self._infile

    @property
    def needs_rendering(self) -> bool:
        """Return True if PDF contains XFA forms.

        XFA forms are not supported by most standard PDF renderers, so we
        need to detect and suppress them.
        """
        return self._needs_rendering

    def __getitem__(self, item) -> PageInfo:
        """Return PageInfo object for page number `item`."""
        return self._pages[item]

    def __len__(self):
        """Return number of pages in PDF."""
        return len(self._pages)

    def __repr__(self):
        """Return string representation."""
        return f"<PdfInfo('...'), page count={len(self)}>"


def main():  # pragma: no cover
    """Run as a script."""
    import argparse  # pylint: disable=import-outside-toplevel
    from pprint import pprint  # pylint: disable=import-outside-toplevel

    parser = argparse.ArgumentParser()
    parser.add_argument('infile')
    args = parser.parse_args()
    pdfinfo = PdfInfo(args.infile)

    pprint(pdfinfo)
    for page in pdfinfo.pages:
        pprint(page)
        for im in page.images:
            pprint(im)


if __name__ == '__main__':
    main()


================================================
FILE: src/ocrmypdf/pdfinfo/layout.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Detailed text position and layout analysis, building on pdfminer.six."""

from __future__ import annotations

import re
from collections.abc import Iterator, Mapping
from contextlib import contextmanager
from math import copysign
from os import PathLike
from pathlib import Path
from typing import Any
from unittest.mock import patch

import pdfminer
import pdfminer.encodingdb
import pdfminer.pdfdevice
import pdfminer.pdfinterp
import pdfminer.psparser
from deprecation import deprecated
from pdfminer.converter import PDFLayoutAnalyzer
from pdfminer.layout import LAParams, LTChar, LTPage, LTTextBox
from pdfminer.pdfcolor import PDFColorSpace
from pdfminer.pdfdevice import PDFTextSeq
from pdfminer.pdfdocument import PDFTextExtractionNotAllowed
from pdfminer.pdffont import FontWidthDict, PDFFont, PDFSimpleFont, PDFUnicodeNotDefined
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager, PDFTextState
from pdfminer.pdfpage import PDFPage
from pdfminer.utils import Matrix, bbox2str, matrix2str

from ocrmypdf.exceptions import EncryptedPdfError, InputFileError

STRIP_NAME = re.compile(r'[0-9]+')


original_pdfsimplefont_init = PDFSimpleFont.__init__


def pdfsimplefont__init__(
    self,
    descriptor: Mapping[str, Any],
    widths: FontWidthDict,
    spec: Mapping[str, Any],
) -> None:
    """Monkeypatch pdfminer.six PDFSimpleFont.__init__.

    If there is no ToUnicode and no Encoding, pdfminer.six assumes that Unicode
    conversion is possible. This is incorrect, according to PDF Reference Manual
    9.10.2. This patch fixes that.
    """
    # Font encoding is specified either by a name of
    # built-in encoding or a dictionary that describes
    # the differences.
    original_pdfsimplefont_init(self, descriptor, widths, spec)
    if not self.unicode_map and 'Encoding' not in spec:
        self.cid2unicode = {}
    return


PDFSimpleFont.__init__ = pdfsimplefont__init__

# Patch pdfminer.six buffer size
# The parser doesn't properly handle keyword tokens are split across the end of the
# buffer, so increase the buffer size something far larger than will ever be seen.
pdfminer.psparser.PSBaseParser.BUFSIZ = 256 * 1024 * 1024


def pdftype3font__pscript5_get_height(self):
    """Monkeypatch for PScript5.dll PDFs.

    The height of Type3 fonts is known to be incorrect in PScript5.dll
    generated PDFs. This patch attempts to correct the height by
    using the bbox height if it is available, otherwise using the
    ascent and descent.
    """
    h = self.bbox[3] - self.bbox[1]
    if h == 0:
        h = self.ascent - self.descent
    return h * copysign(1.0, self.vscale)


def pdftype3font__pscript5_get_descent(self):
    """Monkeypatch for PScript5.dll PDFs.

    The descent of Type3 fonts is known to be incorrect in PScript5.dll
    generated PDFs. This patch attempts to correct the descent by
    using the vscale.
    """
    return self.descent * copysign(1.0, self.vscale)


def pdftype3font__pscript5_get_ascent(self):
    """Monkeypatch for PScript5.dll PDFs.

    The ascent of Type3 fonts is known to be incorrect in PScript5.dll
    generated PDFs. This patch attempts to correct the ascent by
    using the vscale.
    """
    return self.ascent * copysign(1.0, self.vscale)


def _is_undefined_char(s: str) -> bool:
    """Check if a string is an undefined character."""
    return s.startswith('(cid:') and s.endswith(')')


class LTStateAwareChar(LTChar):
    """A subclass of LTChar that tracks text render mode at time of drawing."""

    __slots__ = (
        'rendermode',
        '_text',
        'matrix',
        'fontname',
        'adv',
        'upright',
        'size',
        'width',
        'height',
        'bbox',
        'x0',
        'x1',
        'y0',
        'y1',
    )

    def __init__(
        self,
        matrix: Matrix,
        font: PDFFont,
        fontsize: float,
        scaling: float,
        rise: float,
        text: str,
        textwidth: float,
        textdisp: float | tuple[float | None, float],
        ncs: PDFColorSpace,
        graphicstate: PDFGraphicState,
        textstate: PDFTextState,
    ) -> None:
        """Initialize."""
        super().__init__(
            matrix,
            font,
            fontsize,
            scaling,
            rise,
            text,
            textwidth,
            textdisp,
            ncs,
            graphicstate,
        )
        self.rendermode = textstate.render

    def is_compatible(self, obj: object) -> bool:
        """Check if characters can be combined into a textline.

        We consider characters compatible if:
            - the Unicode mapping is known, and both have the same render mode
            - the Unicode mapping is unknown but both are part of the same font
        """
        # pylint: disable=protected-access
        if not isinstance(obj, LTStateAwareChar):
            return False
        both_unicode_mapped = not _is_undefined_char(
            self._text
        ) and not _is_undefined_char(obj._text)
        if both_unicode_mapped:
            return self.rendermode == obj.rendermode
        return self.fontname == obj.fontname and self.rendermode == obj.rendermode

    def get_text(self) -> str:
        """Get text from this character."""
        if _is_undefined_char(self._text):
            return '\ufffd'  # standard 'Unknown symbol'
        return self._text

    def __repr__(self) -> str:
        """Return a string representation of this object."""
        return (
            f"<{self.__class__.__name__} "
            f"{bbox2str(self.bbox)} "
            f"matrix={matrix2str(self.matrix)} "
            f"rendermode={self.rendermode!r} "
            f"font={self.fontname!r} "
            f"adv={self.adv} "
            f"text={self.get_text()!r}>"
        )


class TextPositionTracker(PDFLayoutAnalyzer):
    """A page layout analyzer that pays attention to text visibility."""

    textstate: PDFTextState

    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        pageno: int = 1,
        laparams: LAParams | None = None,
    ):
        """Initialize the layout analyzer."""
        super().__init__(rsrcmgr, pageno, laparams)
        self.result: LTPage | None = None

    def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
        """Begin processing of a page."""
        super().begin_page(page, ctm)
        self.cur_item = LTPage(self.pageno, page.mediabox)

    def end_page(self, page: PDFPage) -> None:
        """End processing of a page."""
        assert not self._stack, str(len(self._stack))
        assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
        if self.laparams is not None:
            self.cur_item.analyze(self.laparams)
        self.pageno += 1
        self.receive_layout(self.cur_item)

    def render_string(
        self,
        textstate: PDFTextState,
        seq: PDFTextSeq,
        ncs: PDFColorSpace,
        graphicstate: PDFGraphicState,
    ) -> None:
        """Respond to render string event by updating text state."""
        self.textstate = textstate.copy()
        super().render_string(self.textstate, seq, ncs, graphicstate)

    def render_char(
        self,
        matrix: Matrix,
        font: PDFFont,
        fontsize: float,
        scaling: float,
        rise: float,
        cid: int,
        ncs: PDFColorSpace,
        graphicstate: PDFGraphicState,
    ) -> float:
        """Respond to render char event by updating text state."""
        try:
            text = font.to_unichr(cid)
            assert isinstance(text, str), str(type(text))
        except PDFUnicodeNotDefined:
            text = self.handle_undefined_char(font, cid)
        textwidth = font.char_width(cid)
        textdisp = font.char_disp(cid)
        item = LTStateAwareChar(
            matrix,
            font,
            fontsize,
            scaling,
            rise,
            text,
            textwidth,
            textdisp,
            ncs,
            graphicstate,
            self.textstate,
        )
        self.cur_item.add(item)
        return item.adv

    def receive_layout(self, ltpage: LTPage) -> None:
        """Receive layout handler."""
        self.result = ltpage

    def get_result(self) -> LTPage | None:
        """Get the result of the analysis."""
        return self.result


@contextmanager
def patch_pdfminer(pscript5_mode: bool):
    """Patch pdfminer.six to work around bugs in PDFs created by PScript5."""
    if pscript5_mode:
        with patch.multiple(
            'pdfminer.pdffont.PDFType3Font',
            spec=True,
            get_ascent=pdftype3font__pscript5_get_ascent,
            get_descent=pdftype3font__pscript5_get_descent,
            get_height=pdftype3font__pscript5_get_height,
        ):
            yield
    else:
        yield


@deprecated(deprecated_in='16.6.0', details='Use PdfMinerState instead.')
def get_page_analysis(
    infile: PathLike, pageno: int, pscript5_mode: bool
) -> LTPage | None:
    """Get the page analysis for a given page."""
    rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)
    disable_boxes_flow = None
    dev = TextPositionTracker(
        rman,
        laparams=LAParams(
            all_texts=True, detect_vertical=True, boxes_flow=disable_boxes_flow
        ),
    )
    interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev)

    with patch_pdfminer(pscript5_mode):
        try:
            with Path(infile).open('rb') as f:
                page_iter = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0)
                page = next(page_iter, None)
                if page is None:
                    raise InputFileError(
                        f"pdfminer could not process page {pageno} (counting from 0)."
                    )
                interp.process_page(page)
        except PDFTextExtractionNotAllowed as e:
            raise EncryptedPdfError() from e

    return dev.get_result()


class PdfMinerState:
    """Provide a context manager for using pdfminer.six.

    This ensures that the file is closed. It also provides a cache of pages
    from the PDF so that they can be reused if needed, to improve performance.
    """

    def __init__(self, infile: Path, pscript5_mode: bool) -> None:
        """Initialize the context manager.

        Args:
            infile: The path to the PDF file to be analyzed.
            pscript5_mode: Whether the PDF was generated by PScript5.dll.
        """
        self.infile = infile
        self.rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)
        self.disable_boxes_flow = None
        self.page_iter = None
        self.page_cache: list[PDFPage] = []
        self.pscript5_mode = pscript5_mode
        self.file = None

    def __enter__(self):
        """Enter the context manager."""
        self.file = Path(self.infile).open('rb')
        self.page_iter = PDFPage.get_pages(self.file)
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        """Exit the context manager."""
        if self.file:
            self.file.close()
        return True

    def get_page_analysis(self, pageno: int):
        """Get the page analysis for a given page."""
        while len(self.page_cache) <= pageno:
            try:
                self.page_cache.append(next(self.page_iter))
            except StopIteration:
                raise InputFileError(
                    f"pdfminer did not find page {pageno} in the input file."
                ) from None
        page = self.page_cache[pageno]
        if not page:
            raise InputFileError(
                f"pdfminer could not process page {pageno} (counting from 0)."
            )
        dev = TextPositionTracker(
            self.rman,
            laparams=LAParams(
                all_texts=True, detect_vertical=True, boxes_flow=self.disable_boxes_flow
            ),
        )
        interp = pdfminer.pdfinterp.PDFPageInterpreter(self.rman, dev)

        with patch_pdfminer(self.pscript5_mode):
            interp.process_page(page)

        return dev.get_result()


def get_text_boxes(obj) -> Iterator[LTTextBox]:
    """Get the text boxes attached to the current node."""
    for child in obj:
        if isinstance(child, (LTTextBox)):
            yield child
        else:
            try:
                yield from get_text_boxes(child)
            except TypeError:
                continue


================================================
FILE: src/ocrmypdf/pluginspec.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""OCRmyPDF pluggy plugin specification."""

from __future__ import annotations

from abc import ABC, abstractmethod
from argparse import ArgumentParser
from collections.abc import Sequence, Set
from enum import StrEnum
from logging import Handler
from pathlib import Path
from typing import TYPE_CHECKING, NamedTuple

import pluggy
from pydantic import BaseModel

from ocrmypdf import Executor, PdfContext
from ocrmypdf._options import OcrOptions
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.helpers import Resolution

if TYPE_CHECKING:
    from PIL import Image

    # pylint: disable=ungrouped-imports
    from ocrmypdf._jobcontext import PageContext
    from ocrmypdf.hocrtransform import OcrElement
    from ocrmypdf.pdfinfo import PdfInfo

    # pylint: enable=ungrouped-imports


class GhostscriptRasterDevice(StrEnum):
    """Possible raster devices for Ghostscript."""

    JPEGGRAY = 'jpeggray'
    JPEGCOLOR = 'jpeg'
    PNGMONO = 'pngmono'
    PNGGRAY = 'pnggray'
    PNG256 = 'png256'
    PNG16M = 'png16m'


hookspec = pluggy.HookspecMarker('ocrmypdf')

# pylint: disable=unused-argument
# mypy: disable-error-code=empty-body


@hookspec(firstresult=True)
def get_logging_console() -> Handler:  # type: ignore[return-value]
    """Returns a custom logging handler.

    Generally this is necessary when both logging output and a progress bar are both
    outputting to ``sys.stderr``.

    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """


@hookspec
def initialize(plugin_manager: pluggy.PluginManager) -> None:
    """Called when this plugin is first loaded into OCRmyPDF.

    The primary intended use of this is for plugins to check compatibility with other
    plugins and possibly block other blocks, a plugin that wishes to block ocrmypdf's
    built-in optimize plugin could do:

    .. code-block::

        plugin_manager.set_blocked('ocrmypdf.builtin_plugins.optimize')

    It would also be reasonable for an plugin implementation to check if it is unable
    to proceed, for example, because a required dependency is missing. (If the plugin's
    ability to proceed depends on options and arguments, use ``validate`` instead.)

    Raises:
        ocrmypdf.exceptions.ExitCodeException: If options are not acceptable
            and the application should terminate gracefully with an informative
            message and error code.

    Note:
        This hook will be called from the main process, and may modify global state
        before child worker processes are forked.
    """


@hookspec
def add_options(parser: ArgumentParser) -> None:
    """Allows the plugin to add its own command line and API arguments.

    OCRmyPDF converts command line arguments to API arguments, so adding
    arguments here will cause new arguments to be processed for API calls
    to ``ocrmypdf.ocr``, or when invoked on the command line.

    Note:
        This hook will be called from the main process, and may modify global state
        before child worker processes are forked.
    """


@hookspec
def register_options() -> dict[str, type[BaseModel]]:
    """Return plugin's option models keyed by namespace.

    This hook allows plugins to register their option models with the
    plugin option registry. The returned dictionary should map namespace
    strings to Pydantic model classes.

    Returns:
        Dictionary mapping namespace strings to BaseModel classes

    Example:
        @hookimpl
        def register_options():
            return {'tesseract': TesseractOptions}

    Note:
        This hook will be called from the main process during plugin
        infrastructure setup, before child worker processes are forked.
    """


@hookspec
def check_options(options: OcrOptions) -> None:
    """Called to ask the plugin to check all of the options.

    The plugin may check if options that it added are valid.

    Warnings or other messages may be passed to the user by creating a logger
    object using ``log = logging.getLogger(__name__)`` and logging to this.

    The plugin may also modify the *options*. All objects that are in options
    must be picklable so they can be marshalled to child worker processes.

    Raises:
        ocrmypdf.exceptions.ExitCodeException: If options are not acceptable
            and the application should terminate gracefully with an informative
            message and error code.

    Note:
        This hook will be called from the main process, and may modify global state
        before child worker processes are forked.
    """


@hookspec(firstresult=True)
def get_executor(progressbar_class: type[ProgressBar]) -> Executor:  # type: ignore[return-value]
    """Called to obtain an object that manages parallel execution.

    This may be used to replace OCRmyPDF's default parallel execution system
    with a third party alternative. For example, you could make OCRmyPDF run in a
    distributed environment.

    OCRmyPDF's executors are analogous to the standard Python executors in
    ``conconcurrent.futures``, but they do not work the same way. Executors may
    be reused for different, unrelated batch operations, since all of the context
    for a given job are passed to :meth:`Executor.__call__`.

    Should be of type :class:`Executor` or otherwise conforming to the protocol
    of that call.

    Arguments:
        progressbar_class: A progress bar class, which will be created when

    Note:
        This hook will be called from the main process, and may modify global state
        before child worker processes are forked.

    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """


@hookspec(firstresult=True)
def get_progressbar_class() -> type[ProgressBar]:  # type: ignore[return-value]
    """Called to obtain a class that can be used to monitor progress.

    OCRmyPDF will call this function when it wants to display a progress bar.
    The class returned by this function must be compatible with the
    :class:`ProgressBar` protocol.

    Example:
        Here is how OCRmyPDF will use the progress bar:

        .. code-block:: python

            pbar_class = pm.hook.get_progressbar_class()
            with pbar_class(**progress_kwargs) as pbar:
                ... # do some work
                pbar.update(1)
    """


@hookspec
def validate(pdfinfo: PdfInfo, options: OcrOptions) -> None:
    """Called to give a plugin an opportunity to review *options* and *pdfinfo*.

    *options* contains the "work order" to process a particular file. *pdfinfo*
    contains information about the input file obtained after loading and
    parsing. The plugin may modify the *options*. For example, you could decide
    that a certain type of file should be treated with ``options.force_ocr = True``
    based on information in its *pdfinfo*.

    Raises:
        ocrmypdf.exceptions.ExitCodeException: If options or pdfinfo are not acceptable
            and the application should terminate gracefully with an informative
            message and error code.

    Note:
        This hook will be called from the main process, and may modify global state
        before child worker processes are forked.
    """


@hookspec(firstresult=True)
def rasterize_pdf_page(
    input_file: Path,
    output_file: Path,
    raster_device: GhostscriptRasterDevice,
    raster_dpi: Resolution,
    pageno: int,
    page_dpi: Resolution | None,
    rotation: int | None,
    filter_vector: bool,
    stop_on_soft_error: bool,
    options: OcrOptions | None,
    use_cropbox: bool,
) -> Path:  # type: ignore[return-value]
    """Rasterize one page of a PDF at resolution raster_dpi in canvas units.

    The image is sized to match the integer pixels dimensions implied by
    raster_dpi even if those numbers are noninteger. The image's DPI will
    be overridden with the values in page_dpi.

    Args:
        input_file: The PDF to rasterize.
        output_file: The desired name of the rasterized image.
        raster_device: Type of image to produce at output_file.
        raster_dpi: Resolution in dots per inch at which to rasterize page.
        pageno: Page number to rasterize (beginning at page 1).
        page_dpi: Resolution, overriding output image DPI.
        rotation: Cardinal angle, clockwise, to rotate page.
        filter_vector: If True, remove vector graphics objects.
        stop_on_soft_error: If there is an "soft error" such that PDF page image
            generation can proceed, but may visually differ from the original,
            the implementer of this hook should raise a detailed exception. If
            ``False``, continue processing and report by logging it. If the hook
            cannot proceed, it should always raise an exception, regardless of
            this setting. One "soft error" would be a missing font that is
            required to properly rasterize the PDF.
        options: OCRmyPDF options. Plugins may use this to check settings like
            ``options.rasterizer`` to determine whether they should handle the
            request or defer to another plugin. Introduced in version 17.0.
        use_cropbox: If True, rasterize the page's CropBox instead of the
            MediaBox. Default is False (use MediaBox) for consistency with
            Ghostscript's default behavior.

    Returns:
        Path: output_file if successful

    Note:
        This hook will be called from child processes. Modifying global state
        will not affect the main process or other child processes.

    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """


@hookspec(firstresult=True)
def filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image:  # type: ignore[return-value]
    """Called to filter the image before it is sent to OCR.

    This is the image that OCR sees, not what the user sees when they view the
    PDF. In certain modes such as ``--redo-ocr``, portions of the image may be
    masked out to hide them from OCR.

    The main uses of this hook are expected to be hiding content from OCR,
    conditioning images to OCR better with filters, and adjusting images to
    match any constraints imposed by the OCR engine.

    The input image may be color, grayscale, or monochrome, and the
    output image may differ. For example, if you know that a custom OCR engine
    does not care about the color of the text, you could convert the image to
    it to grayscale or monochrome.

    Generally speaking, the output image should be a faithful representation of
    of the input image. You *may* change the pixel width and height of the
    the input image, but you must not change the aspect ratio, and you must
    calculate the DPI of the output image based on the new pixel width and
    height or the OCR text layer will be misaligned with the visual position.

    The built-in Tesseract OCR engine uses this hook itself to downsample
    very large images to fit its constraints.

    Note:
        This hook will be called from child processes. Modifying global state
        will not affect the main process or other child processes.

    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """


@hookspec(firstresult=True)
def filter_page_image(page: PageContext, image_filename: Path) -> Path:  # type: ignore[return-value]
    """Called to filter the whole page before it is inserted into the PDF.

    A whole page image is only produced when preprocessing command line arguments
    are issued or when ``--force-ocr`` is issued. If no whole page is image is
    produced for a given page, this function will not be called. This is not
    the image that will be shown to OCR.

    If the function does not want to modify the image, it should return
    ``image_filename``. The hook may overwrite ``image_filename`` with a new file.

    The output image should preserve the same physical unit dimensions, that is
    ``(width * dpi_x, height * dpi_y)``. That is, if the image is resized, the DPI
    must be adjusted by the reciprocal. If this is not preserved, the PDF page
    will be resized and the OCR layer misaligned. OCRmyPDF does nothing
    to enforce these constraints; it is up to the plugin to do sensible things.

    OCRmyPDF will create the PDF page based on the image format used (unless the
    hook is overridden). If you convert the image to a JPEG, the output page will
    be created as a JPEG, etc. If you change the colorspace, that change will be
    kept. Note that the OCRmyPDF image optimization stage, if enabled, may
    ultimately chose a different format.

    If the return value is a file that does not exist, ``FileNotFoundError``
    will occur. The return value should be a path to a file in the same folder
    as ``image_filename``.

    Note:
        This hook will be called from child processes. Modifying global state
        will not affect the main process or other child processes.

    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """


@hookspec(firstresult=True)
def filter_pdf_page(page: PageContext, image_filename: Path, output_pdf: Path) -> Path:  # type: ignore[return-value]
    """Called to convert a filtered whole page image into a PDF.

    A whole page image is only produced when preprocessing command line arguments
    are issued or when ``--force-ocr`` is issued. If no whole page is image is
    produced for a given page, this function will not be called. This is not
    the image that will be shown to OCR. The whole page image is filtered in
    the hook above, ``filter_page_image``, then this function is called for
    PDF conversion.

    This function will only be called when OCRmyPDF runs in a mode such as
    "force OCR" mode where rasterizing of all content is performed.

    Clever things could be done at this stage such as segmenting the page image into
    color regions or vector equivalents.

    The provider of the hook implementation is responsible for ensuring that the
    OCR text layer is aligned with the PDF produced here, or text misalignment
    will result.

    Currently this function must produce a single page PDF or the pipeline will
    fail.  If the intent is to remove the PDF, then create a single page empty
    PDF.

    Args:
        page: Context for this page.
        image_filename: Filename of the input image used to create output_pdf,
            for "reference" if recreating the output_pdf entirely.
        output_pdf: The previous created output_pdf.

    Returns:
        output_pdf

    Note:
        This hook will be called from child processes. Modifying global state
        will not affect the main process or other child processes.

    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """


class OrientationConfidence(NamedTuple):
    """Expresses an OCR engine's confidence in page rotation.

    Attributes:
        angle: The clockwise angle (0, 90, 180, 270) that the page should be
            rotated. 0 means no rotation.
        confidence: How confident the OCR engine is that this the correct
            rotation. 0 is not confident, 15 is very confident. Arbitrary units.
    """

    angle: int
    confidence: float


class OcrEngine(ABC):
    """A class representing an OCR engine with capabilities similar to Tesseract OCR.

    This could be used to create a plugin for another OCR engine instead of
    Tesseract OCR.
    """

    @staticmethod
    @abstractmethod
    def version() -> str:
        """Returns the version of the OCR engine."""

    @staticmethod
    @abstractmethod
    def creator_tag(options: OcrOptions) -> str:
        """Returns the creator tag to identify this software's role in creating the PDF.

        This tag will be inserted in the XMP metadata and DocumentInfo dictionary
        as appropriate. Ideally you should include the name of the OCR engine and its
        version. The text should not contain line breaks. This is to help developers
        like yourself identify the software that produced this file.

        OCRmyPDF will always prepend its name to this value.
        """

    @abstractmethod
    def __str__(self) -> str:
        """Returns name of OCR engine and version.

        This is used when OCRmyPDF wants to mention the name of the OCR engine
        to the user, usually in an error message.
        """

    @staticmethod
    @abstractmethod
    def languages(options: OcrOptions) -> Set[str]:
        """Returns the set of all languages that are supported by the engine.

        Languages are typically given in 3-letter ISO 3166-1 codes, but actually
        can be any value understood by the OCR engine.
        """

    @staticmethod
    @abstractmethod
    def get_orientation(input_file: Path, options: OcrOptions) -> OrientationConfidence:
        """Returns the orientation of the image."""

    @staticmethod
    def get_deskew(input_file: Path, options: OcrOptions) -> float:
        """Returns the deskew angle of the image, in degrees."""
        return 0.0

    @staticmethod
    @abstractmethod
    def generate_hocr(
        input_file: Path, output_hocr: Path, output_text: Path, options: OcrOptions
    ) -> None:
        """Called to produce a hOCR file from a page image and sidecar text file.

        A hOCR file is an HTML-like file that describes the position of text on a
        page. OCRmyPDF can create a text only PDF from the hOCR file and graft it
        onto the output PDF.

        This function executes in a worker thread or worker process. OCRmyPDF
        automatically parallelizes OCR over pages. The OCR engine should not
        introduce more parallelism.

        Args:
            input_file: A page image on which to perform OCR.
            output_hocr: The expected name of the output hOCR file.
            output_text: The expected name of a text file containing the
                recognized text.
            options: The command line options.
        """

    @staticmethod
    @abstractmethod
    def generate_pdf(
        input_file: Path, output_pdf: Path, output_text: Path, options: OcrOptions
    ) -> None:
        """Called to produce a text only PDF from a page image.

        A text only PDF should contain no visible material of any kind, as it
        will be grafted onto the input PDF page. It must be sized to the
        exact dimensions of the input image.

        This function executes in a worker thread or worker process. OCRmyPDF
        automatically parallelizes OCR over pages. The OCR engine should not
        introduce more parallelism.

        Args:
            input_file: A page image on which to perform OCR.
            output_pdf: The expected name of the output PDF.
            output_text: The expected name of a text file containing the
                recognized text.
            options: The command line options.
        """

    @staticmethod
    def supports_generate_ocr() -> bool:
        """Return True if this engine supports the generate_ocr() API.

        The pipeline uses this to determine whether to call generate_ocr()
        or fall back to generate_hocr().

        Returns:
            False by default. Engines implementing generate_ocr() should
            override this to return True.
        """
        return False

    @staticmethod
    def generate_ocr(
        input_file: Path,
        options: OcrOptions,
        page_number: int = 0,
    ) -> tuple[OcrElement, str]:
        """Generate OCR results as an OcrElement tree.

        This is the modern API for OCR engines. Engines implementing this method
        can return structured OCR results directly without intermediate file formats.

        This function executes in a worker thread or worker process. OCRmyPDF
        automatically parallelizes OCR over pages. The OCR engine should not
        introduce more parallelism.

        Args:
            input_file: A page image on which to perform OCR.
            options: The command line options.
            page_number: Zero-indexed page number (for multi-page context).

        Returns:
            A tuple of (OcrElement tree for the page, plain text content).
            The OcrElement should have ocr_class=OcrClass.PAGE as its root.

        Note:
            This method is optional. Engines that don't implement it should
            leave the default implementation, and the pipeline will fall back to
            generate_hocr() or generate_pdf().
        """
        raise NotImplementedError("This OcrEngine does not implement generate_ocr()")


@hookspec(firstresult=True)
def get_ocr_engine(options: OcrOptions | None) -> OcrEngine:  # type: ignore[return-value]
    """Returns an OcrEngine to use for processing this file.

    The OcrEngine may be instantiated multiple times, by both the main process
    and child process.

    When multiple OCR engine plugins are installed, plugins should check
    ``options.ocr_engine`` and return ``None`` if they are not the selected
    engine. The hook caller will then try the next plugin.

    Args:
        options: The current OcrOptions, used to determine which engine
            to select. May be None for backward compatibility with external
            plugins.

    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """


@hookspec(firstresult=True)
def generate_pdfa(
    pdf_pages: list[Path],
    pdfmark: Path,
    output_file: Path,
    context: PdfContext,
    pdf_version: str,
    pdfa_part: str,
    progressbar_class: type[ProgressBar] | None,
    stop_on_soft_error: bool,
) -> Path:  # type: ignore[return-value]
    """Generate a PDF/A.

    This API strongly assumes a PDF/A generator with Ghostscript's semantics.

    OCRmyPDF will modify the metadata and possibly linearize the PDF/A after it
    is generated.

    Arguments:
        pdf_pages: A list of one or more filenames, will be merged into output_file.
        pdfmark: A PostScript file intended for Ghostscript with details on
            how to perform the PDF/A conversion.
        output_file: The name of the desired output file.
        context: The current context.
        pdf_version: The minimum PDF version that the output file should be.
            At its own discretion, the PDF/A generator may raise the version,
            but should not lower it.
        pdfa_part: The desired PDF/A compliance level, such as ``'2b'``.
        progressbar_class: The class of a progress bar, which must implement
            the ProgressBar protocol. If None, no progress is reported.
        stop_on_soft_error: If there is an "soft error" such that PDF/A generation
            can proceed and produce a valid PDF/A, but output may be invalid or
            may not visually resemble the original, the implementer of this hook
            should raise a detailed exception. If ``False``, continue processing
            and report by logging it. If the hook cannot proceed, it should always
            raise an exception, regardless of this setting.

    Returns:
        Path: If successful, the hook should return ``output_file``.

    Note:
        This is a :ref:`firstresult hook<firstresult>`.

    Note:
        Before version 15.0.0, the ``context`` was not provided and ``compression``
        was provided instead. Plugins should now read the context object to determine
        if compression is requested.
    """


@hookspec(firstresult=True)
def optimize_pdf(
    input_pdf: Path,
    output_pdf: Path,
    context: PdfContext,
    executor: Executor,
    linearize: bool,
) -> tuple[Path, Sequence[str]]:  # type: ignore[return-value]
    """Optimize a PDF after image, OCR and metadata processing.

    If the input_pdf is a PDF/A, the plugin should modify input_pdf in a way
    that preserves the PDF/A status, or report to the user when this is not possible.

    If the implementation fails to produce a smaller file than the input file, it
    should return input_pdf instead.

    A plugin that implements a new optimizer may need to suppress the built-in
    optimizer by implementing an ``initialize`` hook.

    Arguments:
        input_pdf: The input PDF, which has OCR added.
        output_pdf: The requested filename of the output PDF which should be created
            by this optimization hook.
        context: The current context.
        executor: An initialized executor which may be used during optimization,
            to distribute optimization tasks.
        linearize: If True, OCRmyPDF requires ``optimize_pdf`` to return a linearized,
            also known as fast web view PDF.

    Returns:
        Path: If optimization is successful, the hook should return ``output_file``.
            If optimization does not produce a smaller file, the hook should return
            ``input_file``.
        Sequence[str]: Any comments that the plugin wishes to report to the user,
            especially reasons it was not able to further optimize the file. For
            example, the plugin could report that a required third party was not
            installed, so a specific optimization was not attempted.

    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """


@hookspec(firstresult=True)
def is_optimization_enabled(context: PdfContext) -> bool:  # type: ignore[return-value]
    """For a given PdfContext, OCRmyPDF asks the plugin if optimization is enabled.

    An optimization plugin might be installed and active but could be disabled by
    user settings.

    If this returns False, OCRmyPDF will take certain actions to finalize the PDF.

    Returns:
        True if the plugin's optimization is enabled.

    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """


================================================
FILE: src/ocrmypdf/py.typed
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
# ocrmypdf is typed


================================================
FILE: src/ocrmypdf/quality.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Utilities to measure OCR quality."""

from __future__ import annotations

import re
from collections.abc import Iterable


class OcrQualityDictionary:
    """Manages a dictionary for simple OCR quality checks."""

    def __init__(self, *, wordlist: Iterable[str]):
        """Construct a dictionary from a list of words.

        Words for which capitalization is important should be capitalized in the
        dictionary. Words that contain spaces or other punctuation will never match.
        """
        self.dictionary = set(wordlist)

    def measure_words_matched(self, ocr_text: str) -> float:
        """Check how many unique words in the OCR text match a dictionary.

        Words with mixed capitalized are only considered a match if the test word
        matches that capitalization.

        Returns:
            number of words that match / number
        """
        text = re.sub(r"[0-9_]+", ' ', ocr_text)
        text = re.sub(r'\W+', ' ', text)
        text_words_list = re.split(r'\s+', text)
        text_words = {w for w in text_words_list if len(w) >= 3}

        matches = 0
        for w in text_words:
            if w in self.dictionary or (
                w != w.lower() and w.lower() in self.dictionary
            ):
                matches += 1
        hit_ratio = matches / len(text_words) if matches > 0 else 0.0
        return hit_ratio


================================================
FILE: src/ocrmypdf/subprocess/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Wrappers to manage subprocess calls."""

from __future__ import annotations

import logging
import os
import re
import sys
from collections.abc import Callable, Mapping, Sequence
from contextlib import suppress
from pathlib import Path
from subprocess import PIPE, STDOUT, CalledProcessError, CompletedProcess, Popen
from subprocess import run as subprocess_run

from packaging.version import Version

from ocrmypdf.exceptions import MissingDependencyError

# pylint: disable=logging-format-interpolation

log = logging.getLogger(__name__)

Args = Sequence[Path | str]
Environ = Mapping[str, str] | os._Environ  # pylint: disable=protected-access


def run(
    args: Args,
    *,
    env: Environ | None = None,
    logs_errors_to_stdout: bool = False,
    check: bool = False,
    **kwargs,
) -> CompletedProcess:
    """Wrapper around :py:func:`subprocess.run`.

    The main purpose of this wrapper is to log subprocess output in an orderly
    fashion that identifies the responsible subprocess. An additional
    task is that this function goes to greater lengths to find possible Windows
    locations of our dependencies when they are not on the system PATH.

    Arguments should be identical to ``subprocess.run``, except for following:

    Args:
        args: Positional arguments to pass to ``subprocess.run``.
        env: A set of environment variables. If None, the OS environment is used.
        logs_errors_to_stdout: If True, indicates that the process writes its error
            messages to stdout rather than stderr, so stdout should be logged
            if there is an error. If False, stderr is logged. Could be used with
            stderr=STDOUT, stdout=PIPE for example.
        check: If True, raise an exception if the process exits with a non-zero
            status code. If False, the return value will indicate success or failure.
        kwargs: Additional arguments to pass to ``subprocess.run``.
    """
    args, env, process_log, _text = _fix_process_args(args, env, kwargs)

    stderr = None
    stderr_name = 'stderr' if not logs_errors_to_stdout else 'stdout'
    try:
        proc = subprocess_run(args, env=env, check=check, **kwargs)
    except CalledProcessError as e:
        stderr = getattr(e, stderr_name, None)
        raise
    else:
        stderr = getattr(proc, stderr_name, None)
    finally:
        if process_log.isEnabledFor(logging.DEBUG) and stderr:
            with suppress(AttributeError, UnicodeDecodeError):
                stderr = stderr.decode('utf-8', 'replace')
            if logs_errors_to_stdout:
                process_log.debug("stdout/stderr = %s", stderr)
            else:
                process_log.debug("stderr = %s", stderr)
    return proc


def run_polling_stderr(
    args: Args,
    *,
    callback: Callable[[str], None],
    check: bool = False,
    env: Environ | None = None,
    **kwargs,
) -> CompletedProcess:
    """Run a process like ``ocrmypdf.subprocess.run``, and poll stderr.

    Every line of produced by stderr will be forwarded to the callback function.
    The intended use is monitoring progress of subprocesses that output their
    own progress indicators. In addition, each line will be logged if debug
    logging is enabled.

    Requires stderr to be opened in text mode for ease of handling errors. In
    addition the expected encoding= and errors= arguments should be set. Note
    that if stdout is already set up, it need not be binary.
    """
    args, env, process_log, text = _fix_process_args(args, env, kwargs)
    assert text, "Must use text=True"

    with Popen(args, env=env, **kwargs) as proc:
        lines = []
        while proc.poll() is None:
            if proc.stderr is None:
                continue
            for msg in iter(proc.stderr.readline, ''):
                if process_log.isEnabledFor(logging.DEBUG):
                    process_log.debug(msg.strip())
                callback(msg)
                lines.append(msg)
        stderr = ''.join(lines)

        if check and proc.returncode != 0:
            raise CalledProcessError(proc.returncode, args, output=None, stderr=stderr)
        return CompletedProcess(args, proc.returncode, None, stderr=stderr)


def _fix_process_args(
    args: Args, env: Environ | None, kwargs
) -> tuple[Args, Environ, logging.Logger, bool]:
    if not env:
        env = os.environ

    # Search in spoof path if necessary
    program = str(args[0])

    if sys.platform == 'win32':
        # pylint: disable=import-outside-toplevel
        from ocrmypdf.subprocess._windows import fix_windows_args

        args = fix_windows_args(program, args, env)

    log.debug("Running: %s", args)
    process_log = log.getChild(os.path.basename(program))
    text = bool(kwargs.get('text', False))

    return args, env, process_log, text


def get_version(
    program: str,
    *,
    version_arg: str = '--version',
    regex=r'(\d+(\.\d+)*)',
    env: Environ | None = None,
) -> str:
    """Get the version of the specified program.

    Arguments:
        program: The program to version check.
        version_arg: The argument needed to ask for its version, e.g. ``--version``.
        regex: A regular expression to parse the program's output and obtain the
            version.
        env: Custom ``os.environ`` in which to run program.
    """
    args_prog = [program, version_arg]
    try:
        proc = run(
            args_prog,
            close_fds=True,
            text=True,
            stdout=PIPE,
            stderr=STDOUT,
            check=True,
            env=env,
        )
        output: str = proc.stdout
    except FileNotFoundError as e:
        raise MissingDependencyError(
            f"Could not find program '{program}' on the PATH"
        ) from e
    except CalledProcessError as e:
        if e.returncode != 0:
            log.exception(e)
            raise MissingDependencyError(
                f"Ran program '{program}' but it exited with an error:\n{e.output}"
            ) from e
        raise MissingDependencyError(
            f"Could not find program '{program}' on the PATH"
        ) from e

    match = re.match(regex, output.strip())
    if not match:
        raise MissingDependencyError(
            f"The program '{program}' did not report its version. "
            f"Message was:\n{output}"
        )
    version = match.group(1)

    return version


MISSING_PROGRAM = '''
The program '{program}' could not be executed or was not found on your
system PATH.
'''

MISSING_OPTIONAL_PROGRAM = '''
The program '{program}' could not be executed or was not found on your
system PATH.  This program is required when you use the
{required_for} arguments.  You could try omitting these arguments, or install
the package.
'''

MISSING_RECOMMEND_PROGRAM = '''
The program '{program}' could not be executed or was not found on your
system PATH.  This program is recommended when using the {required_for} arguments,
but not required, so we will proceed.  For best results, install the program.
'''

OLD_VERSION = '''
OCRmyPDF requires '{program}' {need_version} or higher.  Your system appears
to have {found_version}.  Please update this program.
'''

OLD_VERSION_REQUIRED_FOR = '''
OCRmyPDF requires '{program}' {need_version} or higher when run with the
{required_for} arguments.  {program} {found_version} is installed.

If you omit these arguments, OCRmyPDF may be able to
proceed.  For best results, update the program.
'''

OSX_INSTALL_ADVICE = '''
If you have homebrew installed, try these command to install the missing
package:
    brew install {package}
'''

LINUX_INSTALL_ADVICE = '''
On systems with the aptitude package manager (Debian, Ubuntu), try these
commands:
    sudo apt update
    sudo apt install {package}

On RPM-based systems (Red Hat, Fedora), try this command:
    sudo dnf install {package}
'''

WINDOWS_INSTALL_ADVICE = '''
If not already installed, install the Chocolatey package manager. Then use
a command prompt to install the missing package:
    choco install {package}
'''


def _get_platform() -> str:
    if sys.platform.startswith('freebsd'):
        return 'freebsd'
    elif sys.platform.startswith('linux'):
        return 'linux'
    elif sys.platform.startswith('win'):
        return 'windows'
    return sys.platform


def _error_trailer(program: str, package: str | Mapping[str, str], **kwargs) -> None:
    del kwargs
    if isinstance(package, Mapping):
        package = package.get(_get_platform(), program)

    if _get_platform() == 'darwin':
        log.info(OSX_INSTALL_ADVICE.format(**locals()))
    elif _get_platform() == 'linux':
        log.info(LINUX_INSTALL_ADVICE.format(**locals()))
    elif _get_platform() == 'windows':
        log.info(WINDOWS_INSTALL_ADVICE.format(**locals()))


def _error_missing_program(
    program: str, package: str, required_for: str | None, recommended: bool
) -> None:
    # pylint: disable=unused-argument
    if recommended:
        log.warning(MISSING_RECOMMEND_PROGRAM.format(**locals()))
    elif required_for:
        log.error(MISSING_OPTIONAL_PROGRAM.format(**locals()))
    else:
        log.error(MISSING_PROGRAM.format(**locals()))
    _error_trailer(**locals())


def _error_old_version(
    program: str,
    package: str,
    need_version: str,
    found_version: str,
    required_for: str | None,
) -> None:
    # pylint: disable=unused-argument
    if required_for:
        log.error(OLD_VERSION_REQUIRED_FOR.format(**locals()))
    else:
        log.error(OLD_VERSION.format(**locals()))
    _error_trailer(**locals())


def check_external_program(
    *,
    program: str,
    package: str,
    version_checker: Callable[[], Version],
    need_version: str | Version,
    required_for: str | None = None,
    recommended: bool = False,
    version_parser: type[Version] = Version,
) -> None:
    """Check for required version of external program and raise exception if not.

    Args:
        program: The name of the program to test.
        package: The name of a software package that typically supplies this program.
            Usually the same as program.
        version_checker: A callable without arguments that retrieves the installed
            version of program.
        need_version: The minimum required version.
        required_for: The name of an argument of feature that requires this program.
        recommended: If this external program is recommended, instead of raising
            an exception, log a warning and allow execution to continue.
        version_parser: A class that should be used to parse and compare version
            numbers. Used when version numbers do not follow standard conventions.
    """
    if not isinstance(need_version, Version):
        need_version = version_parser(need_version)
    try:
        found_version = version_checker()
    except (CalledProcessError, FileNotFoundError) as e:
        _error_missing_program(program, package, required_for, recommended)
        if not recommended:
            raise MissingDependencyError(program) from e
        return
    except MissingDependencyError:
        _error_missing_program(program, package, required_for, recommended)
        if not recommended:
            raise
        return

    if found_version and found_version < need_version:
        _error_old_version(
            program, package, str(need_version), str(found_version), required_for
        )
        if not recommended:
            raise MissingDependencyError(program)

    log.debug('Found %s %s', program, found_version)


================================================
FILE: src/ocrmypdf/subprocess/_windows.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Find Tesseract and Ghostscript binaries on Windows using the registry."""

from __future__ import annotations

import logging
import os
import re
import shutil
import sys
from collections.abc import Callable, Iterable, Iterator
from itertools import chain
from pathlib import Path
from typing import Any, TypeAlias, TypeVar

from packaging.version import InvalidVersion, Version

if sys.platform == 'win32':
    # mypy understands 'if sys.platform' better than try/except ModuleNotFoundError
    import winreg  # pylint: disable=import-error

    HKEYType: TypeAlias = winreg.HKEYType
else:
    from unittest.mock import Mock

    winreg = Mock(
        spec=['HKEYType', 'EnumKey', 'EnumValue', 'HKEY_LOCAL_MACHINE', 'OpenKey']
    )
    # mypy does not understand winreg.HKeyType where winreg is a Mock (fair enough!)
    HKEYType: TypeAlias = Any  # type: ignore


log = logging.getLogger(__name__)

T = TypeVar('T')
Tkey = TypeVar('Tkey')


def ghostscript_version_key(s: str) -> tuple[int, int, int]:
    """Compare Ghostscript version numbers."""
    try:
        release = [int(elem) for elem in s.split('.', maxsplit=3)]
        while len(release) < 3:
            release.append(0)
        return (release[0], release[1], release[2])
    except ValueError:
        return (0, 0, 0)


def registry_enum(key: HKEYType, enum_fn: Callable[[HKEYType, int], T]) -> Iterator[T]:
    limit = 999
    n = 0
    while n < limit:
        try:
            yield enum_fn(key, n)
            n += 1
        except OSError:
            break
    if n == limit:
        raise ValueError(f"Too many registry keys under {key}")


def registry_subkeys(key: HKEYType) -> Iterator[str]:
    return registry_enum(key, winreg.EnumKey)


def registry_values(key: HKEYType) -> Iterator[tuple[str, Any, int]]:
    return registry_enum(key, winreg.EnumValue)


def registry_path_ghostscript(env=None) -> Iterator[Path]:
    del env  # unused (but needed for protocol)
    try:
        with winreg.OpenKey(
            winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Artifex\GPL Ghostscript"
        ) as k:
            latest_gs = max(
                registry_subkeys(k), key=ghostscript_version_key, default=(0, 0, 0)
            )
        with winreg.OpenKey(
            winreg.HKEY_LOCAL_MACHINE, rf"SOFTWARE\Artifex\GPL Ghostscript\{latest_gs}"
        ) as k:
            for _, gs_path, _ in registry_values(k):
                yield Path(gs_path) / 'bin'
    except OSError as e:
        log.warning(e)


def registry_path_tesseract(env=None) -> Iterator[Path]:
    del env  # unused (but needed for protocol)
    try:
        with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Tesseract-OCR") as k:
            for subkey, val, _valtype in registry_values(k):
                if subkey == 'InstallDir':
                    tesseract_path = Path(val)
                    yield tesseract_path
    except OSError as e:
        log.warning(e)


def _gs_version_in_path_key(path: Path) -> tuple[str, Version | None]:
    """Key function for comparing Ghostscript and Tesseract paths.

    Ghostscript installs on Windows:
        %PROGRAMFILES%/gs/gs9.56.1/bin -> ('gs', Version('9.56.1'))
        %PROGRAMFILES%/gs/9.24/bin -> ('gs', Version('9.24'))

    Tesseract looks like:
        %PROGRAMFILES%/Tesseract-OCR -> ('Tesseract-OCR', None)

    Thus ensuring the resulting tuple will order the alternatives correctly,
    e.g. gs10.0 > gs9.99.
    """
    match = re.search(r'gs[/\\]?([0-9.]+)[/\\]bin', str(path))
    if match:
        try:
            version_str = match.group(1)
            version = Version(version_str)
            return 'gs', version
        except InvalidVersion:
            pass
    return path.name, None


def program_files_paths(env=None) -> Iterator[Path]:
    if not env:
        env = os.environ
    program_files = env.get('PROGRAMFILES', '')

    def path_walker() -> Iterator[Path]:
        for path in Path(program_files).iterdir():
            if not path.is_dir():
                continue
            if path.name.lower() == 'tesseract-ocr':
                yield path
            elif path.name.lower() == 'gs':
                yield from (p for p in path.glob('**/bin') if p.is_dir())

    return iter(
        sorted(
            (p for p in path_walker()),
            key=_gs_version_in_path_key,
            reverse=True,
        )
    )


def paths_from_env(env=None) -> Iterator[Path]:
    return (Path(p) for p in os.get_exec_path(env) if p)


def shim_path(new_paths: Callable[[Any], Iterator[Path]], env=None) -> str:
    if not env:
        env = os.environ
    return os.pathsep.join(str(p) for p in new_paths(env) if p)


SHIMS = [
    paths_from_env,
    registry_path_ghostscript,
    registry_path_tesseract,
    program_files_paths,
]


def fix_windows_args(program: str, args, env):
    """Adjust our desired program and command line arguments for use on Windows."""
    # If we are running a .py on Windows, ensure we call it with this Python
    # (to support test suite shims)
    if program.lower().endswith('.py'):
        args = [sys.executable] + args

    # If the program we want is not on the PATH, check elsewhere
    for shim in SHIMS:
        shimmed_path = shim_path(shim, env)
        new_args0 = shutil.which(args[0], path=shimmed_path)
        if new_args0:
            args[0] = new_args0
            break

    return args


def unique_everseen(iterable: Iterable[T], key: Callable[[T], Tkey]) -> Iterator[T]:
    """List unique elements, preserving order."""
    # unique_everseen('AAAABBBCCDAABBB') --> A B C D
    # unique_everseen('ABBCcAD', str.lower) --> A B C D
    seen: set[Tkey] = set()
    seen_add = seen.add
    for element in iterable:
        k = key(element)
        if k not in seen:
            seen_add(k)
            yield element


def _casefold_path(path: Path) -> str:
    return str.casefold(str(path))


def shim_env_path(env=None):
    if env is None:
        env = os.environ

    shim_paths = chain.from_iterable(shim(env) for shim in SHIMS)
    return os.pathsep.join(
        str(p) for p in unique_everseen(shim_paths, key=_casefold_path)
    )


================================================
FILE: tests/__init__.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Tests."""

from __future__ import annotations


================================================
FILE: tests/cache/manifest.jsonl
================================================
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000003_rasterize.png__stdout", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000003_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000004_rasterize.png__stdout", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000004_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000005_rasterize.png__stdout", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000005_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000001_rasterize.png__stdout", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000001_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/2400dpi.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000006_rasterize.png__stdout", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000006_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__thresholding_method=1__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "-c", "thresholding_method=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__thresholding_method=2__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "-c", "thresholding_method=2", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/graph_ocred.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--oem__1__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "--oem", "1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000001_rasterize.png__stdout", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000001_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000001_rasterize.png__stdout", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000001_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000001_rasterize.png__stdout", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000001_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__2__000001_rasterize.png__stdout", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "--psm", "2", "$TMPDIR/000001_rasterize.png", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000001_ocr.png", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000003_ocr.png", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "/tmp/pytest-of-jb/pytest-73/popen-gw4/test_hocr_to_pdf_api0/000001_ocr.png", "/tmp/pytest-of-jb/pytest-73/popen-gw4/test_hocr_to_pdf_api0/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000002_ocr.png", "/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/2400dpi.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__thresholding_method=1__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "-c", "thresholding_method=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/graph_ocred.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__thresholding_method=2__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "-c", "thresholding_method=2", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__--oem__1__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/trivial.pdf", "args": ["-l", "eng", "--oem", "1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__osd__--psm__0__000002_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000002_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__osd__--psm__0__000003_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000003_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__osd__--psm__0__000004_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000004_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/poster.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__deu__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/francais.pdf", "args": ["-l", "deu", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "5.5.1", "system": "Linux", "python": "3.11.14", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}


================================================
FILE: tests/conftest.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import logging
import platform
import sys
from pathlib import Path
from subprocess import CompletedProcess, run

import pytest

from ocrmypdf import api, pdfinfo
from ocrmypdf._exec import unpaper
from ocrmypdf.api import setup_plugin_infrastructure
from ocrmypdf.cli import get_options_and_plugins
from ocrmypdf.exceptions import ExitCode


class Gs106WarningFilter(logging.Filter):
    """Filter out expected Ghostscript 10.6.x warning from test logs."""

    def filter(self, record: logging.LogRecord) -> bool:
        # Allow all records except the expected Ghostscript 10.6.x warning
        return (
            "Ghostscript 10.6.x contains JPEG encoding errors"
            not in record.getMessage()
        )


@pytest.fixture(autouse=True)
def suppress_gs106_warning():
    """Suppress the expected Ghostscript 10.6.x JPEG encoding warning in tests."""
    # Add filter to root logger to suppress expected warnings
    root_logger = logging.getLogger()
    warning_filter = Gs106WarningFilter()
    root_logger.addFilter(warning_filter)
    yield
    root_logger.removeFilter(warning_filter)


def is_linux():
    return platform.system() == 'Linux'


def is_macos():
    return platform.system() == 'Darwin'


def have_unpaper():
    try:
        unpaper.version()
    except Exception:  # pylint: disable=broad-except
        return False
    return True


TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT


@pytest.fixture(scope="session")
def resources() -> Path:
    return Path(TESTS_ROOT) / 'resources'


@pytest.fixture
def ocrmypdf_exec() -> list[str]:
    return [sys.executable, '-m', 'ocrmypdf']


@pytest.fixture(scope="function")
def outdir(tmp_path) -> Path:
    return tmp_path


@pytest.fixture(scope="function")
def outpdf(tmp_path) -> Path:
    return tmp_path / 'out.pdf'


@pytest.fixture(scope="function")
def outtxt(tmp_path) -> Path:
    return tmp_path / 'out.txt'


@pytest.fixture(scope="function")
def no_outpdf(tmp_path) -> Path:
    """Document fact that a test is not expected to produce output.

    This just documents the fact that a test is not expected to produce
    output. Unfortunately an assertion failure inside a test fixture produces
    an error rather than a test failure, so no testing is done. It's up to
    the test to confirm that no output file was created.
    """
    return tmp_path / 'no_output.pdf'


@pytest.fixture(scope="session")
def multipage(resources):
    return resources / 'multipage.pdf'


def check_ocrmypdf(input_file: Path, output_file: Path, *args) -> Path:
    """Run ocrmypdf and confirm that a valid plausible PDF was created."""
    api_args = [str(input_file), str(output_file)] + [
        str(arg) for arg in args if arg is not None
    ]

    options, plugin_manager = get_options_and_plugins(args=api_args)
    api.check_options(options, plugin_manager)
    result = api.run_pipeline(options, plugin_manager=plugin_manager)

    assert result == 0
    assert output_file.exists(), "Output file not created"
    assert output_file.stat().st_size > 100, "PDF too small or empty"

    return output_file


def run_ocrmypdf_api(input_file: Path, output_file: Path, *args) -> ExitCode:
    """Run ocrmypdf via its API in-process, but return CLI-style ExitCode.

    This simulates calling the command line interface in a subprocess and allows us
    to check that the command line interface is working correctly, but since it is
    in-process it is easier to trace with a debugger or coverage tool.

    Any exception raised will be trapped and converted to an exit code.
    The return code must be checked by the caller to determine if the test passed.
    """
    api_args = [str(input_file), str(output_file)] + [
        str(arg) for arg in args if arg is not None
    ]
    options, plugin_manager = get_options_and_plugins(args=api_args)

    api.check_options(options, plugin_manager)
    return api.run_pipeline_cli(options, plugin_manager=plugin_manager)


def run_ocrmypdf(
    input_file: Path, output_file: Path, *args, text: bool = True
) -> CompletedProcess:
    """Run ocrmypdf in a subprocess and let test deal with results.

    If an exception is thrown this fact will be returned as part of the result
    text and return code rather than exception objects.
    """
    p_args = (
        [sys.executable, '-m', 'ocrmypdf']
        + [str(arg) for arg in args if arg is not None]
        + [str(input_file), str(output_file)]
    )

    p = run(
        p_args,
        capture_output=True,
        text=text,
        check=False,
    )
    # print(p.stderr)
    return p


def first_page_dimensions(pdf: Path):
    info = pdfinfo.PdfInfo(pdf)
    page0 = info[0]
    return (page0.width_inches, page0.height_inches)


def pytest_addoption(parser):
    parser.addoption(
        "--runslow",
        action="store_true",
        default=False,
        help=(
            "run slow tests only useful for development (unlikely to be "
            "useful for downstream packagers)"
        ),
    )


def pytest_collection_modifyitems(config, items):
    if config.getoption("--runslow"):
        # --runslow given in cli: do not skip slow tests
        return
    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
    for item in items:
        if "slow" in item.keywords:
            item.add_marker(skip_slow)


def get_test_plugin_manager(plugins=None):
    """Get a properly initialized plugin manager for testing."""
    return setup_plugin_infrastructure(plugins=plugins or [])


================================================
FILE: tests/plugins/gs_feature_elision.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT

from __future__ import annotations

from unittest.mock import patch

from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
from ocrmypdf.subprocess import run_polling_stderr

ELISION_WARNING = """GPL Ghostscript 9.50: Setting Overprint Mode to 1
not permitted in PDF/A-2, overprint mode not set"""


def run_append_stderr(*args, **kwargs):
    proc = run_polling_stderr(*args, **kwargs)
    proc.stderr += '\n' + ELISION_WARNING + '\n'
    return proc


@hookimpl
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
        mock.side_effect = run_append_stderr
        ghostscript.generate_pdfa(
            pdf_pages=pdf_pages,
            pdfmark=pdfmark,
            output_file=output_file,
            context=context,
            pdf_version=pdf_version,
            pdfa_part=pdfa_part,
            progressbar_class=None,
            stop_on_soft_error=True,
        )
        mock.assert_called_once()
    return output_file


================================================
FILE: tests/plugins/gs_pdfa_failure.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT

from __future__ import annotations

from unittest.mock import patch

from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
from ocrmypdf.subprocess import run_polling_stderr


def run_rig_args(args, **kwargs):
    # Remove the two arguments that tell ghostscript to create a PDF/A
    # Does not remove the Postscript definition file - not necessary
    # to cause PDF/A creation failure
    new_args = [
        arg for arg in args if not arg.startswith('-dPDFA') and not arg.endswith('.ps')
    ]
    proc = run_polling_stderr(new_args, **kwargs)
    return proc


@hookimpl
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
        mock.side_effect = run_rig_args
        ghostscript.generate_pdfa(
            pdf_pages=pdf_pages,
            pdfmark=pdfmark,
            output_file=output_file,
            context=context,
            pdf_version=pdf_version,
            pdfa_part=pdfa_part,
            progressbar_class=None,
            stop_on_soft_error=True,
        )
        mock.assert_called()
        return output_file


================================================
FILE: tests/plugins/gs_raster_failure.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT

from __future__ import annotations

from pathlib import Path
from subprocess import CalledProcessError
from unittest.mock import patch

from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript


def raise_gs_fail(*args, **kwargs):
    raise CalledProcessError(
        1, 'gs', output=b"", stderr=b"TEST ERROR: gs_raster_failure.py"
    )


@hookimpl
def rasterize_pdf_page(
    input_file,
    output_file,
    raster_device,
    raster_dpi,
    pageno,
    page_dpi,
    rotation,
    filter_vector,
    stop_on_soft_error,
    options,
    use_cropbox,
) -> Path:
    with patch('ocrmypdf._exec.ghostscript.run') as mock:
        mock.side_effect = raise_gs_fail
        ghostscript.rasterize_pdf_page(
            input_file=input_file,
            output_file=output_file,
            raster_device=raster_device,
            raster_dpi=raster_dpi,
            pageno=pageno,
            page_dpi=page_dpi,
            rotation=rotation,
            filter_vector=filter_vector,
            stop_on_soft_error=stop_on_soft_error,
            options=options,
            use_cropbox=use_cropbox,
        )
        mock.assert_called()
        return output_file


================================================
FILE: tests/plugins/gs_raster_soft_error.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT

from __future__ import annotations

from pathlib import Path
from subprocess import CalledProcessError
from unittest.mock import patch

from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
from ocrmypdf.subprocess import run


def fail_if_stoponerror(args, **kwargs):
    if '-dPDFSTOPONERROR' in args:
        raise CalledProcessError(1, 'gs', output=b"", stderr=b"PDF STOP ON ERROR")
    return run(args, **kwargs)


@hookimpl
def rasterize_pdf_page(
    input_file,
    output_file,
    raster_device,
    raster_dpi,
    pageno,
    page_dpi,
    rotation,
    filter_vector,
    stop_on_soft_error,
    options,
    use_cropbox,
) -> Path:
    with patch('ocrmypdf._exec.ghostscript.run') as mock:
        mock.side_effect = fail_if_stoponerror
        ghostscript.rasterize_pdf_page(
            input_file=input_file,
            output_file=output_file,
            raster_device=raster_device,
            raster_dpi=raster_dpi,
            pageno=pageno,
            page_dpi=page_dpi,
            rotation=rotation,
            filter_vector=filter_vector,
            stop_on_soft_error=stop_on_soft_error,
            options=options,
            use_cropbox=use_cropbox,
        )
        mock.assert_called()
        return output_file


================================================
FILE: tests/plugins/gs_render_failure.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT

from __future__ import annotations

from subprocess import CalledProcessError
from unittest.mock import patch

from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript


def raise_gs_fail(*args, **kwargs):
    raise CalledProcessError(
        1, 'gs', output=b"", stderr=b"TEST ERROR: gs_render_failure.py"
    )


@hookimpl
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
        mock.side_effect = raise_gs_fail
        ghostscript.generate_pdfa(
            pdf_pages=pdf_pages,
            pdfmark=pdfmark,
            output_file=output_file,
            context=context,
            pdf_version=pdf_version,
            pdfa_part=pdfa_part,
            progressbar_class=None,
            stop_on_soft_error=True,
        )
        mock.assert_called()
        return output_file


================================================
FILE: tests/plugins/gs_render_soft_error.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT

from __future__ import annotations

from subprocess import CalledProcessError
from unittest.mock import patch

from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
from ocrmypdf.subprocess import run_polling_stderr


def fail_if_stoponerror(args, **kwargs):
    if '-dPDFSTOPONERROR' in args:
        raise CalledProcessError(1, 'gs', output=b"", stderr=b"PDF STOP ON ERROR")
    return run_polling_stderr(args, **kwargs)


@hookimpl
def generate_pdfa(
    pdf_pages,
    pdfmark,
    output_file,
    context,
    pdf_version,
    pdfa_part,
    stop_on_soft_error,
):
    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
        mock.side_effect = fail_if_stoponerror
        ghostscript.generate_pdfa(
            pdf_pages=pdf_pages,
            pdfmark=pdfmark,
            output_file=output_file,
            context=context,
            pdf_version=pdf_version,
            pdfa_part=pdfa_part,
            progressbar_class=None,
            stop_on_soft_error=stop_on_soft_error,
        )
        mock.assert_called()
        return output_file


================================================
FILE: tests/plugins/tesseract_badutf8.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT

"""Tesseract bad utf8.

In some cases, some versions of Tesseract can output binary gibberish or data
that is not UTF-8 compatible, so we are forced to check that we can convert it
and present it to the user.
"""

from __future__ import annotations

from contextlib import contextmanager
from subprocess import CalledProcessError
from unittest.mock import patch

from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine


def bad_utf8(*args, **kwargs):
    raise CalledProcessError(
        1,
        'tesseract',
        output=b'\x96\xb3\x8c\xf8\x82\xc8UTF-8\x0a',  # "Invalid UTF-8" in Shift JIS
        stderr=b"",
    )


@contextmanager
def patch_tesseract_run():
    with patch('ocrmypdf._exec.tesseract.run') as mock:
        mock.side_effect = bad_utf8
        yield
        mock.assert_called()


class BadUtf8OcrEngine(TesseractOcrEngine):
    @staticmethod
    def generate_hocr(input_file, output_hocr, output_text, options):
        with patch_tesseract_run():
            TesseractOcrEngine.generate_hocr(
                input_file, output_hocr, output_text, options
            )

    @staticmethod
    def generate_pdf(input_file, output_pdf, output_text, options):
        with patch_tesseract_run():
            TesseractOcrEngine.generate_pdf(
                input_file, output_pdf, output_text, options
            )


@hookimpl
def get_ocr_engine():
    return BadUtf8OcrEngine()


================================================
FILE: tests/plugins/tesseract_big_image_error.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
from __future__ import annotations

from contextlib import contextmanager
from subprocess import CalledProcessError
from unittest.mock import patch

from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine


def raise_size_exception(*args, **kwargs):
    raise CalledProcessError(
        1,
        'tesseract',
        output=b"Image too large: (33830, 14959)\nError during processing.",
        stderr=b"",
    )


@contextmanager
def patch_tesseract_run():
    with patch('ocrmypdf._exec.tesseract.run') as mock:
        mock.side_effect = raise_size_exception
        yield
        mock.assert_called()


class BigImageErrorOcrEngine(TesseractOcrEngine):
    @staticmethod
    def get_orientation(input_file, options):
        with patch_tesseract_run():
            return TesseractOcrEngine.get_orientation(input_file, options)

    @staticmethod
    def generate_hocr(input_file, output_hocr, output_text, options):
        with patch_tesseract_run():
            TesseractOcrEngine.generate_hocr(
                input_file, output_hocr, output_text, options
            )

    @staticmethod
    def generate_pdf(input_file, output_pdf, output_text, options):
        with patch_tesseract_run():
            TesseractOcrEngine.generate_pdf(
                input_file, output_pdf, output_text, options
            )


@hookimpl
def get_ocr_engine():
    return BigImageErrorOcrEngine()


================================================
FILE: tests/plugins/tesseract_cache.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Cache output of tesseract to speed up test suite.

The cache is keyed by by the input test file The input arguments are slugged
into a hideous filename that more or less represents them literally.  Joined
together, this becomes the name of the cache folder.  A few name files like
stdout, stderr, hocr, pdf, describe the output to reproduce.

Changes to tests/resources/ or image processing algorithms don't trigger a
cache miss.  By design, an input image that varies according to platform
differences (e.g. JPEG decoders are allowed to produce differing outputs,
and in practice they do) will still be a cache hit.  By design, an
invocation of tesseract with the same parameters from a different test case
will be a hit.  It's fragile.

The tests/cache/manifest.jsonl is a JSON lines file that contains
information about the system that produced the results used when cache was
generated.  This mainly a log to answer questions about how the files
were produced.

Certain operations are not cached and routed to Tesseract OCR directly.

Assumes Tesseract 4+.

"""

from __future__ import annotations

import argparse
import json
import logging
import platform
import re
import shutil
import threading
from functools import partial
from pathlib import Path
from subprocess import PIPE, CalledProcessError, CompletedProcess
from unittest.mock import patch

from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine
from ocrmypdf.subprocess import run

log = logging.getLogger(__name__)

TESTS_ROOT = Path(__file__).resolve().parent.parent
CACHE_ROOT = TESTS_ROOT / 'cache'


parser = argparse.ArgumentParser(
    prog='tesseract-cache', description='cache output of tesseract'
)
parser.add_argument('-l', '--language', action='append')
parser.add_argument('imagename')
parser.add_argument('outputbase')
parser.add_argument('configfiles', nargs='*')
parser.add_argument('--user-words', type=str)
parser.add_argument('--user-patterns', type=str)
parser.add_argument('-c', action='append')
parser.add_argument('--psm', type=int)
parser.add_argument('--oem', type=int)


def get_cache_folder(source_pdf, run_args, parsed_args):
    def slugs():
        yield ''  # so we don't start with a '-' which makes rm difficult
        for arg in run_args[1:]:
            if arg == parsed_args.imagename:
                yield Path(parsed_args.imagename).name
            elif arg == parsed_args.outputbase:
                yield Path(parsed_args.outputbase).name
            elif arg == '-c' or arg.startswith('textonly'):
                pass
            else:
                yield arg

    argv_slug = '__'.join(slugs())
    argv_slug = argv_slug.replace('/', '___')

    return Path(CACHE_ROOT) / Path(source_pdf).stem / argv_slug


def cached_run(options, run_args, **run_kwargs):
    run_args = [str(arg) for arg in run_args]  # flatten PosixPaths
    args = parser.parse_args(run_args[1:])

    if args.imagename in ('stdin', '-'):
        return run(run_args, **run_kwargs)

    source_file = options.input_file
    cache_folder = get_cache_folder(source_file, run_args, args)
    cache_folder.mkdir(parents=True, exist_ok=True)

    log.debug(f"Using Tesseract cache {cache_folder}")

    # Determine what configfiles we need
    configfiles = args.configfiles if args.configfiles else ['txt']

    # Check if cache has all required files
    def cache_complete():
        if not (cache_folder / 'stderr.bin').exists():
            return False
        if not (cache_folder / 'stdout.bin').exists():
            return False
        if args.outputbase != 'stdout':
            for configfile in configfiles:
                if not (cache_folder / f'{configfile}.bin').exists():
                    return False
        return True

    if cache_complete():
        log.debug("Cache HIT")

        # Replicate stdout/err
        if args.outputbase != 'stdout':
            for configfile in configfiles:
                # cp cache -> output
                tessfile = args.outputbase + '.' + configfile
                shutil.copy(str(cache_folder / configfile) + '.bin', tessfile)
        return CompletedProcess(
            args=run_args,
            returncode=0,
            stdout=(cache_folder / 'stdout.bin').read_bytes(),
            stderr=(cache_folder / 'stderr.bin').read_bytes(),
        )

    log.debug("Cache MISS")

    cache_kwargs = {
        k: v for k, v in run_kwargs.items() if k not in ('stdout', 'stderr')
    }
    # Don't pass timeout=0 to the actual run call - it would timeout immediately
    # A timeout of 0 means "use default/no timeout" in the caching context
    if cache_kwargs.get('timeout') == 0.0:
        cache_kwargs['timeout'] = None
    if 'check' not in cache_kwargs:
        cache_kwargs['check'] = True
    try:
        p = run(run_args, stdout=PIPE, stderr=PIPE, **cache_kwargs)
    except CalledProcessError as e:
        log.exception(e)
        raise  # Pass exception onward

    # Update cache
    (cache_folder / 'stdout.bin').write_bytes(p.stdout)
    (cache_folder / 'stderr.bin').write_bytes(p.stderr)

    if args.outputbase != 'stdout':
        for configfile in configfiles:
            if configfile not in ('fpdf2', 'hocr', 'pdf', 'txt'):
                continue
            # cp pwd/{outputbase}.{configfile} -> {cache}/{configfile}
            tessfile = args.outputbase + '.' + configfile
            shutil.copy(tessfile, str(cache_folder / configfile) + '.bin')

    def clean_sys_argv():
        for arg in run_args[1:]:
            yield re.sub(r'.*/ocrmypdf[.]io[.][^/]+[/](.*)', r'$TMPDIR/\1', arg)

    manifest = {
        'tesseract_version': TesseractOcrEngine.version().replace('\n', ' '),
        'system': platform.system(),
        'python': platform.python_version(),
        'argv_slug': cache_folder.name,
        'sourcefile': str(Path(source_file).relative_to(TESTS_ROOT)),
        'args': list(clean_sys_argv()),
    }

    with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f:
        json.dump(manifest, f)
        f.write('\n')
        f.flush()
    return p


class CacheOcrEngine(TesseractOcrEngine):
    # Concurrent threads (with --use-threads) might try to use different parts
    # of the OcrEngine, so we need a lock to protect the state of patched
    # module whenever it's patched. Should refactor ocrmypdf._exec.tesseract so that
    # it does not to be patched at all for testing.
    lock = threading.Lock()

    @staticmethod
    def get_orientation(input_file, options):
        with (
            CacheOcrEngine.lock,
            patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),
        ):
            return TesseractOcrEngine.get_orientation(input_file, options)

    @staticmethod
    def get_deskew(input_file, options) -> float:
        with (
            CacheOcrEngine.lock,
            patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),
        ):
            return TesseractOcrEngine.get_deskew(input_file, options)

    @staticmethod
    def generate_hocr(input_file, output_hocr, output_text, options):
        with (
            CacheOcrEngine.lock,
            patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),
        ):
            TesseractOcrEngine.generate_hocr(
                input_file, output_hocr, output_text, options
            )

    @staticmethod
    def generate_pdf(input_file, output_pdf, output_text, options):
        with (
            CacheOcrEngine.lock,
            patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),
        ):
            TesseractOcrEngine.generate_pdf(
                input_file, output_pdf, output_text, options
            )


@hookimpl
def get_ocr_engine():
    return CacheOcrEngine()


================================================
FILE: tests/plugins/tesseract_crash.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
from __future__ import annotations

import signal
from contextlib import contextmanager
from subprocess import CalledProcessError
from unittest.mock import patch

from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine


def raise_crash(*args, **kwargs):
    raise CalledProcessError(
        128 + signal.SIGABRT,
        'tesseract',
        output=b"",
        stderr=b"libc++abi.dylib: terminating with uncaught exception of type "
        + b"std::bad_alloc: std::bad_alloc",
    )


@contextmanager
def patch_tesseract_run():
    with patch('ocrmypdf._exec.tesseract.run') as mock:
        mock.side_effect = raise_crash
        yield
        mock.assert_called()


class CrashOcrEngine(TesseractOcrEngine):
    @staticmethod
    def get_orientation(input_file, options):
        with patch_tesseract_run():
            return TesseractOcrEngine.get_orientation(input_file, options)

    @staticmethod
    def generate_hocr(input_file, output_hocr, output_text, options):
        with patch_tesseract_run():
            TesseractOcrEngine.generate_hocr(
                input_file, output_hocr, output_text, options
            )

    @staticmethod
    def generate_pdf(input_file, output_pdf, output_text, options):
        with patch_tesseract_run():
            TesseractOcrEngine.generate_pdf(
                input_file, output_pdf, output_text, options
            )


@hookimpl
def get_ocr_engine():
    return CrashOcrEngine()


================================================
FILE: tests/plugins/tesseract_debug_rotate.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract no-op/fixed rotate plugin.

To quickly run tests where getting OCR output is not necessary and we want to test
the rotation pipeline.

In generate_hocr mode, create a .hocr file that specifies no text found.

In 'pdf' mode, convert the image to PDF using another program.

In orientation check mode, report 0, 90, 180, 270... based on page number.
"""

from __future__ import annotations

import pikepdf
from PIL import Image

from ocrmypdf import OcrEngine, OrientationConfidence, hookimpl
from ocrmypdf.helpers import page_number

HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
  <title></title>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  <meta name='ocr-system' content='tesseract 4.1.1' />
  <meta name='ocr-capabilities'
    content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
 </head>
 <body>
  <div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
   <div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
    <p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
     <span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}">
       <span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
     </span>
    </p>
   </div>
  </div>
 </body>
</html>'''


class FixedRotateNoopOcrEngine(OcrEngine):
    @staticmethod
    def version():
        return '4.1.1'

    @staticmethod
    def creator_tag(options):
        tag = '-PDF' if options.pdf_renderer == 'sandwich' else '-hOCR'
        return f"NO-OP {tag} {FixedRotateNoopOcrEngine.version()}"

    def __str__(self):
        return f"NO-OP {FixedRotateNoopOcrEngine.version()}"

    @staticmethod
    def languages(options):
        return {'eng'}

    @staticmethod
    def get_orientation(input_file, options):
        page = page_number(input_file)

        angle = ((page - 1) * 90) % 360

        return OrientationConfidence(angle=angle, confidence=99.9)

    @staticmethod
    def generate_hocr(input_file, output_hocr, output_text, options):
        with (
            Image.open(input_file) as im,
            open(output_hocr, 'w', encoding='utf-8') as f,
        ):
            w, h = im.size
            f.write(HOCR_TEMPLATE.format(str(w), str(h)))
        with open(output_text, 'w') as f:
            f.write('')

    @staticmethod
    def generate_pdf(input_file, output_pdf, output_text, options):
        with Image.open(input_file) as im:
            dpi = im.info['dpi']
            pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1]
        ptsize = pagesize[0] * 72, pagesize[1] * 72
        pdf = pikepdf.new()
        pdf.add_blank_page(page_size=ptsize)
        pdf.save(output_pdf, static_id=True)
        output_text.write_text('')


@hookimpl
def get_ocr_engine():
    return FixedRotateNoopOcrEngine()


================================================
FILE: tests/plugins/tesseract_noop.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract no-op plugin.

To quickly run tests where getting OCR output is not necessary.

In generate_hocr mode, create a .hocr file that specifies no text found.

In 'pdf' mode, convert the image to PDF using another program.

In orientation check mode, report the orientation is upright.
"""

from __future__ import annotations

import pikepdf
from PIL import Image

from ocrmypdf import OcrEngine, OrientationConfidence, hookimpl

HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
  <title></title>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  <meta name='ocr-system' content='tesseract 4.1.1' />
  <meta name='ocr-capabilities'
    content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
 </head>
 <body>
  <div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
   <div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
    <p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
     <span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}">
       <span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
     </span>
    </p>
   </div>
  </div>
 </body>
</html>'''


class NoopOcrEngine(OcrEngine):
    @staticmethod
    def version():
        return '4.1.1'

    @staticmethod
    def creator_tag(options):
        tag = '-PDF' if options.pdf_renderer == 'sandwich' else '-hOCR'
        return f"NO-OP {tag} {NoopOcrEngine.version()}"

    def __str__(self):
        return f"NO-OP {NoopOcrEngine.version()}"

    @staticmethod
    def languages(options):
        return {'eng'}

    @staticmethod
    def get_orientation(input_file, options):
        return OrientationConfidence(angle=0, confidence=0.0)

    @staticmethod
    def get_deskew(input_file, options):
        return 0.0

    @staticmethod
    def generate_hocr(input_file, output_hocr, output_text, options):
        with (
            Image.open(input_file) as im,
            open(output_hocr, 'w', encoding='utf-8') as f,
        ):
            w, h = im.size
            f.write(HOCR_TEMPLATE.format(str(w), str(h)))
        with open(output_text, 'w') as f:
            f.write('')

    @staticmethod
    def generate_pdf(input_file, output_pdf, output_text, options):
        with Image.open(input_file) as im:
            dpi = im.info['dpi']
            pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1]
        ptsize = pagesize[0] * 72, pagesize[1] * 72
        pdf = pikepdf.new()
        pdf.add_blank_page(page_size=ptsize)
        pdf.save(output_pdf, static_id=True)
        output_text.write_text('')


@hookimpl
def get_ocr_engine():
    return NoopOcrEngine()


================================================
FILE: tests/plugins/tesseract_simulate_oom_killer.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract no-op plugin that simulates the OOM killer on page 4.

OCRmyPDF can use a lot of memory, even that it might trigger the
OOM killer on Linux or similar features on other platforms. We want to
ensure we fail with an error rather than deadlock in such cases.

Page 4 was chosen because of this number's association with bad luck
in many East Asian cultures.
"""

# type: ignore
from __future__ import annotations

import os
import signal
from pathlib import Path

from ocrmypdf import hookimpl

# type: ignore


# Ugly hack that let us use the NoopOcrEngine without setting up packaging for our
# tests.
# This hack also requires us to set type: ignore
parent_file = Path(__file__).with_name('tesseract_noop.py')
parent = compile(parent_file.read_text(), parent_file, mode='exec')
exec(parent)
NoopOcrEngine = locals()['NoopOcrEngine']


class Page4Engine(NoopOcrEngine):  # type: ignore
    def __str__(self):
        return f"NO-OP Page 4 {NoopOcrEngine.version()}"

    @staticmethod
    def generate_hocr(input_file: Path, output_hocr, output_text, options):
        if input_file.stem.startswith('000004'):
            # Suicide
            os.kill(os.getpid(), signal.SIGKILL)
        else:
            return NoopOcrEngine.generate_hocr(
                input_file, output_hocr, output_text, options
            )

    @staticmethod
    def generate_pdf(input_file, output_pdf, output_text, options):
        if input_file.stem.startswith('000004'):
            # Suicide
            os.kill(os.getpid(), signal.SIGKILL)
        else:
            return NoopOcrEngine.generate_pdf(
                input_file, output_pdf, output_text, options
            )


@hookimpl
def check_options(options):
    if options.use_threads:
        raise ValueError("I'm not compatible with use_threads")


@hookimpl
def get_ocr_engine():
    return Page4Engine()


================================================
FILE: tests/resources/README.rst
================================================
.. SPDX-FileCopyrightText: 2022 James R. Barlow
.. SPDX-License-Identifier: CC-BY-SA-4.0

These test files are used in OCRmyPDF's test suite. They do not necessarily produce OCR results
at all and are not necessarily meant as examples of OCR output. Some are even invalid PDFs that might
crash certain PDF viewers.

Some of these images were obtained from the public domain. Others are copyrighted and may have
licenses associated. Refer to ``.reuse/dep5`` file in OCRmyPDF's Git repository for information on
the copyright holder(s) and license(s) applicable to these resources.

.. list-table::
    :widths: 15 35 50
    :header-rows: 1

    *   - File
        - Source
        - Purpose
    *   - c02-22.pdf
        - `Project Gutenberg`_, Adventures of Huckleberry Finn, page 22
        - difficult OCR image (obscure fonts and illustrations)
    *   - graph.pdf
        - `Wikimedia:Simple_line_graph_of_ACE_2012_results_by_candidate_sj01.png`_
        - image with slanted text
    *   - lichtenstein.pdf
        - `Wikimedia: JPEG2000 Lichtenstein`_
        - JPEG2000 image
    *   - linn.png, linn.pdf, linn.txt
        - `Wikimedia: LinnSequencer`_
        - image with two columns
    *   - typewriter.png, 2400dpi.pdf
        - `Wikimedia: Triumph typewrtier text Linzensoep`_
        - simple text
    *   - baiona.png
        - `Wikimedia: Baionako udalerri mugakideak`_
        - multilingual text and images
    *   - aspect.pdf
        - synthetic
        - test image with 200 x 100 DPI resolution
    *   - blank.pdf
        - synthetic
        - blank PDF generated by Adobe Illustrator CC 17, containing a lot of application-specific metadata/bloat
    *   - cmyk.pdf
        - synthetic
        - a CMYK image created in Photoshop
    *   - crom.png
        - synthetic
        - test for non-dictionary words
    *   - enormous.pdf
        - synthetic
        - very large PDF page
    *   - epson.pdf
        - synthetic
        - a linearized PDF containing some unusual indirect objects, created by an Epson printer; printout of a Wikipedia article (CC-BY-SA)
    *   - formxobject.pdf
        - synthetic
        - hand-crafted PDF containing an image inside a Form XObject
    *   - francais.pdf
        - synthetic
        - a page containing French accents (diacritics)
    *   - hugemono.pdf
        - synthetic
        - large monochrome 35000x35000 image in JBIG2 encoding
    *   - invalid.pdf
        - synthetic
        - a PDF file header followed by EOF marker
    *   - kcs.pdf
        - synthetic
        - PDF file generated by Kodak Capture Desktop Software 1.2; has invalid table of contents
    *   - livecycle.pdf
        - synthetic
        - a minimal PDF that claims to use dynamic XFA forms
    *   - masks.pdf
        - synthetic
        - file containing explicit masks and a stencil mask drawn without a proper transformation matrix; printout of a German Wikipedia article (CC-BY-SA)
    *   - missing_docinfo.pdf
        - synthetic
        - PDF file with no /DocumentInfo section
    *   - overlay.pdf
        - synthetic
        - PDF file generated by PDFPen pro that triggered content stream parse errors
    *   - negzero.pdf
        - synthetic
        - copy of formxobject.pdf with token that qpdf doesn't like
    *   - no_contents.pdf
        - synthetic
        - synthetic PDF with a blank page that has no /Contents entry
    *   - truetype_font_nomapping.pdf
        - synthetic
        - example of a PDF with an embedded subsetted TrueType font with no Unicode mapping
    *   - trivial.pdf
        - synthetic
        - smallest possible valid PDF-1.3 with all required fields
    *   - type3_font_nomapping.pdf
        - synthetic
        - example of a PDF with an embedded subsetted TrueType font with no Unicode mapping
    *   - vector.pdf
        - synthetic
        - a PDF with vector art and text rendered as curves with no fonts

Assemblies
==========

These test resources are assemblies or derivatives from other previously mentioned files, released under the same license terms as their input files.

- baiona_gray.png (from baiona.png, grayscale version)
- baiona_colormapped.png (from baiona.png, palette version)
- baiona_alpha.png (from baiona.png, RGB+A version)
- cardinal.pdf (four cardinal directions, baked-in rotated copies of linn.png)
- ccitt.pdf (linn.png, converted to CCITT encoding)
- graph_ocred.pdf (from graph.pdf)
- jbig2.pdf (from linn.png)
- multipage.pdf (from several other files)
- palette.pdf (from baiona_colormapped.png)
- poster.pdf (from linn.png)
- rotated_skew.pdf (a /Rotate'd and skewed document from linn.png)
- skew.pdf (from linn.png, skew simulated by adjusting the transformation matrix)
- toc.pdf (from formxobject.pdf, trivial.pdf)


.. _`Wikimedia: LinnSequencer`: https://upload.wikimedia.org/wikipedia/en/b/b7/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg

.. _`Project Gutenberg`: https://www.gutenberg.org/files/76/76-h/76-h.htm#c2

.. _`Wikimedia: Simple_line_graph_of_ACE_2012_results_by_candidate_sj01.png`: https://en.wikipedia.org/wiki/File:Simple_line_graph_of_ACE_2012_results_by_candidate_sj01.png

.. _`Wikimedia: JPEG2000 Lichtenstein`: https://en.wikipedia.org/wiki/JPEG_2000#/media/File:Jpeg2000_2-level_wavelet_transform-lichtenstein.png

.. _`Linux (Wikipedia Article)`: https://de.wikipedia.org/wiki/Linux

.. _`Wikimedia: Triumph typewrtier text Linzensoep`: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif

.. _`Wikimedia: Baionako udalerri mugakideak`: https://commons.wikimedia.org/wiki/File:Baionako_udalerri_mugakideak.png


================================================
FILE: tests/resources/arabic.hocr
================================================
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ar" lang="ar">
<head>
<title></title>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 5.0.0' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "test.png"; bbox 0 0 2550 3300; ppageno 0; scan_res 300 300'>
<div class='ocr_carea' id='carea_1_1' title="bbox 200 200 2350 1200">
<p class='ocr_par' id='par_1_1' lang='ara' dir='rtl' title="bbox 200 200 2350 400">
<span class='ocr_line' id='line_1_1' title="bbox 200 200 2350 400; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_1' title='bbox 200 200 600 400; x_wconf 95'>مرحبا</span>
<span class='ocrx_word' id='word_1_2' title='bbox 650 200 1050 400; x_wconf 95'>بالعالم</span>
</span>
</p>
<p class='ocr_par' id='par_1_2' lang='ara' dir='rtl' title="bbox 200 500 2350 700">
<span class='ocr_line' id='line_1_2' title="bbox 200 500 2350 700; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_3' title='bbox 200 500 600 700; x_wconf 95'>هذا</span>
<span class='ocrx_word' id='word_1_4' title='bbox 650 500 1050 700; x_wconf 95'>نص</span>
<span class='ocrx_word' id='word_1_5' title='bbox 1100 500 1500 700; x_wconf 95'>عربي</span>
</span>
</p>
<p class='ocr_par' id='par_1_3' lang='per' dir='rtl' title="bbox 200 800 2350 1000">
<span class='ocr_line' id='line_1_3' title="bbox 200 800 2350 1000; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_6' title='bbox 200 800 600 1000; x_wconf 95'>سلام</span>
<span class='ocrx_word' id='word_1_7' title='bbox 650 800 1050 1000; x_wconf 95'>فارسی</span>
</span>
</p>
</div>
</div>
</body>
</html>


================================================
FILE: tests/resources/cjk.hocr
================================================
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh" lang="zh">
<head>
<title></title>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 5.0.0' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "test.png"; bbox 0 0 2550 3300; ppageno 0; scan_res 300 300'>
<div class='ocr_carea' id='carea_1_1' title="bbox 200 200 2350 1500">
<p class='ocr_par' id='par_1_1' lang='chi_sim' title="bbox 200 200 2350 400">
<span class='ocr_line' id='line_1_1' title="bbox 200 200 2350 400; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_1' title='bbox 200 200 600 400; x_wconf 95'>你好</span>
<span class='ocrx_word' id='word_1_2' title='bbox 650 200 1050 400; x_wconf 95'>世界</span>
</span>
</p>
<p class='ocr_par' id='par_1_2' lang='chi_tra' title="bbox 200 500 2350 700">
<span class='ocr_line' id='line_1_2' title="bbox 200 500 2350 700; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_3' title='bbox 200 500 600 700; x_wconf 95'>繁體</span>
<span class='ocrx_word' id='word_1_4' title='bbox 650 500 1050 700; x_wconf 95'>中文</span>
</span>
</p>
<p class='ocr_par' id='par_1_3' lang='jpn' title="bbox 200 800 2350 1000">
<span class='ocr_line' id='line_1_3' title="bbox 200 800 2350 1000; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_5' title='bbox 200 800 600 1000; x_wconf 95'>こんにちは</span>
<span class='ocrx_word' id='word_1_6' title='bbox 650 800 1050 1000; x_wconf 95'>世界</span>
</span>
</p>
<p class='ocr_par' id='par_1_4' lang='kor' title="bbox 200 1100 2350 1300">
<span class='ocr_line' id='line_1_4' title="bbox 200 1100 2350 1300; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_7' title='bbox 200 1100 600 1300; x_wconf 95'>안녕하세요</span>
<span class='ocrx_word' id='word_1_8' title='bbox 650 1100 1050 1300; x_wconf 95'>세계</span>
</span>
</p>
</div>
</div>
</body>
</html>


================================================
FILE: tests/resources/devanagari.hocr
================================================
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="hi" lang="hi">
<head>
<title></title>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 5.0.0' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "test.png"; bbox 0 0 2550 3300; ppageno 0; scan_res 300 300'>
<div class='ocr_carea' id='carea_1_1' title="bbox 200 200 2350 1200">
<p class='ocr_par' id='par_1_1' lang='hin' title="bbox 200 200 2350 400">
<span class='ocr_line' id='line_1_1' title="bbox 200 200 2350 400; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_1' title='bbox 200 200 600 400; x_wconf 95'>नमस्ते</span>
<span class='ocrx_word' id='word_1_2' title='bbox 650 200 1050 400; x_wconf 95'>दुनिया</span>
</span>
</p>
<p class='ocr_par' id='par_1_2' lang='hin' title="bbox 200 500 2350 700">
<span class='ocr_line' id='line_1_2' title="bbox 200 500 2350 700; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_3' title='bbox 200 500 600 700; x_wconf 95'>यह</span>
<span class='ocrx_word' id='word_1_4' title='bbox 650 500 1050 700; x_wconf 95'>हिंदी</span>
<span class='ocrx_word' id='word_1_5' title='bbox 1100 500 1500 700; x_wconf 95'>पाठ</span>
<span class='ocrx_word' id='word_1_6' title='bbox 1550 500 1950 700; x_wconf 95'>है</span>
</span>
</p>
<p class='ocr_par' id='par_1_3' lang='san' title="bbox 200 800 2350 1000">
<span class='ocr_line' id='line_1_3' title="bbox 200 800 2350 1000; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_7' title='bbox 200 800 700 1000; x_wconf 95'>संस्कृत</span>
<span class='ocrx_word' id='word_1_8' title='bbox 750 800 1250 1000; x_wconf 95'>भाषा</span>
</span>
</p>
</div>
</div>
</body>
</html>


================================================
FILE: tests/resources/hello_world_scripts.hocr
================================================
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>Multilingual Hello World Script Test</title>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 5.0.0' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<!-- Page: 8.5x11 inches at 300 DPI = 2550x3300 pixels -->
<div class='ocr_page' id='page_1' title='image "hello_scripts.png"; bbox 0 0 2550 3300; ppageno 0; scan_res 300 300'>

<!-- Row 1: English and Spanish (Latin script with accents/punctuation) -->
<div class='ocr_carea' id='carea_1_1' title="bbox 150 150 1200 400">
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 150 150 600 350">
<span class='ocr_line' id='line_1_1' title="bbox 150 150 600 350; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_1_1' title='bbox 150 150 600 350; x_wconf 98'>Hello!</span>
</span>
</p>
</div>

<div class='ocr_carea' id='carea_1_2' title="bbox 1400 150 2400 400">
<p class='ocr_par' id='par_1_2' lang='spa' title="bbox 1400 150 2400 350">
<span class='ocr_line' id='line_1_2' title="bbox 1400 150 2000 350; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_1_2' title='bbox 1400 150 2000 350; x_wconf 97'>¡Hola!</span>
</span>
</p>
</div>

<!-- Row 2: French (accents) and German (umlauts, eszett) -->
<div class='ocr_carea' id='carea_2_1' title="bbox 150 450 1200 700">
<p class='ocr_par' id='par_2_1' lang='fra' title="bbox 150 450 800 650">
<span class='ocr_line' id='line_2_1' title="bbox 150 450 800 650; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_2_1' title='bbox 150 450 800 650; x_wconf 96'>Bonjour!</span>
</span>
</p>
</div>

<div class='ocr_carea' id='carea_2_2' title="bbox 1400 450 2400 700">
<p class='ocr_par' id='par_2_2' lang='deu' title="bbox 1400 450 2100 650">
<span class='ocr_line' id='line_2_2' title="bbox 1400 450 2100 650; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_2_2' title='bbox 1400 450 2100 650; x_wconf 95'>Grüß Gott!</span>
</span>
</p>
</div>

<!-- Row 3: Russian (Cyrillic) and Greek -->
<div class='ocr_carea' id='carea_3_1' title="bbox 150 750 1200 1000">
<p class='ocr_par' id='par_3_1' lang='rus' title="bbox 150 750 900 950">
<span class='ocr_line' id='line_3_1' title="bbox 150 750 900 950; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_3_1' title='bbox 150 750 900 950; x_wconf 94'>Привет!</span>
</span>
</p>
</div>

<div class='ocr_carea' id='carea_3_2' title="bbox 1400 750 2400 1000">
<p class='ocr_par' id='par_3_2' lang='ell' title="bbox 1400 750 2200 950">
<span class='ocr_line' id='line_3_2' title="bbox 1400 750 2200 950; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_3_2' title='bbox 1400 750 2200 950; x_wconf 93'>Γειά σου!</span>
</span>
</p>
</div>

<!-- Row 4: Chinese (Simplified) and Japanese -->
<div class='ocr_carea' id='carea_4_1' title="bbox 150 1050 1200 1300">
<p class='ocr_par' id='par_4_1' lang='chi_sim' title="bbox 150 1050 700 1250">
<span class='ocr_line' id='line_4_1' title="bbox 150 1050 700 1250; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_4_1' title='bbox 150 1050 700 1250; x_wconf 92'>你好！</span>
</span>
</p>
</div>

<div class='ocr_carea' id='carea_4_2' title="bbox 1400 1050 2400 1300">
<p class='ocr_par' id='par_4_2' lang='jpn' title="bbox 1400 1050 2300 1250">
<span class='ocr_line' id='line_4_2' title="bbox 1400 1050 2300 1250; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_4_2' title='bbox 1400 1050 2300 1250; x_wconf 91'>こんにちは！</span>
</span>
</p>
</div>

<!-- Row 5: Korean and Turkish (Latin with special chars) -->
<div class='ocr_carea' id='carea_5_1' title="bbox 150 1350 1200 1600">
<p class='ocr_par' id='par_5_1' lang='kor' title="bbox 150 1350 900 1550">
<span class='ocr_line' id='line_5_1' title="bbox 150 1350 900 1550; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_5_1' title='bbox 150 1350 900 1550; x_wconf 90'>안녕하세요!</span>
</span>
</p>
</div>

<div class='ocr_carea' id='carea_5_2' title="bbox 1400 1350 2400 1600">
<p class='ocr_par' id='par_5_2' lang='tur' title="bbox 1400 1350 2300 1550">
<span class='ocr_line' id='line_5_2' title="bbox 1400 1350 2300 1550; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_5_2' title='bbox 1400 1350 2300 1550; x_wconf 89'>Merhaba!</span>
</span>
</p>
</div>

<!-- Row 6: Hindi (Devanagari) and Arabic (RTL) -->
<div class='ocr_carea' id='carea_6_1' title="bbox 150 1650 1200 1900">
<p class='ocr_par' id='par_6_1' lang='hin' title="bbox 150 1650 900 1850">
<span class='ocr_line' id='line_6_1' title="bbox 150 1650 900 1850; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_6_1' title='bbox 150 1650 900 1850; x_wconf 88'>नमस्ते!</span>
</span>
</p>
</div>

<div class='ocr_carea' id='carea_6_2' title="bbox 1400 1650 2400 1900">
<p class='ocr_par' id='par_6_2' lang='ara' dir='rtl' title="bbox 1400 1650 2300 1850">
<span class='ocr_line' id='line_6_2' title="bbox 1400 1650 2300 1850; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_6_2' title='bbox 1400 1650 2300 1850; x_wconf 87'>!مرحبا</span>
</span>
</p>
</div>

<!-- Row 7: Hebrew (RTL) and Portuguese (accents) -->
<div class='ocr_carea' id='carea_7_1' title="bbox 150 1950 1200 2200">
<p class='ocr_par' id='par_7_1' lang='heb' dir='rtl' title="bbox 150 1950 800 2150">
<span class='ocr_line' id='line_7_1' title="bbox 150 1950 800 2150; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_7_1' title='bbox 150 1950 800 2150; x_wconf 86'>שלום</span>
</span>
</p>
</div>

<div class='ocr_carea' id='carea_7_2' title="bbox 1400 1950 2000 2200">
<p class='ocr_par' id='par_7_2' lang='por' title="bbox 1400 1950 1900 2150">
<span class='ocr_line' id='line_7_2' title="bbox 1400 1950 1900 2150; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35">
<span class='ocrx_word' id='word_7_2' title='bbox 1400 1950 1900 2150; x_wconf 85'>Olá!</span>
</span>
</p>
</div>

<!-- Rotated text section: Various scripts at angles -->
<!-- Rotated baseline: 15 degrees clockwise (baseline slope ~0.27) -->
<div class='ocr_carea' id='carea_8_1' title="bbox 200 2150 900 2700">
<p class='ocr_par' id='par_8_1' lang='ita' title="bbox 200 2150 900 2650">
<span class='ocr_line' id='line_8_1' title="bbox 200 2150 900 2450; baseline 0.27 -30; x_size 130; x_descenders 26; x_ascenders 32">
<span class='ocrx_word' id='word_8_1' title='bbox 200 2150 900 2450; x_wconf 84'>Ciao!</span>
</span>
</p>
</div>

<!-- Rotated baseline: -10 degrees (baseline slope ~-0.18) -->
<div class='ocr_carea' id='carea_8_2' title="bbox 1000 2350 1700 2700">
<p class='ocr_par' id='par_8_2' lang='pol' title="bbox 1000 2400 1700 2650">
<span class='ocr_line' id='line_8_2' title="bbox 1000 2400 1700 2650; baseline -0.18 -25; x_size 130; x_descenders 26; x_ascenders 32">
<span class='ocrx_word' id='word_8_2' title='bbox 1000 2400 1700 2650; x_wconf 83'>Cześć!</span>
</span>
</p>
</div>

<!-- Rotated baseline: 8 degrees clockwise (baseline slope ~0.14) - Chinese -->
<div class='ocr_carea' id='carea_8_3' title="bbox 1800 2350 2450 2700">
<p class='ocr_par' id='par_8_3' lang='chi_tra' title="bbox 1800 2400 2450 2650">
<span class='ocr_line' id='line_8_3' title="bbox 1800 2400 2450 2650; baseline 0.14 -35; x_size 130; x_descenders 26; x_ascenders 32">
<span class='ocrx_word' id='word_8_3' title='bbox 1800 2400 2450 2650; x_wconf 82'>您好！</span>
</span>
</p>
</div>

<!-- Bottom row: More rotated examples -->
<!-- Rotated baseline: -20 degrees (baseline slope ~-0.36) - Russian -->
<div class='ocr_carea' id='carea_9_1' title="bbox 200 2750 900 3100">
<p class='ocr_par' id='par_9_1' lang='rus' title="bbox 200 2800 900 3050">
<span class='ocr_line' id='line_9_1' title="bbox 200 2800 900 3050; baseline -0.36 -20; x_size 120; x_descenders 24; x_ascenders 30">
<span class='ocrx_word' id='word_9_1' title='bbox 200 2800 900 3050; x_wconf 81'>Здравствуй!</span>
</span>
</p>
</div>

<!-- Rotated baseline: 12 degrees clockwise (baseline slope ~0.21) - Greek -->
<div class='ocr_carea' id='carea_9_2' title="bbox 1000 2750 1700 3100">
<p class='ocr_par' id='par_9_2' lang='ell' title="bbox 1000 2780 1700 3050">
<span class='ocr_line' id='line_9_2' title="bbox 1000 2780 1700 3050; baseline 0.21 -30; x_size 120; x_descenders 24; x_ascenders 30">
<span class='ocrx_word' id='word_9_2' title='bbox 1000 2780 1700 3050; x_wconf 80'>Χαίρετε!</span>
</span>
</p>
</div>

<!-- Rotated baseline: -5 degrees (baseline slope ~-0.09) - Arabic RTL rotated -->
<div class='ocr_carea' id='carea_9_3' title="bbox 1800 2750 2450 3100">
<p class='ocr_par' id='par_9_3' lang='ara' dir='rtl' title="bbox 1800 2800 2450 3050">
<span class='ocr_line' id='line_9_3' title="bbox 1800 2800 2450 3050; baseline -0.09 -25; x_size 120; x_descenders 24; x_ascenders 30">
<span class='ocrx_word' id='word_9_3' title='bbox 1800 2800 2450 3050; x_wconf 79'>!أهلاً</span>
</span>
</p>
</div>

</div>
</body>
</html>


================================================
FILE: tests/resources/latin.hocr
================================================
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 5.0.0' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "test.png"; bbox 0 0 2550 3300; ppageno 0; scan_res 300 300'>
<div class='ocr_carea' id='carea_1_1' title="bbox 200 200 2350 1200">
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 200 200 2350 400">
<span class='ocr_line' id='line_1_1' title="bbox 200 200 2350 400; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_1' title='bbox 200 200 600 400; x_wconf 95'>The</span>
<span class='ocrx_word' id='word_1_2' title='bbox 650 200 1050 400; x_wconf 95'>quick</span>
<span class='ocrx_word' id='word_1_3' title='bbox 1100 200 1500 400; x_wconf 95'>brown</span>
<span class='ocrx_word' id='word_1_4' title='bbox 1550 200 1850 400; x_wconf 95'>fox</span>
<span class='ocrx_word' id='word_1_5' title='bbox 1900 200 2350 400; x_wconf 95'>jumps</span>
</span>
</p>
<p class='ocr_par' id='par_1_2' lang='fra' title="bbox 200 500 2350 700">
<span class='ocr_line' id='line_1_2' title="bbox 200 500 2350 700; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_6' title='bbox 200 500 500 700; x_wconf 95'>Café</span>
<span class='ocrx_word' id='word_1_7' title='bbox 550 500 950 700; x_wconf 95'>résumé</span>
<span class='ocrx_word' id='word_1_8' title='bbox 1000 500 1400 700; x_wconf 95'>naïve</span>
</span>
</p>
<p class='ocr_par' id='par_1_3' lang='deu' title="bbox 200 800 2350 1000">
<span class='ocr_line' id='line_1_3' title="bbox 200 800 2350 1000; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_9' title='bbox 200 800 700 1000; x_wconf 95'>Größe</span>
<span class='ocrx_word' id='word_1_10' title='bbox 750 800 1250 1000; x_wconf 95'>Zürich</span>
<span class='ocrx_word' id='word_1_11' title='bbox 1300 800 1800 1000; x_wconf 95'>Ärger</span>
</span>
</p>
</div>
</div>
</body>
</html>


================================================
FILE: tests/resources/linn.txt
================================================
The LinnSequencer
32 Track MIDI Sequence Recorder

The LinnSequencer is a state—of—the-art composition and performance tool for the professional musician. It is

extremely powerful, yet amazingly simple to learn and use. It’s many remarkable features include:

0 Operation is similar to multi-track tape recorder with PLAY, STOP, RECORD, FAST
FORWARD, REWIND, and LOCATE controls.

0 Each of the 100 sequences contains 32 simultaneous, polyphonic tracks. Each track may
be assigned to one of 16 MIDI channels. Simultaneously plays up to 16 polyphonic

synthesizers !

0 Ultra-fast 3 1/2 ” disk drive stores complex songs in seconds and holds over 110,000 notes

per disk!

0 One or all tracks may be TRANSPOSED at the touch of a key.
0 Exclusive real—time ERASE function makes editing FAST.
0 Exclusive REPEAT function automatically repeats any held notes at a pre-selected

rhythmic value.

0 TIMING CORRECTION works during playback and operates without ‘chopping’ notes.

0 Optional SMPTE time code synchronization.

0 Optional remote control.

Recording a Sequence

To record a sequence, simply press RECORD and PLAY,
then play your MIDI keyboard in time to the Sequencer’s
click track. When the sequence loops back around to bar 1,
you’ll hear what you played—only all timing errors will be

corrected! (Timing correction may be adjusted 0r defeated).

Any additional notes played will be added into the track
—existing notes are not erased while recording!

FAST FORWARD, REWIND, and LOCATE controls
may be used at any time to quickly access any location in
your sequence for spot-recording. To overdub a new part,
select a different track and start recording—while you
record, the ﬁrst‘track will play in perfect sync (unless you
MUTE it, or SOLO another track). In this way, up to 32
tracks may be overdubbed! All MIDI effects are recorded
including pitch bend, modulation, velocity, aftertouch,
sustain pedal, and program changes!

Editing

To erase a wrong note, simply hold ERASE and press
the note to be erased just before it plays in the sequence-—
when played back, it will be gone. Notes may also be

added, erased, or changed using the SINGLE STEP func-
tion. To overdub notes at specific points within a sequence,

Additional Features

simply use LOCATE, FAST FORWARD, or REWIND to
find the desired bar number, then start recording.

The INSERT/ COPY function allows you to move bars
from one location to another—in the same sequence or a
different one. For example, you might insert a copy of the
first verse between the second chorus and the bridge.
DELETE BARS operates the same way to remove
unwanted sections.

Creating a Song

One way to create a song is to record each track all the
way through (up to 999 bars). Another way is to record
each basic section (verse, chorus, etc.) in individual
sequences, then use the CREATE SONG function to “chain”
them together. CREATE SONG will then automatically
copy all the parts into a new sequence. If desired, you can
even set the last few bars to repeat infinitely, for a fadeout.

Composition Without Compromise

The technology you use should never be so complex that
it interferes with the creative process. That’s precisely why
the LinnSequencer is designed to let you compose, record
and edit while devoting your undivided attention to your
music. See your Linn dealer today for a demonstration!

0 Simple, easy to learn operation—the 32 character LCD display clearly guides you through all operations. If needed, the

HELP button displays additional explanations.

0 Non-destructive recording—existing notes are not erased while recording.
0 Two FOOTSWIT CH INPUTS may be assigned to remotely control many of the commonly used functions, including

ERASE, REPEAT, PLAY/ STOP, or LOCATE.

0 Two TRIGGER OUTPUTS may be programmed to output pulses at any selected note value.

0 Will sync to standard LinnDrum or Linn 9000 sync tone.

0 Utilizes ultra high—speed, 8 MHZ 80186 16 bit computer internally for FAST operation.
0 TEMPO may be specified in BEATS-PER—MINUTE or FRAMES-PER—BEAT at 24, 25, or 30 frames per second,

(even drop frame!)

0 TEMPO may be entered numerically, adjustable in tenths of a Beat-Per-Minute increments, or by tapping quarter notes

on the TAP TEMPO button.

0 TEMPO CHANGES may be programmed into a sequence, with smooth transitions if desired.
0 Any TIME SIGNATURE may be used, and may be changed within a song.

EDI]
Linn Electronics, Inc.

18720 Oxnard Street, Tarzana, CA 91356
(818) 708-8131 TELEX #298949 LINN UR


================================================
FILE: tests/resources/multilingual.hocr
================================================
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 5.0.0' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "test.png"; bbox 0 0 2550 3300; ppageno 0'>
<div class='ocr_carea' id='carea_1_1' title="bbox 200 200 2350 800">
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 200 200 2350 400">
<span class='ocr_line' id='line_1_1' title="bbox 200 200 2350 400; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_1' title='bbox 200 200 500 400; x_wconf 95'>English</span>
<span class='ocrx_word' id='word_1_2' title='bbox 550 200 750 400; x_wconf 95'>Text</span>
<span class='ocrx_word' id='word_1_3' title='bbox 800 200 1000 400; x_wconf 95'>Here</span>
</span>
</p>
<p class='ocr_par' id='par_1_2' lang='ara' dir='rtl' title="bbox 200 500 2350 800">
<span class='ocr_line' id='line_1_2' title="bbox 200 500 2350 800; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40">
<span class='ocrx_word' id='word_1_4' title='bbox 200 500 600 800; x_wconf 95'>مرحبا</span>
<span class='ocrx_word' id='word_1_5' title='bbox 650 500 950 800; x_wconf 95'>بك</span>
</span>
</p>
</div>
</div>
</body>
</html>


================================================
FILE: tests/test_acroform.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import logging

import pikepdf
import pytest

import ocrmypdf

from .conftest import check_ocrmypdf

# pylint: disable=redefined-outer-name


@pytest.fixture
def acroform(resources):
    return resources / 'acroform.pdf'


def test_acroform_and_redo(acroform, no_outpdf):
    with pytest.raises(
        ocrmypdf.exceptions.InputFileError,
        match=r'.*--redo-ocr.*is not currently possible.*',
    ):
        check_ocrmypdf(acroform, no_outpdf, '--redo-ocr')


def test_acroform_message(acroform, caplog, outpdf):
    caplog.set_level(logging.INFO)
    check_ocrmypdf(acroform, outpdf, '--plugin', 'tests/plugins/tesseract_noop.py')
    assert 'fillable form' in caplog.text
    assert '--force-ocr' in caplog.text


@pytest.fixture
def digitally_signed(acroform, outdir):
    out = outdir / 'acroform_signed.pdf'
    with pikepdf.open(acroform) as pdf:
        pdf.Root.AcroForm.SigFlags = 3
        pdf.save(out)
    yield out


def test_digital_signature(digitally_signed, no_outpdf):
    with pytest.raises(ocrmypdf.exceptions.DigitalSignatureError):
        check_ocrmypdf(digitally_signed, no_outpdf)


def test_digital_signature_invalidate(digitally_signed, no_outpdf):
    check_ocrmypdf(
        digitally_signed, no_outpdf, '--force-ocr', '--invalidate-digital-signatures'
    )


================================================
FILE: tests/test_annots.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

from pikepdf import Array, Dictionary, Name, NameTree, Pdf

from ocrmypdf._annots import remove_broken_goto_annotations


def test_remove_broken_goto_annotations(resources):
    with Pdf.open(resources / 'link.pdf') as pdf:
        assert not remove_broken_goto_annotations(pdf), "File should not be modified"

        # Construct Dests nametree
        nt = NameTree.new(pdf)
        names = pdf.Root[Name.Names] = pdf.make_indirect(Dictionary())
        names[Name.Dests] = nt.obj
        # Create a broken named destination
        nt['Invalid'] = pdf.make_indirect(Dictionary())
        # Create a valid named destination
        nt['Valid'] = Array([pdf.pages[0].obj, Name.XYZ, 0, 0, 0])

        pdf.pages[0].Annots[0].A.D = 'Missing'
        pdf.pages[1].Annots[0].A.D = 'Valid'

        assert remove_broken_goto_annotations(pdf), "File should be modified"

        assert Name.D not in pdf.pages[0].Annots[0].A
        assert Name.D in pdf.pages[1].Annots[0].A


================================================
FILE: tests/test_api.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import pickle
from io import BytesIO
from pathlib import Path

import pytest
from pdfminer.high_level import extract_text

import ocrmypdf
import ocrmypdf._pipelines
import ocrmypdf.api


def test_language_list():
    with pytest.raises(
        (ocrmypdf.exceptions.InputFileError, ocrmypdf.exceptions.MissingDependencyError)
    ):
        ocrmypdf.ocr('doesnotexist.pdf', '_.pdf', language=['eng', 'deu'])


def test_language_parameter_mapped_to_languages():
    """Test that the API 'language' parameter is mapped to OcrOptions 'languages'.

    Regression test for GitHub issue #1640: the Python API ignored the language
    parameter, always defaulting to 'eng'.
    """
    from ocrmypdf._options import OcrOptions
    from ocrmypdf.api import create_options, setup_plugin_infrastructure
    from ocrmypdf.cli import get_parser

    setup_plugin_infrastructure()
    parser = get_parser()

    options = create_options(
        input_file='test.pdf',
        output_file='output.pdf',
        parser=parser,
        language=['tam'],
    )
    assert options.languages == ['tam']

    # Test with a list of multiple languages
    options = create_options(
        input_file='test.pdf',
        output_file='output.pdf',
        parser=parser,
        language=['fra', 'deu'],
    )
    assert options.languages == ['fra', 'deu']

    # Test with a bare string (single language)
    options = create_options(
        input_file='test.pdf',
        output_file='output.pdf',
        parser=parser,
        language='tam',
    )
    assert options.languages == ['tam']

    # Test '+'-separated string is split like CLI --language
    options = create_options(
        input_file='test.pdf',
        output_file='output.pdf',
        parser=parser,
        language='eng+spa',
    )
    assert options.languages == ['eng', 'spa']

    # Test '+'-separated entry within a list is also split
    options = create_options(
        input_file='test.pdf',
        output_file='output.pdf',
        parser=parser,
        language=['eng+spa'],
    )
    assert options.languages == ['eng', 'spa']


def test_stream_api(resources: Path):
    in_ = (resources / 'graph.pdf').open('rb')
    out = BytesIO()

    ocrmypdf.ocr(in_, out, tesseract_timeout=0.0)
    out.seek(0)
    assert b'%PDF' in out.read(1024)


def test_sidecar_stringio(resources: Path, outdir: Path, outpdf: Path):
    s = BytesIO()
    ocrmypdf.ocr(
        resources / 'ccitt.pdf',
        outpdf,
        plugins=['tests/plugins/tesseract_cache.py'],
        sidecar=s,
    )
    s.seek(0)
    assert b'the' in s.getvalue()


def test_hocr_api_multipage(resources: Path, outdir: Path, outpdf: Path):
    ocrmypdf.api._pdf_to_hocr(
        resources / 'multipage.pdf',
        outdir,
        language='eng',
        skip_text=True,
        plugins=['tests/plugins/tesseract_cache.py'],
    )
    assert (outdir / '000001_ocr_hocr.hocr').exists()
    assert (outdir / '000006_ocr_hocr.hocr').exists()
    assert not (outdir / '000004_ocr_hocr.hocr').exists()

    ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf)
    assert outpdf.exists()


def test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path):
    ocrmypdf.api._pdf_to_hocr(
        resources / 'ccitt.pdf',
        outdir,
        language='eng',
        skip_text=True,
        plugins=['tests/plugins/tesseract_cache.py'],
    )
    assert (outdir / '000001_ocr_hocr.hocr').exists()
    hocr = (outdir / '000001_ocr_hocr.hocr').read_text(encoding='utf-8')
    mangled = hocr.replace('the', 'hocr')
    (outdir / '000001_ocr_hocr.hocr').write_text(mangled, encoding='utf-8')

    ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf, optimize=0)

    text = extract_text(outpdf)
    assert 'hocr' in text and 'the' not in text


def test_hocr_result_json():
    result = ocrmypdf._pipelines._common.HOCRResult(
        pageno=1,
        pdf_page_from_image=Path('a'),
        hocr=Path('b'),
        textpdf=Path('c'),
        orientation_correction=180,
    )
    assert (
        result.to_json()
        == '{"pageno": 1, "pdf_page_from_image": {"Path": "a"}, "hocr": {"Path": "b"}, '
        '"textpdf": {"Path": "c"}, "orientation_correction": 180, "ocr_tree": null}'
    )
    assert ocrmypdf._pipelines._common.HOCRResult.from_json(result.to_json()) == result


def test_hocr_result_pickle():
    result = ocrmypdf._pipelines._common.HOCRResult(
        pageno=1,
        pdf_page_from_image=Path('a'),
        hocr=Path('b'),
        textpdf=Path('c'),
        orientation_correction=180,
    )
    assert result == pickle.loads(pickle.dumps(result))


def test_nested_plugin_option_access():
    """Test that plugin options can be accessed via nested namespaces."""
    from ocrmypdf._options import OcrOptions
    from ocrmypdf.api import setup_plugin_infrastructure

    # Set up plugin infrastructure to register plugin models
    setup_plugin_infrastructure()

    # Create options with tesseract settings
    options = OcrOptions(
        input_file='test.pdf',
        output_file='output.pdf',
        tesseract_timeout=120.0,
        tesseract_oem=1,
        optimize=2,
    )

    # Test flat access still works
    assert options.tesseract_timeout == 120.0
    assert options.tesseract_oem == 1
    assert options.optimize == 2

    # Test nested access for tesseract
    tesseract = options.tesseract
    assert tesseract is not None
    assert tesseract.timeout == 120.0
    assert tesseract.oem == 1

    # Test nested access for ghostscript
    ghostscript = options.ghostscript
    assert ghostscript is not None
    assert ghostscript.color_conversion_strategy == "LeaveColorUnchanged"

    # Test that cached instances are returned
    assert options.tesseract is tesseract


def test_default_tesseract_timeout():
    """Test that OcrOptions without explicit tesseract_timeout uses plugin default.

    Regression test for GitHub issue #1636: when using the Python API without
    specifying tesseract_timeout, the default was 0.0 which caused Tesseract
    to immediately time out and produce no OCR output.
    """
    from ocrmypdf._options import OcrOptions
    from ocrmypdf.api import setup_plugin_infrastructure

    setup_plugin_infrastructure()

    # Default OcrOptions should leave tesseract_timeout as None
    options = OcrOptions(
        input_file='test.pdf',
        output_file='output.pdf',
    )
    assert options.tesseract_timeout is None

    # The plugin default (180s) should be used when tesseract_timeout is None
    assert options.tesseract.timeout == 180.0


================================================
FILE: tests/test_check_pdf.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

from ocrmypdf.helpers import check_pdf


def test_pdf_error(resources):
    assert check_pdf(resources / 'blank.pdf')
    assert not check_pdf(__file__)


================================================
FILE: tests/test_completion.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import os
from subprocess import run

import pytest

from ocrmypdf.helpers import running_in_docker

pytestmark = pytest.mark.skipif(
    running_in_docker(),
    reason="docker can't complete",
)


def test_fish():
    try:
        proc = run(
            ['fish', '-n', 'misc/completion/ocrmypdf.fish'],
            check=True,
            encoding='utf-8',
            capture_output=True,
        )
        assert proc.stderr == '', proc.stderr
    except FileNotFoundError:
        pytest.xfail('fish is not installed')


@pytest.mark.skipif(
    os.name == 'nt', reason="Windows CI workers have bash but are best left alone"
)
def test_bash():
    try:
        proc = run(
            ['bash', '-n', 'misc/completion/ocrmypdf.bash'],
            check=True,
            encoding='utf-8',
            capture_output=True,
        )
        assert proc.stderr == '', proc.stderr
    except FileNotFoundError:
        pytest.xfail('bash is not installed')


================================================
FILE: tests/test_concurrency.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import os
import platform

import pytest

from ocrmypdf import ExitCode

from .conftest import run_ocrmypdf_api


@pytest.mark.skipif(os.name == 'nt', reason="Windows doesn't have SIGKILL")
@pytest.mark.skipif(
    platform.python_version_tuple() >= ('3', '12'), reason="can deadlock due to fork"
)
def test_simulate_oom_killer(multipage, no_outpdf):
    exitcode = run_ocrmypdf_api(
        multipage,
        no_outpdf,
        '--force-ocr',
        '--no-use-threads',
        '--plugin',
        'tests/plugins/tesseract_simulate_oom_killer.py',
    )
    assert exitcode == ExitCode.child_process_error


================================================
FILE: tests/test_fpdf_renderer.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Tests for fpdf2-based PDF renderer."""

from __future__ import annotations

from pathlib import Path

import pytest

from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import (
    DebugRenderOptions,
    Fpdf2MultiPageRenderer,
    Fpdf2PdfRenderer,
)
from ocrmypdf.hocrtransform.hocr_parser import HocrParser
from ocrmypdf.models.ocr_element import OcrClass


@pytest.fixture
def font_dir():
    """Return path to font directory."""
    return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"


@pytest.fixture
def multi_font_manager(font_dir):
    """Create MultiFontManager instance for testing."""
    return MultiFontManager(font_dir)


@pytest.fixture
def resources():
    """Return path to test resources directory."""
    return Path(__file__).parent / "resources"


class TestFpdf2RendererImports:
    """Test that all fpdf2 renderer modules can be imported."""

    def test_imports(self):
        """Test that all fpdf_renderer modules can be imported."""
        from ocrmypdf.fpdf_renderer import (
            DebugRenderOptions,
            Fpdf2MultiPageRenderer,
            Fpdf2PdfRenderer,
        )

        assert DebugRenderOptions is not None
        assert Fpdf2PdfRenderer is not None
        assert Fpdf2MultiPageRenderer is not None


class TestDebugRenderOptions:
    """Test DebugRenderOptions dataclass."""

    def test_defaults(self):
        """Test default values."""
        opts = DebugRenderOptions()
        assert opts.render_baseline is False
        assert opts.render_line_bbox is False
        assert opts.render_word_bbox is False

    def test_custom_values(self):
        """Test custom values."""
        opts = DebugRenderOptions(
            render_baseline=True,
            render_line_bbox=True,
            render_word_bbox=True,
        )
        assert opts.render_baseline is True
        assert opts.render_line_bbox is True
        assert opts.render_word_bbox is True


class TestFpdf2PdfRenderer:
    """Test Fpdf2PdfRenderer."""

    def test_requires_page_element(self, multi_font_manager):
        """Test that renderer requires ocr_page element."""
        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement

        # Create a non-page element
        word = OcrElement(
            ocr_class=OcrClass.WORD,
            text="test",
            bbox=BoundingBox(left=0, top=0, right=100, bottom=20),
        )

        with pytest.raises(ValueError, match="Root element must be ocr_page"):
            Fpdf2PdfRenderer(
                page=word,
                dpi=300,
                multi_font_manager=multi_font_manager,
            )

    def test_requires_bbox(self, multi_font_manager):
        """Test that renderer requires page with bounding box."""
        from ocrmypdf.models.ocr_element import OcrElement

        page = OcrElement(ocr_class=OcrClass.PAGE)

        with pytest.raises(ValueError, match="Page must have bounding box"):
            Fpdf2PdfRenderer(
                page=page,
                dpi=300,
                multi_font_manager=multi_font_manager,
            )

    def test_render_simple_page(self, multi_font_manager, tmp_path):
        """Test rendering a simple page with one word."""
        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement

        # Create a simple page with one word
        word = OcrElement(
            ocr_class=OcrClass.WORD,
            text="Hello",
            bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
        )
        line = OcrElement(
            ocr_class=OcrClass.LINE,
            bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
            children=[word],
        )
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
            children=[line],
        )

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=72,  # 1:1 mapping to PDF points
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )

        output_path = tmp_path / "test_simple.pdf"
        renderer.render(output_path)

        assert output_path.exists()
        assert output_path.stat().st_size > 0

    def test_render_invisible_text(self, multi_font_manager, tmp_path):
        """Test rendering invisible text (OCR layer)."""
        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement

        word = OcrElement(
            ocr_class=OcrClass.WORD,
            text="Invisible",
            bbox=BoundingBox(left=100, top=100, right=250, bottom=130),
        )
        line = OcrElement(
            ocr_class=OcrClass.LINE,
            bbox=BoundingBox(left=100, top=100, right=250, bottom=130),
            children=[word],
        )
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
            children=[line],
        )

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=72,
            multi_font_manager=multi_font_manager,
            invisible_text=True,  # This is the default
        )

        output_path = tmp_path / "test_invisible.pdf"
        renderer.render(output_path)

        assert output_path.exists()
        assert output_path.stat().st_size > 0


class TestFpdf2MultiPageRenderer:
    """Test Fpdf2MultiPageRenderer."""

    def test_requires_pages(self, multi_font_manager):
        """Test that renderer requires at least one page."""
        with pytest.raises(ValueError, match="No pages to render"):
            renderer = Fpdf2MultiPageRenderer(
                pages_data=[],
                multi_font_manager=multi_font_manager,
            )
            renderer.render(Path("/tmp/test.pdf"))

    def test_render_multiple_pages(self, multi_font_manager, tmp_path):
        """Test rendering multiple pages."""
        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement

        pages_data = []
        for i in range(3):
            word = OcrElement(
                ocr_class=OcrClass.WORD,
                text=f"Page{i+1}",
                bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
            )
            line = OcrElement(
                ocr_class=OcrClass.LINE,
                bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
                children=[word],
            )
            page = OcrElement(
                ocr_class=OcrClass.PAGE,
                bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
                children=[line],
            )
            pages_data.append((i + 1, page, 72))

        renderer = Fpdf2MultiPageRenderer(
            pages_data=pages_data,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )

        output_path = tmp_path / "test_multipage.pdf"
        renderer.render(output_path)

        assert output_path.exists()
        assert output_path.stat().st_size > 0


class TestFpdf2RendererWithHocr:
    """Test fpdf2 renderer with actual hOCR files."""

    def test_render_latin_hocr(self, resources, multi_font_manager, tmp_path):
        """Test rendering Latin text from hOCR."""
        hocr_path = resources / "latin.hocr"
        if not hocr_path.exists():
            pytest.skip("latin.hocr not found")

        parser = HocrParser(hocr_path)
        page = parser.parse()

        # Ensure we got a page
        assert page.ocr_class == OcrClass.PAGE
        assert page.bbox is not None

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )

        output_path = tmp_path / "latin_fpdf2.pdf"
        renderer.render(output_path)

        assert output_path.exists()
        assert output_path.stat().st_size > 0

    def test_render_cjk_hocr(self, resources, multi_font_manager, tmp_path):
        """Test rendering CJK text from hOCR."""
        hocr_path = resources / "cjk.hocr"
        if not hocr_path.exists():
            pytest.skip("cjk.hocr not found")

        parser = HocrParser(hocr_path)
        page = parser.parse()

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )

        output_path = tmp_path / "cjk_fpdf2.pdf"
        renderer.render(output_path)

        assert output_path.exists()
        assert output_path.stat().st_size > 0

    def test_render_arabic_hocr(self, resources, multi_font_manager, tmp_path):
        """Test rendering Arabic text from hOCR."""
        hocr_path = resources / "arabic.hocr"
        if not hocr_path.exists():
            pytest.skip("arabic.hocr not found")

        parser = HocrParser(hocr_path)
        page = parser.parse()

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )

        output_path = tmp_path / "arabic_fpdf2.pdf"
        renderer.render(output_path)

        assert output_path.exists()
        assert output_path.stat().st_size > 0

    def test_render_hello_world_scripts_hocr(
        self, resources, multi_font_manager, tmp_path
    ):
        """Test rendering comprehensive multilingual 'Hello!' hOCR file.

        This tests all major scripts including:
        - Latin (English, Spanish, French, German, Italian, Polish, Portuguese, Turkish)
        - Cyrillic (Russian)
        - Greek
        - CJK (Chinese Simplified, Chinese Traditional, Japanese, Korean)
        - Devanagari (Hindi)
        - Arabic (RTL)
        - Hebrew (RTL)

        Also includes rotated baselines to exercise skew handling.
        """
        hocr_path = resources / "hello_world_scripts.hocr"
        if not hocr_path.exists():
            pytest.skip("hello_world_scripts.hocr not found")

        parser = HocrParser(hocr_path)
        page = parser.parse()

        # Verify we parsed the page correctly
        assert page.ocr_class == OcrClass.PAGE
        assert page.bbox is not None
        # Should have 2550x3300 at 300 DPI
        assert page.bbox.right == 2550
        assert page.bbox.bottom == 3300

        # Test with visible text for visual inspection
        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )

        output_path = tmp_path / "hello_world_scripts_fpdf2.pdf"
        renderer.render(output_path)

        assert output_path.exists()
        assert output_path.stat().st_size > 0

    def test_render_hello_world_scripts_multipage(
        self, resources, multi_font_manager, tmp_path
    ):
        """Test rendering hello_world_scripts.hocr using MultiPageRenderer.

        Uses Fpdf2MultiPageRenderer to render the multilingual test file,
        demonstrating font handling across all major writing systems.
        """
        hocr_path = resources / "hello_world_scripts.hocr"
        if not hocr_path.exists():
            pytest.skip("hello_world_scripts.hocr not found")

        parser = HocrParser(hocr_path)
        page = parser.parse()

        # Build pages_data list as expected by MultiPageRenderer
        pages_data = [(1, page, 300)]  # (page_number, page_element, dpi)

        renderer = Fpdf2MultiPageRenderer(
            pages_data=pages_data,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )

        output_path = tmp_path / "hello_world_scripts_multipage.pdf"
        renderer.render(output_path)

        assert output_path.exists()
        assert output_path.stat().st_size > 0


class TestWordSegmentation:
    """Test that rendered PDFs have proper word segmentation for pdfminer.six."""

    def test_word_segmentation_with_pdfminer(self, multi_font_manager, tmp_path):
        """Test that pdfminer.six can extract words with proper spacing.

        This test verifies that explicit space characters are inserted between
        words so that pdfminer.six (and similar PDF readers) can properly
        segment words during text extraction.
        """
        from pdfminer.high_level import extract_text

        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement

        # Create a page with multiple words on one line
        word1 = OcrElement(
            ocr_class=OcrClass.WORD,
            text="Hello",
            bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
        )
        word2 = OcrElement(
            ocr_class=OcrClass.WORD,
            text="World",
            bbox=BoundingBox(left=220, top=100, right=320, bottom=130),
        )
        word3 = OcrElement(
            ocr_class=OcrClass.WORD,
            text="Test",
            bbox=BoundingBox(left=340, top=100, right=420, bottom=130),
        )
        line = OcrElement(
            ocr_class=OcrClass.LINE,
            bbox=BoundingBox(left=100, top=100, right=420, bottom=130),
            children=[word1, word2, word3],
        )
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
            children=[line],
        )

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=72,  # 1:1 mapping to PDF points
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )

        output_path = tmp_path / "test_word_segmentation.pdf"
        renderer.render(output_path)

        # Extract text using pdfminer.six
        extracted_text = extract_text(str(output_path))

        # Verify words are separated by spaces
        assert "Hello" in extracted_text
        assert "World" in extracted_text
        assert "Test" in extracted_text

        # The text should NOT be run together like "HelloWorldTest"
        assert "HelloWorld" not in extracted_text
        assert "WorldTest" not in extracted_text

        # Verify proper word segmentation - words should be separated
        # (allowing for whitespace variations)
        words_found = extracted_text.split()
        assert "Hello" in words_found
        assert "World" in words_found
        assert "Test" in words_found

    def test_cjk_no_spurious_spaces(self, multi_font_manager, tmp_path):
        """Test that CJK text does not get spurious spaces inserted.

        CJK scripts don't use spaces between characters/words, so we should
        not insert spaces between adjacent CJK words.
        """
        from pdfminer.high_level import extract_text

        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement

        # Create a page with CJK words (Chinese characters)
        # 你好 = "Hello" in Chinese
        # 世界 = "World" in Chinese
        word1 = OcrElement(
            ocr_class=OcrClass.WORD,
            text="你好",
            bbox=BoundingBox(left=100, top=100, right=160, bottom=130),
        )
        word2 = OcrElement(
            ocr_class=OcrClass.WORD,
            text="世界",
            bbox=BoundingBox(left=170, top=100, right=230, bottom=130),
        )
        line = OcrElement(
            ocr_class=OcrClass.LINE,
            bbox=BoundingBox(left=100, top=100, right=230, bottom=130),
            children=[word1, word2],
        )
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
            children=[line],
        )

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=72,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )

        output_path = tmp_path / "test_cjk_segmentation.pdf"
        renderer.render(output_path)

        # Extract text using pdfminer.six
        extracted_text = extract_text(str(output_path))

        # CJK text should be present
        assert "你好" in extracted_text
        assert "世界" in extracted_text

        # There should NOT be spaces between CJK characters
        # (but pdfminer may add some whitespace, so we check the raw chars)
        extracted_chars = extracted_text.replace(" ", "").replace("\n", "")
        assert "你好世界" in extracted_chars or (
            "你好" in extracted_chars and "世界" in extracted_chars
        )

    def test_latin_hocr_word_segmentation(
        self, resources, multi_font_manager, tmp_path
    ):
        """Test word segmentation with real Latin hOCR file."""
        from pdfminer.high_level import extract_text

        hocr_path = resources / "latin.hocr"
        if not hocr_path.exists():
            pytest.skip("latin.hocr not found")

        parser = HocrParser(hocr_path)
        page = parser.parse()

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )

        output_path = tmp_path / "latin_segmentation.pdf"
        renderer.render(output_path)

        # Extract text using pdfminer.six
        extracted_text = extract_text(str(output_path))

        # The Latin text should have proper word segmentation
        # Words should be separable
        words = extracted_text.split()
        assert len(words) > 0

        # Check that common English words are properly segmented
        # (not stuck together)
        text_no_newlines = extracted_text.replace("\n", " ")
        # There should be spaces in the extracted text
        assert " " in text_no_newlines


================================================
FILE: tests/test_ghostscript.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import logging
import secrets
import subprocess
import sys
from decimal import Decimal
from unittest.mock import patch

import pikepdf
import pytest
from packaging.version import Version
from PIL import Image, UnidentifiedImageError

from ocrmypdf._exec import ghostscript
from ocrmypdf._exec.ghostscript import DuplicateFilter, rasterize_pdf
from ocrmypdf.builtin_plugins.ghostscript import _repair_gs106_jpeg_corruption
from ocrmypdf.exceptions import ColorConversionNeededError, ExitCode, InputFileError
from ocrmypdf.helpers import Resolution
from ocrmypdf.pluginspec import GhostscriptRasterDevice

from .conftest import check_ocrmypdf, run_ocrmypdf_api

# pylint: disable=redefined-outer-name


@pytest.fixture
def francais(resources):
    path = resources / 'francais.pdf'
    return path, pikepdf.open(path)


def test_rasterize_size(francais, outdir):
    path, pdf = francais
    page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])
    assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0
    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
    target_size = Decimal('50.0'), Decimal('30.0')
    forced_dpi = Resolution(42.0, 4242.0)

    rasterize_pdf(
        path,
        outdir / 'out.png',
        raster_device=GhostscriptRasterDevice.PNGMONO,
        raster_dpi=Resolution(
            target_size[0] / page_size[0], target_size[1] / page_size[1]
        ),
        page_dpi=forced_dpi,
    )

    with Image.open(outdir / 'out.png') as im:
        assert im.size == target_size
        assert im.info['dpi'] == forced_dpi


def test_rasterize_rotated(francais, outdir, caplog):
    path, pdf = francais
    page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])
    assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0
    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
    target_size = Decimal('50.0'), Decimal('30.0')
    forced_dpi = Resolution(42.0, 4242.0)

    caplog.set_level(logging.DEBUG)
    rasterize_pdf(
        path,
        outdir / 'out.png',
        raster_device=GhostscriptRasterDevice.PNGMONO,
        raster_dpi=Resolution(
            target_size[0] / page_size[0], target_size[1] / page_size[1]
        ),
        page_dpi=forced_dpi,
        rotation=90,
    )

    with Image.open(outdir / 'out.png') as im:
        assert im.size == (target_size[1], target_size[0])
        assert im.info['dpi'] == forced_dpi.flip_axis()


def test_rasterize_low_dpi(francais, outdir):
    """Test that very low DPI values (below 10) produce correctly sized output.

    Ghostscript may fail with DPI values below 10. The workaround renders at
    a minimum of 10 DPI and resizes the output to match the expected dimensions.
    """
    path, pdf = francais
    page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])
    assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0
    page_size = (float(page_size_pts[0]) / 72, float(page_size_pts[1]) / 72)

    # Request a very small output (DPI below 10 on both axes)
    target_size = (5, 3)
    forced_dpi = Resolution(72.0, 72.0)

    rasterize_pdf(
        path,
        outdir / 'out_low_dpi.png',
        raster_device=GhostscriptRasterDevice.PNGMONO,
        raster_dpi=Resolution(
            target_size[0] / page_size[0], target_size[1] / page_size[1]
        ),
        page_dpi=forced_dpi,
    )

    with Image.open(outdir / 'out_low_dpi.png') as im:
        assert im.size == target_size
        assert im.info['dpi'] == forced_dpi


def test_rasterize_low_dpi_one_axis(francais, outdir):
    """Test low DPI on only one axis produces correctly sized output."""
    path, pdf = francais
    page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])
    assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0
    page_size = (float(page_size_pts[0]) / 72, float(page_size_pts[1]) / 72)

    # Request low DPI on X axis only (below 10), normal on Y axis
    target_size = (5, 50)
    forced_dpi = Resolution(72.0, 72.0)

    rasterize_pdf(
        path,
        outdir / 'out_low_dpi_x.png',
        raster_device=GhostscriptRasterDevice.PNGMONO,
        raster_dpi=Resolution(
            target_size[0] / page_size[0], target_size[1] / page_size[1]
        ),
        page_dpi=forced_dpi,
    )

    with Image.open(outdir / 'out_low_dpi_x.png') as im:
        assert im.size == target_size
        assert im.info['dpi'] == forced_dpi


def test_gs_render_failure(resources, outpdf, caplog):
    exitcode = run_ocrmypdf_api(
        resources / 'blank.pdf',
        outpdf,
        '--output-type',
        'pdfa',  # Required to trigger Ghostscript PDF/A generation
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        '--plugin',
        'tests/plugins/gs_render_failure.py',
    )
    assert 'TEST ERROR: gs_render_failure.py' in caplog.text
    assert exitcode == ExitCode.child_process_error


def test_gs_raster_failure(resources, outpdf, caplog):
    exitcode = run_ocrmypdf_api(
        resources / 'francais.pdf',
        outpdf,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        '--plugin',
        'tests/plugins/gs_raster_failure.py',
    )
    assert 'TEST ERROR: gs_raster_failure.py' in caplog.text
    assert exitcode == ExitCode.child_process_error


def test_ghostscript_pdfa_failure(resources, outpdf, caplog):
    exitcode = run_ocrmypdf_api(
        resources / 'francais.pdf',
        outpdf,
        '--output-type',
        'pdfa',  # Required to trigger Ghostscript PDF/A generation
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        '--plugin',
        'tests/plugins/gs_pdfa_failure.py',
    )
    assert (
        exitcode == ExitCode.pdfa_conversion_failed
    ), "Unexpected return when PDF/A fails"


def test_ghostscript_feature_elision(resources, outpdf):
    check_ocrmypdf(
        resources / 'francais.pdf',
        outpdf,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        '--plugin',
        'tests/plugins/gs_feature_elision.py',
    )


def test_ghostscript_mandatory_color_conversion(resources, outpdf):
    with pytest.raises(ColorConversionNeededError):
        check_ocrmypdf(
            resources / 'jbig2_baddevicen.pdf',
            outpdf,
            '--output-type',
            'pdfa',  # Required to trigger Ghostscript PDF/A generation
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )


def test_rasterize_pdf_errors(resources, no_outpdf, caplog):
    with patch('ocrmypdf._exec.ghostscript.run') as mock:
        # ghostscript can produce empty files with return code 0
        mock.return_value = subprocess.CompletedProcess(
            ['fakegs'], returncode=0, stdout=b'', stderr=b'error this is an error'
        )
        with pytest.raises(UnidentifiedImageError):
            rasterize_pdf(
                resources / 'francais.pdf',
                no_outpdf,
                raster_device=GhostscriptRasterDevice.PNGMONO,
                raster_dpi=Resolution(100, 100),
            )
        assert "this is an error" in caplog.text
        assert "invalid page image file" in caplog.text


class TestDuplicateFilter:
    @pytest.fixture(scope='function')
    def duplicate_filter_logger(self):
        # token_urlsafe: ensure the logger has a unique name so tests are isolated
        logger = logging.getLogger(__name__ + secrets.token_urlsafe(8))
        logger.setLevel(logging.DEBUG)
        logger.addFilter(DuplicateFilter(logger))
        return logger

    @pytest.mark.xfail(
        (3, 13, 3) <= sys.version_info[:3] <= (3, 13, 5),
        reason="https://github.com/python/cpython/pull/135858",
    )
    def test_filter_duplicate_messages(self, duplicate_filter_logger, caplog):
        log = duplicate_filter_logger
        log.error("test error message")
        log.error("test error message")
        log.error("test error message")
        log.error("another error message")
        log.error("another error message")
        log.error("yet another error message")

        assert len(caplog.records) == 5
        assert caplog.records[0].msg == "test error message"
        assert caplog.records[1].msg == "(suppressed 2 repeated lines)"
        assert caplog.records[2].msg == "another error message"
        assert caplog.records[3].msg == "(suppressed 1 repeated lines)"
        assert caplog.records[4].msg == "yet another error message"

    def test_filter_does_not_affect_unique_messages(
        self, duplicate_filter_logger, caplog
    ):
        log = duplicate_filter_logger
        log.error("test error message")
        log.error("another error message")
        log.error("yet another error message")

        assert len(caplog.records) == 3
        assert caplog.records[0].msg == "test error message"
        assert caplog.records[1].msg == "another error message"
        assert caplog.records[2].msg == "yet another error message"

    @pytest.mark.xfail(
        (3, 13, 3) <= sys.version_info[:3] <= (3, 13, 5),
        reason="https://github.com/python/cpython/pull/135858",
    )
    def test_filter_alt_messages(self, duplicate_filter_logger, caplog):
        log = duplicate_filter_logger
        log.error("test error message")
        log.error("another error message")
        log.error("test error message")
        log.error("another error message")
        log.error("test error message")
        log.error("test error message")
        log.error("another error message")
        log.error("yet another error message")

        assert len(caplog.records) == 4
        assert caplog.records[0].msg == "test error message"
        assert caplog.records[1].msg == "another error message"
        assert caplog.records[2].msg == "(suppressed 5 repeated lines)"
        assert caplog.records[3].msg == "yet another error message"


@pytest.fixture
def pdf_with_invalid_image(outdir):
    # issue 1451
    Name = pikepdf.Name
    pdf = pikepdf.new()
    pdf.add_blank_page()
    pdf.pages[0].Contents = pdf.make_stream(b'612 0 0 612 0 0 cm /Image Do')
    # Create an invalid image object that has both ColorSpace and ImageMask set
    pdf.pages[0].Resources = pikepdf.Dictionary(
        XObject=pdf.make_indirect(
            pikepdf.Dictionary(
                Image=pdf.make_stream(
                    b"\xf0\x0f" * 8,
                    ColorSpace=Name.DeviceGray,
                    BitsPerComponent=1,
                    Width=8,
                    Height=8,
                    ImageMask=True,
                    Subtype=Name.Image,
                    Type=Name.XObject,
                )
            )
        )
    )
    pdf.save(outdir / 'invalid_image.pdf')
    pdf.save('invalid_image.pdf')
    return outdir / 'invalid_image.pdf'


@pytest.mark.xfail(
    ghostscript.version() < Version('10.04.0'),
    reason="Older Ghostscript behavior is different",
)
def test_recoverable_image_error(pdf_with_invalid_image, outdir, caplog):
    # When stop_on_error is False, we expect Ghostscript to print an error
    # but continue
    rasterize_pdf(
        outdir / 'invalid_image.pdf',
        outdir / 'out.png',
        raster_device=GhostscriptRasterDevice.PNGMONO,
        raster_dpi=Resolution(10, 10),
        stop_on_error=False,
    )
    assert 'Image has both ImageMask and ColorSpace' in caplog.text


@pytest.mark.xfail(
    ghostscript.version() < Version('10.04.0'),
    reason="Older Ghostscript behavior is different",
)
def test_recoverable_image_error_with_stop(pdf_with_invalid_image, outdir, caplog):
    # When stop_on_error is True, Ghostscript will print an error and exit
    # but still produce a viable image. We intercept this case and raise
    # InputFileError because it will contain an image of the whole page minus
    # the image we are rendering.
    with pytest.raises(
        InputFileError, match="Try using --continue-on-soft-render-error"
    ):
        rasterize_pdf(
            outdir / 'invalid_image.pdf',
            outdir / 'out.png',
            raster_device=GhostscriptRasterDevice.PNGMONO,
            raster_dpi=Resolution(100, 100),
            stop_on_error=True,
        )
    # out2.png will not be created; if it were it would be blank.


class TestGs106JpegCorruptionRepair:
    """Test the Ghostscript 10.6 JPEG corruption repair function."""

    @pytest.fixture
    def create_damaged_pdf(self, resources, outdir):
        """Create a damaged PDF by truncating JPEG data by 2 bytes."""

        def _create_damaged(source_pdf_name='francais.pdf', truncate_bytes=2):
            source_path = resources / source_pdf_name
            damaged_path = outdir / 'damaged.pdf'

            with pikepdf.open(source_path) as pdf:
                # Find and truncate DCTDecode images
                Name = pikepdf.Name
                damaged_count = 0
                for page in pdf.pages:
                    if Name.Resources not in page:
                        continue
                    resources_dict = page[Name.Resources]
                    if Name.XObject not in resources_dict:
                        continue
                    for key in resources_dict[Name.XObject].keys():
                        obj = resources_dict[Name.XObject][key]
                        if obj.get(Name.Subtype) != Name.Image:
                            continue
                        if obj.get(Name.Filter) != Name.DCTDecode:
                            continue
                        # Truncate the JPEG data
                        original_bytes = obj.read_raw_bytes()
                        truncated_bytes = original_bytes[:-truncate_bytes]
                        obj.write(truncated_bytes, filter=Name.DCTDecode)
                        damaged_count += 1

                pdf.save(damaged_path)
                return source_path, damaged_path, damaged_count

        return _create_damaged

    def test_repair_truncated_jpeg(self, create_damaged_pdf, caplog):
        """Test that truncated JPEG images are repaired."""
        caplog.set_level(logging.DEBUG)
        source_path, damaged_path, damaged_count = create_damaged_pdf()

        assert damaged_count > 0, "Test PDF should have DCTDecode images"

        # Get original image bytes for comparison
        with pikepdf.open(source_path) as pdf:
            Name = pikepdf.Name
            original_bytes_list = []
            for page in pdf.pages:
                if Name.Resources not in page:
                    continue
                resources_dict = page[Name.Resources]
                if Name.XObject not in resources_dict:
                    continue
                for key in resources_dict[Name.XObject].keys():
                    obj = resources_dict[Name.XObject][key]
                    if obj.get(Name.Subtype) != Name.Image:
                        continue
                    if obj.get(Name.Filter) != Name.DCTDecode:
                        continue
                    original_bytes_list.append(obj.read_raw_bytes())

        # Run the repair function
        repaired = _repair_gs106_jpeg_corruption(source_path, damaged_path)
        assert repaired is True, "Repair should have been performed"

        # Verify the repaired PDF has correct image bytes
        with pikepdf.open(damaged_path) as pdf:
            Name = pikepdf.Name
            repaired_bytes_list = []
            for page in pdf.pages:
                if Name.Resources not in page:
                    continue
                resources_dict = page[Name.Resources]
                if Name.XObject not in resources_dict:
                    continue
                for key in resources_dict[Name.XObject].keys():
                    obj = resources_dict[Name.XObject][key]
                    if obj.get(Name.Subtype) != Name.Image:
                        continue
                    if obj.get(Name.Filter) != Name.DCTDecode:
                        continue
                    repaired_bytes_list.append(obj.read_raw_bytes())

        assert len(repaired_bytes_list) == len(original_bytes_list)
        for orig, repaired_bytes in zip(original_bytes_list, repaired_bytes_list, strict=False):
            assert orig == repaired_bytes, "Repaired bytes should match original"

        # Check that error/warning was logged
        assert "JPEG corruption detected" in caplog.text

    def test_no_repair_when_not_truncated(self, resources, outdir, caplog):
        """Test that no repair is done when images are not truncated."""
        caplog.set_level(logging.DEBUG)
        source_path = resources / 'francais.pdf'

        # Copy source to output (no damage)
        output_path = outdir / 'undamaged.pdf'
        with pikepdf.open(source_path) as pdf:
            pdf.save(output_path)

        # Run the repair function - should not repair anything
        repaired = _repair_gs106_jpeg_corruption(source_path, output_path)
        assert repaired is False, "No repair should have been performed"
        assert "JPEG corruption detected" not in caplog.text

    def test_no_repair_when_truncation_too_large(self, create_damaged_pdf, caplog):
        """Test that images truncated by more than 15 bytes are not repaired."""
        caplog.set_level(logging.DEBUG)
        source_path, damaged_path, _ = create_damaged_pdf(truncate_bytes=20)

        repaired = _repair_gs106_jpeg_corruption(source_path, damaged_path)
        assert repaired is False, "Should not repair truncation > 15 bytes"
        assert "JPEG corruption detected" not in caplog.text


================================================
FILE: tests/test_graft.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

from unittest.mock import patch

import pikepdf

import ocrmypdf


def test_no_glyphless_graft(resources, outdir):
    with (
        pikepdf.open(resources / 'francais.pdf') as pdf,
        pikepdf.open(resources / 'aspect.pdf') as pdf_aspect,
        pikepdf.open(resources / 'cmyk.pdf') as pdf_cmyk,
    ):
        pdf.pages.extend(pdf_aspect.pages)
        pdf.pages.extend(pdf_cmyk.pages)
        pdf.save(outdir / 'test.pdf')

    with patch('ocrmypdf._graft.MAX_REPLACE_PAGES', 2):
        ocrmypdf.ocr(
            outdir / 'test.pdf',
            outdir / 'out.pdf',
            deskew=True,
            tesseract_timeout=0,
            force_ocr=True,
        )
    # This test needs asserts


def test_links(resources, outpdf):
    ocrmypdf.ocr(
        resources / 'link.pdf', outpdf, redo_ocr=True, oversample=200, output_type='pdf'
    )
    with pikepdf.open(outpdf) as pdf:
        p1 = pdf.pages[0]
        p2 = pdf.pages[1]
        assert p1.Annots[0].A.D[0].objgen == p2.objgen
        assert p2.Annots[0].A.D[0].objgen == p1.objgen


def test_redo_ocr_with_offset_mediabox(resources, outdir):
    """Test that --redo-ocr handles non-zero mediabox origins correctly.

    Regression test for issue #1630 where PDFs with mediabox origins like
    [0, 100, width, height+100] (common in cropped/JSTOR-style PDFs)
    would have OCR text shifted vertically because the text layer CTM
    did not account for the page origin offset.
    """
    # Create a PDF with a non-zero mediabox origin
    input_pdf = outdir / 'offset_mediabox_input.pdf'
    y_offset = 100

    with pikepdf.open(resources / 'graph_ocred.pdf') as pdf:
        page = pdf.pages[0]
        original_mb = list(page.MediaBox)

        # Shift mediabox Y origin to simulate cropped/JSTOR-style PDFs
        page.MediaBox = [
            original_mb[0],
            original_mb[1] + y_offset,
            original_mb[2],
            original_mb[3] + y_offset,
        ]

        pdf.save(input_pdf)

    # Run --redo-ocr (this is where the bug occurred)
    output_pdf = outdir / 'offset_redo_ocr.pdf'
    ocrmypdf.ocr(input_pdf, output_pdf, redo_ocr=True)

    # Verify the output
    with pikepdf.open(output_pdf) as pdf:
        page = pdf.pages[0]
        mediabox = list(page.MediaBox)

        # MediaBox origin should be preserved
        assert (
            float(mediabox[1]) == y_offset
        ), f"MediaBox Y origin should be preserved at {y_offset}, got {mediabox[1]}"

        # The content stream should include a CTM with the Y origin translation.
        # Without the fix, the CTM was omitted for rotation==0, causing a shift.
        content = page.Contents.read_bytes()
        assert b'cm' in content, (
            "Content stream should include a CTM to translate by the page origin"
        )


def test_strip_invisble_text():
    pdf = pikepdf.Pdf.new()
    print(pikepdf.parse_content_stream(pikepdf.Stream(pdf, b'3 Tr')))
    page = pdf.add_blank_page()
    visible_text = [
        pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
        pikepdf.ContentStreamInstruction(
            (pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
        ),
        pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
        pikepdf.ContentStreamInstruction(
            (pikepdf.String('visible'),), pikepdf.Operator('Tj')
        ),
        pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
    ]
    invisible_text = [
        pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
        pikepdf.ContentStreamInstruction(
            (pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
        ),
        pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
        pikepdf.ContentStreamInstruction(
            (pikepdf.String('invisible'),), pikepdf.Operator('Tj')
        ),
        pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
    ]
    invisible_text_setting_tr = [
        pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
        pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),
        pikepdf.ContentStreamInstruction(
            (pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
        ),
        pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
        pikepdf.ContentStreamInstruction(
            (pikepdf.String('invisible'),), pikepdf.Operator('Tj')
        ),
        pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
    ]
    stream = [
        pikepdf.ContentStreamInstruction([], pikepdf.Operator('q')),
        pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),
        *invisible_text,
        pikepdf.ContentStreamInstruction([], pikepdf.Operator('Q')),
        *visible_text,
        *invisible_text_setting_tr,
        *invisible_text,
    ]
    content_stream = pikepdf.unparse_content_stream(stream)
    page.Contents = pikepdf.Stream(pdf, content_stream)

    def count(string, page):
        return len(
            [
                True
                for operands, operator in pikepdf.parse_content_stream(page)
                if operator == pikepdf.Operator('Tj')
                and operands[0] == pikepdf.String(string)
            ]
        )

    nr_visible_pre = count('visible', page)
    ocrmypdf._graft.strip_invisible_text(pdf, page)
    nr_visible_post = count('visible', page)
    assert (
        nr_visible_pre == nr_visible_post
    ), 'Number of visible text elements did not change'
    assert count('invisible', page) == 0, 'No invisible elems left'


================================================
FILE: tests/test_helpers.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import logging
import multiprocessing
import os
from pathlib import Path
from unittest.mock import MagicMock

import pytest
from packaging.version import Version

from ocrmypdf import helpers
from ocrmypdf.helpers import running_in_docker

needs_symlink = pytest.mark.skipif(os.name == 'nt', reason='needs posix symlink')
windows_only = pytest.mark.skipif(os.name != 'nt', reason="Windows test")


class TestSafeSymlink:
    def test_safe_symlink_link_self(self, tmp_path, caplog):
        helpers.safe_symlink(tmp_path / 'self', tmp_path / 'self')
        assert caplog.record_tuples[0][1] == logging.WARNING

    def test_safe_symlink_overwrite(self, tmp_path):
        (tmp_path / 'regular_file').touch()
        with pytest.raises(FileExistsError):
            helpers.safe_symlink(tmp_path / 'input', tmp_path / 'regular_file')

    @needs_symlink
    def test_safe_symlink_relink(self, tmp_path):
        (tmp_path / 'regular_file_a').touch()
        (tmp_path / 'regular_file_b').write_bytes(b'ABC')
        (tmp_path / 'link').symlink_to(tmp_path / 'regular_file_a')
        helpers.safe_symlink(tmp_path / 'regular_file_b', tmp_path / 'link')
        assert (tmp_path / 'link').samefile(tmp_path / 'regular_file_b') or (
            tmp_path / 'link'
        ).read_bytes() == b'ABC'


def test_no_cpu_count(monkeypatch):
    invoked = False

    def cpu_count_raises():
        nonlocal invoked
        invoked = True
        raise NotImplementedError()

    monkeypatch.setattr(multiprocessing, 'cpu_count', cpu_count_raises)
    with pytest.warns(expected_warning=UserWarning):
        assert helpers.available_cpu_count() == 1
    assert invoked, "Patched function called during test"


skipif_docker = pytest.mark.skipif(running_in_docker(), reason="fails on Docker")


class TestFileIsWritable:
    @pytest.fixture
    def non_existent(self, tmp_path):
        return tmp_path / 'nofile'

    @pytest.fixture
    def basic_file(self, tmp_path):
        basic = tmp_path / 'basic'
        basic.touch()
        return basic

    def test_plain(self, non_existent):
        assert helpers.is_file_writable(non_existent)

    @needs_symlink
    def test_symlink_loop(self, tmp_path):
        loop = tmp_path / 'loop'
        loop.symlink_to(loop)
        assert not helpers.is_file_writable(loop)

    @skipif_docker
    def test_chmod(self, basic_file):
        assert helpers.is_file_writable(basic_file)
        basic_file.chmod(0o400)
        assert not helpers.is_file_writable(basic_file)
        basic_file.chmod(0o000)
        assert not helpers.is_file_writable(basic_file)

    def test_permission_error(self, basic_file):
        pathmock = MagicMock(spec_set=basic_file)
        pathmock.is_symlink.return_value = False
        pathmock.exists.return_value = True
        pathmock.is_file.side_effect = PermissionError
        assert not helpers.is_file_writable(pathmock)


@windows_only
def test_gs_install_locations():
    # pylint: disable=import-outside-toplevel
    from ocrmypdf.subprocess._windows import _gs_version_in_path_key

    assert _gs_version_in_path_key(Path("C:\\Program Files\\gs\\gs9.52\\bin")) == (
        'gs',
        Version('9.52'),
    )


@windows_only
def test_shim_paths(tmp_path):
    # pylint: disable=import-outside-toplevel
    from ocrmypdf.subprocess._windows import shim_env_path

    progfiles = tmp_path / 'Program Files'
    progfiles.mkdir()
    (progfiles / 'tesseract-ocr').mkdir()
    (progfiles / 'gs' / '9.51' / 'bin').mkdir(parents=True)
    (progfiles / 'gs' / 'gs9.52.3' / 'bin').mkdir(parents=True)
    syspath = tmp_path / 'bin'
    env = {'PROGRAMFILES': str(progfiles), 'PATH': str(syspath)}

    result_str = shim_env_path(env=env)
    results = result_str.split(os.pathsep)
    assert results[0] == str(syspath), results
    assert results[-3].endswith('tesseract-ocr'), results
    assert results[-2].endswith(os.path.join('gs9.52.3', 'bin')), results
    assert results[-1].endswith(os.path.join('gs', '9.51', 'bin')), results


def test_resolution():
    Resolution = helpers.Resolution
    dpi_100 = Resolution(100, 100)
    dpi_200 = Resolution(200, 200)
    assert dpi_100.is_square
    assert not Resolution(100, 200).is_square
    assert dpi_100 == Resolution(100, 100)
    assert str(dpi_100) != str(dpi_200)
    assert dpi_100.take_max([200, 300], [400]) == Resolution(300, 400)


================================================
FILE: tests/test_hocr_parser.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Unit tests for HocrParser class."""

from __future__ import annotations

from pathlib import Path
from textwrap import dedent

import pytest

from ocrmypdf.hocrtransform import (
    HocrParseError,
    HocrParser,
    OcrClass,
)


@pytest.fixture
def simple_hocr(tmp_path) -> Path:
    """Create a simple valid hOCR file."""
    content = dedent("""\
        <?xml version="1.0" encoding="UTF-8"?>
        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
            "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
        <head>
            <title>Test</title>
        </head>
        <body>
            <div class='ocr_page' title='bbox 0 0 1000 500; ppageno 0'>
                <p class='ocr_par' lang='eng' dir='ltr'>
                    <span class='ocr_line' title='bbox 100 100 900 150; baseline 0.01 -5'>
                        <span class='ocrx_word' title='bbox 100 100 200 150; x_wconf 95'>Hello</span>
                        <span class='ocrx_word' title='bbox 250 100 350 150; x_wconf 90'>World</span>
                    </span>
                </p>
            </div>
        </body>
        </html>
    """)
    hocr_file = tmp_path / "simple.hocr"
    hocr_file.write_text(content, encoding='utf-8')
    return hocr_file


@pytest.fixture
def multiline_hocr(tmp_path) -> Path:
    """Create an hOCR file with multiple lines and paragraphs."""
    content = dedent("""\
        <?xml version="1.0" encoding="UTF-8"?>
        <html>
        <body>
            <div class='ocr_page' title='bbox 0 0 1000 1000'>
                <p class='ocr_par' lang='eng'>
                    <span class='ocr_line' title='bbox 100 100 900 150'>
                        <span class='ocrx_word' title='bbox 100 100 200 150'>Line</span>
                        <span class='ocrx_word' title='bbox 210 100 280 150'>one</span>
                    </span>
                    <span class='ocr_line' title='bbox 100 200 900 250'>
                        <span class='ocrx_word' title='bbox 100 200 200 250'>Line</span>
                        <span class='ocrx_word' title='bbox 210 200 280 250'>two</span>
                    </span>
                </p>
                <p class='ocr_par' lang='deu'>
                    <span class='ocr_line' title='bbox 100 400 900 450'>
                        <span class='ocrx_word' title='bbox 100 400 200 450'>German</span>
                        <span class='ocrx_word' title='bbox 210 400 280 450'>text</span>
                    </span>
                </p>
            </div>
        </body>
        </html>
    """)
    hocr_file = tmp_path / "multiline.hocr"
    hocr_file.write_text(content, encoding='utf-8')
    return hocr_file


@pytest.fixture
def rtl_hocr(tmp_path) -> Path:
    """Create an hOCR file with RTL text."""
    content = dedent("""\
        <?xml version="1.0" encoding="UTF-8"?>
        <html>
        <body>
            <div class='ocr_page' title='bbox 0 0 1000 500'>
                <p class='ocr_par' lang='ara' dir='rtl'>
                    <span class='ocr_line' title='bbox 100 100 900 150'>
                        <span class='ocrx_word' title='bbox 100 100 200 150'>مرحبا</span>
                    </span>
                </p>
            </div>
        </body>
        </html>
    """)
    hocr_file = tmp_path / "rtl.hocr"
    hocr_file.write_text(content, encoding='utf-8')
    return hocr_file


@pytest.fixture
def rotated_hocr(tmp_path) -> Path:
    """Create an hOCR file with rotated text (textangle)."""
    content = dedent("""\
        <?xml version="1.0" encoding="UTF-8"?>
        <html>
        <body>
            <div class='ocr_page' title='bbox 0 0 1000 500'>
                <p class='ocr_par' lang='eng'>
                    <span class='ocr_line' title='bbox 100 100 900 150; textangle 5.5'>
                        <span class='ocrx_word' title='bbox 100 100 200 150'>Rotated</span>
                    </span>
                </p>
            </div>
        </body>
        </html>
    """)
    hocr_file = tmp_path / "rotated.hocr"
    hocr_file.write_text(content, encoding='utf-8')
    return hocr_file


@pytest.fixture
def header_hocr(tmp_path) -> Path:
    """Create an hOCR file with different line types."""
    content = dedent("""\
        <?xml version="1.0" encoding="UTF-8"?>
        <html>
        <body>
            <div class='ocr_page' title='bbox 0 0 1000 500'>
                <p class='ocr_par' lang='eng'>
                    <span class='ocr_header' title='bbox 100 50 900 100'>
                        <span class='ocrx_word' title='bbox 100 50 300 100'>Chapter</span>
                        <span class='ocrx_word' title='bbox 310 50 400 100'>One</span>
                    </span>
                    <span class='ocr_line' title='bbox 100 150 900 200'>
                        <span class='ocrx_word' title='bbox 100 150 200 200'>Body</span>
                        <span class='ocrx_word' title='bbox 210 150 280 200'>text</span>
                    </span>
                    <span class='ocr_caption' title='bbox 100 300 900 350'>
                        <span class='ocrx_word' title='bbox 100 300 200 350'>Figure</span>
                        <span class='ocrx_word' title='bbox 210 300 250 350'>1</span>
                    </span>
                </p>
            </div>
        </body>
        </html>
    """)
    hocr_file = tmp_path / "header.hocr"
    hocr_file.write_text(content, encoding='utf-8')
    return hocr_file


@pytest.fixture
def font_info_hocr(tmp_path) -> Path:
    """Create an hOCR file with font information."""
    content = dedent("""\
        <?xml version="1.0" encoding="UTF-8"?>
        <html>
        <body>
            <div class='ocr_page' title='bbox 0 0 1000 500'>
                <p class='ocr_par' lang='eng'>
                    <span class='ocr_line' title='bbox 100 100 900 150'>
                        <span class='ocrx_word' title='bbox 100 100 200 150; x_font Arial; x_fsize 12.5'>Styled</span>
                    </span>
                </p>
            </div>
        </body>
        </html>
    """)
    hocr_file = tmp_path / "font_info.hocr"
    hocr_file.write_text(content, encoding='utf-8')
    return hocr_file


class TestHocrParserBasic:
    """Basic HocrParser functionality tests."""

    def test_parse_simple_hocr(self, simple_hocr):
        parser = HocrParser(simple_hocr)
        page = parser.parse()

        assert page.ocr_class == OcrClass.PAGE
        assert page.bbox is not None
        assert page.bbox.width == 1000
        assert page.bbox.height == 500

    def test_parse_page_number(self, simple_hocr):
        parser = HocrParser(simple_hocr)
        page = parser.parse()

        assert page.page_number == 0

    def test_parse_paragraphs(self, simple_hocr):
        parser = HocrParser(simple_hocr)
        page = parser.parse()

        assert len(page.paragraphs) == 1
        paragraph = page.paragraphs[0]
        assert paragraph.ocr_class == OcrClass.PARAGRAPH
        assert paragraph.language == "eng"
        assert paragraph.direction == "ltr"

    def test_parse_lines(self, simple_hocr):
        parser = HocrParser(simple_hocr)
        page = parser.parse()

        lines = page.lines
        assert len(lines) == 1
        line = lines[0]
        assert line.ocr_class == OcrClass.LINE
        assert line.bbox is not None
        assert line.baseline is not None
        assert line.baseline.slope == pytest.approx(0.01)
        assert line.baseline.intercept == -5

    def test_parse_words(self, simple_hocr):
        parser = HocrParser(simple_hocr)
        page = parser.parse()

        words = page.words
        assert len(words) == 2
        assert words[0].text == "Hello"
        assert words[1].text == "World"

    def test_parse_word_confidence(self, simple_hocr):
        parser = HocrParser(simple_hocr)
        page = parser.parse()

        words = page.words
        assert words[0].confidence == pytest.approx(0.95)
        assert words[1].confidence == pytest.approx(0.90)

    def test_parse_word_bbox(self, simple_hocr):
        parser = HocrParser(simple_hocr)
        page = parser.parse()

        word = page.words[0]
        assert word.bbox is not None
        assert word.bbox.left == 100
        assert word.bbox.top == 100
        assert word.bbox.right == 200
        assert word.bbox.bottom == 150


class TestHocrParserMultiline:
    """Test parsing of multi-line/multi-paragraph hOCR."""

    def test_multiple_lines(self, multiline_hocr):
        parser = HocrParser(multiline_hocr)
        page = parser.parse()

        assert len(page.paragraphs) == 2
        assert len(page.lines) == 3  # 2 in first par, 1 in second

    def test_multiple_paragraphs_languages(self, multiline_hocr):
        parser = HocrParser(multiline_hocr)
        page = parser.parse()

        paragraphs = page.paragraphs
        assert paragraphs[0].language == "eng"
        assert paragraphs[1].language == "deu"

    def test_word_count(self, multiline_hocr):
        parser = HocrParser(multiline_hocr)
        page = parser.parse()

        assert len(page.words) == 6  # 2 + 2 + 2


class TestHocrParserRTL:
    """Test parsing of RTL text."""

    def test_rtl_direction(self, rtl_hocr):
        parser = HocrParser(rtl_hocr)
        page = parser.parse()

        paragraph = page.paragraphs[0]
        assert paragraph.direction == "rtl"
        assert paragraph.language == "ara"

    def test_rtl_line_inherits_direction(self, rtl_hocr):
        parser = HocrParser(rtl_hocr)
        page = parser.parse()

        line = page.lines[0]
        assert line.direction == "rtl"


class TestHocrParserRotation:
    """Test parsing of rotated text."""

    def test_textangle(self, rotated_hocr):
        parser = HocrParser(rotated_hocr)
        page = parser.parse()

        line = page.lines[0]
        assert line.textangle == pytest.approx(5.5)


class TestHocrParserLineTypes:
    """Test parsing of different line types."""

    def test_header_line(self, header_hocr):
        parser = HocrParser(header_hocr)
        page = parser.parse()

        lines = page.lines
        assert len(lines) == 3

        # Check line types
        line_classes = [line.ocr_class for line in lines]
        assert OcrClass.HEADER in line_classes
        assert OcrClass.LINE in line_classes
        assert OcrClass.CAPTION in line_classes

    def test_all_line_types_have_words(self, header_hocr):
        parser = HocrParser(header_hocr)
        page = parser.parse()

        for line in page.lines:
            assert len(line.children) > 0


class TestHocrParserFontInfo:
    """Test parsing of font information."""

    def test_font_name_and_size(self, font_info_hocr):
        parser = HocrParser(font_info_hocr)
        page = parser.parse()

        word = page.words[0]
        assert word.font is not None
        assert word.font.name == "Arial"
        assert word.font.size == pytest.approx(12.5)


class TestHocrParserErrors:
    """Test error handling in HocrParser."""

    def test_missing_file(self, tmp_path):
        with pytest.raises(FileNotFoundError):
            HocrParser(tmp_path / "nonexistent.hocr")

    def test_invalid_xml(self, tmp_path):
        hocr_file = tmp_path / "invalid.hocr"
        hocr_file.write_text("<html><body>not closed", encoding='utf-8')

        with pytest.raises(HocrParseError):
            HocrParser(hocr_file)

    def test_missing_ocr_page(self, tmp_path):
        hocr_file = tmp_path / "no_page.hocr"
        hocr_file.write_text(
            "<html><body><p>No ocr_page</p></body></html>", encoding='utf-8'
        )

        parser = HocrParser(hocr_file)
        with pytest.raises(HocrParseError, match="No ocr_page"):
            parser.parse()

    def test_missing_page_bbox(self, tmp_path):
        hocr_file = tmp_path / "no_bbox.hocr"
        hocr_file.write_text(
            "<html><body><div class='ocr_page'>No bbox</div></body></html>",
            encoding='utf-8',
        )

        parser = HocrParser(hocr_file)
        with pytest.raises(HocrParseError, match="bbox"):
            parser.parse()


class TestHocrParserEdgeCases:
    """Test edge cases in HocrParser."""

    def test_empty_word_text(self, tmp_path):
        """Words with empty text should be skipped."""
        content = dedent("""\
            <?xml version="1.0" encoding="UTF-8"?>
            <html>
            <body>
                <div class='ocr_page' title='bbox 0 0 1000 500'>
                    <p class='ocr_par'>
                        <span class='ocr_line' title='bbox 100 100 900 150'>
                            <span class='ocrx_word' title='bbox 100 100 200 150'></span>
                            <span class='ocrx_word' title='bbox 210 100 300 150'>Valid</span>
                        </span>
                    </p>
                </div>
            </body>
            </html>
        """)
        hocr_file = tmp_path / "empty_word.hocr"
        hocr_file.write_text(content, encoding='utf-8')

        parser = HocrParser(hocr_file)
        page = parser.parse()

        # Only the non-empty word should be parsed
        assert len(page.words) == 1
        assert page.words[0].text == "Valid"

    def test_whitespace_only_word(self, tmp_path):
        """Words with only whitespace should be skipped."""
        content = dedent("""\
            <?xml version="1.0" encoding="UTF-8"?>
            <html>
            <body>
                <div class='ocr_page' title='bbox 0 0 1000 500'>
                    <p class='ocr_par'>
                        <span class='ocr_line' title='bbox 100 100 900 150'>
                            <span class='ocrx_word' title='bbox 100 100 200 150'>   </span>
                            <span class='ocrx_word' title='bbox 210 100 300 150'>Valid</span>
                        </span>
                    </p>
                </div>
            </body>
            </html>
        """)
        hocr_file = tmp_path / "whitespace_word.hocr"
        hocr_file.write_text(content, encoding='utf-8')

        parser = HocrParser(hocr_file)
        page = parser.parse()

        assert len(page.words) == 1
        assert page.words[0].text == "Valid"

    def test_line_without_bbox(self, tmp_path):
        """Lines without bbox should be skipped."""
        content = dedent("""\
            <?xml version="1.0" encoding="UTF-8"?>
            <html>
            <body>
                <div class='ocr_page' title='bbox 0 0 1000 500'>
                    <p class='ocr_par'>
                        <span class='ocr_line'>
                            <span class='ocrx_word' title='bbox 100 100 200 150'>Word</span>
                        </span>
                        <span class='ocr_line' title='bbox 100 200 900 250'>
                            <span class='ocrx_word' title='bbox 100 200 200 250'>Valid</span>
                        </span>
                    </p>
                </div>
            </body>
            </html>
        """)
        hocr_file = tmp_path / "no_line_bbox.hocr"
        hocr_file.write_text(content, encoding='utf-8')

        parser = HocrParser(hocr_file)
        page = parser.parse()

        # Only line with bbox should be parsed
        assert len(page.lines) == 1
        assert page.words[0].text == "Valid"

    def test_unicode_normalization(self, tmp_path):
        """Text should be NFKC normalized."""
        # Use a string with combining characters
        content = dedent("""\
            <?xml version="1.0" encoding="UTF-8"?>
            <html>
            <body>
                <div class='ocr_page' title='bbox 0 0 1000 500'>
                    <p class='ocr_par'>
                        <span class='ocr_line' title='bbox 100 100 900 150'>
                            <span class='ocrx_word' title='bbox 100 100 200 150'>ﬁ</span>
                        </span>
                    </p>
                </div>
            </body>
            </html>
        """)
        hocr_file = tmp_path / "unicode.hocr"
        hocr_file.write_text(content, encoding='utf-8')

        parser = HocrParser(hocr_file)
        page = parser.parse()

        # fi ligature should be normalized to "fi"
        assert page.words[0].text == "fi"

    def test_words_directly_under_page(self, tmp_path):
        """Test fallback for words directly under page (no paragraph structure)."""
        content = dedent("""\
            <?xml version="1.0" encoding="UTF-8"?>
            <html>
            <body>
                <div class='ocr_page' title='bbox 0 0 1000 500'>
                    <span class='ocrx_word' title='bbox 100 100 200 150'>Direct</span>
                    <span class='ocrx_word' title='bbox 210 100 300 150'>Word</span>
                </div>
            </body>
            </html>
        """)
        hocr_file = tmp_path / "direct_words.hocr"
        hocr_file.write_text(content, encoding='utf-8')

        parser = HocrParser(hocr_file)
        page = parser.parse()

        # Words should be parsed as direct children
        assert len(page.children) == 2
        assert page.children[0].text == "Direct"
        assert page.children[1].text == "Word"

    def test_no_namespace(self, tmp_path):
        """Test parsing hOCR without XHTML namespace."""
        content = dedent("""\
            <html>
            <body>
                <div class='ocr_page' title='bbox 0 0 1000 500'>
                    <p class='ocr_par'>
                        <span class='ocr_line' title='bbox 100 100 900 150'>
                            <span class='ocrx_word' title='bbox 100 100 200 150'>NoNS</span>
                        </span>
                    </p>
                </div>
            </body>
            </html>
        """)
        hocr_file = tmp_path / "no_namespace.hocr"
        hocr_file.write_text(content, encoding='utf-8')

        parser = HocrParser(hocr_file)
        page = parser.parse()

        assert len(page.words) == 1
        assert page.words[0].text == "NoNS"


================================================
FILE: tests/test_hocrtransform.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import re
from io import StringIO
from pathlib import Path

import pytest
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from PIL import Image

from ocrmypdf._exec.tesseract import generate_hocr
from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import Fpdf2PdfRenderer
from ocrmypdf.helpers import check_pdf
from ocrmypdf.hocrtransform import HocrParser

from .conftest import check_ocrmypdf


def text_from_pdf(filename):
    output_string = StringIO()
    with open(filename, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    return output_string.getvalue()


# pylint: disable=redefined-outer-name


@pytest.fixture
def font_dir():
    """Get the font directory."""
    return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"


@pytest.fixture
def multi_font_manager(font_dir):
    """Create a MultiFontManager for tests."""
    return MultiFontManager(font_dir)


@pytest.fixture
def blank_hocr(tmp_path):
    im = Image.new('1', (8, 8), 0)
    im.save(tmp_path / 'blank.tif', format='TIFF')
    generate_hocr(
        input_file=tmp_path / 'blank.tif',
        output_hocr=tmp_path / 'blank.hocr',
        output_text=tmp_path / 'blank.txt',
        languages=['eng'],
        engine_mode=1,
        tessconfig=[],
        pagesegmode=3,
        thresholding=0,
        user_words=None,
        user_patterns=None,
        timeout=None,
    )
    return tmp_path / 'blank.hocr'


def test_mono_image(blank_hocr, outdir, multi_font_manager):
    im = Image.new('1', (8, 8), 0)
    for n in range(8):
        im.putpixel((n, n), 1)
    im.save(outdir / 'mono.tif', format='TIFF')

    # Parse hOCR file
    parser = HocrParser(str(blank_hocr))
    ocr_page = parser.parse()

    # Use DPI from hOCR or default
    dpi = ocr_page.dpi or 8

    # Render to PDF using fpdf2
    renderer = Fpdf2PdfRenderer(
        page=ocr_page,
        dpi=dpi,
        multi_font_manager=multi_font_manager,
        invisible_text=True,
    )
    renderer.render(outdir / 'mono.pdf')

    check_pdf(outdir / 'mono.pdf')


@pytest.mark.slow
def test_fpdf2_matches_sandwich(resources, outdir):
    """Test that fpdf2 renderer produces similar output to sandwich renderer."""
    # Note: hocr renderer now redirects to fpdf2
    check_ocrmypdf(
        resources / 'ccitt.pdf', outdir / 'fpdf2.pdf', '--pdf-renderer=fpdf2'
    )
    check_ocrmypdf(
        resources / 'ccitt.pdf', outdir / 'tess.pdf', '--pdf-renderer=sandwich'
    )

    # Slight differences in spacing and word order can appear, so at least ensure
    # that we get all of the same words...
    def clean(s):
        s = re.sub(r'\s+', ' ', s)
        words = s.split(' ')
        return set(words)

    fpdf2_words = clean(text_from_pdf(outdir / 'fpdf2.pdf'))
    tess_words = clean(text_from_pdf(outdir / 'tess.pdf'))

    similarity = len(fpdf2_words & tess_words) / len(fpdf2_words | tess_words)

    assert similarity > 0.99


================================================
FILE: tests/test_image_input.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

from unittest.mock import patch

import img2pdf
import pikepdf
import pytest
from PIL import Image

import ocrmypdf

from .conftest import check_ocrmypdf, run_ocrmypdf_api

# pylint: disable=redefined-outer-name


@pytest.fixture
def baiona(resources):
    return Image.open(resources / 'baiona_gray.png')


def test_image_to_pdf(resources, outpdf):
    check_ocrmypdf(
        resources / 'crom.png',
        outpdf,
        '--image-dpi',
        '200',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


def test_no_dpi_info(caplog, baiona, outdir, no_outpdf):
    im = baiona
    assert 'dpi' not in im.info
    input_image = outdir / 'baiona_no_dpi.png'
    im.save(input_image)

    rc = run_ocrmypdf_api(input_image, no_outpdf)
    assert rc == ocrmypdf.ExitCode.input_file
    assert "--image-dpi" in caplog.text


def test_dpi_not_credible(caplog, baiona, outdir, no_outpdf):
    im = baiona
    assert 'dpi' not in im.info
    input_image = outdir / 'baiona_no_dpi.png'
    im.save(input_image, dpi=(30, 30))

    rc = run_ocrmypdf_api(input_image, no_outpdf)
    assert rc == ocrmypdf.ExitCode.input_file
    assert "not credible" in caplog.text


def test_cmyk_no_icc(caplog, resources, no_outpdf):
    rc = run_ocrmypdf_api(resources / 'baiona_cmyk.jpg', no_outpdf)
    assert rc == ocrmypdf.ExitCode.input_file
    assert "no ICC profile" in caplog.text


def test_img2pdf_fails(resources, no_outpdf):
    with patch(
        'ocrmypdf._pipeline.img2pdf.convert', side_effect=img2pdf.ImageOpenError()
    ) as mock:
        rc = run_ocrmypdf_api(
            resources / 'baiona_gray.png', no_outpdf, '--image-dpi', '200'
        )
        assert rc == ocrmypdf.ExitCode.input_file
        mock.assert_called()


@pytest.mark.xfail(reason="remove background disabled")
def test_jpeg_in_jpeg_out(resources, outpdf):
    check_ocrmypdf(
        resources / 'baiona_color.jpg',
        outpdf,
        '--image-dpi',
        '100',
        '--output-type',
        'pdf',  # specifically check pdf because Ghostscript may convert to JPEG
        '--remove-background',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    with pikepdf.open(outpdf) as pdf:
        assert next(iter(pdf.pages[0].images.values())).Filter == pikepdf.Name.DCTDecode


================================================
FILE: tests/test_imageops.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import hypothesis.strategies as st
from hypothesis import given
from PIL import Image

from ocrmypdf.imageops import (
    _calculate_downsample,
    bytes_per_pixel,
    calculate_downsample,
    downsample_image,
)


def test_bytes_per_pixel():
    assert bytes_per_pixel('RGB') == 4
    assert bytes_per_pixel('RGBA') == 4
    assert bytes_per_pixel('LA') == 2
    assert bytes_per_pixel('L') == 1


def test_calculate_downsample():
    im = Image.new('RGB', (100, 100))
    assert calculate_downsample(im, max_size=(50, 50)) == (50, 50)
    assert calculate_downsample(im, max_pixels=2500) == (50, 50)
    assert calculate_downsample(im, max_bytes=10000) == (50, 50)
    assert calculate_downsample(im, max_bytes=100000) == (100, 100)


@given(
    st.one_of(st.just("RGB"), st.just('L')),
    st.integers(min_value=1, max_value=100000),
    st.integers(min_value=1, max_value=100000),
    st.integers(min_value=64, max_value=100000),
    st.integers(min_value=64, max_value=100000),
    st.integers(min_value=64 * 64, max_value=1000000),
)
def test_calculate_downsample_hypothesis(mode, im_w, im_h, max_x, max_y, max_bytes):
    result = _calculate_downsample(
        (im_w, im_h),
        bytes_per_pixel(mode),
        max_size=(max_x, max_y),
        max_bytes=max_bytes,
    )
    assert result[0] <= max_x
    assert result[1] <= max_y
    assert result[0] * result[1] * bytes_per_pixel(mode) <= max_bytes


def test_downsample_image():
    im = Image.new('RGB', (100, 100))
    im.info['dpi'] = (300, 300)
    ds = downsample_image(im, (50, 50))
    assert ds.size == (50, 50)
    assert ds.info['dpi'] == (150, 150)


================================================
FILE: tests/test_json_serialization.py
================================================
"""Test JSON serialization of OcrOptions for multiprocessing compatibility."""
from __future__ import annotations

import multiprocessing
from io import BytesIO
from pathlib import Path, PurePath

import pytest

from ocrmypdf._options import OcrOptions
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOptions


@pytest.fixture(autouse=True)
def register_plugin_models():
    """Register plugin models for tests."""
    OcrOptions.register_plugin_models({'tesseract': TesseractOptions})
    yield
    # Clean up after test (optional, but good practice)


def worker_function(options_json: str) -> str:
    """Worker function that deserializes OcrOptions from JSON and returns a result."""
    # Register plugin models in worker process
    from ocrmypdf._options import OcrOptions
    from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOptions

    OcrOptions.register_plugin_models({'tesseract': TesseractOptions})

    # Reconstruct OcrOptions from JSON in worker process
    options = OcrOptions.model_validate_json_safe(options_json)

    # Verify we can access various option types
    # Count only user-added extra_attrs (exclude plugin cache keys starting with '_')
    user_attrs_count = len(
        [k for k in options.extra_attrs.keys() if not k.startswith('_')]
    )
    result = {
        'input_file': str(options.input_file),
        'output_file': str(options.output_file),
        'languages': options.languages,
        'optimize': options.optimize,
        'tesseract_timeout': options.tesseract.timeout,
        'fast_web_view': options.fast_web_view,
        'extra_attrs_count': user_attrs_count,
    }

    # Return as JSON string
    import json

    return json.dumps(result)


def test_json_serialization_multiprocessing():
    """Test that OcrOptions can be JSON serialized and used in multiprocessing."""
    # Create OcrOptions with various field types
    options = OcrOptions(
        input_file=Path('/test/input.pdf'),
        output_file=Path('/test/output.pdf'),
        languages=['eng', 'deu'],
        optimize=2,
        tesseract_timeout=120.0,
        fast_web_view=2.5,
        deskew=True,
        clean=False,
    )

    # Add some extra attributes
    options.extra_attrs['custom_field'] = 'test_value'
    options.extra_attrs['numeric_field'] = 42

    # Serialize to JSON
    options_json = options.model_dump_json_safe()

    # Test that we can deserialize in the main process
    reconstructed = OcrOptions.model_validate_json_safe(options_json)
    assert reconstructed.input_file == options.input_file
    assert reconstructed.output_file == options.output_file
    assert reconstructed.languages == options.languages
    assert reconstructed.optimize == options.optimize
    assert reconstructed.tesseract_timeout == options.tesseract.timeout
    assert reconstructed.fast_web_view == options.fast_web_view
    assert reconstructed.deskew == options.deskew
    assert reconstructed.clean == options.clean
    # Compare user-added extra_attrs (excluding plugin cache keys)
    user_attrs = {k: v for k, v in options.extra_attrs.items() if not k.startswith('_')}
    reconstructed_attrs = {
        k: v for k, v in reconstructed.extra_attrs.items() if not k.startswith('_')
    }
    assert reconstructed_attrs == user_attrs

    # Test multiprocessing with JSON serialization
    with multiprocessing.Pool(processes=2) as pool:
        # Send the JSON string to worker processes
        results = pool.map(worker_function, [options_json, options_json])

    # Verify results from worker processes
    import json

    for result_json in results:
        result = json.loads(result_json)
        assert PurePath(result['input_file']) == PurePath('/test/input.pdf')
        assert PurePath(result['output_file']) == PurePath('/test/output.pdf')
        assert result['languages'] == ['eng', 'deu']
        assert result['optimize'] == 2
        assert result['tesseract_timeout'] == 120.0
        assert result['fast_web_view'] == 2.5
        assert result['extra_attrs_count'] == 2  # custom_field and numeric_field


def test_json_serialization_with_streams():
    """Test JSON serialization with stream objects."""
    input_stream = BytesIO(b'fake pdf data')
    output_stream = BytesIO()

    options = OcrOptions(
        input_file=input_stream,
        output_file=output_stream,
        languages=['eng'],
        optimize=1,
    )

    # Serialize to JSON (streams should be converted to placeholders)
    options_json = options.model_dump_json_safe()

    # Deserialize (streams will be placeholder strings)
    reconstructed = OcrOptions.model_validate_json_safe(options_json)

    # Streams should be converted to placeholder strings
    assert reconstructed.input_file == 'stream'
    assert reconstructed.output_file == 'stream'
    assert reconstructed.languages == ['eng']
    assert reconstructed.optimize == 1


def test_json_serialization_with_none_values():
    """Test JSON serialization handles None values correctly."""
    options = OcrOptions(
        input_file=Path('/test/input.pdf'),
        output_file=Path('/test/output.pdf'),
        languages=['eng'],
        # Many fields will be None by default
    )

    # Serialize to JSON
    options_json = options.model_dump_json_safe()

    # Deserialize
    reconstructed = OcrOptions.model_validate_json_safe(options_json)

    # Verify None values are preserved (check actual defaults from model)
    assert reconstructed.tesseract_timeout is None  # Default value
    assert reconstructed.fast_web_view == 1.0  # Default value, not None
    assert (
        reconstructed.color_conversion_strategy == "LeaveColorUnchanged"
    )  # Default value
    assert reconstructed.pdfa_image_compression is None  # This one is actually None

    # Verify non-None values are preserved
    assert reconstructed.input_file == options.input_file
    assert reconstructed.output_file == options.output_file
    assert reconstructed.languages == options.languages


================================================
FILE: tests/test_logging.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import logging

from ocrmypdf._pipelines._common import configure_debug_logging


def test_debug_logging(tmp_path):
    # Just exercise the debug logger but don't validate it
    # See https://github.com/pytest-dev/pytest/issues/5502 for pytest logging quirks
    prefix = 'test_debug_logging'
    log = logging.getLogger(prefix)
    _handler, remover = configure_debug_logging(tmp_path / 'test.log', prefix)
    log.info("test message")
    remover()


================================================
FILE: tests/test_main.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import os
import shutil
import sys
from math import isclose
from pathlib import Path
from subprocess import run
from unittest.mock import patch

import pikepdf
import pytest
from PIL import Image

import ocrmypdf
from ocrmypdf._exec import tesseract
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
from ocrmypdf.helpers import running_in_docker
from ocrmypdf.pdfa import file_claims_pdfa
from ocrmypdf.pdfinfo import Colorspace, Encoding, PdfInfo
from ocrmypdf.subprocess import get_version

from .conftest import (
    check_ocrmypdf,
    first_page_dimensions,
    have_unpaper,
    is_macos,
    run_ocrmypdf,
    run_ocrmypdf_api,
)

# pylint: disable=redefined-outer-name


RENDERERS = ['fpdf2', 'sandwich']


def test_quick(resources, outpdf):
    check_ocrmypdf(
        resources / 'ccitt.pdf', outpdf, '--plugin', 'tests/plugins/tesseract_cache.py'
    )


@pytest.mark.parametrize('renderer', RENDERERS)
def test_oversample(renderer, resources, outpdf):
    oversampled_pdf = check_ocrmypdf(
        resources / 'skew.pdf',
        outpdf,
        '--oversample',
        '350',
        '-f',
        '--pdf-renderer',
        renderer,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )

    pdfinfo = PdfInfo(oversampled_pdf)

    print(pdfinfo[0].dpi.x)
    assert abs(pdfinfo[0].dpi.x - 350) < 1


def test_repeat_ocr(resources, no_outpdf):
    result = run_ocrmypdf_api(resources / 'graph_ocred.pdf', no_outpdf)
    assert result == ExitCode.already_done_ocr


def test_force_ocr(resources, outpdf):
    out = check_ocrmypdf(
        resources / 'graph_ocred.pdf',
        outpdf,
        '-f',
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )
    pdfinfo = PdfInfo(out)
    assert pdfinfo[0].has_text


def test_skip_ocr(resources, outpdf):
    out = check_ocrmypdf(
        resources / 'graph_ocred.pdf',
        outpdf,
        '-s',
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )
    pdfinfo = PdfInfo(out)
    assert pdfinfo[0].has_text


def test_redo_ocr(resources, outpdf):
    in_ = resources / 'graph_ocred.pdf'
    before = PdfInfo(in_, detailed_analysis=True)
    out = outpdf
    out = check_ocrmypdf(in_, out, '--redo-ocr')
    after = PdfInfo(out, detailed_analysis=True)
    assert before[0].has_text and after[0].has_text
    assert (
        before[0].get_textareas() != after[0].get_textareas()
    ), "Expected text to be different after re-OCR"


def test_argsfile(resources, outdir):
    path_argsfile = outdir / 'test_argsfile.txt'
    with open(str(path_argsfile), 'w') as argsfile:
        print(
            '--title',
            'ArgsFile Test',
            '--author',
            'Test Cases',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
            sep='\n',
            end='\n',
            file=argsfile,
        )
    check_ocrmypdf(
        resources / 'graph.pdf', path_argsfile, '@' + str(outdir / 'test_argsfile.txt')
    )


@pytest.mark.parametrize('renderer', RENDERERS)
def test_ocr_timeout(renderer, resources, outpdf):
    out = check_ocrmypdf(
        resources / 'skew.pdf',
        outpdf,
        '--tesseract-timeout',
        '0',
        '--pdf-renderer',
        renderer,
    )
    pdfinfo = PdfInfo(out)
    assert not pdfinfo[0].has_text


def test_skip_big(resources, outpdf):
    out = check_ocrmypdf(
        resources / 'jbig2.pdf',
        outpdf,
        '--skip-big',
        '1',
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )
    pdfinfo = PdfInfo(out)
    assert not pdfinfo[0].has_text


@pytest.mark.parametrize('renderer', RENDERERS)
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_maximum_options(renderer, output_type, multipage, outpdf):
    check_ocrmypdf(
        multipage,
        outpdf,
        '-d',
        '-ci' if have_unpaper() else None,
        '-f',
        '-k',
        '--oversample',
        '300',
        '--skip-big',
        '10',
        '--title',
        'Too Many Weird Files',
        '--author',
        'py.test',
        '--pdf-renderer',
        renderer,
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )


@pytest.mark.skipif(
    tesseract.version() >= tesseract.TesseractVersion('5'),
    reason="tess 5 tries harder to find its files",
)
def test_tesseract_missing_tessdata(monkeypatch, resources, no_outpdf, tmpdir):
    monkeypatch.setenv("TESSDATA_PREFIX", os.fspath(tmpdir))
    with pytest.raises(MissingDependencyError):
        run_ocrmypdf_api(resources / 'graph.pdf', no_outpdf, '-v', '1', '--skip-text')


def test_invalid_input_pdf(resources, no_outpdf):
    result = run_ocrmypdf_api(resources / 'invalid.pdf', no_outpdf)
    assert result == ExitCode.input_file


def test_blank_input_pdf(resources, outpdf):
    result = run_ocrmypdf_api(resources / 'blank.pdf', outpdf)
    assert result == ExitCode.ok


def test_force_ocr_on_pdf_with_no_images(resources, no_outpdf):
    # As a correctness test, make sure that --force-ocr on a PDF with no
    # content still triggers tesseract. If tesseract crashes, then it was
    # called.
    exitcode = run_ocrmypdf_api(
        resources / 'blank.pdf',
        no_outpdf,
        '--force-ocr',
        '--plugin',
        'tests/plugins/tesseract_crash.py',
    )
    assert exitcode == ExitCode.child_process_error
    assert not no_outpdf.exists()


@pytest.mark.skipif(
    is_macos(),
    reason="takes too long to install language packs in macOS homebrew",
)
def test_german(resources, outdir):
    # Produce a sidecar too - implicit test that system locale is set up
    # properly. It is fine that we are testing -l deu on a French file because
    # we are exercising the functionality not going for accuracy.
    sidecar = outdir / 'francais.txt'
    try:
        check_ocrmypdf(
            resources / 'francais.pdf',
            outdir / 'francais.pdf',
            '-l',
            'deu',  # more commonly installed
            '--sidecar',
            sidecar,
            '--plugin',
            'tests/plugins/tesseract_cache.py',
        )
    except MissingDependencyError:
        if 'deu' not in tesseract.get_languages():
            pytest.xfail(reason="tesseract-deu language pack not installed")
        raise


def test_klingon(resources, outpdf):
    with pytest.raises(MissingDependencyError):
        run_ocrmypdf_api(resources / 'francais.pdf', outpdf, '-l', 'klz')


def test_missing_docinfo(resources, outpdf):
    result = run_ocrmypdf_api(
        resources / 'missing_docinfo.pdf',
        outpdf,
        '-l',
        'eng',
        '--skip-text',
        '--plugin',
        Path('tests/plugins/tesseract_noop.py'),
    )
    assert result == ExitCode.ok


def test_uppercase_extension(resources, outdir):
    shutil.copy(str(resources / "skew.pdf"), str(outdir / "UPPERCASE.PDF"))

    check_ocrmypdf(
        outdir / "UPPERCASE.PDF",
        outdir / "UPPERCASE_OUT.PDF",
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


def test_input_file_not_found(caplog, no_outpdf):
    input_file = "does not exist.pdf"
    result = run_ocrmypdf_api(input_file, no_outpdf)
    assert result == ExitCode.input_file
    assert input_file in caplog.text


@pytest.mark.skipif(os.name == 'nt' or running_in_docker(), reason="chmod")
def test_input_file_not_readable(caplog, resources, outdir, no_outpdf):
    input_file = outdir / 'trivial.pdf'
    shutil.copy(resources / 'trivial.pdf', input_file)
    input_file.chmod(0o000)
    result = run_ocrmypdf_api(input_file, no_outpdf)
    assert result == ExitCode.input_file
    assert str(input_file) in caplog.text


def test_input_file_not_a_pdf(caplog, no_outpdf):
    input_file = __file__  # Try to OCR this file
    result = run_ocrmypdf_api(input_file, no_outpdf)
    assert result == ExitCode.input_file
    if os.name != 'nt':  # name will be mangled with \\'s on nt
        assert input_file in caplog.text


@pytest.mark.parametrize('renderer', RENDERERS)
def test_pagesegmode(renderer, resources, outpdf):
    check_ocrmypdf(
        resources / 'skew.pdf',
        outpdf,
        '--tesseract-pagesegmode',
        '7',
        '-v',
        '1',
        '--pdf-renderer',
        renderer,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )


def test_tesseract_oem(resources, outpdf):
    check_ocrmypdf(
        resources / 'trivial.pdf',
        outpdf,
        '--tesseract-oem',
        '1',
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )


@pytest.mark.parametrize('value', ['auto', 'otsu', 'adaptive-otsu', 'sauvola'])
def test_tesseract_thresholding(value, resources, outpdf):
    check_ocrmypdf(
        resources / 'trivial.pdf',
        outpdf,
        '--tesseract-thresholding',
        value,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )


@pytest.mark.parametrize('value', ['abcxyz'])
def test_tesseract_thresholding_invalid(value, resources, no_outpdf):
    with pytest.raises(SystemExit, match='2'):
        run_ocrmypdf_api(
            resources / 'trivial.pdf',
            no_outpdf,
            '--tesseract-thresholding',
            value,
            '--plugin',
            'tests/plugins/tesseract_cache.py',
        )


@pytest.mark.parametrize('renderer', RENDERERS)
def test_tesseract_crash(renderer, resources, no_outpdf, caplog):
    exitcode = run_ocrmypdf_api(
        resources / 'ccitt.pdf',
        no_outpdf,
        '-v',
        '1',
        '--pdf-renderer',
        renderer,
        '--plugin',
        'tests/plugins/tesseract_crash.py',
    )
    assert exitcode == ExitCode.child_process_error
    assert not no_outpdf.exists()
    assert "SubprocessOutputError" in caplog.text


def test_tesseract_crash_autorotate(resources, no_outpdf, caplog):
    exitcode = run_ocrmypdf_api(
        resources / 'ccitt.pdf',
        no_outpdf,
        '-r',
        '--plugin',
        'tests/plugins/tesseract_crash.py',
    )
    assert exitcode == ExitCode.child_process_error
    assert not no_outpdf.exists()
    assert "uncaught exception" in caplog.text


@pytest.mark.parametrize('renderer', RENDERERS)
@pytest.mark.slow
def test_tesseract_image_too_big(renderer, resources, outpdf):
    check_ocrmypdf(
        resources / 'hugemono.pdf',
        outpdf,
        '-r',
        '--pdf-renderer',
        renderer,
        '--max-image-mpixels',
        '0',
        '--plugin',
        'tests/plugins/tesseract_big_image_error.py',
    )


@pytest.mark.parametrize('encryption_level', [2, 3, 4, 6])
def test_encrypted(resources, outpdf, encryption_level, caplog):
    if os.name == 'darwin' and sys.version_info >= (3, 12) and encryption_level <= 4:
        # Error is: RuntimeError: unable to load openssl legacy provider
        # pikepdf obtains encryption from qpdf, which gets it from openssl among other
        # providers.
        # Error message itself comes from here:
        # https://github.com/qpdf/qpdf/blob/da3eae39c8e5261196bbc1b460e5b556c6836dbf/libqpdf/QPDFCrypto_openssl.cc#L56
        # Somehow pikepdf + Python 3.12 + macOS does not have this problem, despite
        # using Homebrew's qpdf. Possibly the difference is that pikepdf's Python 3.12
        # comes from cibuildwheel, and our macOS Python 3.12 comes from GitHub Actions
        # setup-python. It may be necessary to build a custom qpdf for macOS.
        # In any case, OCRmyPDF doesn't support loading encrypted files at all, it
        # just complains about encryption, and it's using pikepdf to generate encrypted
        # files for testing.
        pytest.skip("GitHub Python 3.12 on macOS does not have openssl legacy support")
    encryption = pikepdf.models.encryption.Encryption(
        owner='ocrmypdf',
        user='ocrmypdf',
        R=encryption_level,
        aes=(encryption_level >= 4),
        metadata=(encryption_level == 6),
    )

    with pikepdf.open(resources / 'jbig2.pdf') as pdf:
        pdf.save(outpdf, encryption=encryption)

    exitcode = run_ocrmypdf_api(
        outpdf,
        outpdf,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    assert exitcode == ExitCode.encrypted_pdf
    assert 'encryption must be removed' in caplog.text


def test_jbig2_passthrough(resources, outpdf):
    out = check_ocrmypdf(
        resources / 'jbig2.pdf',
        outpdf,
        '--output-type',
        'pdf',
        '--pdf-renderer',
        'fpdf2',
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )
    out_pageinfo = PdfInfo(out)
    assert out_pageinfo[0].images[0].enc == Encoding.jbig2


def test_masks(resources, outpdf):
    assert (
        ocrmypdf.ocr(
            resources / 'masks.pdf', outpdf, plugins=['tests/plugins/tesseract_noop.py']
        )
        == ExitCode.ok
    )


def test_linearized_pdf_and_indirect_object(resources, outpdf):
    check_ocrmypdf(
        resources / 'epson.pdf', outpdf, '--plugin', 'tests/plugins/tesseract_noop.py'
    )


def test_very_high_dpi(resources, outpdf):
    """Checks for a Decimal quantize error with high DPI, etc."""
    check_ocrmypdf(
        resources / '2400dpi.pdf',
        outpdf,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )
    pdfinfo = PdfInfo(outpdf)

    image = pdfinfo[0].images[0]
    assert isclose(image.dpi.x, image.dpi.y)
    assert isclose(image.dpi.x, 2400)


def test_overlay(resources, outpdf):
    check_ocrmypdf(
        resources / 'overlay.pdf',
        outpdf,
        '--skip-text',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


@pytest.fixture
def protected_file(outdir):
    protected_file = outdir / 'protected.pdf'
    protected_file.touch()
    protected_file.chmod(0o400)  # Read-only
    yield protected_file


@pytest.mark.skipif(
    os.name == 'nt' or os.geteuid() == 0, reason="root can write to anything"
)
def test_destination_not_writable(resources, protected_file):
    exitcode = run_ocrmypdf_api(
        resources / 'jbig2.pdf',
        protected_file,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    assert exitcode == ExitCode.file_access_error


@pytest.fixture
def valid_tess_config(outdir):
    cfg_file = outdir / 'test.cfg'
    with cfg_file.open('w') as f:
        f.write(
            '''\
load_system_dawg 0
language_model_penalty_non_dict_word 0
language_model_penalty_non_freq_dict_word 0
'''
        )
    yield cfg_file


def test_tesseract_config_valid(resources, valid_tess_config, outpdf):
    check_ocrmypdf(
        resources / '3small.pdf',
        outpdf,
        '--tesseract-config',
        valid_tess_config,
        '--pages',
        '1',
    )


@pytest.fixture
def invalid_tess_config(outdir):
    cfg_file = outdir / 'test.cfg'
    with cfg_file.open('w') as f:
        f.write(
            '''\
THIS FILE IS INVALID
'''
        )
    yield cfg_file


@pytest.mark.slow  # This test sometimes times out in CI
@pytest.mark.parametrize('renderer', RENDERERS)
def test_tesseract_config_invalid(renderer, resources, invalid_tess_config, outpdf):
    p = run_ocrmypdf(
        resources / 'ccitt.pdf',
        outpdf,
        '--pdf-renderer',
        renderer,
        '--tesseract-config',
        invalid_tess_config,
    )
    assert (
        "parameter not found" in p.stderr.lower()
        or "error occurred while parsing" in p.stderr.lower()
    ), "No error message"
    assert p.returncode == ExitCode.invalid_config


def test_user_words_ocr(resources, outdir):
    # Does not actually test if --user-words causes output to differ
    word_list = outdir / 'wordlist.txt'
    sidecar_after = outdir / 'sidecar.txt'

    with word_list.open('w') as f:
        f.write('cromulent\n')  # a perfectly cromulent word

    check_ocrmypdf(
        resources / 'crom.png',
        outdir / 'out.pdf',
        '--image-dpi',
        150,
        '--sidecar',
        sidecar_after,
        '--user-words',
        word_list,
    )


def test_form_xobject(resources, outpdf):
    check_ocrmypdf(
        resources / 'formxobject.pdf',
        outpdf,
        '--force-ocr',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


@pytest.mark.parametrize('renderer', RENDERERS)
def test_pagesize_consistency(renderer, resources, outpdf):
    infile = resources / '3small.pdf'

    before_dims = first_page_dimensions(infile)

    check_ocrmypdf(
        infile,
        outpdf,
        '--pdf-renderer',
        renderer,
        '--clean' if have_unpaper() else None,
        '--deskew',
        # '--remove-background',
        '--clean-final' if have_unpaper() else None,
        '-k',
        '--pages',
        '1',
    )

    after_dims = first_page_dimensions(outpdf)

    assert isclose(before_dims[0], after_dims[0], rel_tol=1e-4)
    assert isclose(before_dims[1], after_dims[1], rel_tol=1e-4)


def test_skip_big_with_no_images(resources, outpdf):
    check_ocrmypdf(
        resources / 'blank.pdf',
        outpdf,
        '--skip-big',
        '5',
        '--force-ocr',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


def test_no_contents(resources, outpdf):
    check_ocrmypdf(
        resources / 'no_contents.pdf',
        outpdf,
        '--force-ocr',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


@pytest.mark.parametrize(
    'image', ['baiona.png', 'baiona_gray.png', 'baiona_alpha.png', 'baiona_color.jpg']
)
def test_compression_preserved(ocrmypdf_exec, resources, image, outpdf):
    input_file = str(resources / image)
    output_file = str(outpdf)

    im = Image.open(input_file)
    # Runs: ocrmypdf - output.pdf < testfile
    with open(input_file, 'rb') as input_stream:
        p_args = ocrmypdf_exec + [
            '--optimize',
            '0',
            '--image-dpi',
            '150',
            '--output-type',
            'pdf',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
            '-',
            output_file,
        ]
        p = run(
            p_args,
            capture_output=True,
            stdin=input_stream,
            text=True,
            check=False,
        )

        if im.mode in ('RGBA', 'LA'):
            # If alpha image is input, expect an error
            assert p.returncode != ExitCode.ok and 'alpha' in p.stderr
            return

        assert p.returncode == ExitCode.ok, p.stderr

    pdfinfo = PdfInfo(output_file)

    pdfimage = pdfinfo[0].images[0]

    if input_file.endswith('.png'):
        assert pdfimage.enc != Encoding.jpeg, "Lossless compression changed to lossy!"
    elif input_file.endswith('.jpg'):
        assert pdfimage.enc == Encoding.jpeg, "Lossy compression changed to lossless!"
    if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
        assert pdfimage.color == Colorspace.rgb, "Colorspace changed"
    elif im.mode.startswith('L'):
        assert pdfimage.color == Colorspace.gray, "Colorspace changed"
    im.close()


@pytest.mark.parametrize(
    'image,compression',
    [
        ('baiona.png', 'jpeg'),
        ('baiona_gray.png', 'lossless'),
        ('baiona_color.jpg', 'lossless'),
    ],
)
def test_compression_changed(ocrmypdf_exec, resources, image, compression, outpdf):
    input_file = str(resources / image)
    output_file = str(outpdf)

    im = Image.open(input_file)

    # Runs: ocrmypdf - output.pdf < testfile
    with open(input_file, 'rb') as input_stream:
        p_args = ocrmypdf_exec + [
            '--image-dpi',
            '150',
            '--output-type',
            'pdfa',
            '--optimize',
            '0',
            '--pdfa-image-compression',
            compression,
            '--plugin',
            'tests/plugins/tesseract_noop.py',
            '-',
            output_file,
        ]
        p = run(
            p_args,
            capture_output=True,
            stdin=input_stream,
            text=True,
            check=False,
        )
        assert p.returncode == ExitCode.ok, p.stderr

    pdfinfo = PdfInfo(output_file)

    pdfimage = pdfinfo[0].images[0]

    if compression == "jpeg":
        assert pdfimage.enc == Encoding.jpeg
    else:
        if image.endswith('jpg'):
            # Ghostscript JPEG passthrough - no issue
            assert pdfimage.enc == Encoding.jpeg
        else:
            assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000)

    if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
        assert pdfimage.color == Colorspace.rgb, "Colorspace changed"
    elif im.mode.startswith('L'):
        assert pdfimage.color == Colorspace.gray, "Colorspace changed"
    im.close()


def test_sidecar_pagecount(resources, outpdf):
    sidecar = outpdf.with_suffix('.txt')
    check_ocrmypdf(
        resources / '3small.pdf',
        outpdf,
        '--skip-text',
        '--sidecar',
        sidecar,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )

    pdfinfo = PdfInfo(resources / '3small.pdf')
    num_pages = len(pdfinfo)

    with open(sidecar, encoding='utf-8') as f:
        ocr_text = f.read()

    # There should a formfeed between each pair of pages, so the count of
    # formfeeds is the page count less one
    assert (
        ocr_text.count('\f') == num_pages - 1
    ), "Sidecar page count does not match PDF page count"


def test_sidecar_nonempty(resources, outpdf):
    sidecar = outpdf.with_suffix('.txt')
    check_ocrmypdf(
        resources / 'ccitt.pdf',
        outpdf,
        '--sidecar',
        sidecar,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )

    with open(sidecar, encoding='utf-8') as f:
        ocr_text = f.read()
    assert 'the' in ocr_text


@pytest.mark.parametrize('pdfa_level', ['1', '2', '3'])
def test_pdfa_n(pdfa_level, resources, outpdf):
    check_ocrmypdf(
        resources / 'ccitt.pdf',
        outpdf,
        '--output-type',
        'pdfa-' + pdfa_level,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['conformance'] == f'PDF/A-{pdfa_level}b'


def test_decompression_bomb_error(resources, outpdf, caplog):
    run_ocrmypdf_api(resources / 'hugemono.pdf', outpdf)
    assert 'decompression bomb' in caplog.text
    assert 'max-image-mpixels' in caplog.text


@pytest.mark.slow
def test_decompression_bomb_succeeds(resources, outpdf):
    exitcode = run_ocrmypdf_api(
        resources / 'hugemono.pdf', outpdf, '--max-image-mpixels', '2000'
    )
    assert exitcode == 0


def test_text_curves(resources, outpdf):
    with patch('ocrmypdf._pipeline.VECTOR_PAGE_DPI', 100):
        check_ocrmypdf(
            resources / 'vector.pdf',
            outpdf,
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )

        info = PdfInfo(outpdf)
        assert len(info.pages[0].images) == 0, "added images to the vector PDF"


def test_text_curves_force(resources, outpdf):
    with patch('ocrmypdf._pipeline.VECTOR_PAGE_DPI', 100):
        check_ocrmypdf(
            resources / 'vector.pdf',
            outpdf,
            '--force-ocr',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )

        info = PdfInfo(outpdf)
        assert len(info.pages[0].images) != 0, "force did not rasterize"


def test_output_is_dir(resources, outdir, caplog):
    exitcode = run_ocrmypdf_api(
        resources / 'trivial.pdf',
        outdir,
        '--force-ocr',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    assert exitcode == ExitCode.file_access_error
    assert 'is not a writable file' in caplog.text


@pytest.mark.skipif(os.name == 'nt', reason="symlink needs admin permissions")
def test_output_is_symlink(resources, outdir):
    sym = Path(outdir / 'this_is_a_symlink')
    sym.symlink_to(outdir / 'out.pdf')
    exitcode = run_ocrmypdf_api(
        resources / 'trivial.pdf',
        sym,
        '--force-ocr',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    assert exitcode == ExitCode.ok
    assert (outdir / 'out.pdf').stat().st_size > 0, 'target file not created'


def test_livecycle(resources, no_outpdf, caplog):
    exitcode = run_ocrmypdf_api(resources / 'livecycle.pdf', no_outpdf)

    assert exitcode == ExitCode.input_file, caplog.text


def test_version_check():
    with pytest.raises(MissingDependencyError):
        get_version('NOT_FOUND_UNLIKELY_ON_PATH')

    with pytest.raises(MissingDependencyError):
        get_version('sh', version_arg='-c')

    with pytest.raises(MissingDependencyError):
        get_version('echo')


@pytest.mark.parametrize(
    'threshold, optimize, output_type, expected',
    [
        [1.0, 0, 'pdfa', False],
        [1.0, 0, 'pdf', False],
        [0.0, 0, 'pdfa', True],
        [0.0, 0, 'pdf', True],
        [1.0, 1, 'pdfa', False],
        [1.0, 1, 'pdf', False],
        [0.0, 1, 'pdfa', True],
        [0.0, 1, 'pdf', True],
    ],
)
def test_fast_web_view(resources, outpdf, threshold, optimize, output_type, expected):
    check_ocrmypdf(
        resources / 'trivial.pdf',
        outpdf,
        '--fast-web-view',
        threshold,
        '--optimize',
        optimize,
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    with pikepdf.open(outpdf) as pdf:
        assert pdf.is_linearized == expected


def test_image_dpi_not_image(caplog, resources, outpdf):
    check_ocrmypdf(
        resources / 'trivial.pdf',
        outpdf,
        '--image-dpi',
        '100',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    assert '--image-dpi is being ignored' in caplog.text


def test_outputtype_none_bad_setup(resources, outpdf):
    p = run_ocrmypdf(
        resources / 'trivial.pdf',
        outpdf,
        '--output-type=none',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    assert p.returncode == ExitCode.bad_args
    assert 'Set the output file to' in p.stderr


def test_outputtype_none(resources, outtxt):
    exitcode = run_ocrmypdf_api(
        resources / 'trivial.pdf',
        '-',
        '--output-type=none',
        '--sidecar',
        outtxt,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    assert exitcode == ExitCode.ok
    assert outtxt.exists()


@pytest.fixture
def graph_bad_icc(resources, outdir):
    synth_input_file = outdir / 'graph-bad-icc.pdf'
    with pikepdf.open(resources / 'graph.pdf') as pdf:
        icc = pdf.make_stream(
            b'invalid icc profile', N=3, Alternate=pikepdf.Name.DeviceRGB
        )
        pdf.pages[0].Resources.XObject['/Im0'].ColorSpace = pikepdf.Array(
            [pikepdf.Name.ICCBased, icc]
        )
        pdf.save(synth_input_file)
        yield synth_input_file


def test_corrupt_icc(graph_bad_icc, outpdf, caplog):
    result = run_ocrmypdf_api(graph_bad_icc, outpdf)
    assert result == ExitCode.ok
    assert any(
        'corrupt or unreadable ICC profile' in rec.message for rec in caplog.records
    )


================================================
FILE: tests/test_metadata.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import datetime as dt
import warnings
from shutil import copyfile

import pikepdf
import pytest
from pikepdf.models.metadata import decode_pdf_date

from ocrmypdf._jobcontext import PdfContext
from ocrmypdf._metadata import metadata_fixup
from ocrmypdf._pipeline import convert_to_pdfa
from ocrmypdf.api import setup_plugin_infrastructure
from ocrmypdf.cli import get_options_and_plugins
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.pdfa import file_claims_pdfa, generate_pdfa_ps
from ocrmypdf.pdfinfo import PdfInfo

from .conftest import check_ocrmypdf, run_ocrmypdf, run_ocrmypdf_api


@pytest.mark.parametrize("output_type", ['pdfa', 'pdf'])
def test_preserve_docinfo(output_type, resources, outpdf):
    output = check_ocrmypdf(
        resources / 'graph.pdf',
        outpdf,
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    with (
        pikepdf.open(resources / 'graph.pdf') as pdf_before,
        pikepdf.open(output) as pdf_after,
    ):
        for key in ('/Title', '/Author'):
            assert pdf_before.docinfo[key] == pdf_after.docinfo[key]
        pdfa_info = file_claims_pdfa(str(output))
        assert pdfa_info['output'] == output_type


@pytest.mark.parametrize("output_type", ['pdfa', 'pdf'])
def test_override_metadata(output_type, resources, outpdf, caplog):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    exitcode = run_ocrmypdf_api(
        input_file,
        outpdf,
        '--title',
        german,
        '--author',
        chinese,
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    assert exitcode == ExitCode.ok, caplog.text

    with pikepdf.open(input_file) as before, pikepdf.open(outpdf) as after:
        assert after.docinfo.Title == german, after.docinfo
        assert after.docinfo.Author == chinese, after.docinfo
        assert after.docinfo.get('/Keywords', '') == ''

        before_date = decode_pdf_date(str(before.docinfo.CreationDate))
        after_date = decode_pdf_date(str(after.docinfo.CreationDate))
        assert before_date == after_date

        pdfa_info = file_claims_pdfa(outpdf)
        assert pdfa_info['output'] == output_type


@pytest.mark.parametrize('output_type', ['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'])
@pytest.mark.parametrize('field', ['title', 'author', 'subject', 'keywords'])
def test_unset_metadata(output_type, field, resources, outpdf, caplog):
    input_file = resources / 'meta.pdf'

    # magic strings contained in the input pdf metadata
    meta = {
        'title': b'NFY5f7Ft2DWMkxLhXwxvFf7eWR2KeK3vEDcd',
        'author': b'yXaryipxyRk9dVjWjSSaVaNCKeLRgEVzPRMp',
        'subject': b't49vimctvnuH7ZeAjAkv52ACvWFjcnm5MPJr',
        'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh',
    }

    exitcode = run_ocrmypdf_api(
        input_file,
        outpdf,
        f'--{field}',
        '',
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    assert exitcode == ExitCode.ok, caplog.text

    # We mainly want to ensure that when '' is passed, the corresponding
    # metadata is unset in the output pdf. Since metedata is not compressed,
    # the best way to gaurentee the metadata of interest didn't carry
    # forward is to just check to ensure the corresponding magic string
    # isn't contained anywhere in the output pdf. We'll also check to ensure
    # it's in the input pdf and that any values not unset are still in the
    # output pdf.
    with open(input_file, 'rb') as before, open(outpdf, 'rb') as after:
        before_data = before.read()
        after_data = after.read()

    for k, v in meta.items():
        assert v in before_data
        if k == field:
            assert v not in after_data
        else:
            assert v in after_data


def test_high_unicode(resources, no_outpdf):
    # Ghostscript doesn't support high Unicode, so neither do we, to be
    # safe
    input_file = resources / 'c02-22.pdf'
    high_unicode = 'U+1030C is: 𐌌'

    p = run_ocrmypdf(
        input_file,
        no_outpdf,
        '--subject',
        high_unicode,
        '--output-type',
        'pdfa',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    assert p.returncode == ExitCode.bad_args, p.stderr


@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_bookmarks_preserved(output_type, ocr_option, resources, outpdf):
    fitz = pytest.importorskip('fitz')
    input_file = resources / 'toc.pdf'
    before_toc = fitz.Document(str(input_file)).get_toc()

    check_ocrmypdf(
        input_file,
        outpdf,
        ocr_option,
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    after_toc = fitz.Document(str(outpdf)).get_toc()
    print(before_toc)
    print(after_toc)
    assert before_toc == after_toc


def seconds_between_dates(date1, date2):
    return (date2 - date1).total_seconds()


@pytest.mark.parametrize('infile', ['trivial.pdf', 'jbig2.pdf'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_creation_date_preserved(output_type, resources, infile, outpdf):
    input_file = resources / infile

    check_ocrmypdf(
        input_file,
        outpdf,
        '--output-type',
        output_type,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    with pikepdf.open(input_file) as pdf_before, pikepdf.open(outpdf) as pdf_after:
        before = pdf_before.trailer.get('/Info', {})
        after = pdf_after.trailer.get('/Info', {})

        if not before:
            assert after.get('/CreationDate', '') != ''
        else:
            # We expect that the creation date stayed the same
            date_before = decode_pdf_date(str(before['/CreationDate']))
            date_after = decode_pdf_date(str(after['/CreationDate']))
            assert seconds_between_dates(date_before, date_after) < 1000

        # We expect that the modified date is quite recent
        date_after = decode_pdf_date(str(after['/ModDate']))
        assert seconds_between_dates(date_after, dt.datetime.now(dt.UTC)) < 1000


@pytest.fixture
def libxmp_file_to_dict():
    try:
        with warnings.catch_warnings():
            # libxmp imports distutils.Version, which is deprecated
            warnings.filterwarnings(
                "ignore",
                category=DeprecationWarning,
                message=r".*distutils Version classes are deprecated.*",
            )
            from libxmp.utils import (
                file_to_dict,  # pylint: disable=import-outside-toplevel
            )
    except Exception:  # pylint: disable=broad-except
        pytest.skip("libxmp not available or libexempi3 not installed")
    return file_to_dict


@pytest.mark.parametrize(
    'test_file,output_type',
    [
        ('graph.pdf', 'pdf'),  # PDF with full metadata
        ('graph.pdf', 'pdfa'),  # PDF/A with full metadata
        ('overlay.pdf', 'pdfa'),  # /Title()
        ('3small.pdf', 'pdfa'),
    ],
)
def test_xml_metadata_preserved(
    libxmp_file_to_dict, test_file, output_type, resources, outpdf
):
    input_file = resources / test_file

    before = libxmp_file_to_dict(str(input_file))

    check_ocrmypdf(
        input_file,
        outpdf,
        '--output-type',
        output_type,
        '--skip-text',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    after = libxmp_file_to_dict(str(outpdf))

    equal_properties = [
        'dc:contributor',
        'dc:coverage',
        'dc:creator',
        'dc:description',
        'dc:format',
        'dc:identifier',
        'dc:language',
        'dc:publisher',
        'dc:relation',
        'dc:rights',
        'dc:source',
        'dc:subject',
        'dc:title',
        'dc:type',
        'pdf:keywords',
    ]
    acquired_properties = ['dc:format']

    # Cleanup messy data structure
    # Top level is key-value mapping of namespaces to keys under namespace,
    # so we put everything in the same namespace
    def unify_namespaces(xmpdict):
        for entries in xmpdict.values():
            yield from entries

    # Now we have a list of (key, value, {infodict}). We don't care about
    # infodict. Just flatten to keys and values
    def keyval_from_tuple(list_of_tuples):
        for k, v, *_ in list_of_tuples:
            yield k, v

    before = dict(keyval_from_tuple(unify_namespaces(before)))
    after = dict(keyval_from_tuple(unify_namespaces(after)))

    for prop in equal_properties:
        if prop in before:
            assert prop in after, f'{prop} dropped from xmp'
            assert before[prop] == after[prop]

        # libxmp presents multivalued entries (e.g. dc:title) as:
        # 'dc:title': '' <- there's a title
        # 'dc:title[1]: 'The Title' <- the actual title
        # 'dc:title[1]/?xml:lang': 'x-default' <- language info
        propidx = f'{prop}[1]'
        if propidx in before:
            assert (
                after.get(propidx) == before[propidx]
                or after.get(prop) == before[propidx]
            )

        if prop in after and prop not in before:
            assert prop in acquired_properties, (
                f"acquired unexpected property {prop} with value "
                f"{after.get(propidx) or after.get(prop)}"
            )


def test_kodak_toc(resources, outpdf):
    check_ocrmypdf(
        resources / 'kcs.pdf',
        outpdf,
        '--output-type',
        'pdf',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    with pikepdf.open(outpdf) as p:
        if pikepdf.Name.First in p.Root.Outlines:
            assert isinstance(p.Root.Outlines.First, pikepdf.Dictionary)


def test_metadata_fixup_warning(resources, outdir, caplog):
    options, _pm = get_options_and_plugins(
        ['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']
    )

    copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')

    # Use the new setup function instead of get_plugin_manager directly
    plugin_manager = setup_plugin_infrastructure([])
    context = PdfContext(options, outdir, outdir / 'graph.pdf', None, plugin_manager)
    metadata_fixup(
        working_file=outdir / 'graph.pdf', context=context, pdf_save_settings={}
    )
    for record in caplog.records:
        assert record.levelname != 'WARNING', "Unexpected warning"

    # Now add some metadata that will not be copyable
    with pikepdf.open(outdir / 'graph.pdf') as graph:
        with graph.open_metadata() as meta:
            meta['prism2:publicationName'] = 'OCRmyPDF Test'
        graph.save(outdir / 'graph_mod.pdf')

    context = PdfContext(
        options, outdir, outdir / 'graph_mod.pdf', None, plugin_manager
    )
    metadata_fixup(
        working_file=outdir / 'graph.pdf', context=context, pdf_save_settings={}
    )
    assert any(record.levelname == 'WARNING' for record in caplog.records)


XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'


def test_prevent_gs_invalid_xml(resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')

    # Inject a string with a trailing nul character into the DocumentInfo
    # dictionary of this PDF, as often occurs in practice.
    with pikepdf.open(resources / 'trivial.pdf') as pdf:
        pdf.Root.DocumentInfo = pikepdf.Dictionary(
            Title=b'String with trailing nul\x00'
        )
        pdf.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)

    options, _ = get_options_and_plugins(
        args=[
            '-j',
            '1',
            '--output-type',
            'pdfa-2',
            'a.pdf',
            'b.pdf',
        ]
    )
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')

    # Use the new setup function
    plugin_manager = setup_plugin_infrastructure([])
    context = PdfContext(
        options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, plugin_manager
    )

    convert_to_pdfa(
        str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
    )

    contents = (outdir / 'pdfa.pdf').read_bytes()
    # Since the XML may be invalid, we scan instead of actually feeding it
    # to a parser.

    xmp_start = contents.find(XMP_MAGIC)
    xmp_end = contents.rfind(b'<?xpacket end', xmp_start)
    assert 0 < xmp_start < xmp_end
    # Ensure we did not carry the nul forward.
    assert contents.find(b'&#0;', xmp_start, xmp_end) == -1, "found escaped nul"
    assert contents.find(b'\x00', xmp_start, xmp_end) == -1


================================================
FILE: tests/test_multi_font_manager.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Unit tests for MultiFontManager and FontProvider."""

from __future__ import annotations

import logging
from pathlib import Path

import pytest

from ocrmypdf.font import BuiltinFontProvider, FontManager, MultiFontManager


@pytest.fixture
def font_dir():
    """Return path to font directory."""
    return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"


@pytest.fixture
def multi_font_manager(font_dir):
    """Create MultiFontManager instance for testing."""
    return MultiFontManager(font_dir)


def has_cjk_font(manager: MultiFontManager) -> bool:
    """Check if CJK font is available (from system)."""
    return 'NotoSansCJK-Regular' in manager.fonts


def has_arabic_font(manager: MultiFontManager) -> bool:
    """Check if Arabic font is available (from system)."""
    return 'NotoSansArabic-Regular' in manager.fonts


def has_devanagari_font(manager: MultiFontManager) -> bool:
    """Check if Devanagari font is available (from system)."""
    return 'NotoSansDevanagari-Regular' in manager.fonts


# Marker for tests that require CJK fonts
requires_cjk = pytest.mark.skipif(
    "not has_cjk_font(MultiFontManager())",
    reason="CJK font not available (not installed on system)"
)


# --- MultiFontManager Initialization Tests ---


def test_init_loads_builtin_fonts(multi_font_manager):
    """Test that initialization loads all expected builtin fonts."""
    # Only NotoSans-Regular and Occulta are bundled
    assert 'NotoSans-Regular' in multi_font_manager.fonts
    assert 'Occulta' in multi_font_manager.fonts

    # At least 2 builtin fonts should be loaded
    assert len(multi_font_manager.fonts) >= 2

    # Arabic, Devanagari, CJK are optional (system fonts)


def test_missing_font_directory():
    """Test that missing font directory raises error for fallback font."""
    with pytest.raises(FileNotFoundError):
        MultiFontManager(Path("/nonexistent/path"))


# --- Arabic Script Language Tests ---
# These tests require Arabic fonts to be installed on the system


def test_select_font_for_arabic_language(multi_font_manager):
    """Test font selection with Arabic language hint."""
    if not has_arabic_font(multi_font_manager):
        pytest.skip("Arabic font not available")
    font_manager = multi_font_manager.select_font_for_word("مرحبا", "ara")
    assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']


def test_select_font_for_persian_language(multi_font_manager):
    """Test font selection with Persian language hint."""
    if not has_arabic_font(multi_font_manager):
        pytest.skip("Arabic font not available")
    font_manager = multi_font_manager.select_font_for_word("سلام", "per")
    assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']


def test_select_font_for_urdu_language(multi_font_manager):
    """Test font selection with Urdu language hint."""
    if not has_arabic_font(multi_font_manager):
        pytest.skip("Arabic font not available")
    font_manager = multi_font_manager.select_font_for_word("ہیلو", "urd")
    assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']


def test_farsi_language_code(multi_font_manager):
    """Test that 'fas' (Farsi alternative code) maps to Arabic font."""
    if not has_arabic_font(multi_font_manager):
        pytest.skip("Arabic font not available")
    font_manager = multi_font_manager.select_font_for_word("سلام", "fas")
    assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']


# --- Devanagari Script Language Tests ---
# These tests require Devanagari fonts to be installed on the system


def test_select_font_for_hindi_language(multi_font_manager):
    """Test font selection with Hindi language hint."""
    if not has_devanagari_font(multi_font_manager):
        pytest.skip("Devanagari font not available")
    font_manager = multi_font_manager.select_font_for_word("नमस्ते", "hin")
    assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']


def test_select_font_for_sanskrit_language(multi_font_manager):
    """Test font selection with Sanskrit language hint."""
    if not has_devanagari_font(multi_font_manager):
        pytest.skip("Devanagari font not available")
    font_manager = multi_font_manager.select_font_for_word("संस्कृतम्", "san")
    assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']


def test_select_font_for_marathi_language(multi_font_manager):
    """Test font selection with Marathi language hint."""
    if not has_devanagari_font(multi_font_manager):
        pytest.skip("Devanagari font not available")
    font_manager = multi_font_manager.select_font_for_word("मराठी", "mar")
    assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']


def test_select_font_for_nepali_language(multi_font_manager):
    """Test font selection with Nepali language hint."""
    if not has_devanagari_font(multi_font_manager):
        pytest.skip("Devanagari font not available")
    font_manager = multi_font_manager.select_font_for_word("नेपाली", "nep")
    assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']


# --- CJK Language Tests ---
# These tests require CJK fonts to be installed on the system


def test_select_font_for_chinese_language(multi_font_manager):
    """Test font selection with Chinese language hint (ISO 639-3)."""
    if not has_cjk_font(multi_font_manager):
        pytest.skip("CJK font not available")
    font_manager = multi_font_manager.select_font_for_word("你好", "zho")
    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']


def test_select_font_for_chinese_generic(multi_font_manager):
    """Test font selection with generic Chinese language code."""
    if not has_cjk_font(multi_font_manager):
        pytest.skip("CJK font not available")
    font_manager = multi_font_manager.select_font_for_word("中文", "chi")
    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']


def test_select_font_for_chinese_simplified(multi_font_manager):
    """Test font selection with Tesseract's chi_sim language code."""
    if not has_cjk_font(multi_font_manager):
        pytest.skip("CJK font not available")
    font_manager = multi_font_manager.select_font_for_word("简体字", "chi_sim")
    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']


def test_select_font_for_chinese_traditional(multi_font_manager):
    """Test font selection with Tesseract's chi_tra language code."""
    if not has_cjk_font(multi_font_manager):
        pytest.skip("CJK font not available")
    font_manager = multi_font_manager.select_font_for_word("漢字", "chi_tra")
    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']


def test_select_font_for_japanese_language(multi_font_manager):
    """Test font selection with Japanese language hint."""
    if not has_cjk_font(multi_font_manager):
        pytest.skip("CJK font not available")
    font_manager = multi_font_manager.select_font_for_word("こんにちは", "jpn")
    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']


def test_select_font_for_korean_language(multi_font_manager):
    """Test font selection with Korean language hint."""
    if not has_cjk_font(multi_font_manager):
        pytest.skip("CJK font not available")
    font_manager = multi_font_manager.select_font_for_word("안녕하세요", "kor")
    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']


# --- Latin/English Tests ---


def test_select_font_for_english_text(multi_font_manager):
    """Test font selection for English text."""
    font_manager = multi_font_manager.select_font_for_word("Hello World", "eng")
    assert font_manager == multi_font_manager.fonts['NotoSans-Regular']


def test_select_font_without_language_hint(multi_font_manager):
    """Test font selection without language hint falls back to glyph checking."""
    font_manager = multi_font_manager.select_font_for_word("Hello", None)
    assert font_manager == multi_font_manager.fonts['NotoSans-Regular']


# --- Fallback Behavior Tests ---


def test_select_font_arabic_text_without_language_hint(multi_font_manager):
    """Test that Arabic text is handled via fallback without language hint."""
    if not has_arabic_font(multi_font_manager):
        pytest.skip("Arabic font not available")
    font_manager = multi_font_manager.select_font_for_word("مرحبا", None)
    # Should get NotoSansArabic-Regular via fallback chain glyph checking
    assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']


def test_devanagari_text_without_language_hint(multi_font_manager):
    """Test that Devanagari text is handled via fallback without language hint."""
    # NotoSans-Regular includes Devanagari glyphs, so it's selected first in fallback
    font_manager = multi_font_manager.select_font_for_word("नमस्ते", None)
    # Could be NotoSans-Regular or NotoSansDevanagari-Regular depending on availability
    assert font_manager is not None


def test_cjk_text_without_language_hint(multi_font_manager):
    """Test that CJK text is handled via fallback without language hint."""
    if not has_cjk_font(multi_font_manager):
        pytest.skip("CJK font not available")
    font_manager = multi_font_manager.select_font_for_word("你好", None)
    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']


def test_fallback_to_occulta_font(multi_font_manager):
    """Test that unsupported characters fall back to Occulta.ttf."""
    # Use a character unlikely to be in any standard font
    font_manager = multi_font_manager.select_font_for_word("test", "xyz")
    # Should return some valid font
    assert font_manager in multi_font_manager.fonts.values()


def test_fallback_fonts_constant(multi_font_manager):
    """Test that FALLBACK_FONTS contains expected fonts."""
    # Check that core fonts are in fallback list
    assert 'NotoSans-Regular' in MultiFontManager.FALLBACK_FONTS
    assert 'NotoSansArabic-Regular' in MultiFontManager.FALLBACK_FONTS
    assert 'NotoSansDevanagari-Regular' in MultiFontManager.FALLBACK_FONTS
    assert 'NotoSansCJK-Regular' in MultiFontManager.FALLBACK_FONTS

    # Only NotoSans-Regular is bundled; other scripts are system fonts
    assert 'NotoSans-Regular' in multi_font_manager.fonts


# --- Glyph Coverage Tests ---


def test_has_all_glyphs_for_english(multi_font_manager):
    """Test glyph coverage checking for English text."""
    assert multi_font_manager.has_all_glyphs('NotoSans-Regular', "Hello World")
    assert multi_font_manager.has_all_glyphs('NotoSans-Regular', "café")


def test_has_all_glyphs_for_arabic(multi_font_manager):
    """Test glyph coverage checking for Arabic text."""
    if not has_arabic_font(multi_font_manager):
        pytest.skip("Arabic font not available")
    assert multi_font_manager.has_all_glyphs('NotoSansArabic-Regular', "مرحبا")


def test_has_all_glyphs_for_devanagari(multi_font_manager):
    """Test glyph coverage checking for Devanagari text."""
    if not has_devanagari_font(multi_font_manager):
        pytest.skip("Devanagari font not available")
    assert multi_font_manager.has_all_glyphs('NotoSansDevanagari-Regular', "नमस्ते")


def test_has_all_glyphs_for_cjk(multi_font_manager):
    """Test glyph coverage checking for CJK text."""
    if not has_cjk_font(multi_font_manager):
        pytest.skip("CJK font not available")
    assert multi_font_manager.has_all_glyphs('NotoSansCJK-Regular', "你好")


def test_empty_text_has_all_glyphs(multi_font_manager):
    """Test that empty text returns True for glyph coverage."""
    assert multi_font_manager.has_all_glyphs('NotoSans-Regular', "")


def test_has_all_glyphs_missing_font(multi_font_manager):
    """Test that has_all_glyphs returns False for non-existent font."""
    assert not multi_font_manager.has_all_glyphs('NonExistentFont', "test")


# --- Caching Tests ---


def test_font_selection_caching(multi_font_manager):
    """Test that font selection results are cached."""
    font1 = multi_font_manager.select_font_for_word("Hello", "eng")

    cache_key = ("Hello", "eng")
    assert cache_key in multi_font_manager._selection_cache

    font2 = multi_font_manager.select_font_for_word("Hello", "eng")
    assert font1 == font2


# --- Language Font Map Tests ---


def test_language_font_map_coverage():
    """Test that LANGUAGE_FONT_MAP has valid structure."""
    # Only NotoSans-Regular is bundled now
    # This test just verifies the structure is valid
    for font_name in MultiFontManager.LANGUAGE_FONT_MAP.values():
        # All font names should be valid strings
        assert isinstance(font_name, str)
        assert font_name.startswith('NotoSans')


# --- get_all_fonts Tests ---


def test_get_all_fonts(multi_font_manager):
    """Test get_all_fonts returns all loaded fonts."""
    all_fonts = multi_font_manager.get_all_fonts()
    assert isinstance(all_fonts, dict)
    # At least 2 builtin fonts should be loaded (NotoSans-Regular and Occulta)
    assert len(all_fonts) >= 2
    assert 'NotoSans-Regular' in all_fonts
    assert 'Occulta' in all_fonts
    # Arabic, Devanagari, CJK are optional (system fonts)


# --- FontProvider Tests ---


class MockFontProvider:
    """Mock FontProvider for testing missing fonts."""

    def __init__(
        self, available_fonts: dict[str, FontManager], fallback: FontManager
    ):
        """Initialize mock font provider with given fonts."""
        self._fonts = available_fonts
        self._fallback = fallback

    def get_font(self, font_name: str) -> FontManager | None:
        return self._fonts.get(font_name)

    def get_available_fonts(self) -> list[str]:
        return list(self._fonts.keys())

    def get_fallback_font(self) -> FontManager:
        return self._fallback


def test_custom_font_provider(font_dir):
    """Test that custom FontProvider can be injected."""
    fonts = {
        'NotoSans-Regular': FontManager(font_dir / 'NotoSans-Regular.ttf'),
        'Occulta': FontManager(font_dir / 'Occulta.ttf'),
    }
    provider = MockFontProvider(fonts, fonts['Occulta'])

    manager = MultiFontManager(font_provider=provider)

    # Should only have the fonts we provided
    assert len(manager.fonts) == 2
    assert 'NotoSans-Regular' in manager.fonts
    assert 'Occulta' in manager.fonts


def test_missing_font_uses_fallback(font_dir):
    """Test that missing fonts gracefully fall back."""
    fonts = {
        'NotoSans-Regular': FontManager(font_dir / 'NotoSans-Regular.ttf'),
        'Occulta': FontManager(font_dir / 'Occulta.ttf'),
    }
    provider = MockFontProvider(fonts, fonts['Occulta'])

    manager = MultiFontManager(font_provider=provider)

    # Arabic text should fall back to Occulta since NotoSansArabic is missing
    font = manager.select_font_for_word("مرحبا", "ara")
    assert font == fonts['Occulta']


def test_builtin_font_provider_loads_expected_fonts(font_dir):
    """Test BuiltinFontProvider loads all expected builtin fonts."""
    provider = BuiltinFontProvider(font_dir)

    available = provider.get_available_fonts()
    assert 'NotoSans-Regular' in available
    assert 'Occulta' in available
    # Only Latin (NotoSans) and glyphless fallback (Occulta) are bundled.
    # All other scripts (Arabic, Devanagari, CJK, etc.) are discovered
    # from system fonts by SystemFontProvider to reduce package size.
    assert len(available) == 2


def test_builtin_font_provider_get_font(font_dir):
    """Test BuiltinFontProvider.get_font returns correct fonts."""
    provider = BuiltinFontProvider(font_dir)

    font = provider.get_font('NotoSans-Regular')
    assert font is not None
    assert isinstance(font, FontManager)

    missing = provider.get_font('NonExistent')
    assert missing is None


def test_builtin_font_provider_get_fallback(font_dir):
    """Test BuiltinFontProvider.get_fallback_font returns Occulta font."""
    provider = BuiltinFontProvider(font_dir)

    fallback = provider.get_fallback_font()
    assert fallback is not None
    assert fallback == provider.get_font('Occulta')


def test_builtin_font_provider_missing_font_logs_warning(tmp_path, font_dir, caplog):
    """Test that missing expected fonts log a warning."""
    # Create minimal font directory with only Occulta.ttf
    (tmp_path / 'Occulta.ttf').write_bytes((font_dir / 'Occulta.ttf').read_bytes())

    with caplog.at_level(logging.WARNING):
        provider = BuiltinFontProvider(tmp_path)

    # Should have logged warnings for missing fonts
    assert 'NotoSans-Regular' in caplog.text
    assert 'not found' in caplog.text

    # But Occulta should be loaded
    assert provider.get_fallback_font() is not None


def test_builtin_font_provider_missing_occulta_raises(tmp_path):
    """Test that missing Occulta.ttf raises FileNotFoundError."""
    with pytest.raises(FileNotFoundError, match="Required fallback font"):
        BuiltinFontProvider(tmp_path)


================================================
FILE: tests/test_multilingual_direct.py
================================================
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Direct tests for multilingual text rendering with fpdf2 renderer.

This tests the fpdf2 renderer with various language groups:
- Latin (English, French, German with diacritics)
- Arabic (Arabic, Persian - RTL scripts)
- CJK (Chinese Simplified/Traditional, Japanese, Korean)
- Devanagari (Hindi, Sanskrit)
"""
from __future__ import annotations

import shutil
import subprocess
from pathlib import Path

import pytest

from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import DebugRenderOptions, Fpdf2PdfRenderer
from ocrmypdf.hocrtransform.hocr_parser import HocrParser

RESOURCES = Path(__file__).parent / "resources"


@pytest.fixture
def pdftotext():
    """Return a function to extract text from PDF using pdftotext.

    Skips the test if pdftotext is not available.
    """
    pdftotext_path = shutil.which('pdftotext')
    if pdftotext_path is None:
        pytest.skip("pdftotext not available")

    def extract_text(pdf_path: Path) -> str:
        return subprocess.check_output(
            ['pdftotext', '-enc', 'UTF-8', str(pdf_path), '-'],
            text=True,
            encoding='utf-8',
        )

    return extract_text


@pytest.fixture
def font_dir():
    """Return path to font directory."""
    return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"


@pytest.fixture
def multi_font_manager(font_dir):
    """Create MultiFontManager instance for testing."""
    return MultiFontManager(font_dir)


@pytest.fixture
def multi_font_manager_arabic(font_dir):
    """Create MultiFontManager instance for testing, with Arabic."""
    mfm = MultiFontManager(font_dir)
    if not mfm.has_font("NotoSansArabic-Regular"):
        pytest.skip("NotoSansArabic font not available")
    return mfm


# =============================================================================
# Latin Script Tests
# =============================================================================


class TestLatinScript:
    """Tests for Latin script (English, French, German, etc.)."""

    @pytest.fixture
    def latin_hocr(self):
        """Return path to Latin HOCR test file."""
        return RESOURCES / "latin.hocr"

    def test_render_latin_basic(
        self, latin_hocr, multi_font_manager, tmp_path, pdftotext
    ):
        """Test rendering Latin script with various diacritics."""
        parser = HocrParser(latin_hocr)
        page = parser.parse()

        assert page is not None
        paras = list(page.paragraphs)
        assert len(paras) == 3  # English, French, German

        # Check languages
        assert paras[0].language == 'eng'
        assert paras[1].language == 'fra'
        assert paras[2].language == 'deu'

        # Render to PDF
        output_pdf = tmp_path / "latin_output.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300.0,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )
        renderer.render(output_pdf)

        assert output_pdf.exists()
        assert output_pdf.stat().st_size > 0

        # Extract text and verify
        text = pdftotext(output_pdf)

        # English words
        assert 'quick' in text or 'brown' in text or 'fox' in text

        # French with diacritics
        assert 'Café' in text or 'résumé' in text or 'naïve' in text

        # German with umlauts and eszett
        assert 'Größe' in text or 'Zürich' in text or 'Ärger' in text

    def test_latin_font_selection(self, latin_hocr, multi_font_manager):
        """Test that NotoSans is selected for Latin text."""
        parser = HocrParser(latin_hocr)
        page = parser.parse()

        for line in page.lines:
            for word in line.children:
                if word.text:
                    font = multi_font_manager.select_font_for_word(
                        word.text, line.language
                    )
                    assert font is not None
                    # Latin text should use NotoSans-Regular
                    assert multi_font_manager.has_all_glyphs(
                        'NotoSans-Regular', word.text
                    )


# =============================================================================
# Arabic Script Tests
# =============================================================================


class TestArabicScript:
    """Tests for Arabic script (Arabic, Persian, etc.)."""

    @pytest.fixture
    def arabic_hocr(self):
        """Return path to Arabic HOCR test file."""
        return RESOURCES / "arabic.hocr"

    def test_render_arabic_basic(
        self, arabic_hocr, multi_font_manager_arabic, tmp_path, pdftotext
    ):
        """Test rendering Arabic script text."""
        parser = HocrParser(arabic_hocr)
        page = parser.parse()

        assert page is not None
        paras = list(page.paragraphs)
        assert len(paras) == 3  # Arabic paragraphs and Persian

        # Render to PDF
        output_pdf = tmp_path / "arabic_output.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300.0,
            multi_font_manager=multi_font_manager_arabic,
            invisible_text=False,
        )
        renderer.render(output_pdf)

        assert output_pdf.exists()
        assert output_pdf.stat().st_size > 0

        # Extract text and verify Arabic content
        text = pdftotext(output_pdf)

        # Arabic words: مرحبا بالعالم (Hello world)
        assert 'مرحبا' in text or 'بالعالم' in text
        # هذا نص عربي (This is Arabic text)
        assert 'عربي' in text or 'نص' in text

    def test_arabic_font_selection(self, arabic_hocr, multi_font_manager_arabic):
        """Test that NotoSansArabic is selected for Arabic text."""
        parser = HocrParser(arabic_hocr)
        page = parser.parse()

        for line in page.lines:
            for word in line.children:
                if word.text and line.language in ('ara', 'per'):
                    font = multi_font_manager_arabic.select_font_for_word(
                        word.text, line.language
                    )
                    assert font is not None
                    # Arabic text should use NotoSansArabic
                    assert multi_font_manager_arabic.has_all_glyphs(
                        'NotoSansArabic-Regular', word.text
                    ), f"NotoSansArabic cannot render '{word.text}'"

    def test_arabic_rtl_handling(self, arabic_hocr):
        """Test that RTL direction is correctly parsed from hOCR."""
        parser = HocrParser(arabic_hocr)
        page = parser.parse()

        for para in page.paragraphs:
            if para.language in ('ara', 'per'):
                # Arabic paragraphs should have RTL direction
                assert (
                    para.direction == 'rtl'
                ), "Arabic paragraph should have RTL direction"


# =============================================================================
# CJK Script Tests
# =============================================================================


def _latin_font_works(multi_font_manager) -> bool:
    """Check if Latin font is available."""
    return multi_font_manager.has_font('NotoSans-Regular')


def _arabic_font_works(multi_font_manager) -> bool:
    """Check if Arabic font is available."""
    return multi_font_manager.has_font('NotoSansArabic-Regular')


def _devanagari_font_works(multi_font_manager) -> bool:
    """Check if Devanagari font is available."""
    return multi_font_manager.has_font('NotoSansDevanagari-Regular')


def _cjk_font_works(multi_font_manager) -> bool:
    """Check if CJK font is working (not corrupted)."""
    return multi_font_manager.has_font('NotoSansCJK-Regular')


class TestCJKScript:
    """Tests for CJK scripts (Chinese, Japanese, Korean)."""

    @pytest.fixture
    def cjk_hocr(self):
        """Return path to CJK HOCR test file."""
        return RESOURCES / "cjk.hocr"

    def test_render_cjk_basic(self, cjk_hocr, multi_font_manager, tmp_path, pdftotext):
        """Test rendering CJK script text."""
        if not _cjk_font_works(multi_font_manager):
            pytest.skip("CJK font not available or corrupted")

        parser = HocrParser(cjk_hocr)
        page = parser.parse()

        assert page is not None
        paras = list(page.paragraphs)
        assert len(paras) == 4  # Chinese Simplified, Traditional, Japanese, Korean

        # Check languages
        languages = [p.language for p in paras]
        assert 'chi_sim' in languages
        assert 'chi_tra' in languages
        assert 'jpn' in languages
        assert 'kor' in languages

        # Render to PDF
        output_pdf = tmp_path / "cjk_output.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300.0,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )
        renderer.render(output_pdf)

        assert output_pdf.exists()
        assert output_pdf.stat().st_size > 0

        # Extract text and verify CJK content
        text = pdftotext(output_pdf)

        # Chinese: 你好 世界 (Hello world)
        assert '你好' in text or '世界' in text
        # Japanese: こんにちは (Hello)
        assert 'こんにちは' in text or '世界' in text
        # Korean: 안녕하세요 (Hello)
        assert '안녕하세요' in text or '세계' in text

    def test_cjk_font_selection(self, cjk_hocr, multi_font_manager):
        """Test that NotoSansCJK is selected for CJK text."""
        if not _cjk_font_works(multi_font_manager):
            pytest.skip("CJK font not available or corrupted")

        parser = HocrParser(cjk_hocr)
        page = parser.parse()

        cjk_languages = {'chi_sim', 'chi_tra', 'jpn', 'kor', 'zho', 'chi'}

        for line in page.lines:
            for word in line.children:
                if word.text and line.language in cjk_languages:
                    font = multi_font_manager.select_font_for_word(
                        word.text, line.language
                    )
                    assert font is not None
                    # CJK text should use NotoSansCJK
                    assert multi_font_manager.has_all_glyphs(
                        'NotoSansCJK-Regular', word.text
                    ), f"NotoSansCJK cannot render '{word.text}'"


# =============================================================================
# Devanagari Script Tests
# =============================================================================


class TestDevanagariScript:
    """Tests for Devanagari script (Hindi, Sanskrit, etc.)."""

    @pytest.fixture
    def devanagari_hocr(self):
        """Return path to Devanagari HOCR test file."""
        return RESOURCES / "devanagari.hocr"

    def test_render_devanagari_basic(
        self, devanagari_hocr, multi_font_manager, tmp_path, pdftotext
    ):
        """Test rendering Devanagari script text."""
        parser = HocrParser(devanagari_hocr)
        page = parser.parse()

        assert page is not None
        paras = list(page.paragraphs)
        assert len(paras) == 3  # Hindi paragraphs and Sanskrit

        # Render to PDF
        output_pdf = tmp_path / "devanagari_output.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300.0,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )
        renderer.render(output_pdf)

        assert output_pdf.exists()
        assert output_pdf.stat().st_size > 0

        # Extract text and verify Devanagari content
        text = pdftotext(output_pdf)

        # Hindi: नमस्ते दुनिया (Hello world)
        assert 'नमस्ते' in text or 'दुनिया' in text
        # यह हिंदी पाठ है (This is Hindi text)
        assert 'हिंदी' in text or 'पाठ' in text

    def test_devanagari_font_selection(self, devanagari_hocr, multi_font_manager):
        """Test that NotoSansDevanagari is selected for Devanagari text."""
        if not multi_font_manager.has_font('NotoSansDevanagari-Regular'):
            pytest.skip("Devanagari font not available")
        parser = HocrParser(devanagari_hocr)
        page = parser.parse()

        devanagari_languages = {'hin', 'san', 'mar', 'nep'}

        for line in page.lines:
            for word in line.children:
                if word.text and line.language in devanagari_languages:
                    font = multi_font_manager.select_font_for_word(
                        word.text, line.language
                    )
                    assert font is not None
                    # Devanagari text should use NotoSansDevanagari
                    assert multi_font_manager.has_all_glyphs(
                        'NotoSansDevanagari-Regular', word.text
                    ), f"NotoSansDevanagari cannot render '{word.text}'"


# =============================================================================
# Mixed Language / Multilingual Tests
# =============================================================================


class TestMultilingual:
    """Tests for mixed-language documents."""

    @pytest.fixture
    def multilingual_hocr(self):
        """Return path to multilingual HOCR test file."""
        return RESOURCES / "multilingual.hocr"

    def test_render_multilingual_hocr_basic(
        self, multilingual_hocr, multi_font_manager_arabic, tmp_path, pdftotext
    ):
        """Test rendering multilingual HOCR file with English and Arabic text."""
        parser = HocrParser(multilingual_hocr)
        page = parser.parse()

        assert page is not None
        assert len(list(page.paragraphs)) == 2  # English and Arabic paragraphs

        # Check languages
        paras = list(page.paragraphs)
        assert paras[0].language == 'eng'
        assert paras[1].language == 'ara'

        # Render to PDF
        output_pdf = tmp_path / "multilingual_output.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300.0,
            multi_font_manager=multi_font_manager_arabic,
            invisible_text=False,
        )
        renderer.render(output_pdf)

        assert output_pdf.exists()
        assert output_pdf.stat().st_size > 0

        # Extract text from PDF
        text = pdftotext(output_pdf)

        # Verify both English and Arabic text are present
        assert 'English' in text or 'Text' in text or 'Here' in text
        # Arabic text: مرحبا بك
        assert 'مرحبا' in text or 'بك' in text

    def test_render_multilingual_with_debug_options(
        self, multilingual_hocr, multi_font_manager, tmp_path
    ):
        """Test rendering with debug visualization enabled."""
        parser = HocrParser(multilingual_hocr)
        page = parser.parse()

        # Render with debug options
        output_pdf = tmp_path / "multilingual_debug.pdf"
        debug_options = DebugRenderOptions(
            render_baseline=True,
            render_line_bbox=True,
            render_word_bbox=True,
        )
        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300.0,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
            debug_render_options=debug_options,
        )
        renderer.render(output_pdf)

        assert output_pdf.exists()
        assert output_pdf.stat().st_size > 0

    def test_multilingual_invisible_text(
        self, multilingual_hocr, multi_font_manager, tmp_path, pdftotext
    ):
        """Test rendering with invisible text (default OCR mode)."""
        parser = HocrParser(multilingual_hocr)
        page = parser.parse()

        # Render with invisible text (standard for OCR layer)
        output_pdf = tmp_path / "multilingual_invisible.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=300.0,
            multi_font_manager=multi_font_manager,
            invisible_text=True,
        )
        renderer.render(output_pdf)

        assert output_pdf.exists()

        # Text should still be extractable even though invisible
        text = pdftotext(output_pdf)
        assert len(text.strip()) > 0

    def test_multilingual_font_selection(
        self, multilingual_hocr, multi_font_manager_arabic
    ):
        """Test that correct fonts are selected for each language."""
        parser = HocrParser(multilingual_hocr)
        page = parser.parse()

        # Get all words
        words = []
        for line in page.lines:
            for word in line.children:
                if word.text:
                    words.append((word.text, line.language))

        # Verify we have both English and Arabic words
        eng_words = [w for w, lang in words if lang == 'eng']
        ara_words = [w for w, lang in words if lang == 'ara']

        assert len(eng_words) > 0, "Should have English words"
        assert len(ara_words) > 0, "Should have Arabic words"

        # Test font selection
        for text, lang in words:
            font_mgr = multi_font_manager_arabic.select_font_for_word(text, lang)
            assert font_mgr is not None, f"No font selected for '{text}' ({lang})"

            if lang == 'ara':
                assert multi_font_manager_arabic.has_all_glyphs(
                    'NotoSansArabic-Regular', text
                ), f"NotoSansArabic cannot render '{text}'"


# =============================================================================
# Baseline and Structure Tests
# =============================================================================


class TestBaselineHandling:
    """Tests for baseline and hOCR structure handling."""

    @pytest.fixture
    def multilingual_hocr(self):
        """Return path to multilingual HOCR test file."""
        return RESOURCES / "multilingual.hocr"

    def test_multilingual_baseline_handling(self, multilingual_hocr):
        """Test that baseline information is correctly parsed from hOCR."""
        parser = HocrParser(multilingual_hocr)
        page = parser.parse()

        for line in page.lines:
            if line.baseline:
                # Baseline should be reasonable
                assert (
                    -1.0 <= line.baseline.slope <= 1.0
                ), "Baseline slope should be reasonable"


# =============================================================================
# Font Coverage Tests
# =============================================================================


class TestFontCoverage:
    """Tests verifying font coverage for various scripts."""

    def test_noto_sans_latin_coverage(self, multi_font_manager):
        """Test NotoSans covers common Latin characters and diacritics."""
        if not _latin_font_works(multi_font_manager):
            pytest.skip("NotoSans font not available")

        latin_samples = [
            "Hello World",
            "Café résumé naïve",
            "Größe Zürich Ärger",
            "ÀÁÂÃÄÅÆÇÈÉÊË",
            "àáâãäåæçèéêë",
        ]

        for sample in latin_samples:
            assert multi_font_manager.has_all_glyphs(
                'NotoSans-Regular', sample
            ), f"NotoSans should cover: {sample}"

    def test_noto_sans_arabic_coverage(self, multi_font_manager_arabic):
        """Test NotoSansArabic covers Arabic characters."""
        arabic_samples = [
            "مرحبا",  # Hello
            "بالعالم",  # World
            "العربية",  # Arabic
        ]

        for sample in arabic_samples:
            assert multi_font_manager_arabic.has_all_glyphs(
                'NotoSansArabic-Regular', sample
            ), f"NotoSansArabic should cover: {sample}"

    def test_noto_sans_devanagari_coverage(self, multi_font_manager):
        """Test NotoSansDevanagari covers Devanagari characters."""
        if not _devanagari_font_works(multi_font_manager):
            pytest.skip("NotoSansDevanagari font not available")

        devanagari_samples = [
            "नमस्ते",  # Hello
            "हिंदी",  # Hindi
            "संस्कृत",  # Sanskrit
        ]

        for sample in devanagari_samples:
            assert multi_font_manager.has_all_glyphs(
                'NotoSansDevanagari-Regular', sample
            ), f"NotoSansDevanagari should cover: {sample}"

    def test_noto_sans_cjk_coverage(self, multi_font_manager):
        """Test NotoSansCJK covers CJK characters."""
        if not _cjk_font_works(multi_font_manager):
            pytest.skip("CJK font not available or corrupted")

        cjk_samples = [
            "你好",  # Chinese: Hello
            "世界",  # Chinese: World
            "こんにちは",  # Japanese: Hello
            "안녕하세요",  # Korean: Hello
        ]

        for sample in cjk_samples:
            assert multi_font_manager.has_all_glyphs(
                'NotoSansCJK-Regular', sample
            ), f"NotoSansCJK should cover: {sample}"


if __name__ == "__main__":
    # Allow running this test directly for quick iteration
    import sys

    sys.exit(pytest.main([__file__, "-v", "-s"]))


================================================
FILE: tests/test_null_ocr_engine.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Unit tests for NullOcrEngine (--ocr-engine none).

Tests verify that the Null OCR engine exists and functions correctly
for scenarios where users want PDF processing without OCR.
"""

from __future__ import annotations

from pathlib import Path
from unittest.mock import MagicMock

import pytest


class TestNullOcrEngineExists:
    """Test that NullOcrEngine plugin exists and is loadable."""

    def test_null_ocr_module_importable(self):
        """null_ocr module should be importable."""
        from ocrmypdf.builtin_plugins import null_ocr

        assert null_ocr is not None

    def test_null_ocr_engine_class_exists(self):
        """NullOcrEngine class should exist."""
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        assert NullOcrEngine is not None


class TestNullOcrEngineInterface:
    """Test NullOcrEngine implements OcrEngine interface."""

    def test_version_returns_none(self):
        """NullOcrEngine.version() should return 'none'."""
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        assert NullOcrEngine.version() == "none"

    def test_creator_tag(self):
        """NullOcrEngine.creator_tag() should indicate no OCR."""
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        tag = NullOcrEngine.creator_tag(MagicMock())
        tag_lower = tag.lower()
        assert "no ocr" in tag_lower or "null" in tag_lower or "none" in tag_lower

    def test_languages_returns_empty_set(self):
        """NullOcrEngine.languages() should return empty set."""
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        langs = NullOcrEngine.languages(MagicMock())
        assert langs == set()

    def test_supports_generate_ocr_returns_true(self):
        """NullOcrEngine should support generate_ocr()."""
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        assert NullOcrEngine.supports_generate_ocr() is True

    def test_get_orientation_returns_zero(self):
        """NullOcrEngine.get_orientation() should return angle=0."""
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        result = NullOcrEngine.get_orientation(Path("test.png"), MagicMock())
        assert result.angle == 0

    def test_get_deskew_returns_zero(self):
        """NullOcrEngine.get_deskew() should return 0.0."""
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        result = NullOcrEngine.get_deskew(Path("test.png"), MagicMock())
        assert result == 0.0


class TestNullOcrEngineGenerateOcr:
    """Test NullOcrEngine.generate_ocr() output."""

    @pytest.fixture
    def sample_image(self, tmp_path):
        """Create a simple test image."""
        from PIL import Image

        img_path = tmp_path / "test.png"
        img = Image.new('RGB', (100, 100), color='white')
        img.save(img_path, dpi=(300, 300))
        return img_path

    def test_generate_ocr_returns_tuple(self, sample_image):
        """generate_ocr() should return (OcrElement, str) tuple."""
        from ocrmypdf import OcrElement
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        result = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)

        assert isinstance(result, tuple)
        assert len(result) == 2
        assert isinstance(result[0], OcrElement)
        assert isinstance(result[1], str)

    def test_generate_ocr_returns_empty_text(self, sample_image):
        """generate_ocr() should return empty text string."""
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        _, text = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)

        assert text == ""

    def test_generate_ocr_returns_page_element(self, sample_image):
        """generate_ocr() should return OcrElement with ocr_class PAGE."""
        from ocrmypdf import OcrClass
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        ocr_tree, _ = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)

        assert ocr_tree.ocr_class == OcrClass.PAGE

    def test_generate_ocr_page_has_correct_dimensions(self, sample_image):
        """generate_ocr() page element should have image dimensions."""
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        ocr_tree, _ = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)

        # Image is 100x100
        assert ocr_tree.bbox.right == 100
        assert ocr_tree.bbox.bottom == 100


class TestOcrEngineOption:
    """Test --ocr-engine CLI option."""

    def test_ocr_engine_option_accepted(self):
        """CLI should accept --ocr-engine option."""
        from ocrmypdf.cli import get_parser

        parser = get_parser()

        # Should not raise
        args = parser.parse_args(['--ocr-engine', 'none', 'in.pdf', 'out.pdf'])
        assert args.ocr_engine == 'none'

    def test_ocr_engine_choices_include_none(self):
        """--ocr-engine should include 'none' as a choice."""
        from ocrmypdf.cli import get_parser

        parser = get_parser()

        # Find the --ocr-engine action
        for action in parser._actions:
            if '--ocr-engine' in action.option_strings:
                assert 'none' in action.choices
                break
        else:
            pytest.fail("--ocr-engine option not found")

    def test_ocr_engine_choices_include_auto(self):
        """--ocr-engine should include 'auto' as default."""
        from ocrmypdf.cli import get_parser

        parser = get_parser()

        for action in parser._actions:
            if '--ocr-engine' in action.option_strings:
                assert 'auto' in action.choices
                assert action.default == 'auto'
                break


================================================
FILE: tests/test_ocr_element.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Unit tests for OcrElement dataclass and related classes."""

from __future__ import annotations

import pytest

from ocrmypdf.hocrtransform import (
    Baseline,
    BoundingBox,
    FontInfo,
    OcrClass,
    OcrElement,
)


class TestBoundingBox:
    """Tests for BoundingBox dataclass."""

    def test_basic_creation(self):
        bbox = BoundingBox(left=10, top=20, right=100, bottom=50)
        assert bbox.left == 10
        assert bbox.top == 20
        assert bbox.right == 100
        assert bbox.bottom == 50

    def test_width_height(self):
        bbox = BoundingBox(left=10, top=20, right=110, bottom=70)
        assert bbox.width == 100
        assert bbox.height == 50

    def test_zero_size_box(self):
        bbox = BoundingBox(left=10, top=20, right=10, bottom=20)
        assert bbox.width == 0
        assert bbox.height == 0

    def test_invalid_left_right(self):
        with pytest.raises(ValueError, match="right.*left"):
            BoundingBox(left=100, top=20, right=10, bottom=50)

    def test_invalid_top_bottom(self):
        with pytest.raises(ValueError, match="bottom.*top"):
            BoundingBox(left=10, top=50, right=100, bottom=20)


class TestBaseline:
    """Tests for Baseline dataclass."""

    def test_defaults(self):
        baseline = Baseline()
        assert baseline.slope == 0.0
        assert baseline.intercept == 0.0

    def test_with_values(self):
        baseline = Baseline(slope=0.01, intercept=-5)
        assert baseline.slope == 0.01
        assert baseline.intercept == -5


class TestFontInfo:
    """Tests for FontInfo dataclass."""

    def test_defaults(self):
        font = FontInfo()
        assert font.name is None
        assert font.size is None
        assert font.bold is False
        assert font.italic is False

    def test_with_values(self):
        font = FontInfo(name="Arial", size=12.0, bold=True)
        assert font.name == "Arial"
        assert font.size == 12.0
        assert font.bold is True
        assert font.italic is False


class TestOcrElement:
    """Tests for OcrElement dataclass."""

    def test_minimal_element(self):
        elem = OcrElement(ocr_class=OcrClass.WORD, text="hello")
        assert elem.ocr_class == "ocrx_word"
        assert elem.text == "hello"
        assert elem.bbox is None
        assert elem.children == []

    def test_element_with_bbox(self):
        bbox = BoundingBox(left=0, top=0, right=100, bottom=50)
        elem = OcrElement(ocr_class=OcrClass.LINE, bbox=bbox)
        assert elem.bbox == bbox
        assert elem.bbox.width == 100

    def test_element_hierarchy(self):
        word1 = OcrElement(ocr_class=OcrClass.WORD, text="Hello")
        word2 = OcrElement(ocr_class=OcrClass.WORD, text="World")
        line = OcrElement(ocr_class=OcrClass.LINE, children=[word1, word2])
        paragraph = OcrElement(ocr_class=OcrClass.PARAGRAPH, children=[line])
        page = OcrElement(ocr_class=OcrClass.PAGE, children=[paragraph])

        assert len(page.children) == 1
        assert len(page.children[0].children) == 1
        assert len(page.children[0].children[0].children) == 2

    def test_iter_by_class_single(self):
        word = OcrElement(ocr_class=OcrClass.WORD, text="test")
        line = OcrElement(ocr_class=OcrClass.LINE, children=[word])
        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])

        words = page.iter_by_class(OcrClass.WORD)
        assert len(words) == 1
        assert words[0].text == "test"

    def test_iter_by_class_multiple(self):
        words = [
            OcrElement(ocr_class=OcrClass.WORD, text="one"),
            OcrElement(ocr_class=OcrClass.WORD, text="two"),
            OcrElement(ocr_class=OcrClass.WORD, text="three"),
        ]
        line = OcrElement(ocr_class=OcrClass.LINE, children=words)
        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])

        result = page.iter_by_class(OcrClass.WORD)
        assert len(result) == 3
        assert [w.text for w in result] == ["one", "two", "three"]

    def test_iter_by_class_multiple_types(self):
        line = OcrElement(ocr_class=OcrClass.LINE)
        header = OcrElement(ocr_class=OcrClass.HEADER)
        caption = OcrElement(ocr_class=OcrClass.CAPTION)
        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line, header, caption])

        result = page.iter_by_class(OcrClass.LINE, OcrClass.HEADER)
        assert len(result) == 2

    def test_find_by_class(self):
        word = OcrElement(ocr_class=OcrClass.WORD, text="found")
        line = OcrElement(ocr_class=OcrClass.LINE, children=[word])
        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])

        result = page.find_by_class(OcrClass.WORD)
        assert result is not None
        assert result.text == "found"

    def test_find_by_class_not_found(self):
        line = OcrElement(ocr_class=OcrClass.LINE)
        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])

        result = page.find_by_class(OcrClass.WORD)
        assert result is None

    def test_get_text_recursive_leaf(self):
        word = OcrElement(ocr_class=OcrClass.WORD, text="hello")
        assert word.get_text_recursive() == "hello"

    def test_get_text_recursive_nested(self):
        word1 = OcrElement(ocr_class=OcrClass.WORD, text="Hello")
        word2 = OcrElement(ocr_class=OcrClass.WORD, text="World")
        line = OcrElement(ocr_class=OcrClass.LINE, children=[word1, word2])

        assert line.get_text_recursive() == "Hello World"

    def test_words_property(self):
        words = [
            OcrElement(ocr_class=OcrClass.WORD, text="a"),
            OcrElement(ocr_class=OcrClass.WORD, text="b"),
        ]
        line = OcrElement(ocr_class=OcrClass.LINE, children=words)
        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])

        assert len(page.words) == 2
        assert page.words[0].text == "a"

    def test_lines_property(self):
        line1 = OcrElement(ocr_class=OcrClass.LINE)
        line2 = OcrElement(ocr_class=OcrClass.HEADER)  # Also a line type
        par = OcrElement(ocr_class=OcrClass.PARAGRAPH, children=[line1, line2])
        page = OcrElement(ocr_class=OcrClass.PAGE, children=[par])

        assert len(page.lines) == 2

    def test_paragraphs_property(self):
        par1 = OcrElement(ocr_class=OcrClass.PARAGRAPH)
        par2 = OcrElement(ocr_class=OcrClass.PARAGRAPH)
        page = OcrElement(ocr_class=OcrClass.PAGE, children=[par1, par2])

        assert len(page.paragraphs) == 2

    def test_direction_ltr(self):
        elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, direction="ltr")
        assert elem.direction == "ltr"

    def test_direction_rtl(self):
        elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, direction="rtl")
        assert elem.direction == "rtl"

    def test_language(self):
        elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, language="eng")
        assert elem.language == "eng"

    def test_baseline(self):
        baseline = Baseline(slope=0.01, intercept=-3)
        elem = OcrElement(ocr_class=OcrClass.LINE, baseline=baseline)
        assert elem.baseline.slope == 0.01
        assert elem.baseline.intercept == -3

    def test_textangle(self):
        elem = OcrElement(ocr_class=OcrClass.LINE, textangle=5.0)
        assert elem.textangle == 5.0

    def test_confidence(self):
        elem = OcrElement(ocr_class=OcrClass.WORD, confidence=0.95)
        assert elem.confidence == 0.95

    def test_page_properties(self):
        elem = OcrElement(
            ocr_class=OcrClass.PAGE,
            dpi=300.0,
            page_number=0,
            logical_page_number=1,
        )
        assert elem.dpi == 300.0
        assert elem.page_number == 0
        assert elem.logical_page_number == 1


class TestOcrClass:
    """Tests for OcrClass constants."""

    def test_class_values(self):
        assert OcrClass.PAGE == "ocr_page"
        assert OcrClass.PARAGRAPH == "ocr_par"
        assert OcrClass.LINE == "ocr_line"
        assert OcrClass.WORD == "ocrx_word"
        assert OcrClass.HEADER == "ocr_header"
        assert OcrClass.CAPTION == "ocr_caption"

    def test_line_types_frozenset(self):
        assert OcrClass.LINE in OcrClass.LINE_TYPES
        assert OcrClass.HEADER in OcrClass.LINE_TYPES
        assert OcrClass.CAPTION in OcrClass.LINE_TYPES
        assert OcrClass.WORD not in OcrClass.LINE_TYPES


================================================
FILE: tests/test_ocr_engine_interface.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Unit tests for OcrEngine interface extensions.

These tests verify that the OcrEngine ABC has the new generate_ocr() method
and that OcrElement classes are exported from the public API.
"""

from __future__ import annotations

from pathlib import Path
from unittest.mock import MagicMock

import pytest

from ocrmypdf.pluginspec import OcrEngine


class TestOcrEngineInterface:
    """Test that OcrEngine ABC has required methods."""

    def test_generate_ocr_method_exists(self):
        """OcrEngine must have generate_ocr() method signature."""
        assert hasattr(OcrEngine, 'generate_ocr')

    def test_supports_generate_ocr_method_exists(self):
        """OcrEngine must have supports_generate_ocr() method."""
        assert hasattr(OcrEngine, 'supports_generate_ocr')

    def test_supports_generate_ocr_default_false(self):
        """Default supports_generate_ocr() should return False."""
        from ocrmypdf.pluginspec import OrientationConfidence

        # Create a minimal concrete implementation
        class MinimalEngine(OcrEngine):
            @staticmethod
            def version():
                return "1.0"

            @staticmethod
            def creator_tag(options):
                return "test"

            def __str__(self):
                return "test"

            @staticmethod
            def languages(options):
                return set()

            @staticmethod
            def get_orientation(input_file, options):
                return OrientationConfidence(0, 0.0)

            @staticmethod
            def get_deskew(input_file, options):
                return 0.0

            @staticmethod
            def generate_hocr(input_file, output_hocr, output_text, options):
                pass

            @staticmethod
            def generate_pdf(input_file, output_pdf, output_text, options):
                pass

        engine = MinimalEngine()
        assert engine.supports_generate_ocr() is False

    def test_generate_ocr_raises_not_implemented_by_default(self):
        """Default generate_ocr() should raise NotImplementedError."""
        from ocrmypdf.pluginspec import OrientationConfidence

        class MinimalEngine(OcrEngine):
            @staticmethod
            def version():
                return "1.0"

            @staticmethod
            def creator_tag(options):
                return "test"

            def __str__(self):
                return "test"

            @staticmethod
            def languages(options):
                return set()

            @staticmethod
            def get_orientation(input_file, options):
                return OrientationConfidence(0, 0.0)

            @staticmethod
            def get_deskew(input_file, options):
                return 0.0

            @staticmethod
            def generate_hocr(input_file, output_hocr, output_text, options):
                pass

            @staticmethod
            def generate_pdf(input_file, output_pdf, output_text, options):
                pass

        engine = MinimalEngine()
        with pytest.raises(NotImplementedError):
            engine.generate_ocr(Path("test.png"), MagicMock(), 0)


class TestOcrElementExport:
    """Test that OcrElement is exported from public API."""

    def test_ocrelement_importable_from_ocrmypdf(self):
        """OcrElement should be importable from ocrmypdf package."""
        from ocrmypdf import OcrElement

        assert OcrElement is not None

    def test_ocrclass_importable_from_ocrmypdf(self):
        """OcrClass should be importable from ocrmypdf package."""
        from ocrmypdf import OcrClass

        assert OcrClass is not None

    def test_boundingbox_importable_from_ocrmypdf(self):
        """BoundingBox should be importable from ocrmypdf package."""
        from ocrmypdf import BoundingBox

        assert BoundingBox is not None


================================================
FILE: tests/test_ocr_engine_selection.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Unit tests for OCR engine selection mechanism.

Tests verify that the --ocr-engine option works correctly and that
engine-specific options are available.
"""

from __future__ import annotations

import pytest


class TestOcrEngineCliOption:
    """Test --ocr-engine CLI option."""

    def test_ocr_engine_option_exists(self):
        """CLI should have --ocr-engine option."""
        from ocrmypdf.cli import get_parser

        parser = get_parser()

        option_strings = []
        for action in parser._actions:
            option_strings.extend(action.option_strings)

        assert '--ocr-engine' in option_strings

    def test_ocr_engine_accepts_tesseract(self):
        """--ocr-engine should accept 'tesseract'."""
        from ocrmypdf.cli import get_parser

        parser = get_parser()

        args = parser.parse_args(['--ocr-engine', 'tesseract', 'in.pdf', 'out.pdf'])
        assert args.ocr_engine == 'tesseract'

    def test_ocr_engine_accepts_auto(self):
        """--ocr-engine should accept 'auto'."""
        from ocrmypdf.cli import get_parser

        parser = get_parser()

        args = parser.parse_args(['--ocr-engine', 'auto', 'in.pdf', 'out.pdf'])
        assert args.ocr_engine == 'auto'

    def test_ocr_engine_accepts_none(self):
        """--ocr-engine should accept 'none'."""
        from ocrmypdf.cli import get_parser

        parser = get_parser()

        args = parser.parse_args(['--ocr-engine', 'none', 'in.pdf', 'out.pdf'])
        assert args.ocr_engine == 'none'

    def test_ocr_engine_default_is_auto(self):
        """--ocr-engine should default to 'auto'."""
        from ocrmypdf.cli import get_parser

        parser = get_parser()

        args = parser.parse_args(['in.pdf', 'out.pdf'])
        assert args.ocr_engine == 'auto'

    def test_ocr_engine_rejects_invalid(self):
        """--ocr-engine should reject invalid values."""
        from ocrmypdf.cli import get_parser

        parser = get_parser()

        with pytest.raises(SystemExit):
            parser.parse_args(['--ocr-engine', 'invalid_engine', 'in.pdf', 'out.pdf'])


class TestOcrEngineOptionsModel:
    """Test OcrOptions has ocr_engine field."""

    def test_ocr_options_has_ocr_engine_field(self):
        """OcrOptions should have ocr_engine field."""
        from ocrmypdf._options import OcrOptions

        # Check field exists in model
        assert 'ocr_engine' in OcrOptions.model_fields


class TestOcrEnginePluginSelection:
    """Test that get_ocr_engine() hook selects correct engine based on options."""

    def test_tesseract_selected_when_auto(self):
        """TesseractOcrEngine should be returned when ocr_engine='auto'."""
        from unittest.mock import MagicMock

        from ocrmypdf.builtin_plugins import tesseract_ocr
        from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine

        options = MagicMock()
        options.ocr_engine = 'auto'

        engine = tesseract_ocr.get_ocr_engine(options=options)
        assert isinstance(engine, TesseractOcrEngine)

    def test_tesseract_selected_when_tesseract(self):
        """TesseractOcrEngine should be returned when ocr_engine='tesseract'."""
        from unittest.mock import MagicMock

        from ocrmypdf.builtin_plugins import tesseract_ocr
        from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine

        options = MagicMock()
        options.ocr_engine = 'tesseract'

        engine = tesseract_ocr.get_ocr_engine(options=options)
        assert isinstance(engine, TesseractOcrEngine)

    def test_null_selected_when_none(self):
        """NullOcrEngine should be returned when ocr_engine='none'."""
        from unittest.mock import MagicMock

        from ocrmypdf.builtin_plugins import null_ocr
        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine

        options = MagicMock()
        options.ocr_engine = 'none'

        engine = null_ocr.get_ocr_engine(options=options)
        assert isinstance(engine, NullOcrEngine)

    def test_null_returns_none_when_auto(self):
        """null_ocr.get_ocr_engine() should return None when ocr_engine='auto'."""
        from unittest.mock import MagicMock

        from ocrmypdf.builtin_plugins import null_ocr

        options = MagicMock()
        options.ocr_engine = 'auto'

        engine = null_ocr.get_ocr_engine(options=options)
        assert engine is None


================================================
FILE: tests/test_optimize.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

from io import BytesIO
from os import fspath
from pathlib import Path
from unittest.mock import patch

import img2pdf
import pikepdf
import pytest
from pikepdf import Array, Dictionary, Name
from PIL import Image, ImageDraw

from ocrmypdf import optimize as opt
from ocrmypdf._exec import jbig2enc, pngquant
from ocrmypdf._exec.ghostscript import rasterize_pdf
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution
from ocrmypdf.optimize import PdfImage, extract_image_filter
from ocrmypdf.pluginspec import GhostscriptRasterDevice
from tests.conftest import check_ocrmypdf

needs_pngquant = pytest.mark.skipif(
    not pngquant.available(), reason="pngquant not installed"
)
needs_jbig2enc = pytest.mark.skipif(
    not jbig2enc.available(), reason="jbig2enc not installed"
)


# pylint:disable=redefined-outer-name


@pytest.fixture(scope="session")
def palette(resources):
    return resources / 'palette.pdf'


@needs_pngquant
@pytest.mark.parametrize('pdf', ['multipage', 'palette'])
def test_basic(multipage, palette, pdf, outpdf):
    infile = multipage if pdf == 'multipage' else palette
    opt.main(infile, outpdf, level=3)

    assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size


@needs_pngquant
def test_mono_not_inverted(resources, outdir):
    infile = resources / '2400dpi.pdf'
    opt.main(infile, outdir / 'out.pdf', level=3)

    rasterize_pdf(
        outdir / 'out.pdf',
        outdir / 'im.png',
        raster_device=GhostscriptRasterDevice.PNGGRAY,
        raster_dpi=Resolution(10, 10),
    )

    with Image.open(fspath(outdir / 'im.png')) as im:
        assert im.getpixel((0, 0)) > 240, "Expected white background"


@needs_pngquant
def test_jpg_png_params(resources, outpdf):
    check_ocrmypdf(
        resources / 'crom.png',
        outpdf,
        '--image-dpi',
        '200',
        '--optimize',
        '3',
        '--jpg-quality',
        '50',
        '--png-quality',
        '20',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


@needs_jbig2enc
def test_jbig2_lossless(resources, outpdf):
    """Test that JBIG2 lossless encoding works without JBIG2Globals."""
    args = [
        resources / 'ccitt.pdf',
        outpdf,
        '--image-dpi',
        '200',
        '--optimize',
        '3',
        '--jpg-quality',
        '50',
        '--png-quality',
        '20',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        '--jbig2-threshold',
        '0.7',
    ]

    check_ocrmypdf(*args)

    with pikepdf.open(outpdf) as pdf:
        pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
        assert pim.filters[0] == '/JBIG2Decode'
        # Lossless JBIG2 has no JBIG2Globals (no shared symbol dictionary)
        assert len(pim.decode_parms) == 0


@needs_pngquant
@needs_jbig2enc
def test_flate_to_jbig2(resources, outdir):
    # This test requires an image that pngquant is capable of converting to
    # to 1bpp - so use an existing 1bpp image, convert up, confirm it can
    # convert down
    with Image.open(fspath(resources / 'typewriter.png')) as im:
        assert im.mode in ('1', 'P')
        im = im.convert('L')
        im.save(fspath(outdir / 'type8.png'))

    check_ocrmypdf(
        outdir / 'type8.png',
        outdir / 'out.pdf',
        '--image-dpi',
        '100',
        '--png-quality',
        '50',
        '--optimize',
        '3',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    with pikepdf.open(outdir / 'out.pdf') as pdf:
        pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
        assert pim.filters[0] == '/JBIG2Decode'


@needs_pngquant
def test_multiple_pngs(resources, outdir):
    with Path.open(outdir / 'in.pdf', 'wb') as inpdf:
        img2pdf.convert(
            fspath(resources / 'baiona_colormapped.png'),
            fspath(resources / 'baiona_gray.png'),
            outputstream=inpdf,
            **IMG2PDF_KWARGS,
        )

    def mockquant(input_file, output_file, *_args):
        with Image.open(input_file) as im:
            draw = ImageDraw.Draw(im)
            draw.rectangle((0, 0, im.width, im.height), fill=128)
            im.save(output_file)

    with patch('ocrmypdf.optimize.pngquant.quantize') as mock:
        mock.side_effect = mockquant
        check_ocrmypdf(
            outdir / 'in.pdf',
            outdir / 'out.pdf',
            '--optimize',
            '3',
            '--jobs',
            '1',
            '--use-threads',
            '--output-type',
            'pdf',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )
        mock.assert_called()

    with (
        pikepdf.open(outdir / 'in.pdf') as inpdf,
        pikepdf.open(outdir / 'out.pdf') as outpdf,
    ):
        for n in range(len(inpdf.pages)):
            inim = next(iter(inpdf.pages[n].images.values()))
            outim = next(iter(outpdf.pages[n].images.values()))
            assert len(outim.read_raw_bytes()) < len(inim.read_raw_bytes()), n


def test_optimize_off(resources, outpdf):
    check_ocrmypdf(
        resources / 'trivial.pdf',
        outpdf,
        '--optimize=0',
        '--output-type',
        'pdf',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


def test_group3(resources):
    with pikepdf.open(resources / 'ccitt.pdf') as pdf:
        im = pdf.pages[0].Resources.XObject['/Im1']
        assert (
            opt.extract_image_filter(im, im.objgen[0]) is not None
        ), "Group 4 should be allowed"

        im.DecodeParms['/K'] = 0
        assert (
            opt.extract_image_filter(im, im.objgen[0]) is None
        ), "Group 3 should be disallowed"


def test_find_formx(resources):
    with pikepdf.open(resources / 'formxobject.pdf') as pdf:
        working, pagenos = opt._find_image_xrefs(pdf)
        assert len(working) == 1
        xref = next(iter(working))
        assert pagenos[xref] == 0


def test_extract_image_filter_with_pdf_image():
    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 10
    image.Height = 10
    image.Filter = [Name.FlateDecode, Name.DCTDecode]
    pdf_image = PdfImage(image)
    image.BitsPerComponent = 8
    assert extract_image_filter(image, None) == (
        pdf_image,
        pdf_image.filter_decodeparms[1],
    )


def test_extract_image_filter_with_non_image():
    image = Dictionary()
    image.Subtype = Name.Form
    assert extract_image_filter(image, None) is None


def test_extract_image_filter_with_small_stream_size():
    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 50
    assert extract_image_filter(image, None) is None


def test_extract_image_filter_with_small_dimensions():
    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 5
    image.Height = 5
    assert extract_image_filter(image, None) is None


def test_extract_image_filter_with_multiple_compression_filters():
    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 10
    image.Height = 10
    image.BitsPerComponent = 8
    image.Filter = [Name.ASCII85Decode, Name.FlateDecode, Name.DCTDecode]
    assert extract_image_filter(image, None) is None


def test_extract_image_filter_with_wide_gamut_image():
    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 10
    image.Height = 10
    image.BitsPerComponent = 16
    image.Filter = Name.FlateDecode
    assert extract_image_filter(image, None) is None


def test_extract_image_filter_with_jpeg2000_image():
    im = Image.new('RGB', (10, 10))
    bio = BytesIO()
    im.save(bio, format='JPEG2000')
    pdf = pikepdf.new()
    stream = pdf.make_stream(
        data=bio.getvalue(),
        Subtype=Name.Image,
        Length=200,
        Width=10,
        Height=10,
        BitsPerComponent=8,
        Filter=Name.JPXDecode,
    )
    assert extract_image_filter(stream, None) is None


def test_extract_image_filter_with_ccitt_group_3_image():
    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 10
    image.Height = 10
    image.BitsPerComponent = 1
    image.Filter = Name.CCITTFaxDecode
    image.DecodeParms = Array([Dictionary(K=1)])
    assert extract_image_filter(image, None) is None


# Triggers pikepdf bug
# def test_extract_image_filter_with_decode_table():
#     image = Dictionary()
#     image.Subtype = Name.Image
#     image.Length = 200
#     image.Width = 10
#     image.Height = 10
#     image.Filter = Name.FlateDecode
#     image.BitsPerComponent = 8
#     image.ColorSpace = Name.DeviceGray
#     image.Decode = [42, 0]
#     assert extract_image_filter(image, None) is None


def test_extract_image_filter_with_rgb_smask_matte():
    image = Dictionary()
    image.Subtype = Name.Image
    image.Length = 200
    image.Width = 10
    image.Height = 10
    image.Filter = Name.FlateDecode
    image.BitsPerComponent = 8
    image.ColorSpace = Name.DeviceRGB
    image.SMask = Dictionary(
        Type=Name.Image,
        Subtype=Name.Image,
        Length=200,
        Width=10,
        Height=10,
        Filter=Name.FlateDecode,
        BitsPerComponent=8,
        ColorSpace=Name.DeviceGray,
        Matte=Array([1, 2, 3]),
    )
    assert extract_image_filter(image, None) is None


================================================
FILE: tests/test_page_boxes.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import pikepdf
import pytest

from ocrmypdf._exec import verapdf

from .conftest import check_ocrmypdf

page_rect = [0, 0, 612, 792]
inset_rect = [200, 200, 612, 792]
wh_rect = [0, 0, 412, 592]

neg_rect = [-100, -100, 512, 692]

# When speculative PDF/A succeeds (verapdf available), MediaBox is preserved.
# Ghostscript would normalize MediaBox to start at origin, but speculative
# conversion bypasses Ghostscript.
_pdfa_inset_expected = inset_rect if verapdf.available() else wh_rect

mediabox_testdata = [
    ('fpdf2', 'pdfa', 'ccitt.pdf', None, inset_rect, _pdfa_inset_expected),
    ('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, _pdfa_inset_expected),
    ('fpdf2', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
    ('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
    (
        'fpdf2',
        'pdfa',
        'ccitt.pdf',
        '--force-ocr',
        inset_rect,
        wh_rect,
    ),
    (
        'fpdf2',
        'pdf',
        'ccitt.pdf',
        '--force-ocr',
        inset_rect,
        wh_rect,
    ),
    ('fpdf2', 'pdfa', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
    ('fpdf2', 'pdf', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
]


@pytest.mark.parametrize(
    'renderer, output_type, in_pdf, mode, crop_to, crop_expected', mediabox_testdata
)
def test_media_box(
    resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
):
    with pikepdf.open(resources / in_pdf) as pdf:
        page = pdf.pages[0]
        page.MediaBox = crop_to
        pdf.save(outdir / 'cropped.pdf')
    args = [
        '--jobs',
        '1',
        '--pdf-renderer',
        renderer,
        '--output-type',
        output_type,
    ]
    if mode:
        args.append(mode)

    check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)

    with pikepdf.open(outdir / 'processed.pdf') as pdf:
        page = pdf.pages[0]
        assert [float(x) for x in page.mediabox] == crop_expected


cropbox_testdata = [
    ('fpdf2', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
    ('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
    ('fpdf2', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
    ('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
    (
        'fpdf2',
        'pdfa',
        'ccitt.pdf',
        '--force-ocr',
        inset_rect,
        inset_rect,
    ),
    (
        'fpdf2',
        'pdf',
        'ccitt.pdf',
        '--force-ocr',
        inset_rect,
        inset_rect,
    ),
]


@pytest.mark.parametrize(
    'renderer, output_type, in_pdf, mode, crop_to, crop_expected', cropbox_testdata
)
def test_crop_box(
    resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
):
    with pikepdf.open(resources / in_pdf) as pdf:
        page = pdf.pages[0]
        page.CropBox = crop_to
        pdf.save(outdir / 'cropped.pdf')
    args = [
        '--jobs',
        '1',
        '--pdf-renderer',
        renderer,
        '--output-type',
        output_type,
        '--optimize',
        '0',
    ]
    if mode:
        args.append(mode)

    check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)

    with pikepdf.open(outdir / 'processed.pdf') as pdf:
        page = pdf.pages[0]
        assert [float(x) for x in page.cropbox] == crop_expected


================================================
FILE: tests/test_page_numbers.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import pytest

import ocrmypdf
from ocrmypdf._options import _pages_from_ranges
from ocrmypdf.exceptions import BadArgsError
from ocrmypdf.pdfinfo import PdfInfo


@pytest.mark.parametrize(
    'pages, result',
    [
        ['1', {0}],
        ['1,2', {0, 1}],
        ['1-3', {0, 1, 2}],
        ['2,5,6', {1, 4, 5}],
        ['11-15, 18, ', {10, 11, 12, 13, 14, 17}],
        [',,3', {2}],
        ['3, 3, 3, 3,', {2}],
        ['3, 2, 1, 42', {0, 1, 2, 41}],
        ['-1', BadArgsError],
        ['1,3,-11', BadArgsError],
        ['1-,', BadArgsError],
        ['start-end', BadArgsError],
        ['1-0', BadArgsError],
        ['99-98', BadArgsError],
        ['0-0', BadArgsError],
        ['1-0,3-4', BadArgsError],
        [',', BadArgsError],
        ['', BadArgsError],
    ],
)
def test_pages(pages, result):
    if isinstance(result, type):
        with pytest.raises(result):
            _pages_from_ranges(pages)
    else:
        assert _pages_from_ranges(pages) == result


def test_nonmonotonic_warning(caplog):
    pages = _pages_from_ranges('1, 3, 2')
    assert pages == {0, 1, 2}
    assert 'out of order' in caplog.text


def test_limited_pages(multipage, outpdf):
    ocrmypdf.ocr(
        multipage,
        outpdf,
        pages='5-6',
        optimize=0,
        output_type='pdf',
        plugins=['tests/plugins/tesseract_cache.py'],
    )
    pi = PdfInfo(outpdf)
    assert not pi.pages[0].has_text
    assert pi.pages[4].has_text
    assert pi.pages[5].has_text


================================================
FILE: tests/test_pdf_renderer.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Unit tests for Fpdf2PdfRenderer class."""

from __future__ import annotations

from io import StringIO
from pathlib import Path

import pytest
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import DebugRenderOptions, Fpdf2PdfRenderer
from ocrmypdf.helpers import check_pdf
from ocrmypdf.hocrtransform import (
    Baseline,
    BoundingBox,
    OcrClass,
    OcrElement,
)


def text_from_pdf(filename: Path) -> str:
    """Extract text from a PDF file using pdfminer."""
    output_string = StringIO()
    with open(filename, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    return output_string.getvalue()


@pytest.fixture
def font_dir():
    """Get the font directory."""
    return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"


@pytest.fixture
def multi_font_manager(font_dir):
    """Create a MultiFontManager for tests."""
    return MultiFontManager(font_dir)


def create_simple_page(
    width: float = 1000,
    height: float = 500,
    words: list[tuple[str, tuple[float, float, float, float]]] | None = None,
) -> OcrElement:
    """Create a simple OcrElement page for testing.

    Args:
        width: Page width in pixels
        height: Page height in pixels
        words: List of (text, (left, top, right, bottom)) tuples

    Returns:
        OcrElement representing the page
    """
    if words is None:
        words = [("Hello", (100, 100, 200, 150)), ("World", (250, 100, 350, 150))]

    word_elements = [
        OcrElement(
            ocr_class=OcrClass.WORD,
            text=text,
            bbox=BoundingBox(left=bbox[0], top=bbox[1], right=bbox[2], bottom=bbox[3]),
        )
        for text, bbox in words
    ]

    line = OcrElement(
        ocr_class=OcrClass.LINE,
        bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
        baseline=Baseline(slope=0.0, intercept=0),
        children=word_elements,
    )

    paragraph = OcrElement(
        ocr_class=OcrClass.PARAGRAPH,
        bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
        direction="ltr",
        language="eng",
        children=[line],
    )

    page = OcrElement(
        ocr_class=OcrClass.PAGE,
        bbox=BoundingBox(left=0, top=0, right=width, bottom=height),
        children=[paragraph],
    )

    return page


class TestFpdf2PdfRendererBasic:
    """Basic Fpdf2PdfRenderer functionality tests."""

    def test_render_simple_page(self, tmp_path, multi_font_manager):
        """Test rendering a simple page with two words."""
        page = create_simple_page()
        output_pdf = tmp_path / "simple.pdf"

        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        assert output_pdf.exists()
        check_pdf(str(output_pdf))

    def test_rendered_text_extractable(self, tmp_path, multi_font_manager):
        """Test that rendered text can be extracted from the PDF."""
        page = create_simple_page()
        output_pdf = tmp_path / "extractable.pdf"

        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        extracted_text = text_from_pdf(output_pdf)
        assert "Hello" in extracted_text
        assert "World" in extracted_text

    def test_invisible_text_mode(self, tmp_path, multi_font_manager):
        """Test that invisible_text=True creates a valid PDF."""
        page = create_simple_page()
        output_pdf = tmp_path / "invisible.pdf"

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=72.0,
            multi_font_manager=multi_font_manager,
            invisible_text=True,
        )
        renderer.render(output_pdf)

        # Text should still be extractable even when invisible
        extracted_text = text_from_pdf(output_pdf)
        assert "Hello" in extracted_text

    def test_visible_text_mode(self, tmp_path, multi_font_manager):
        """Test that invisible_text=False creates a valid PDF with visible text."""
        page = create_simple_page()
        output_pdf = tmp_path / "visible.pdf"

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=72.0,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
        )
        renderer.render(output_pdf)

        # Text should be extractable
        extracted_text = text_from_pdf(output_pdf)
        assert "Hello" in extracted_text


class TestFpdf2PdfRendererPageSize:
    """Test page size calculations."""

    def test_page_dimensions(self, tmp_path, multi_font_manager):
        """Test that page dimensions are calculated correctly."""
        # 1000x500 pixels at 72 dpi = 1000x500 points
        page = create_simple_page(width=1000, height=500)
        output_pdf = tmp_path / "dimensions.pdf"

        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        assert renderer.coord_transform.page_width_pt == pytest.approx(1000.0)
        assert renderer.coord_transform.page_height_pt == pytest.approx(500.0)

        renderer.render(output_pdf)

    def test_high_dpi_page(self, tmp_path, multi_font_manager):
        """Test page dimensions at higher DPI."""
        # 720x360 pixels at 144 dpi = 360x180 points
        page = create_simple_page(width=720, height=360)
        output_pdf = tmp_path / "high_dpi.pdf"

        renderer = Fpdf2PdfRenderer(
            page=page, dpi=144.0, multi_font_manager=multi_font_manager
        )
        assert renderer.coord_transform.page_width_pt == pytest.approx(360.0)
        assert renderer.coord_transform.page_height_pt == pytest.approx(180.0)

        renderer.render(output_pdf)
        check_pdf(str(output_pdf))


class TestFpdf2PdfRendererMultiLine:
    """Test rendering of multi-line content."""

    def test_multiple_lines(self, tmp_path, multi_font_manager):
        """Test rendering multiple lines of text."""
        line1_words = [
            OcrElement(
                ocr_class=OcrClass.WORD,
                text="Line",
                bbox=BoundingBox(left=100, top=100, right=180, bottom=150),
            ),
            OcrElement(
                ocr_class=OcrClass.WORD,
                text="one",
                bbox=BoundingBox(left=190, top=100, right=250, bottom=150),
            ),
        ]
        line1 = OcrElement(
            ocr_class=OcrClass.LINE,
            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
            baseline=Baseline(slope=0.0, intercept=0),
            children=line1_words,
        )

        line2_words = [
            OcrElement(
                ocr_class=OcrClass.WORD,
                text="Line",
                bbox=BoundingBox(left=100, top=200, right=180, bottom=250),
            ),
            OcrElement(
                ocr_class=OcrClass.WORD,
                text="two",
                bbox=BoundingBox(left=190, top=200, right=250, bottom=250),
            ),
        ]
        line2 = OcrElement(
            ocr_class=OcrClass.LINE,
            bbox=BoundingBox(left=100, top=200, right=900, bottom=250),
            baseline=Baseline(slope=0.0, intercept=0),
            children=line2_words,
        )

        paragraph = OcrElement(
            ocr_class=OcrClass.PARAGRAPH,
            bbox=BoundingBox(left=100, top=100, right=900, bottom=250),
            direction="ltr",
            language="eng",
            children=[line1, line2],
        )

        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
            children=[paragraph],
        )

        output_pdf = tmp_path / "multiline.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        extracted_text = text_from_pdf(output_pdf)
        assert "Line" in extracted_text
        assert "one" in extracted_text
        assert "two" in extracted_text


class TestFpdf2PdfRendererTextDirection:
    """Test rendering of different text directions."""

    def test_ltr_text(self, tmp_path, multi_font_manager):
        """Test rendering LTR text."""
        page = create_simple_page()
        output_pdf = tmp_path / "ltr.pdf"

        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        check_pdf(str(output_pdf))

    def test_rtl_text(self, tmp_path, multi_font_manager):
        """Test rendering RTL text."""
        word = OcrElement(
            ocr_class=OcrClass.WORD,
            text="مرحبا",
            bbox=BoundingBox(left=100, top=100, right=200, bottom=150),
        )
        line = OcrElement(
            ocr_class=OcrClass.LINE,
            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
            baseline=Baseline(slope=0.0, intercept=0),
            direction="rtl",
            children=[word],
        )
        paragraph = OcrElement(
            ocr_class=OcrClass.PARAGRAPH,
            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
            direction="rtl",
            language="ara",
            children=[line],
        )
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
            children=[paragraph],
        )

        output_pdf = tmp_path / "rtl.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        check_pdf(str(output_pdf))


class TestFpdf2PdfRendererBaseline:
    """Test baseline handling in rendering."""

    def test_sloped_baseline(self, tmp_path, multi_font_manager):
        """Test rendering with a sloped baseline."""
        word = OcrElement(
            ocr_class=OcrClass.WORD,
            text="Sloped",
            bbox=BoundingBox(left=100, top=100, right=200, bottom=150),
        )
        line = OcrElement(
            ocr_class=OcrClass.LINE,
            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
            baseline=Baseline(slope=0.02, intercept=-5),
            children=[word],
        )
        paragraph = OcrElement(
            ocr_class=OcrClass.PARAGRAPH,
            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
            direction="ltr",
            language="eng",
            children=[line],
        )
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
            children=[paragraph],
        )

        output_pdf = tmp_path / "sloped.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        check_pdf(str(output_pdf))
        extracted_text = text_from_pdf(output_pdf)
        assert "Sloped" in extracted_text


class TestFpdf2PdfRendererTextangle:
    """Test textangle (rotation) handling in rendering."""

    def test_rotated_text(self, tmp_path, multi_font_manager):
        """Test rendering rotated text."""
        word = OcrElement(
            ocr_class=OcrClass.WORD,
            text="Rotated",
            bbox=BoundingBox(left=100, top=100, right=200, bottom=150),
        )
        line = OcrElement(
            ocr_class=OcrClass.LINE,
            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
            baseline=Baseline(slope=0.0, intercept=0),
            textangle=5.0,
            children=[word],
        )
        paragraph = OcrElement(
            ocr_class=OcrClass.PARAGRAPH,
            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
            direction="ltr",
            language="eng",
            children=[line],
        )
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
            children=[paragraph],
        )

        output_pdf = tmp_path / "rotated.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        check_pdf(str(output_pdf))
        extracted_text = text_from_pdf(output_pdf)
        assert "Rotated" in extracted_text


class TestFpdf2PdfRendererWordBreaks:
    """Test word rendering."""

    def test_word_breaks_english(self, tmp_path, multi_font_manager):
        """Test that words are rendered for English text."""
        page = create_simple_page()
        output_pdf = tmp_path / "english.pdf"

        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        extracted_text = text_from_pdf(output_pdf)
        # Words should be present
        assert "Hello" in extracted_text
        assert "World" in extracted_text

    def test_cjk_text(self, tmp_path, multi_font_manager):
        """Test rendering CJK text."""
        words = [
            OcrElement(
                ocr_class=OcrClass.WORD,
                text="你好",
                bbox=BoundingBox(left=100, top=100, right=150, bottom=150),
            ),
            OcrElement(
                ocr_class=OcrClass.WORD,
                text="世界",
                bbox=BoundingBox(left=160, top=100, right=210, bottom=150),
            ),
        ]
        line = OcrElement(
            ocr_class=OcrClass.LINE,
            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
            baseline=Baseline(slope=0.0, intercept=0),
            children=words,
        )
        paragraph = OcrElement(
            ocr_class=OcrClass.PARAGRAPH,
            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),
            direction="ltr",
            language="chi_sim",  # Simplified Chinese
            children=[line],
        )
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
            children=[paragraph],
        )

        output_pdf = tmp_path / "chinese.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        check_pdf(str(output_pdf))


class TestFpdf2PdfRendererDebugOptions:
    """Test debug rendering options."""

    def test_debug_render_options_default(self, multi_font_manager):
        """Test that debug options are disabled by default."""
        page = create_simple_page()
        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )

        assert renderer.debug_options.render_baseline is False
        assert renderer.debug_options.render_word_bbox is False
        assert renderer.debug_options.render_line_bbox is False

    def test_debug_render_options_enabled(self, tmp_path, multi_font_manager):
        """Test rendering with debug options enabled."""
        page = create_simple_page()
        output_pdf = tmp_path / "debug.pdf"

        debug_opts = DebugRenderOptions(
            render_baseline=True,
            render_word_bbox=True,
            render_line_bbox=True,
        )

        renderer = Fpdf2PdfRenderer(
            page=page,
            dpi=72.0,
            multi_font_manager=multi_font_manager,
            invisible_text=False,
            debug_render_options=debug_opts,
        )
        renderer.render(output_pdf)

        check_pdf(str(output_pdf))
        # Text should still be extractable
        extracted_text = text_from_pdf(output_pdf)
        assert "Hello" in extracted_text


class TestFpdf2PdfRendererErrors:
    """Test error handling in Fpdf2PdfRenderer."""

    def test_invalid_ocr_class(self, multi_font_manager):
        """Test that non-page elements are rejected."""
        line = OcrElement(
            ocr_class=OcrClass.LINE, bbox=BoundingBox(left=0, top=0, right=100, bottom=50)
        )

        with pytest.raises(ValueError, match="ocr_page"):
            Fpdf2PdfRenderer(page=line, dpi=72.0, multi_font_manager=multi_font_manager)

    def test_page_without_bbox(self, multi_font_manager):
        """Test that pages without bbox are rejected."""
        page = OcrElement(ocr_class=OcrClass.PAGE)

        with pytest.raises(ValueError, match="bounding box"):
            Fpdf2PdfRenderer(page=page, dpi=72.0, multi_font_manager=multi_font_manager)


class TestFpdf2PdfRendererLineTypes:
    """Test rendering of different line types."""

    def test_header_line(self, tmp_path, multi_font_manager):
        """Test rendering header lines."""
        word = OcrElement(
            ocr_class=OcrClass.WORD,
            text="Header",
            bbox=BoundingBox(left=100, top=50, right=200, bottom=100),
        )
        header = OcrElement(
            ocr_class=OcrClass.HEADER,
            bbox=BoundingBox(left=100, top=50, right=900, bottom=100),
            baseline=Baseline(slope=0.0, intercept=0),
            children=[word],
        )
        paragraph = OcrElement(
            ocr_class=OcrClass.PARAGRAPH,
            bbox=BoundingBox(left=100, top=50, right=900, bottom=100),
            direction="ltr",
            language="eng",
            children=[header],
        )
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
            children=[paragraph],
        )

        output_pdf = tmp_path / "header.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        check_pdf(str(output_pdf))
        extracted_text = text_from_pdf(output_pdf)
        assert "Header" in extracted_text

    def test_caption_line(self, tmp_path, multi_font_manager):
        """Test rendering caption lines."""
        word = OcrElement(
            ocr_class=OcrClass.WORD,
            text="Caption",
            bbox=BoundingBox(left=100, top=300, right=200, bottom=350),
        )
        caption = OcrElement(
            ocr_class=OcrClass.CAPTION,
            bbox=BoundingBox(left=100, top=300, right=900, bottom=350),
            baseline=Baseline(slope=0.0, intercept=0),
            children=[word],
        )
        paragraph = OcrElement(
            ocr_class=OcrClass.PARAGRAPH,
            bbox=BoundingBox(left=100, top=300, right=900, bottom=350),
            direction="ltr",
            language="eng",
            children=[caption],
        )
        page = OcrElement(
            ocr_class=OcrClass.PAGE,
            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),
            children=[paragraph],
        )

        output_pdf = tmp_path / "caption.pdf"
        renderer = Fpdf2PdfRenderer(
            page=page, dpi=72.0, multi_font_manager=multi_font_manager
        )
        renderer.render(output_pdf)

        check_pdf(str(output_pdf))
        extracted_text = text_from_pdf(output_pdf)
        assert "Caption" in extracted_text


================================================
FILE: tests/test_pdfa.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import os

import pikepdf
import pytest

from ocrmypdf.exceptions import MissingDependencyError

from .conftest import check_ocrmypdf


@pytest.mark.parametrize('optimize', (0, 3))
@pytest.mark.parametrize('pdfa_level', (1, 2, 3))
def test_pdfa(resources, outpdf, optimize, pdfa_level):
    try:
        check_ocrmypdf(
            resources / 'francais.pdf',
            outpdf,
            '--plugin',
            'tests/plugins/tesseract_noop.py',
            f'--output-type=pdfa-{pdfa_level}',
            f'--optimize={optimize}',
        )
    except MissingDependencyError as e:
        if 'pngquant' in str(e) and optimize in (2, 3) and os.name == 'nt':
            pytest.xfail("pngquant currently not available on Windows")
    if pdfa_level in (2, 3):
        # PDF/A-2 allows ObjStm
        assert b'/ObjStm' in outpdf.read_bytes()
    elif pdfa_level == 1:
        # PDF/A-1 might allow ObjStm, but Acrobat does not approve it, so
        # we don't use it
        assert b'/ObjStm' not in outpdf.read_bytes()

    with pikepdf.open(outpdf) as pdf, pdf.open_metadata() as m:
        assert m.pdfa_status == f'{pdfa_level}B'


================================================
FILE: tests/test_pdfinfo.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import pickle
import warnings
from io import BytesIO
from math import isclose

import img2pdf
import pikepdf
import pytest
from PIL import Image
from reportlab.lib.units import inch
from reportlab.pdfgen.canvas import Canvas

from ocrmypdf import pdfinfo
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution
from ocrmypdf.pdfinfo import Colorspace, Encoding
from ocrmypdf.pdfinfo._contentstream import _interpret_contents
from ocrmypdf.pdfinfo.layout import PDFPage

warnings.filterwarnings(
    "ignore", category=DeprecationWarning, module="reportlab.lib.rl_safe_eval"
)

# pylint: disable=protected-access


@pytest.fixture
def single_page_text(outdir):
    filename = outdir / 'text.pdf'
    pdf = Canvas(str(filename), pagesize=(8 * inch, 6 * inch))
    text = pdf.beginText()
    text.setFont('Helvetica', 12)
    text.setTextOrigin(1 * inch, 3 * inch)
    text.textLine(
        "Methink'st thou art a general offence and every man should beat thee."
    )
    pdf.drawText(text)
    pdf.showPage()
    pdf.save()
    return filename


def test_single_page_text(single_page_text):
    info = pdfinfo.PdfInfo(single_page_text)

    assert len(info) == 1
    page = info[0]

    assert page.has_text
    assert len(page.images) == 0


@pytest.fixture(scope='session')
def eight_by_eight():
    im = Image.new('1', (8, 8), 0)
    for n in range(8):
        im.putpixel((n, n), 1)
    return im


@pytest.fixture
def eight_by_eight_regular_image(eight_by_eight, outpdf):
    im = eight_by_eight
    bio = BytesIO()
    im.save(bio, format='PNG')
    bio.seek(0)

    imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
    layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)

    with outpdf.open('wb') as f:
        img2pdf.convert(
            bio,
            producer="img2pdf",
            layout_fun=layout_fun,
            outputstream=f,
            **IMG2PDF_KWARGS,
        )
    return outpdf


def test_single_page_image(eight_by_eight_regular_image):
    info = pdfinfo.PdfInfo(eight_by_eight_regular_image)

    assert len(info) == 1
    page = info[0]

    assert not page.has_text
    assert len(page.images) == 1

    pdfimage = page.images[0]
    assert pdfimage.width == 8
    assert pdfimage.color == Colorspace.gray

    # DPI in a 1"x1" is the image width
    assert isclose(pdfimage.dpi.x, 8)
    assert isclose(pdfimage.dpi.y, 8)


@pytest.fixture
def eight_by_eight_inline_image(eight_by_eight, outpdf):
    pdf = Canvas(str(outpdf), pagesize=(8 * 72, 6 * 72))
    # Draw image in a 72x72 pt or 1"x1" area
    pdf.drawInlineImage(eight_by_eight, 0, 0, width=72, height=72)
    pdf.showPage()
    pdf.save()
    return outpdf


def test_single_page_inline_image(eight_by_eight_inline_image):
    info = pdfinfo.PdfInfo(eight_by_eight_inline_image)
    print(info)
    pdfimage = info[0].images[0]
    assert isclose(pdfimage.dpi.x, 8)
    assert pdfimage.color == Colorspace.gray
    assert pdfimage.width == 8


def test_jpeg(resources):
    filename = resources / 'c02-22.pdf'

    pdf = pdfinfo.PdfInfo(filename)

    pdfimage = pdf[0].images[0]
    assert pdfimage.enc == Encoding.jpeg
    assert isclose(pdfimage.dpi.x, 150)


@pytest.fixture
def flate_jpeg_pdf(outpdf):
    """Create a PDF with a FlateDecode+DCTDecode (flate+jpeg) encoded image.

    This simulates what OCRmyPDF's optimizer does when it deflates JPEGs.
    """
    from zlib import compress

    # Create an RGB image and save as JPEG
    im = Image.new('RGB', (64, 64), color=(128, 64, 192))
    bio = BytesIO()
    im.save(bio, format='JPEG')
    jpeg_data = bio.getvalue()

    # Compress the JPEG data with flate
    flate_jpeg_data = compress(jpeg_data)

    # Create a PDF with the flate+jpeg image
    with pikepdf.Pdf.new() as pdf:
        pdf.add_blank_page(page_size=(72, 72))
        image_dict = pikepdf.Stream(
            pdf,
            flate_jpeg_data,
            BitsPerComponent=8,
            ColorSpace=pikepdf.Name.DeviceRGB,
            Filter=[pikepdf.Name.FlateDecode, pikepdf.Name.DCTDecode],
            Height=64,
            Subtype=pikepdf.Name.Image,
            Type=pikepdf.Name.XObject,
            Width=64,
        )
        objname = pdf.pages[0].add_resource(
            image_dict, pikepdf.Name.XObject, pikepdf.Name.Im0
        )
        pdf.pages[0].Contents = pikepdf.Stream(
            pdf, b"q 72 0 0 72 0 0 cm %s Do Q" % bytes(objname)
        )
        pdf.save(outpdf)
    return outpdf


def test_flate_jpeg(flate_jpeg_pdf):
    """Test that pdfinfo correctly identifies FlateDecode+DCTDecode as flate_jpeg."""
    pdf = pdfinfo.PdfInfo(flate_jpeg_pdf)

    pdfimage = pdf[0].images[0]
    assert pdfimage.enc == Encoding.flate_jpeg


def test_form_xobject(resources):
    filename = resources / 'formxobject.pdf'

    pdf = pdfinfo.PdfInfo(filename)
    pdfimage = pdf[0].images[0]
    assert pdfimage.width == 50


def test_no_contents(resources):
    filename = resources / 'no_contents.pdf'

    pdf = pdfinfo.PdfInfo(filename)
    assert len(pdf[0].images) == 0
    assert not pdf[0].has_text


def test_oversized_page(resources):
    pdf = pdfinfo.PdfInfo(resources / 'poster.pdf')
    image = pdf[0].images[0]
    assert image.width * image.dpi.x > 200, "this is supposed to be oversized"


def test_pickle(resources):
    # For multiprocessing we must be able to pickle our information - if
    # this fails then we are probably storing some unpickleabe pikepdf or
    # other external data around
    filename = resources / 'graph_ocred.pdf'
    pdf = pdfinfo.PdfInfo(filename)
    pickle.dumps(pdf)


def test_vector(resources):
    filename = resources / 'vector.pdf'
    pdf = pdfinfo.PdfInfo(filename)
    assert pdf[0].has_vector
    assert not pdf[0].has_text


def test_ocr_detection(resources):
    filename = resources / 'graph_ocred.pdf'
    pdf = pdfinfo.PdfInfo(filename)
    assert not pdf[0].has_vector
    assert pdf[0].has_text


@pytest.mark.parametrize(
    'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf')
)
def test_corrupt_font_detection(resources, testfile):
    filename = resources / testfile
    pdf = pdfinfo.PdfInfo(filename, detailed_analysis=True)
    assert pdf[0].has_corrupt_text


def test_stack_abuse():
    p = pikepdf.Pdf.new()

    stream = pikepdf.Stream(p, b'q ' * 35)
    with pytest.warns(UserWarning, match="overflowed"):
        _interpret_contents(stream)

    stream = pikepdf.Stream(p, b'q Q Q Q Q')
    with pytest.warns(UserWarning, match="underflowed"):
        _interpret_contents(stream)

    stream = pikepdf.Stream(p, b'q ' * 135)
    with pytest.warns(UserWarning), pytest.raises(RuntimeError):
        _interpret_contents(stream)


def test_pages_issue700(monkeypatch, resources):
    def get_no_pages(*args, **kwargs):
        return iter([])

    monkeypatch.setattr(PDFPage, 'get_pages', get_no_pages)

    with pytest.raises(InputFileError, match="pdfminer"):
        pi = pdfinfo.PdfInfo(
            resources / 'cardinal.pdf',
            detailed_analysis=True,
            progbar=False,
            max_workers=1,
        )
        pi._miner_state.get_page_analysis(0)


@pytest.fixture
def image_scale0(resources, outpdf):
    with pikepdf.open(resources / 'cmyk.pdf') as cmyk:
        xobj = cmyk.pages[0].as_form_xobject()

        p = pikepdf.Pdf.new()
        p.add_blank_page(page_size=(72, 72))
        objname = p.pages[0].add_resource(
            p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0
        )
        print(objname)
        p.pages[0].Contents = pikepdf.Stream(
            p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname)
        )
        p.save(outpdf)
    return outpdf


def test_image_scale0(image_scale0):
    pi = pdfinfo.PdfInfo(
        image_scale0, detailed_analysis=True, progbar=False, max_workers=1
    )
    assert not pi.pages[0]._images[0].dpi.is_finite
    assert pi.pages[0].dpi == Resolution(0, 0)


================================================
FILE: tests/test_pipeline.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import warnings
from unittest.mock import Mock

import pytest
from PIL import Image
from reportlab.lib.units import inch
from reportlab.lib.utils import ImageReader
from reportlab.pdfgen.canvas import Canvas

from ocrmypdf import _pipeline, pdfinfo
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo import Encoding

warnings.filterwarnings(
    "ignore", category=DeprecationWarning, module="reportlab.lib.rl_safe_eval"
)


@pytest.fixture(scope='session')
def rgb_image():
    im = Image.new('RGB', (8, 8))
    im.putpixel((4, 4), (255, 0, 0))
    im.putpixel((5, 5), (0, 255, 0))
    im.putpixel((6, 6), (0, 0, 255))
    return ImageReader(im)


DUMMY_OVERSAMPLE_RESOLUTION = Resolution(42.0, 42.0)
VECTOR_RESOLUTION = Resolution(_pipeline.VECTOR_PAGE_DPI, _pipeline.VECTOR_PAGE_DPI)


@pytest.mark.parametrize(
    'image, text, vector, result',
    [
        (False, False, False, VECTOR_RESOLUTION),
        (False, True, False, VECTOR_RESOLUTION),
        (True, False, False, DUMMY_OVERSAMPLE_RESOLUTION),
        (True, True, False, VECTOR_RESOLUTION),
        (False, False, True, VECTOR_RESOLUTION),
        (False, True, True, VECTOR_RESOLUTION),
        (True, False, True, VECTOR_RESOLUTION),
        (True, True, True, VECTOR_RESOLUTION),
    ],
)
def test_dpi_needed(image, text, vector, result, rgb_image, outdir):
    c = Canvas(str(outdir / 'dpi.pdf'), pagesize=(5 * inch, 5 * inch))
    if image:
        c.drawImage(rgb_image, 1 * inch, 1 * inch, width=1 * inch, height=1 * inch)
    if text:
        c.drawString(1 * inch, 4 * inch, "Actual text")
    if vector:
        c.ellipse(3 * inch, 3 * inch, 4 * inch, 4 * inch)
    c.showPage()
    c.save()

    pi = pdfinfo.PdfInfo(outdir / 'dpi.pdf')
    pageinfo = pi[0]
    ctx = Mock()
    ctx.options.oversample = DUMMY_OVERSAMPLE_RESOLUTION[0]
    ctx.pageinfo = pageinfo

    assert _pipeline.get_canvas_square_dpi(ctx) == result
    assert _pipeline.get_page_square_dpi(ctx) == result


@pytest.mark.parametrize(
    # Name for nicer -v output
    'name,input,output',
    (
        (
            'empty_input',
            # Input:
            (),
            # Output:
            (),
        ),
        (
            'no_values',
            # Input:
            ('', '', '', '', ''),
            # Output:
            (((1, 5), None),),
        ),
        (
            'no_empty_values',
            # Input:
            ('v', 'w', 'x', 'y', 'z'),
            # Output:
            (
                ((1, 1), 'v'),
                ((2, 2), 'w'),
                ((3, 3), 'x'),
                ((4, 4), 'y'),
                ((5, 5), 'z'),
            ),
        ),
        (
            'skip_head',
            # Input:
            ('', '', 'x', 'y', 'z'),
            # Output:
            (
                ((1, 2), None),
                ((3, 3), 'x'),
                ((4, 4), 'y'),
                ((5, 5), 'z'),
            ),
        ),
        (
            'skip_tail',
            # Input:
            ('x', 'y', 'z', '', ''),
            # Output:
            (
                ((1, 1), 'x'),
                ((2, 2), 'y'),
                ((3, 3), 'z'),
                ((4, 5), None),
            ),
        ),
        (
            'range_in_middle',
            # Input:
            ('x', '', '', '', 'y'),
            # Output:
            (
                ((1, 1), 'x'),
                ((2, 4), None),
                ((5, 5), 'y'),
            ),
        ),
        (
            'range_in_middle_2',
            # Input:
            ('x', '', '', 'y', '', '', '', 'z'),
            # Output:
            (
                ((1, 1), 'x'),
                ((2, 3), None),
                ((4, 4), 'y'),
                ((5, 7), None),
                ((8, 8), 'z'),
            ),
        ),
    ),
)
def test_enumerate_compress_ranges(name, input, output):
    assert output == tuple(_pipeline.enumerate_compress_ranges(input))


@pytest.mark.parametrize(
    'encodings, expected',
    [
        # Empty images list returns False
        ([], False),
        # Single JPEG returns True
        ([Encoding.jpeg], True),
        # Single flate_jpeg returns True
        ([Encoding.flate_jpeg], True),
        # Mix of jpeg and flate_jpeg returns True
        ([Encoding.jpeg, Encoding.flate_jpeg], True),
        # Non-JPEG encoding returns False
        ([Encoding.flate], False),
        # Mix with non-JPEG returns False
        ([Encoding.jpeg, Encoding.flate], False),
        ([Encoding.flate_jpeg, Encoding.flate], False),
    ],
)
def test_should_visible_page_image_use_jpg(encodings, expected):
    """Test that should_visible_page_image_use_jpg correctly handles flate_jpeg."""
    pageinfo = Mock()
    pageinfo.images = [Mock(enc=enc) for enc in encodings]
    assert _pipeline.should_visible_page_image_use_jpg(pageinfo) == expected


================================================
FILE: tests/test_pipeline_generate_ocr.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Unit tests for pipeline support of generate_ocr().

These tests verify that the pipeline supports the new generate_ocr() API
alongside the existing hOCR path.
"""

from __future__ import annotations

import dataclasses
from pathlib import Path
from unittest.mock import MagicMock, patch

from ocrmypdf import BoundingBox, OcrElement


class TestOcrEngineDirect:
    """Test the ocr_engine_direct() pipeline function."""

    def test_ocr_engine_direct_function_exists(self):
        """ocr_engine_direct function should exist in _pipeline module."""
        from ocrmypdf import _pipeline

        assert hasattr(_pipeline, 'ocr_engine_direct')

    def test_ocr_engine_direct_returns_tuple(self, tmp_path):
        """ocr_engine_direct should return (OcrElement, Path) tuple."""
        from ocrmypdf._pipeline import ocr_engine_direct

        # Mock page context with an engine that supports generate_ocr
        mock_context = MagicMock()
        mock_engine = MagicMock()
        mock_engine.supports_generate_ocr.return_value = True
        mock_engine.generate_ocr.return_value = (
            OcrElement(ocr_class='ocr_page', bbox=BoundingBox(0, 0, 100, 100)),
            "test text",
        )
        mock_context.plugin_manager.get_ocr_engine.return_value = mock_engine
        mock_context.get_path.return_value = tmp_path / Path("test.txt")
        mock_context.pageno = 0

        with patch('builtins.open', MagicMock()):
            result = ocr_engine_direct(Path("test.png"), mock_context)

        assert isinstance(result, tuple)
        assert len(result) == 2


class TestPageResultExtension:
    """Test PageResult NamedTuple extension."""

    def test_page_result_has_ocr_tree_field(self):
        """PageResult should have ocr_tree field."""
        from ocrmypdf._pipelines._common import PageResult

        # PageResult is a NamedTuple, use _fields
        assert 'ocr_tree' in PageResult._fields

    def test_page_result_ocr_tree_default_none(self):
        """PageResult.ocr_tree should default to None."""
        from ocrmypdf._pipelines._common import PageResult

        result = PageResult(pageno=0)
        assert result.ocr_tree is None


class TestFpdf2DirectPage:
    """Test Fpdf2DirectPage dataclass for direct OcrElement input."""

    def test_fpdf2_direct_page_exists(self):
        """Fpdf2DirectPage dataclass should exist."""
        from ocrmypdf._graft import Fpdf2DirectPage

        assert Fpdf2DirectPage is not None

    def test_fpdf2_direct_page_has_ocr_tree(self):
        """Fpdf2DirectPage should have ocr_tree field."""
        from ocrmypdf._graft import Fpdf2DirectPage

        fields = {f.name for f in dataclasses.fields(Fpdf2DirectPage)}
        assert 'ocr_tree' in fields


class TestHOCRResultExtension:
    """Test HOCRResult dataclass extension."""

    def test_hocr_result_has_ocr_tree_field(self):
        """HOCRResult should have ocr_tree field."""
        from ocrmypdf._pipelines._common import HOCRResult

        fields = {f.name for f in dataclasses.fields(HOCRResult)}
        assert 'ocr_tree' in fields

    def test_hocr_result_ocr_tree_default_none(self):
        """HOCRResult.ocr_tree should default to None."""
        from ocrmypdf._pipelines._common import HOCRResult

        result = HOCRResult(pageno=0)
        assert result.ocr_tree is None


================================================
FILE: tests/test_preprocessing.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

from math import isclose

import pytest
from PIL import Image

from ocrmypdf._exec import ghostscript, tesseract
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo import PdfInfo
from ocrmypdf.pluginspec import GhostscriptRasterDevice

from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf

RENDERERS = ['fpdf2', 'sandwich']


def test_deskew(resources, outdir):
    # Run with deskew
    deskewed_pdf = check_ocrmypdf(resources / 'skew.pdf', outdir / 'skew.pdf', '-d')

    # Now render as an image again...
    deskewed_png = outdir / 'deskewed.png'

    ghostscript.rasterize_pdf(
        deskewed_pdf,
        deskewed_png,
        raster_device=GhostscriptRasterDevice.PNGMONO,
        raster_dpi=Resolution(150, 150),
        pageno=1,
    )

    # ...and use Tessera to find the skew angle to confirm that it was deskewed
    skew_angle = tesseract.get_deskew(deskewed_png, [], None, 5.0)
    print(skew_angle)
    assert -0.5 < skew_angle < 0.5, "Deskewing failed"


def test_deskew_blank_page(resources, outpdf):
    # Tesseract doesn't like blank pages - make sure we can get through
    check_ocrmypdf(resources / 'blank.pdf', outpdf, '--deskew')


@pytest.mark.xfail(reason="remove background disabled")
def test_remove_background(resources, outdir):
    # Ensure the input image does not contain pure white/black
    with Image.open(resources / 'baiona_color.jpg') as im:
        assert im.getextrema() != ((0, 255), (0, 255), (0, 255))

    output_pdf = check_ocrmypdf(
        resources / 'baiona_color.jpg',
        outdir / 'test_remove_bg.pdf',
        '--remove-background',
        '--image-dpi',
        '150',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    output_png = outdir / 'remove_bg.png'

    ghostscript.rasterize_pdf(
        output_pdf,
        output_png,
        raster_device=GhostscriptRasterDevice.PNG16M,
        raster_dpi=Resolution(100, 100),
        pageno=1,
    )

    # The output image should contain pure white and black
    with Image.open(output_png) as im:
        assert im.getextrema() == ((0, 255), (0, 255), (0, 255))


# This will run 5 * 2 * 2 = 20 test cases
@pytest.mark.parametrize(
    "pdf", ['palette.pdf', 'cmyk.pdf', 'ccitt.pdf', 'jbig2.pdf', 'lichtenstein.pdf']
)
@pytest.mark.parametrize("renderer", ['sandwich', 'fpdf2'])
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
def test_exotic_image(pdf, renderer, output_type, resources, outdir):
    outfile = outdir / f'test_{pdf}_{renderer}.pdf'
    check_ocrmypdf(
        resources / pdf,
        outfile,
        '-dc' if have_unpaper() else '-d',
        '-v',
        '1',
        '--output-type',
        output_type,
        '--sidecar',
        '--skip-text',
        '--pdf-renderer',
        renderer,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )

    assert outfile.with_suffix('.pdf.txt').exists()


@pytest.mark.parametrize('renderer', RENDERERS)
def test_non_square_resolution(renderer, resources, outpdf):
    # Confirm input image is non-square resolution
    in_pageinfo = PdfInfo(resources / 'aspect.pdf')
    assert in_pageinfo[0].dpi.x != in_pageinfo[0].dpi.y

    proc = run_ocrmypdf(
        resources / 'aspect.pdf',
        outpdf,
        '--pdf-renderer',
        renderer,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )
    # PDF/A conversion can fail for this file if Ghostscript >= 10.3, so don't test
    # exit code in that case
    if proc.returncode != ExitCode.pdfa_conversion_failed:
        proc.check_returncode()

    out_pageinfo = PdfInfo(outpdf)

    # Confirm resolution was kept the same
    assert in_pageinfo[0].dpi == out_pageinfo[0].dpi


@pytest.mark.parametrize('renderer', RENDERERS)
def test_convert_to_square_resolution(renderer, resources, outpdf):
    # Confirm input image is non-square resolution
    in_pageinfo = PdfInfo(resources / 'aspect.pdf')
    assert in_pageinfo[0].dpi.x != in_pageinfo[0].dpi.y

    # --force-ocr requires means forced conversion to square resolution
    check_ocrmypdf(
        resources / 'aspect.pdf',
        outpdf,
        '--force-ocr',
        '--pdf-renderer',
        renderer,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )

    out_pageinfo = PdfInfo(outpdf)

    in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]

    # Resolution show now be equal
    assert out_p0.dpi.x == out_p0.dpi.y

    # Page size should match input page size
    assert isclose(in_p0.width_inches, out_p0.width_inches)
    assert isclose(in_p0.height_inches, out_p0.height_inches)

    # Because we rasterized the page to produce a new image, it should occupy
    # the entire page
    out_im_w = out_p0.images[0].width / out_p0.images[0].dpi.x
    out_im_h = out_p0.images[0].height / out_p0.images[0].dpi.y
    assert isclose(out_p0.width_inches, out_im_w)
    assert isclose(out_p0.height_inches, out_im_h)


================================================
FILE: tests/test_quality.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

from ocrmypdf import quality as qual


def test_quality_measurement():
    oqd = qual.OcrQualityDictionary(
        wordlist=["words", "words", "quick", "brown", "fox", "dog", "lazy"]
    )
    assert len(oqd.dictionary) == 6  # 6 unique

    assert (
        oqd.measure_words_matched("The quick brown fox jumps quickly over the lazy dog")
        == 0.5
    )
    assert oqd.measure_words_matched("12345 10% _f  7fox -brown   | words") == 1.0

    assert oqd.measure_words_matched("quick quick quick") == 1.0


================================================
FILE: tests/test_rasterizer.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Tests for the --rasterizer CLI option."""

from __future__ import annotations

from io import BytesIO

import img2pdf
import pikepdf
import pytest
from PIL import Image

from ocrmypdf._options import OcrOptions
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution

from .conftest import check_ocrmypdf

# Check if pypdfium2 is available
try:
    import pypdfium2  # noqa: F401

    PYPDFIUM_AVAILABLE = True
except ImportError:
    PYPDFIUM_AVAILABLE = False


class TestRasterizerOption:
    """Test the --rasterizer CLI option."""

    def test_rasterizer_auto_default(self, resources, outpdf):
        """Test that --rasterizer auto (default) works."""
        check_ocrmypdf(
            resources / 'graph.pdf',
            outpdf,
            '--rasterizer',
            'auto',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )

    def test_rasterizer_ghostscript(self, resources, outpdf):
        """Test that --rasterizer ghostscript works."""
        check_ocrmypdf(
            resources / 'graph.pdf',
            outpdf,
            '--rasterizer',
            'ghostscript',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )

    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
    def test_rasterizer_pypdfium(self, resources, outpdf):
        """Test that --rasterizer pypdfium works when pypdfium2 is installed."""
        check_ocrmypdf(
            resources / 'graph.pdf',
            outpdf,
            '--rasterizer',
            'pypdfium',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )

    def test_rasterizer_invalid(self):
        """Test that an invalid rasterizer value is rejected."""
        with pytest.raises(ValueError, match="rasterizer must be one of"):
            OcrOptions(
                input_file='test.pdf', output_file='out.pdf', rasterizer='invalid'
            )


class TestRasterizerWithRotation:
    """Test --rasterizer interaction with --rotate-pages."""

    def test_ghostscript_with_rotation(self, resources, outpdf):
        """Test Ghostscript rasterizer with page rotation."""
        check_ocrmypdf(
            resources / 'cardinal.pdf',
            outpdf,
            '--rasterizer',
            'ghostscript',
            '--rotate-pages',
            '--rotate-pages-threshold',
            '0.1',
            '--plugin',
            'tests/plugins/tesseract_cache.py',
        )

    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
    def test_pypdfium_with_rotation(self, resources, outpdf):
        """Test pypdfium rasterizer with page rotation."""
        check_ocrmypdf(
            resources / 'cardinal.pdf',
            outpdf,
            '--rasterizer',
            'pypdfium',
            '--rotate-pages',
            '--rotate-pages-threshold',
            '0.1',
            '--plugin',
            'tests/plugins/tesseract_cache.py',
        )

    def test_auto_with_rotation(self, resources, outpdf):
        """Test auto rasterizer with page rotation."""
        check_ocrmypdf(
            resources / 'cardinal.pdf',
            outpdf,
            '--rasterizer',
            'auto',
            '--rotate-pages',
            '--rotate-pages-threshold',
            '0.1',
            '--plugin',
            'tests/plugins/tesseract_cache.py',
        )


class TestRasterizerHookDirect:
    """Test rasterize_pdf_page hook directly with different rasterizer options."""

    def test_ghostscript_hook_respects_option(self, resources, tmp_path):
        """Test that Ghostscript hook returns None when pypdfium is requested."""
        pm = get_plugin_manager([])

        # Create options requesting pypdfium
        options = OcrOptions(
            input_file=resources / 'graph.pdf',
            output_file=tmp_path / 'out.pdf',
            rasterizer='pypdfium',
        )

        img = tmp_path / 'ghostscript_test.png'
        result = pm.rasterize_pdf_page(
            input_file=resources / 'graph.pdf',
            output_file=img,
            raster_device='pngmono',
            raster_dpi=Resolution(50, 50),
            page_dpi=Resolution(50, 50),
            pageno=1,
            rotation=0,
            filter_vector=False,
            stop_on_soft_error=True,
            options=options,
            use_cropbox=False,
        )
        # When pypdfium is requested:
        # - If pypdfium IS available, pypdfium handles it and returns the path
        # - If pypdfium is NOT available, both plugins return None
        #   (ghostscript returns None because pypdfium was requested,
        #    pypdfium returns None because it's not installed)
        if PYPDFIUM_AVAILABLE:
            assert result == img
        else:
            assert result is None

    def test_pypdfium_hook_respects_option(self, resources, tmp_path):
        """Test that pypdfium hook returns None when ghostscript is requested."""
        pm = get_plugin_manager([])

        # Create options requesting ghostscript
        options = OcrOptions(
            input_file=resources / 'graph.pdf',
            output_file=tmp_path / 'out.pdf',
            rasterizer='ghostscript',
        )

        img = tmp_path / 'pypdfium_test.png'
        result = pm.rasterize_pdf_page(
            input_file=resources / 'graph.pdf',
            output_file=img,
            raster_device='pngmono',
            raster_dpi=Resolution(50, 50),
            page_dpi=Resolution(50, 50),
            pageno=1,
            rotation=0,
            filter_vector=False,
            stop_on_soft_error=True,
            options=options,
            use_cropbox=False,
        )
        # Ghostscript should handle it
        assert result == img
        assert img.exists()

    def test_auto_uses_pypdfium_when_available(self, resources, tmp_path):
        """Test that auto mode uses pypdfium when available."""
        pm = get_plugin_manager([])

        options = OcrOptions(
            input_file=resources / 'graph.pdf',
            output_file=tmp_path / 'out.pdf',
            rasterizer='auto',
        )

        img = tmp_path / 'auto_test.png'
        result = pm.rasterize_pdf_page(
            input_file=resources / 'graph.pdf',
            output_file=img,
            raster_device='pngmono',
            raster_dpi=Resolution(50, 50),
            page_dpi=Resolution(50, 50),
            pageno=1,
            rotation=0,
            filter_vector=False,
            stop_on_soft_error=True,
            options=options,
            use_cropbox=False,
        )
        assert result == img
        assert img.exists()


def _create_gradient_image(width: int, height: int) -> Image.Image:
    """Create an image with multiple gradients to detect rasterization errors.

    The image contains:
    - Horizontal gradient from red to blue
    - Vertical gradient overlay from green to transparent
    - Diagonal bands for edge detection
    """
    img = Image.new('RGB', (width, height))
    pixels = img.load()

    for y in range(height):
        for x in range(width):
            # Horizontal gradient: red to blue
            r = int(255 * (1 - x / width))
            b = int(255 * (x / width))

            # Vertical gradient: add green component
            g = int(255 * (y / height))

            # Add diagonal bands for edge detection
            band = ((x + y) // 20) % 2
            if band:
                r = min(255, r + 40)
                g = min(255, g + 40)
                b = min(255, b + 40)

            pixels[x, y] = (r, g, b)

    return img


@pytest.fixture
def pdf_with_nonstandard_boxes(tmp_path):
    """Create a PDF with nonstandard MediaBox, TrimBox and CropBox."""
    # Create an image with gradients to detect rasterization errors
    img = _create_gradient_image(200, 300)
    img_bytes = BytesIO()
    img.save(img_bytes, format='PNG')
    img_bytes.seek(0)

    # Convert to PDF
    pdf_bytes = BytesIO()
    img2pdf.convert(
        img_bytes.read(),
        layout_fun=img2pdf.get_fixed_dpi_layout_fun((72, 72)),
        outputstream=pdf_bytes,
        **IMG2PDF_KWARGS,
    )
    pdf_bytes.seek(0)

    # Modify the PDF to have nonstandard boxes
    pdf_path = tmp_path / 'nonstandard_boxes.pdf'
    with pikepdf.open(pdf_bytes) as pdf:
        page = pdf.pages[0]
        # Set MediaBox larger than content
        page.MediaBox = pikepdf.Array([0, 0, 400, 500])
        # Set CropBox smaller - this is what viewers typically show
        page.CropBox = pikepdf.Array([50, 50, 350, 450])
        # Set TrimBox even smaller - indicates intended trim area
        page.TrimBox = pikepdf.Array([75, 75, 325, 425])
        pdf.save(pdf_path)

    return pdf_path


@pytest.fixture
def pdf_with_negative_mediabox(tmp_path):
    """Create a PDF with MediaBox that has negative origin coordinates."""
    # Create an image with gradients to detect rasterization errors
    img = _create_gradient_image(200, 300)
    img_bytes = BytesIO()
    img.save(img_bytes, format='PNG')
    img_bytes.seek(0)

    pdf_bytes = BytesIO()
    img2pdf.convert(
        img_bytes.read(),
        layout_fun=img2pdf.get_fixed_dpi_layout_fun((72, 72)),
        outputstream=pdf_bytes,
        **IMG2PDF_KWARGS,
    )
    pdf_bytes.seek(0)

    pdf_path = tmp_path / 'negative_mediabox.pdf'
    with pikepdf.open(pdf_bytes) as pdf:
        page = pdf.pages[0]
        # MediaBox with negative origin (valid PDF but unusual)
        page.MediaBox = pikepdf.Array([-100, -100, 300, 400])
        pdf.save(pdf_path)

    return pdf_path


class TestRasterizerWithNonStandardBoxes:
    """Test rasterizers with PDFs having nonstandard MediaBox/TrimBox/CropBox."""

    def test_ghostscript_nonstandard_boxes(self, pdf_with_nonstandard_boxes, outpdf):
        """Test Ghostscript handles nonstandard page boxes correctly."""
        check_ocrmypdf(
            pdf_with_nonstandard_boxes,
            outpdf,
            '--rasterizer',
            'ghostscript',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )

    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
    def test_pypdfium_nonstandard_boxes(self, pdf_with_nonstandard_boxes, outpdf):
        """Test pypdfium handles nonstandard page boxes correctly."""
        check_ocrmypdf(
            pdf_with_nonstandard_boxes,
            outpdf,
            '--rasterizer',
            'pypdfium',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )

    def test_ghostscript_negative_mediabox(self, pdf_with_negative_mediabox, outpdf):
        """Test Ghostscript handles negative MediaBox origin."""
        check_ocrmypdf(
            pdf_with_negative_mediabox,
            outpdf,
            '--rasterizer',
            'ghostscript',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )

    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
    def test_pypdfium_negative_mediabox(self, pdf_with_negative_mediabox, outpdf):
        """Test pypdfium handles negative MediaBox origin."""
        check_ocrmypdf(
            pdf_with_negative_mediabox,
            outpdf,
            '--rasterizer',
            'pypdfium',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )

    def test_compare_rasterizers_nonstandard_boxes(
        self, pdf_with_nonstandard_boxes, tmp_path
    ):
        """Compare output dimensions between rasterizers for nonstandard boxes."""
        pm = get_plugin_manager([])

        options_gs = OcrOptions(
            input_file=pdf_with_nonstandard_boxes,
            output_file=tmp_path / 'out_gs.pdf',
            rasterizer='ghostscript',
        )

        img_gs = tmp_path / 'gs.png'
        pm.rasterize_pdf_page(
            input_file=pdf_with_nonstandard_boxes,
            output_file=img_gs,
            raster_device='png16m',
            raster_dpi=Resolution(72, 72),
            page_dpi=Resolution(72, 72),
            pageno=1,
            rotation=0,
            filter_vector=False,
            stop_on_soft_error=True,
            options=options_gs,
            use_cropbox=False,
        )

        with Image.open(img_gs) as im_gs:
            gs_size = im_gs.size

        if PYPDFIUM_AVAILABLE:
            options_pdfium = OcrOptions(
                input_file=pdf_with_nonstandard_boxes,
                output_file=tmp_path / 'out_pdfium.pdf',
                rasterizer='pypdfium',
            )

            img_pdfium = tmp_path / 'pdfium.png'
            pm.rasterize_pdf_page(
                input_file=pdf_with_nonstandard_boxes,
                output_file=img_pdfium,
                raster_device='png16m',
                raster_dpi=Resolution(72, 72),
                page_dpi=Resolution(72, 72),
                pageno=1,
                rotation=0,
                filter_vector=False,
                stop_on_soft_error=True,
                options=options_pdfium,
                use_cropbox=False,
            )

            with Image.open(img_pdfium) as im_pdfium:
                pdfium_size = im_pdfium.size

            # Both rasterizers should now produce MediaBox dimensions (400x500)
            # when use_cropbox=False (the default)
            assert gs_size == (400, 500), f"Ghostscript size: {gs_size}"
            assert pdfium_size == (400, 500), f"pypdfium size: {pdfium_size}"


class TestRasterizerWithRotationAndBoxes:
    """Test rasterizer + rotation + nonstandard boxes combinations."""

    # The pdf_with_nonstandard_boxes fixture creates a PDF with:
    # - MediaBox: [0, 0, 400, 500] → 400x500 points
    # - CropBox: [50, 50, 350, 450] → 300x400 points
    # - TrimBox: [75, 75, 325, 425] → 250x350 points
    #
    # With use_cropbox=False (default), both rasterizers use MediaBox
    MEDIABOX_WIDTH = 400
    MEDIABOX_HEIGHT = 500

    def _get_expected_size(self, rotation: int) -> tuple[int, int]:
        """Get expected image dimensions after rotation."""
        width, height = self.MEDIABOX_WIDTH, self.MEDIABOX_HEIGHT

        if rotation in (0, 180):
            return (width, height)
        else:  # 90, 270
            return (height, width)

    def test_ghostscript_rotation_dimensions(
        self, pdf_with_nonstandard_boxes, tmp_path
    ):
        """Test Ghostscript produces correct dimensions with rotation."""
        pm = get_plugin_manager([])

        options = OcrOptions(
            input_file=pdf_with_nonstandard_boxes,
            output_file=tmp_path / 'out.pdf',
            rasterizer='ghostscript',
        )

        for rotation in [0, 90, 180, 270]:
            img_path = tmp_path / f'gs_rot{rotation}.png'
            pm.rasterize_pdf_page(
                input_file=pdf_with_nonstandard_boxes,
                output_file=img_path,
                raster_device='png16m',
                raster_dpi=Resolution(72, 72),
                page_dpi=Resolution(72, 72),
                pageno=1,
                rotation=rotation,
                filter_vector=False,
                stop_on_soft_error=True,
                options=options,
                use_cropbox=False,
            )
            assert img_path.exists(), f"Failed to rasterize with rotation {rotation}"

            with Image.open(img_path) as img:
                expected = self._get_expected_size(rotation)
                # Allow small tolerance for rounding
                assert abs(img.size[0] - expected[0]) <= 2, (
                    f"Width mismatch at {rotation}°: got {img.size[0]}, "
                    f"expected {expected[0]}"
                )
                assert abs(img.size[1] - expected[1]) <= 2, (
                    f"Height mismatch at {rotation}°: got {img.size[1]}, "
                    f"expected {expected[1]}"
                )

    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
    def test_pypdfium_rotation_dimensions(self, pdf_with_nonstandard_boxes, tmp_path):
        """Test pypdfium produces correct dimensions with rotation."""
        pm = get_plugin_manager([])

        options = OcrOptions(
            input_file=pdf_with_nonstandard_boxes,
            output_file=tmp_path / 'out.pdf',
            rasterizer='pypdfium',
        )

        for rotation in [0, 90, 180, 270]:
            img_path = tmp_path / f'pdfium_rot{rotation}.png'
            pm.rasterize_pdf_page(
                input_file=pdf_with_nonstandard_boxes,
                output_file=img_path,
                raster_device='png16m',
                raster_dpi=Resolution(72, 72),
                page_dpi=Resolution(72, 72),
                pageno=1,
                rotation=rotation,
                filter_vector=False,
                stop_on_soft_error=True,
                options=options,
                use_cropbox=False,
            )
            assert img_path.exists(), f"Failed to rasterize with rotation {rotation}"

            with Image.open(img_path) as img:
                expected = self._get_expected_size(rotation)
                # Allow small tolerance for rounding
                assert abs(img.size[0] - expected[0]) <= 2, (
                    f"Width mismatch at {rotation}°: got {img.size[0]}, "
                    f"expected {expected[0]}"
                )
                assert abs(img.size[1] - expected[1]) <= 2, (
                    f"Height mismatch at {rotation}°: got {img.size[1]}, "
                    f"expected {expected[1]}"
                )

    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason="pypdfium2 not installed")
    def test_rasterizers_produce_same_dimensions(
        self, pdf_with_nonstandard_boxes, tmp_path
    ):
        """Verify ghostscript and pypdfium produce the same MediaBox dimensions.

        With use_cropbox=False (the default), both rasterizers should render
        to the MediaBox and produce identical dimensions.
        """
        pm = get_plugin_manager([])

        for rotation in [0, 90, 180, 270]:
            # Rasterize with Ghostscript
            gs_options = OcrOptions(
                input_file=pdf_with_nonstandard_boxes,
                output_file=tmp_path / 'out.pdf',
                rasterizer='ghostscript',
            )
            gs_img_path = tmp_path / f'gs_cmp_rot{rotation}.png'
            pm.rasterize_pdf_page(
                input_file=pdf_with_nonstandard_boxes,
                output_file=gs_img_path,
                raster_device='png16m',
                raster_dpi=Resolution(72, 72),
                page_dpi=Resolution(72, 72),
                pageno=1,
                rotation=rotation,
                filter_vector=False,
                stop_on_soft_error=True,
                options=gs_options,
                use_cropbox=False,
            )

            # Rasterize with pypdfium
            pdfium_options = OcrOptions(
                input_file=pdf_with_nonstandard_boxes,
                output_file=tmp_path / 'out.pdf',
                rasterizer='pypdfium',
            )
            pdfium_img_path = tmp_path / f'pdfium_cmp_rot{rotation}.png'
            pm.rasterize_pdf_page(
                input_file=pdf_with_nonstandard_boxes,
                output_file=pdfium_img_path,
                raster_device='png16m',
                raster_dpi=Resolution(72, 72),
                page_dpi=Resolution(72, 72),
                pageno=1,
                rotation=rotation,
                filter_vector=False,
                stop_on_soft_error=True,
                options=pdfium_options,
                use_cropbox=False,
            )

            # Verify both produce the same MediaBox dimensions
            with (
                Image.open(gs_img_path) as gs_img,
                Image.open(pdfium_img_path) as pdfium_img,
            ):
                expected = self._get_expected_size(rotation)

                assert abs(gs_img.size[0] - expected[0]) <= 2, (
                    f"GS width at {rotation}°: {gs_img.size[0]}, "
                    f"expected {expected[0]}"
                )
                assert abs(gs_img.size[1] - expected[1]) <= 2, (
                    f"GS height at {rotation}°: {gs_img.size[1]}, "
                    f"expected {expected[1]}"
                )
                assert abs(pdfium_img.size[0] - expected[0]) <= 2, (
                    f"pdfium width at {rotation}°: {pdfium_img.size[0]}, "
                    f"expected {expected[0]}"
                )
                assert abs(pdfium_img.size[1] - expected[1]) <= 2, (
                    f"pdfium height at {rotation}°: {pdfium_img.size[1]}, "
                    f"expected {expected[1]}"
                )


================================================
FILE: tests/test_rotation.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import operator
from io import BytesIO
from math import cos, pi, sin
from os import fspath
from subprocess import run

import img2pdf
import pikepdf
import pytest
from PIL import Image, ImageChops
from reportlab.pdfgen.canvas import Canvas

from ocrmypdf._exec import ghostscript
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution
from ocrmypdf.pdfinfo import PdfInfo
from ocrmypdf.pluginspec import GhostscriptRasterDevice

from .conftest import check_ocrmypdf, run_ocrmypdf_api

# pylintx: disable=unused-variable

RENDERERS = ['fpdf2', 'sandwich']


def compare_images_monochrome(
    outdir, reference_pdf, reference_pageno, test_pdf, test_pageno
):
    reference_png = outdir / f'{reference_pdf.name}.ref{reference_pageno:04d}.png'
    test_png = outdir / f'{test_pdf.name}.test{test_pageno:04d}.png'

    def rasterize(pdf, pageno, png):
        if png.exists():
            print(png)
            return
        ghostscript.rasterize_pdf(
            pdf,
            png,
            raster_device=GhostscriptRasterDevice.PNGMONO,
            raster_dpi=Resolution(100, 100),
            pageno=pageno,
            rotation=0,
        )

    rasterize(reference_pdf, reference_pageno, reference_png)
    rasterize(test_pdf, test_pageno, test_png)

    with Image.open(reference_png) as reference_im, Image.open(test_png) as test_im:
        assert reference_im.mode == test_im.mode == '1'
        assert reference_im.size == test_im.size, "Images must be the same size"

        # XOR the images: matching pixels become 0, different pixels become 1
        difference = ImageChops.logical_xor(reference_im, test_im)

        # Count matching pixels directly using getcolors()
        # For a binary image, getcolors returns [(count, 0), (count, 1)] or subset
        colors = difference.getcolors()
        color_counts = {color: count for count, color in colors}
        count_same = color_counts.get(0, 0)  # 0 = matching pixels (XOR result is 0)
        count_different = color_counts.get(255, 0)  # 255 = different pixels
        total = count_same + count_different

        return count_same / total


def test_monochrome_comparison(resources, outdir):
    # Self test: check that an incorrect rotated image has poor
    # comparison with reference
    cmp = compare_images_monochrome(
        outdir,
        reference_pdf=resources / 'cardinal.pdf',
        reference_pageno=1,  # north facing page
        test_pdf=resources / 'cardinal.pdf',
        test_pageno=3,  # south facing page
    )
    assert cmp < 0.90
    cmp = compare_images_monochrome(
        outdir,
        reference_pdf=resources / 'cardinal.pdf',
        reference_pageno=2,
        test_pdf=resources / 'cardinal.pdf',
        test_pageno=2,
    )
    assert cmp > 0.95


@pytest.mark.slow
@pytest.mark.parametrize('renderer', RENDERERS)
def test_autorotate(renderer, resources, outdir):
    # cardinal.pdf contains four copies of an image rotated in each cardinal
    # direction - these ones are "burned in" not tagged with /Rotate
    check_ocrmypdf(
        resources / 'cardinal.pdf',
        outdir / 'out.pdf',
        '-r',
        '-v',
        '1',
        '--pdf-renderer',
        renderer,
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )
    for n in range(1, 4 + 1):
        cmp = compare_images_monochrome(
            outdir,
            reference_pdf=resources / 'cardinal.pdf',
            reference_pageno=1,
            test_pdf=outdir / 'out.pdf',
            test_pageno=n,
        )
        assert cmp > 0.95


@pytest.mark.parametrize(
    'threshold, op, comparison_threshold',
    [
        ('1', operator.ge, 0.95),  # Low thresh -> always rotate -> high score
        ('99', operator.le, 0.90),  # High thres -> never rotate -> low score
    ],
)
def test_autorotate_threshold(threshold, op, comparison_threshold, resources, outdir):
    check_ocrmypdf(
        resources / 'cardinal.pdf',
        outdir / 'out.pdf',
        '--rotate-pages-threshold',
        threshold,
        '-r',
        # '-v',
        # '1',
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )

    cmp = compare_images_monochrome(  # pylint: disable=unused-variable
        outdir,
        reference_pdf=resources / 'cardinal.pdf',
        reference_pageno=1,
        test_pdf=outdir / 'out.pdf',
        test_pageno=3,
    )

    assert op(cmp, comparison_threshold)


@pytest.mark.parametrize('rasterizer', ['pypdfium', 'ghostscript'])
def test_rotated_skew_timeout(resources, outpdf, rasterizer):
    """Check rotated skew timeout.

    This document contains an image that is rotated 90 into place with a
    /Rotate tag and intentionally skewed by altering the transformation matrix.

    This tests for a bug where the combination of preprocessing and a tesseract
    timeout produced a page whose dimensions did not match the original's.
    """
    input_file = resources / 'rotated_skew.pdf'
    in_pageinfo = PdfInfo(input_file)[0]

    assert (
        in_pageinfo.height_pixels < in_pageinfo.width_pixels
    ), "Expected the input page to be landscape"
    assert in_pageinfo.rotation == 90, "Expected a rotated page"

    out = check_ocrmypdf(
        input_file,
        outpdf,
        '--pdf-renderer',
        'fpdf2',
        '--deskew',
        '--tesseract-timeout',
        '0',
        '--rasterizer',
        rasterizer,
    )

    out_pageinfo = PdfInfo(out)[0]
    w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels

    assert h > w, "Expected the output page to be portrait"

    assert out_pageinfo.rotation == 0, "Expected no page rotation for output"

    assert (
        in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w
    ), "Expected page rotation to be baked in"


@pytest.mark.parametrize('rasterizer', ['pypdfium', 'ghostscript'])
def test_rotate_deskew_ocr_timeout(resources, outdir, rasterizer):
    check_ocrmypdf(
        resources / 'rotated_skew.pdf',
        outdir / 'deskewed.pdf',
        '--rotate-pages',
        '--rotate-pages-threshold',
        '0',
        '--deskew',
        '--tesseract-timeout',
        '0',
        '--pdf-renderer',
        'fpdf2',
        '--rasterizer',
        rasterizer,
    )

    cmp = compare_images_monochrome(
        outdir,
        reference_pdf=resources / 'ccitt.pdf',
        reference_pageno=1,
        test_pdf=outdir / 'deskewed.pdf',
        test_pageno=1,
    )

    # Confirm that the page still got deskewed
    # pypdfium anti-aliases so gets better visual quality, but lower score (0.88)
    # on monochrome comparison; ghostscript looks ugly but gets > 0.95
    assert cmp > 0.85


def make_rotate_test(imagefile, outdir, prefix, image_angle, page_angle, cropbox=None):
    memimg = BytesIO()
    with Image.open(fspath(imagefile)) as im:
        if image_angle != 0:
            ccw_angle = -image_angle % 360
            im = im.transpose(getattr(Image.Transpose, f'ROTATE_{ccw_angle}'))
        im.save(memimg, format='PNG')
    memimg.seek(0)
    mempdf = BytesIO()
    img2pdf.convert(
        memimg.read(),
        layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
        outputstream=mempdf,
        **IMG2PDF_KWARGS,
    )
    mempdf.seek(0)
    with pikepdf.open(mempdf) as pdf:
        pdf.pages[0].Rotate = page_angle
        target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
        if cropbox:
            pdf.pages[0].CropBox = cropbox
        pdf.save(target)
        return target


@pytest.mark.slow
@pytest.mark.parametrize('page_angle', (0, 90, 180, 270))
@pytest.mark.parametrize('image_angle', (0, 90, 180, 270))
def test_rotate_page_level(image_angle, page_angle, resources, outdir, caplog):
    reference = make_rotate_test(resources / 'typewriter.png', outdir, 'ref', 0, 0)
    test = make_rotate_test(
        resources / 'typewriter.png', outdir, 'test', image_angle, page_angle
    )
    out = test.with_suffix('.out.pdf')

    exitcode = run_ocrmypdf_api(
        test,
        out,
        '-O0',
        '--rotate-pages',
        '--rotate-pages-threshold',
        '0.001',
    )
    assert exitcode == 0, caplog.text

    assert compare_images_monochrome(outdir, reference, 1, out, 1) > 0.2


@pytest.mark.slow
@pytest.mark.parametrize('page_rotate_angle', (0, 90, 180, 270))
def test_page_rotate_tag(page_rotate_angle, resources, outdir, caplog):
    # Check that pages that have an image that is misrotated but restored to
    # correct rotation with a /Rotate will be processed correct and yield text.
    test = make_rotate_test(
        resources / 'crom.png', outdir, 'test', -page_rotate_angle, page_rotate_angle
    )
    out = test.with_suffix('.out.pdf')
    exitcode = run_ocrmypdf_api(
        test,
        out,
        '-O0',
    )
    assert exitcode == 0, caplog.text

    def pdftotext(filename):
        return (
            run(['pdftotext', '-enc', 'UTF-8', filename, '-'], capture_output=True)
            .stdout.strip()
            .decode('utf-8')
        )

    test_text = pdftotext(out)
    assert 'is a' in test_text, test_text


@pytest.mark.parametrize('page_rotate_angle', (0, 90, 180, 270))
@pytest.mark.parametrize('renderer', ['sandwich', 'fpdf2'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_rotate_and_crop(
    resources, outdir, page_rotate_angle, renderer, output_type, caplog
):
    cropbox = (100, 200, 1000, 800)
    reference = make_rotate_test(
        resources / 'typewriter.png', outdir, 'ref', 0, 0, cropbox
    )
    test = make_rotate_test(
        resources / 'typewriter.png',
        outdir,
        'test',
        -page_rotate_angle,
        page_rotate_angle,
        cropbox,
    )
    out = test.with_suffix('.out.pdf')

    exitcode = run_ocrmypdf_api(
        test,
        out,
        '-O0',
        '--rotate-pages',
        '--rotate-pages-threshold',
        '0',
        '--pdf-renderer',
        renderer,
        '--output-type',
        output_type,
        '--no-progress-bar',
    )
    assert exitcode == 0, caplog.text

    assert compare_images_monochrome(outdir, reference, 1, out, 1) > 0.9


@pytest.mark.parametrize('rasterizer', ['pypdfium', 'ghostscript'])
def test_rasterize_rotates(resources, tmp_path, rasterizer):
    from ocrmypdf._options import OcrOptions

    pm = get_plugin_manager([])

    options = OcrOptions(
        input_file=resources / 'graph.pdf',
        output_file=tmp_path / 'out.pdf',
        rasterizer=rasterizer,
    )

    img = tmp_path / 'img90.png'
    pm.rasterize_pdf_page(
        input_file=resources / 'graph.pdf',
        output_file=img,
        raster_device=GhostscriptRasterDevice.PNGMONO,
        raster_dpi=Resolution(20, 20),
        page_dpi=Resolution(20, 20),
        pageno=1,
        rotation=90,
        filter_vector=False,
        stop_on_soft_error=True,
        options=options,
        use_cropbox=False,
    )
    with Image.open(img) as im:
        assert im.size == (83, 200), "Image not rotated"

    img = tmp_path / 'img180.png'
    pm.rasterize_pdf_page(
        input_file=resources / 'graph.pdf',
        output_file=img,
        raster_device=GhostscriptRasterDevice.PNGMONO,
        raster_dpi=Resolution(20, 20),
        page_dpi=Resolution(20, 20),
        pageno=1,
        rotation=180,
        filter_vector=False,
        stop_on_soft_error=True,
        options=options,
        use_cropbox=False,
    )
    assert Image.open(img).size == (200, 83), "Image not rotated"


def test_simulated_scan(outdir):
    canvas = Canvas(
        fspath(outdir / 'fakescan.pdf'),
        pagesize=(209.8, 297.6),
    )

    page_vars = [(2, 36, 250), (91, 170, 240), (179, 190, 36), (271, 36, 36)]

    for n, page_var in enumerate(page_vars):
        text = canvas.beginText()
        text.setFont('Helvetica', 20)

        angle, x, y = page_var
        cos_a, sin_a = cos(angle / 180.0 * pi), sin(angle / 180.0 * pi)

        text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, x, y)
        text.textOut(f'Page {n + 1}')
        canvas.drawText(text)
        canvas.showPage()
    canvas.save()

    check_ocrmypdf(
        outdir / 'fakescan.pdf',
        outdir / 'out.pdf',
        '--force-ocr',
        '--deskew',
        '--rotate-pages',
        '--plugin',
        'tests/plugins/tesseract_debug_rotate.py',
    )

    with pikepdf.open(outdir / 'out.pdf') as pdf:
        assert (
            pdf.pages[1].mediabox[2] > pdf.pages[1].mediabox[3]
        ), "Wrong orientation: not landscape"
        assert (
            pdf.pages[3].mediabox[2] > pdf.pages[3].mediabox[3]
        ), "Wrong orientation: Not landscape"

        assert (
            pdf.pages[0].mediabox[2] < pdf.pages[0].mediabox[3]
        ), "Wrong orientation: Not portrait"
        assert (
            pdf.pages[2].mediabox[2] < pdf.pages[2].mediabox[3]
        ), "Wrong orientation: Not portrait"


================================================
FILE: tests/test_semfree.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import sys

import pytest

from ocrmypdf.exceptions import ExitCode

from .conftest import is_linux, run_ocrmypdf_api


@pytest.mark.skipif(not is_linux(), reason='semfree plugin only works on Linux')
@pytest.mark.skipif(
    sys.version_info >= (3, 14),
    reason='semfree plugin only works on Python 3.13 or earlier',
)
def test_semfree(resources, outpdf):
    with pytest.warns(DeprecationWarning, match="semfree.py is deprecated"):
        exitcode = run_ocrmypdf_api(
            resources / 'multipage.pdf',
            outpdf,
            '--skip-text',
            '--skip-big',
            '2',
            '--plugin',
            'ocrmypdf.extra_plugins.semfree',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )
        assert exitcode in (ExitCode.ok, ExitCode.pdfa_conversion_failed)


================================================
FILE: tests/test_soft_error.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import os

import pytest

from ocrmypdf.exceptions import ExitCode

from .conftest import run_ocrmypdf_api


def test_raster_continue_on_soft_error(resources, outpdf):
    exitcode = run_ocrmypdf_api(
        resources / 'francais.pdf',
        outpdf,
        '--continue-on-soft-render-error',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        '--plugin',
        'tests/plugins/gs_raster_soft_error.py',
    )
    assert exitcode == ExitCode.ok


def test_raster_stop_on_soft_error(resources, outpdf):
    exitcode = run_ocrmypdf_api(
        resources / 'francais.pdf',
        outpdf,
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        '--plugin',
        'tests/plugins/gs_raster_soft_error.py',
    )
    assert exitcode == ExitCode.child_process_error


def test_render_continue_on_soft_error(resources, outpdf):
    exitcode = run_ocrmypdf_api(
        resources / 'francais.pdf',
        outpdf,
        '--output-type',
        'pdfa',  # Required to trigger Ghostscript PDF/A generation
        '--continue-on-soft-render-error',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        '--plugin',
        'tests/plugins/gs_render_soft_error.py',
    )
    assert exitcode == ExitCode.ok


@pytest.mark.skipif(os.name == 'nt', reason='Ghostscript on Windows errors out')
def test_render_stop_on_soft_error(resources, outpdf):
    exitcode = run_ocrmypdf_api(
        resources / 'francais.pdf',
        outpdf,
        '--output-type',
        'pdfa',  # Required to trigger Ghostscript PDF/A generation
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        '--plugin',
        'tests/plugins/gs_render_soft_error.py',
    )
    assert exitcode == ExitCode.child_process_error


================================================
FILE: tests/test_stdio.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import os
from subprocess import DEVNULL, PIPE, run

import pytest

from ocrmypdf.helpers import check_pdf

from .conftest import run_ocrmypdf


def test_stdin(ocrmypdf_exec, resources, outpdf):
    input_file = str(resources / 'francais.pdf')
    output_file = str(outpdf)

    # Runs: ocrmypdf - output.pdf < testfile.pdf
    with open(input_file, 'rb') as input_stream:
        p_args = ocrmypdf_exec + [
            '-',
            output_file,
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        ]
        run(p_args, capture_output=True, stdin=input_stream, check=True)


def test_stdout(ocrmypdf_exec, resources, outpdf):
    if 'COV_CORE_DATAFILE' in os.environ:
        pytest.skip("Coverage uses stdout")

    input_file = str(resources / 'francais.pdf')
    output_file = str(outpdf)

    # Runs: ocrmypdf francais.pdf - > test_stdout.pdf
    with open(output_file, 'wb') as output_stream:
        p_args = ocrmypdf_exec + [
            input_file,
            '-',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        ]
        run(p_args, stdout=output_stream, stderr=PIPE, stdin=DEVNULL, check=True)

    assert check_pdf(output_file)


@pytest.mark.skipif(os.name == 'nt', reason='Windows does not support /dev/null')
def test_dev_null(resources):
    if 'COV_CORE_DATAFILE' in os.environ:
        pytest.skip("Coverage uses stdout")

    p = run_ocrmypdf(
        resources / 'trivial.pdf',
        os.devnull,
        '--force-ocr',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    assert p.returncode == 0, "could not send output to /dev/null"
    assert len(p.stdout) == 0, "wrote to stdout"


================================================
FILE: tests/test_system_font_provider.py
================================================
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Unit tests for SystemFontProvider and ChainedFontProvider."""

from __future__ import annotations

import sys
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from ocrmypdf.font import (
    BuiltinFontProvider,
    ChainedFontProvider,
    SystemFontProvider,
)

# --- SystemFontProvider Platform Detection Tests ---


class TestSystemFontProviderPlatform:
    """Test platform detection in SystemFontProvider."""

    def test_get_platform_linux(self):
        """Test Linux platform detection."""
        provider = SystemFontProvider()
        with patch.object(sys, 'platform', 'linux'):
            assert provider._get_platform() == 'linux'

    def test_get_platform_darwin(self):
        """Test macOS platform detection."""
        provider = SystemFontProvider()
        with patch.object(sys, 'platform', 'darwin'):
            assert provider._get_platform() == 'darwin'

    def test_get_platform_windows(self):
        """Test Windows platform detection."""
        provider = SystemFontProvider()
        with patch.object(sys, 'platform', 'win32'):
            assert provider._get_platform() == 'windows'

    def test_get_platform_freebsd(self):
        """Test FreeBSD platform detection."""
        provider = SystemFontProvider()
        with patch.object(sys, 'platform', 'freebsd13'):
            assert provider._get_platform() == 'freebsd'


class TestSystemFontProviderDirectories:
    """Test font directory resolution."""

    def test_linux_font_dirs(self):
        """Test Linux font directories."""
        provider = SystemFontProvider()
        with patch.object(sys, 'platform', 'linux'):
            provider._font_dirs = None  # Reset cache
            dirs = provider._get_font_dirs()
            assert Path('/usr/share/fonts') in dirs
            assert Path('/usr/local/share/fonts') in dirs

    def test_darwin_font_dirs(self):
        """Test macOS font directories."""
        provider = SystemFontProvider()
        with patch.object(sys, 'platform', 'darwin'):
            provider._font_dirs = None  # Reset cache
            dirs = provider._get_font_dirs()
            assert Path('/Library/Fonts') in dirs
            assert Path('/System/Library/Fonts') in dirs

    def test_windows_font_dirs_with_windir(self):
        """Test Windows font directory from WINDIR env var."""
        provider = SystemFontProvider()
        with (
            patch.object(sys, 'platform', 'win32'),
            patch.dict('os.environ', {'WINDIR': r'D:\Windows'}),
        ):
            provider._font_dirs = None  # Reset cache
            dirs = provider._get_font_dirs()
            # Check that Fonts subdir of WINDIR is included
            # Use str comparison to avoid Path normalization issues across platforms
            dir_strs = [str(d) for d in dirs]
            assert any('Fonts' in d for d in dir_strs)

    def test_windows_font_dirs_default(self):
        """Test Windows font directory with default path."""
        provider = SystemFontProvider()
        with (
            patch.object(sys, 'platform', 'win32'),
            patch.dict('os.environ', {}, clear=True),
        ):
            provider._font_dirs = None  # Reset cache
            dirs = provider._get_font_dirs()
            # Check that Windows\Fonts is included (default fallback)
            dir_strs = [str(d) for d in dirs]
            assert any('Windows' in d and 'Fonts' in d for d in dir_strs)

    def test_windows_font_dirs_with_localappdata(self):
        """Test Windows user fonts directory from LOCALAPPDATA env var."""
        provider = SystemFontProvider()
        with (
            patch.object(sys, 'platform', 'win32'),
            patch.dict(
                'os.environ',
                {'WINDIR': r'C:\Windows', 'LOCALAPPDATA': r'C:\Users\Test\AppData\Local'},
            ),
        ):
            provider._font_dirs = None  # Reset cache
            dirs = provider._get_font_dirs()
            dir_strs = [str(d) for d in dirs]
            # Should have both system and user font directories
            assert len(dirs) == 2
            assert any('Windows' in d and 'Fonts' in d for d in dir_strs)
            assert any(
                'AppData' in d and 'Local' in d and 'Fonts' in d
                for d in dir_strs
            )

    def test_font_dirs_cached(self):
        """Test that font directories are cached."""
        provider = SystemFontProvider()
        dirs1 = provider._get_font_dirs()
        dirs2 = provider._get_font_dirs()
        assert dirs1 is dirs2  # Same object, not recomputed


class TestSystemFontProviderLazyLoading:
    """Test lazy loading behavior."""

    def test_no_scanning_on_init(self):
        """Test that no directory scanning happens during initialization."""
        provider = SystemFontProvider()
        # Caches should be empty
        assert len(provider._font_cache) == 0
        assert len(provider._not_found) == 0

    def test_get_font_unknown_name_returns_none(self):
        """Test that unknown font names return None."""
        provider = SystemFontProvider()
        result = provider.get_font('UnknownFont-Regular')
        assert result is None
        # Unknown fonts are added to not_found to cache the negative result
        assert 'UnknownFont-Regular' in provider._not_found

    def test_negative_cache(self):
        """Test that not-found results are cached."""
        provider = SystemFontProvider()
        # Mock _find_font_file to return None
        with patch.object(provider, '_find_font_file', return_value=None):
            result1 = provider.get_font('NotoSansCJK-Regular')
            assert result1 is None
            assert 'NotoSansCJK-Regular' in provider._not_found

            # Second call should not call _find_font_file again
            provider._find_font_file = MagicMock(return_value=None)
            result2 = provider.get_font('NotoSansCJK-Regular')
            assert result2 is None
            provider._find_font_file.assert_not_called()

    def test_positive_cache(self):
        """Test that found fonts are cached."""
        provider = SystemFontProvider()
        font_dir = Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"
        font_path = font_dir / "NotoSans-Regular.ttf"

        if not font_path.exists():
            pytest.skip("Test font not available")

        with patch.object(provider, '_find_font_file', return_value=font_path):
            result1 = provider.get_font('NotoSans-Regular')
            assert result1 is not None
            assert 'NotoSans-Regular' in provider._font_cache

            # Second call should use cache
            provider._find_font_file = MagicMock()
            result2 = provider.get_font('NotoSans-Regular')
            assert result2 is result1
            provider._find_font_file.assert_not_called()


class TestSystemFontProviderAvailableFonts:
    """Test get_available_fonts method."""

    def test_returns_all_patterns(self):
        """Test that get_available_fonts returns all known font patterns."""
        provider = SystemFontProvider()
        fonts = provider.get_available_fonts()
        assert 'NotoSans-Regular' in fonts
        assert 'NotoSansCJK-Regular' in fonts
        assert 'NotoSansArabic-Regular' in fonts
        assert 'NotoSansThai-Regular' in fonts

    def test_fallback_font_raises(self):
        """Test that get_fallback_font raises NotImplementedError."""
        provider = SystemFontProvider()
        with pytest.raises(NotImplementedError):
            provider.get_fallback_font()


# --- ChainedFontProvider Tests ---


class TestChainedFontProvider:
    """Test ChainedFontProvider."""

    def test_requires_at_least_one_provider(self):
        """Test that empty provider list raises error."""
        with pytest.raises(ValueError, match="At least one provider"):
            ChainedFontProvider([])

    def test_get_font_tries_providers_in_order(self):
        """Test that get_font tries providers in order."""
        provider1 = MagicMock()
        provider1.get_font.return_value = None

        provider2 = MagicMock()
        mock_font = MagicMock()
        provider2.get_font.return_value = mock_font

        chain = ChainedFontProvider([provider1, provider2])
        result = chain.get_font('TestFont')

        provider1.get_font.assert_called_once_with('TestFont')
        provider2.get_font.assert_called_once_with('TestFont')
        assert result is mock_font

    def test_get_font_stops_on_first_match(self):
        """Test that get_font stops after first successful match."""
        mock_font = MagicMock()
        provider1 = MagicMock()
        provider1.get_font.return_value = mock_font

        provider2 = MagicMock()

        chain = ChainedFontProvider([provider1, provider2])
        result = chain.get_font('TestFont')

        provider1.get_font.assert_called_once()
        provider2.get_font.assert_not_called()
        assert result is mock_font

    def test_get_font_returns_none_if_all_fail(self):
        """Test that get_font returns None if all providers fail."""
        provider1 = MagicMock()
        provider1.get_font.return_value = None

        provider2 = MagicMock()
        provider2.get_font.return_value = None

        chain = ChainedFontProvider([provider1, provider2])
        result = chain.get_font('TestFont')

        assert result is None

    def test_get_available_fonts_combines_providers(self):
        """Test that get_available_fonts combines all providers."""
        provider1 = MagicMock()
        provider1.get_available_fonts.return_value = ['Font1', 'Font2']

        provider2 = MagicMock()
        provider2.get_available_fonts.return_value = ['Font2', 'Font3']

        chain = ChainedFontProvider([provider1, provider2])
        fonts = chain.get_available_fonts()

        assert fonts == ['Font1', 'Font2', 'Font3']  # Deduplicated, order preserved

    def test_get_fallback_font_from_first_provider(self):
        """Test that get_fallback_font uses first available fallback."""
        mock_font = MagicMock()
        provider1 = MagicMock()
        provider1.get_fallback_font.return_value = mock_font

        provider2 = MagicMock()

        chain = ChainedFontProvider([provider1, provider2])
        result = chain.get_fallback_font()

        assert result is mock_font
        provider2.get_fallback_font.assert_not_called()

    def test_get_fallback_font_skips_not_implemented(self):
        """Test that get_fallback_font skips providers that raise."""
        provider1 = MagicMock()
        provider1.get_fallback_font.side_effect = NotImplementedError()

        mock_font = MagicMock()
        provider2 = MagicMock()
        provider2.get_fallback_font.return_value = mock_font

        chain = ChainedFontProvider([provider1, provider2])
        result = chain.get_fallback_font()

        assert result is mock_font

    def test_get_fallback_font_raises_if_none_available(self):
        """Test that get_fallback_font raises if no provider has fallback."""
        provider1 = MagicMock()
        provider1.get_fallback_font.side_effect = NotImplementedError()

        provider2 = MagicMock()
        provider2.get_fallback_font.side_effect = KeyError()

        chain = ChainedFontProvider([provider1, provider2])
        with pytest.raises(RuntimeError, match="No fallback font available"):
            chain.get_fallback_font()


class TestChainedFontProviderIntegration:
    """Integration tests with real providers."""

    @pytest.fixture
    def font_dir(self):
        """Return path to font directory."""
        return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"

    def test_builtin_then_system_chain(self, font_dir):
        """Test chaining BuiltinFontProvider with SystemFontProvider."""
        builtin = BuiltinFontProvider(font_dir)
        system = SystemFontProvider()

        chain = ChainedFontProvider([builtin, system])

        # Should find NotoSans from builtin
        font = chain.get_font('NotoSans-Regular')
        assert font is not None

        # Should get fallback from builtin
        fallback = chain.get_fallback_font()
        assert fallback is not None

    def test_system_fonts_extend_builtin(self, font_dir):
        """Test that system fonts add to builtin fonts."""
        builtin = BuiltinFontProvider(font_dir)
        system = SystemFontProvider()

        chain = ChainedFontProvider([builtin, system])

        builtin_fonts = set(builtin.get_available_fonts())
        chain_fonts = set(chain.get_available_fonts())

        # Chain should have at least as many fonts as builtin
        assert chain_fonts >= builtin_fonts


================================================
FILE: tests/test_tagged.py
================================================
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import pytest

import ocrmypdf


def test_block_tagged(resources):
    with pytest.raises(ocrmypdf.exceptions.TaggedPDFError):
        ocrmypdf.ocr(resources / 'tagged.pdf', '_.pdf')


def test_force_tagged_warns(resources, outpdf, caplog):
    caplog.set_level('WARNING')
    ocrmypdf.ocr(
        resources / 'tagged.pdf',
        outpdf,
        force_ocr=True,
        plugins=['tests/plugins/tesseract_noop.py'],
    )
    assert 'marked as a Tagged PDF' in caplog.text


def test_tagged_pdf_mode_ignore_with_skip_text(resources, outpdf, caplog):
    """Ignore tagged_pdf_mode should warn but not error."""
    caplog.set_level('WARNING')
    ocrmypdf.ocr(
        resources / 'tagged.pdf',
        outpdf,
        tagged_pdf_mode='ignore',
        skip_text=True,  # Tagged PDF has text, so skip pages with text
        plugins=['tests/plugins/tesseract_noop.py'],
    )
    assert 'marked as a Tagged PDF' in caplog.text


def test_tagged_pdf_mode_ignore_with_force(resources, outpdf, caplog):
    """Ignore tagged_pdf_mode with force mode should warn."""
    caplog.set_level('WARNING')
    ocrmypdf.ocr(
        resources / 'tagged.pdf',
        outpdf,
        tagged_pdf_mode='ignore',
        force_ocr=True,
        plugins=['tests/plugins/tesseract_noop.py'],
    )
    assert 'marked as a Tagged PDF' in caplog.text


================================================
FILE: tests/test_tesseract.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import logging
import os
import subprocess
from os import fspath
from pathlib import Path

import pytest

from ocrmypdf import pdfinfo
from ocrmypdf._exec import tesseract
from ocrmypdf.exceptions import BadArgsError, MissingDependencyError

from .conftest import check_ocrmypdf, run_ocrmypdf_api

# pylint: disable=redefined-outer-name


@pytest.mark.parametrize('basename', ['graph_ocred.pdf', 'cardinal.pdf'])
def test_skip_pages_does_not_replicate(resources, basename, outdir):
    infile = resources / basename
    outpdf = outdir / basename

    check_ocrmypdf(
        infile,
        outpdf,
        '--pdf-renderer',
        'sandwich',
        '--force-ocr',
        '--tesseract-timeout',
        '0',
    )

    info_in = pdfinfo.PdfInfo(infile)

    info = pdfinfo.PdfInfo(outpdf)
    for page in info:
        assert len(page.images) == 1, "skipped page was replicated"

    for n, info_out_n in enumerate(info):
        assert info_out_n.width_inches == info_in[n].width_inches, "output resized"
        assert info_out_n.height_inches == info_in[n].height_inches, "output resized"


def test_content_preservation(resources, outpdf):
    infile = resources / 'masks.pdf'

    check_ocrmypdf(
        infile, outpdf, '--pdf-renderer', 'fpdf2', '--tesseract-timeout', '0'
    )

    info = pdfinfo.PdfInfo(outpdf)
    page = info[0]
    assert len(page.images) > 1, "masks were rasterized"


@pytest.mark.skipif(
    tesseract.version() >= tesseract.TesseractVersion('5'), reason="doesn't fool Tess 5"
)
def test_no_languages(tmp_path, monkeypatch):
    (tmp_path / 'tessdata').mkdir()
    monkeypatch.setenv('TESSDATA_PREFIX', fspath(tmp_path))
    with pytest.raises(MissingDependencyError):
        tesseract.get_languages()


def test_image_too_large_hocr(monkeypatch, resources, outdir):
    def dummy_run(args, *, env=None, **kwargs):
        raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')

    monkeypatch.setattr(tesseract, 'run', dummy_run)
    tesseract.generate_hocr(
        input_file=resources / 'crom.png',
        output_hocr=outdir / 'out.hocr',
        output_text=outdir / 'out.txt',
        languages=['eng'],
        engine_mode=None,
        tessconfig=[],
        timeout=180.0,
        pagesegmode=None,
        thresholding=0,
        user_words=None,
        user_patterns=None,
    )
    assert Path(outdir / 'out.hocr').read_text() == ''


def test_image_too_large_pdf(monkeypatch, resources, outdir):
    def dummy_run(args, *, env=None, **kwargs):
        raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')

    monkeypatch.setattr(tesseract, 'run', dummy_run)
    tesseract.generate_pdf(
        input_file=resources / 'crom.png',
        output_pdf=outdir / 'pdf.pdf',
        output_text=outdir / 'txt.txt',
        languages=['eng'],
        engine_mode=None,
        tessconfig=[],
        timeout=180.0,
        pagesegmode=None,
        thresholding=0,
        user_words=None,
        user_patterns=None,
    )
    assert Path(outdir / 'txt.txt').read_text() == '[skipped page]'
    if os.name != 'nt':  # different semantics
        assert Path(outdir / 'pdf.pdf').stat().st_size == 0


def test_timeout(caplog):
    tesseract.page_timedout(5)
    assert "took too long" in caplog.text


@pytest.mark.parametrize(
    'in_, logged',
    [
        (b'Tesseract Open Source', ''),
        (b'lots of diacritics blah blah', 'diacritics'),
        (b'Warning in pixReadMem', ''),
        (b'OSD: Weak margin', 'unsure about page orientation'),
        (b'Error in pixScanForForeground', ''),
        (b'Error in boxClipToRectangle', ''),
        (b'an unexpected error', 'an unexpected error'),
        (b'a dire warning', 'a dire warning'),
        (b'read_params_file something', 'read_params_file'),
        (b'an innocent message', 'innocent'),
        (b'\x7f\x7f\x80innocent unicode failure', 'innocent'),
    ],
)
def test_tesseract_log_output(caplog, in_, logged):
    caplog.set_level(logging.INFO)
    tesseract.tesseract_log_output(in_)
    if logged == '':
        assert caplog.text == ''
    else:
        assert logged in caplog.text


def test_tesseract_log_output_raises(caplog):
    with pytest.raises(tesseract.TesseractConfigError):
        tesseract.tesseract_log_output(b'parameter not found: moo')
    assert 'not found' in caplog.text


def test_blocked_language(resources, no_outpdf):
    infile = resources / 'masks.pdf'
    for bad_lang in ['osd', 'equ']:
        with pytest.raises(BadArgsError):
            run_ocrmypdf_api(infile, no_outpdf, '-l', bad_lang)


================================================
FILE: tests/test_unpaper.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import logging
from os import fspath
from unittest.mock import Mock, patch

import pytest
from packaging.version import Version
from pydantic import ValidationError

from ocrmypdf._exec import unpaper
from ocrmypdf._validation import check_options
from ocrmypdf.cli import get_options_and_plugins
from ocrmypdf.exceptions import ExitCode, MissingDependencyError

from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf_api

# pylint: disable=redefined-outer-name

needs_unpaper = pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")


def test_no_unpaper(resources, no_outpdf):
    input_ = fspath(resources / "c02-22.pdf")
    output = fspath(no_outpdf)

    options, pm = get_options_and_plugins(["--clean", input_, output])
    with patch("ocrmypdf._exec.unpaper.version") as mock:
        mock.side_effect = FileNotFoundError("unpaper")

        with pytest.raises(MissingDependencyError):
            check_options(options, pm)
        mock.assert_called()


def test_old_unpaper(resources, no_outpdf):
    input_ = fspath(resources / "c02-22.pdf")
    output = fspath(no_outpdf)

    options, pm = get_options_and_plugins(["--clean", input_, output])
    with patch("ocrmypdf._exec.unpaper.version") as mock:
        mock.return_value = Version('0.5')

        with pytest.raises(MissingDependencyError):
            check_options(options, pm)
        mock.assert_called()


def test_unpaper_version_chatter(resources, no_outpdf):
    input_ = fspath(resources / "c02-22.pdf")
    output = fspath(no_outpdf)

    options, pm = get_options_and_plugins(["--clean", input_, output])
    with patch("ocrmypdf.subprocess.run") as mock:
        mock.return_value = Mock(stdout='Warning: using insecure memory!\n7.0.0\n')

        with pytest.raises(MissingDependencyError):
            check_options(options, pm)
        mock.assert_called()


@needs_unpaper
def test_clean(resources, outpdf):
    check_ocrmypdf(
        resources / "skew.pdf",
        outpdf,
        "-c",
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


@needs_unpaper
def test_unpaper_args_valid(resources, outpdf):
    check_ocrmypdf(
        resources / "skew.pdf",
        outpdf,
        "-c",
        "--unpaper-args",
        "--layout double",  # Spaces required here
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


@needs_unpaper
def test_unpaper_args_invalid_filename(resources, outpdf, caplog):
    with pytest.raises(ValidationError, match="No filenames allowed"):
        run_ocrmypdf_api(
            resources / "skew.pdf",
            outpdf,
            "-c",
            "--unpaper-args",
            "/etc/passwd",
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )


@needs_unpaper
def test_unpaper_args_invalid(resources, outpdf):
    exitcode = run_ocrmypdf_api(
        resources / "skew.pdf",
        outpdf,
        "-c",
        "--unpaper-args",
        "unpaper is not going to like these arguments",
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )
    # Can't tell difference between unpaper choking on bad arguments or some
    # other unpaper failure
    assert exitcode == ExitCode.child_process_error


@needs_unpaper
def test_unpaper_image_too_big(resources, outdir, caplog):
    with patch('ocrmypdf._exec.unpaper.UNPAPER_IMAGE_PIXEL_LIMIT', 42):
        infile = resources / 'crom.png'
        assert unpaper.clean(infile, outdir / 'out.png', dpi=300) == infile

        assert any(
            'too large for cleaning' in rec.message
            for rec in caplog.get_records('call')
            if rec.levelno == logging.WARNING
        )


@needs_unpaper
def test_palette_image(resources, outpdf):
    check_ocrmypdf(
        resources / "palette.pdf",
        outpdf,
        "-c",
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


================================================
FILE: tests/test_userunit.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

from math import isclose

import pytest

from ocrmypdf.pdfinfo import PdfInfo

from .conftest import check_ocrmypdf

# pylint: disable=redefined-outer-name


@pytest.fixture
def poster(resources):
    return resources / 'poster.pdf'


@pytest.mark.parametrize("mode", ['pdf', 'pdfa'])
def test_userunit_pdf_passes(mode, poster, outpdf):
    before = PdfInfo(poster)
    check_ocrmypdf(
        poster,
        outpdf,
        f'--output-type={mode}',
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )

    after = PdfInfo(outpdf)
    assert isclose(before[0].width_inches, after[0].width_inches)


def test_rotate_interaction(poster, outpdf):
    check_ocrmypdf(
        poster,
        outpdf,
        '--output-type=pdf',
        '--rotate-pages',
        '--plugin',
        'tests/plugins/tesseract_cache.py',
    )


================================================
FILE: tests/test_validation.py
================================================
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import logging
import os
from unittest.mock import patch

import pikepdf
import pytest

from ocrmypdf import _validation as vd
from ocrmypdf._concurrent import NullProgressBar, SerialExecutor
from ocrmypdf._exec.tesseract import TesseractVersion
from ocrmypdf._options import OcrOptions
from ocrmypdf.api import create_options, setup_plugin_infrastructure
from ocrmypdf.cli import get_parser
from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
from ocrmypdf.pdfinfo import PdfInfo

from .conftest import run_ocrmypdf_api


def make_opts_pm(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs):
    if language is not None:
        kwargs['language'] = language
    parser = get_parser()
    pm = setup_plugin_infrastructure(plugins=kwargs.get('plugins', []))
    pm.add_options(parser=parser)
    return (
        create_options(
            input_file=input_file, output_file=output_file, parser=parser, **kwargs
        ),
        pm,
    )


def make_opts(*args, **kwargs):
    opts, _pm = make_opts_pm(*args, **kwargs)
    return opts


def make_ocr_opts(input_file='a.pdf', output_file='b.pdf', **kwargs):
    """Create OcrOptions directly for testing Pydantic validation."""
    return OcrOptions(input_file=input_file, output_file=output_file, **kwargs)


def test_old_tesseract_error():
    with patch(
        'ocrmypdf._exec.tesseract.version',
        return_value=TesseractVersion('4.00.00alpha'),
    ), pytest.raises(MissingDependencyError):
        vd.check_options(*make_opts_pm(pdf_renderer='sandwich', language='eng'))


def test_tesseract_not_installed(caplog):
    with patch('ocrmypdf.subprocess.run') as not_found:
        not_found.side_effect = FileNotFoundError('tesseract')
        with pytest.raises(MissingDependencyError, match="Could not find program"):
            vd.check_options(*make_opts_pm())
            assert (
                "'tesseract' could not be executed" in caplog.text
            ), "Error message not printed"
            assert 'install' in caplog.text, "Install advice not printed"
        not_found.assert_called()


def test_lossless_redo():
    with pytest.raises(ValueError, match="--redo-ocr.*is not currently compatible"):
        make_ocr_opts(redo_ocr=True, deskew=True)


def test_mutex_options():
    with pytest.raises(
        ValueError, match="Choose only one of --force-ocr, --skip-text, --redo-ocr"
    ):
        make_ocr_opts(force_ocr=True, skip_text=True)
    with pytest.raises(
        ValueError, match="Choose only one of --force-ocr, --skip-text, --redo-ocr"
    ):
        make_ocr_opts(redo_ocr=True, skip_text=True)
    with pytest.raises(
        ValueError, match="Choose only one of --force-ocr, --skip-text, --redo-ocr"
    ):
        make_ocr_opts(redo_ocr=True, force_ocr=True)


def test_optimizing(caplog):
    vd.check_options(
        *make_opts_pm(optimize=0, png_quality=18, jpeg_quality=10)
    )
    assert 'will be ignored because' in caplog.text


def test_pillow_options():
    # Test that max_image_mpixels=0 is valid (validation now in OcrOptions)
    opts = make_ocr_opts(max_image_mpixels=0)
    assert opts.max_image_mpixels == 0

    # Test that negative values are rejected
    with pytest.raises(ValueError, match="max_image_mpixels must be non-negative"):
        make_ocr_opts(max_image_mpixels=-1)


def test_output_tty():
    with patch('sys.stdout.isatty', return_value=True), pytest.raises(BadArgsError):
        vd.check_requested_output_file(make_opts(output_file='-'))


def test_report_file_size(tmp_path, caplog):
    logging.getLogger('pikepdf._qpdf').setLevel(logging.CRITICAL)  # Suppress logging

    in_ = tmp_path / 'a.pdf'
    out = tmp_path / 'b.pdf'
    pdf = pikepdf.new()
    pdf.save(in_)
    pdf.save(out)
    opts = make_opts(output_type='pdf')
    vd.report_output_file_size(opts, in_, out)
    assert caplog.text == ''
    caplog.clear()

    waste_of_space = b'Dummy' * 5000
    pdf.Root.Dummy = waste_of_space
    pdf.save(in_)
    pdf.Root.Dummy2 = waste_of_space + waste_of_space
    pdf.save(out)

    vd.report_output_file_size(opts, in_, out, ['The optional dependency...'])
    assert 'optional dependency' in caplog.text
    caplog.clear()

    vd.report_output_file_size(opts, in_, out, [])
    assert 'No reason' in caplog.text
    caplog.clear()

    opts = make_opts(in_, out, optimize=0, output_type='pdf')
    vd.report_output_file_size(opts, in_, out, ["Optimization was disabled."])
    assert 'disabled' in caplog.text
    caplog.clear()


def test_false_action_store_true():
    opts = make_opts(keep_temporary_files=True)
    assert opts.keep_temporary_files
    opts = make_opts(keep_temporary_files=False)
    assert not opts.keep_temporary_files


@pytest.mark.parametrize('progress_bar', [True, False])
def test_no_progress_bar(progress_bar, resources):
    opts, pm = make_opts_pm(
        progress_bar=progress_bar, input_file=(resources / 'trivial.pdf')
    )
    vd.check_options(opts, pm)

    pbar_disabled = None

    class CheckProgressBar(NullProgressBar):
        def __init__(self, disable, **kwargs):
            nonlocal pbar_disabled
            pbar_disabled = disable
            super().__init__(disable=disable, **kwargs)

    executor = SerialExecutor(pbar_class=CheckProgressBar)
    pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar, executor=executor)

    assert pdfinfo is not None
    assert pbar_disabled is not None and pbar_disabled != progress_bar


def make_version(version):
    def _make_version():
        return TesseractVersion(version)

    return _make_version


def test_version_comparison():
    vd.check_external_program(
        program="dummy_basic",
        package="dummy",
        version_checker=make_version('9.0'),
        need_version='8.0.2',
    )
    vd.check_external_program(
        program="dummy_doubledigit",
        package="dummy",
        version_checker=make_version('10.0'),
        need_version='8.0.2',
    )
    with pytest.raises(MissingDependencyError):
        vd.check_external_program(
            program="tesseract",
            package="tesseract",
            version_checker=make_version('4.0.0-beta.1'),
            need_version='4.1.1',
            version_parser=TesseractVersion,
        )
    vd.check_external_program(
        program="tesseract",
        package="tesseract",
        version_checker=make_version('v5.0.0-alpha.20200201'),
        need_version='4.1.1',
        version_parser=TesseractVersion,
    )
    vd.check_external_program(
        program="tesseract",
        package="tesseract",
        version_checker=make_version('5.0.0-rc1.20211030'),
        need_version='4.1.1',
        version_parser=TesseractVersion,
    )
    vd.check_external_program(
        program="tesseract",
        package="tesseract",
        version_checker=make_version('v4.1.1.20181030'),  # Used in some Windows builds
        need_version='4.1.1',
        version_parser=TesseractVersion,
    )
    vd.check_external_program(
        program="gs",
        package="ghostscript",
        version_checker=make_version('10.0'),
        need_version='9.50',
    )
    with pytest.raises(MissingDependencyError):
        vd.check_external_program(
            program="tesseract",
            package="tesseract",
            version_checker=make_version('4.1.1-rc2-25-g9707'),
            need_version='4.1.1',
            version_parser=TesseractVersion,
        )
    with pytest.raises(MissingDependencyError):
        vd.check_external_program(
            program="dummy_fails",
            package="dummy",
            version_checker=make_version('1.0'),
            need_version='2.0',
        )


def test_optional_program_recommended(caplog):
    caplog.clear()

    def raiser():
        raise FileNotFoundError('jbig2')

    with caplog.at_level(logging.WARNING):
        vd.check_external_program(
            program="jbig2",
            package="jbig2enc",
            version_checker=raiser,
            need_version='42',
            required_for='this test case',
            recommended=True,
        )
        assert any(
            (loglevel == logging.WARNING and "recommended" in msg)
            for _logger_name, loglevel, msg in caplog.record_tuples
        )


def test_pagesegmode_warning(caplog):
    opts = make_opts(tesseract_pagesegmode='0')
    plugin_manager = setup_plugin_infrastructure(plugins=opts.plugins or [])
    vd.check_options(opts, plugin_manager)
    assert 'disable OCR' in caplog.text


def test_two_languages():
    vd.check_options_languages(
        create_options(
            input_file='a.pdf',
            output_file='b.pdf',
            parser=get_parser(),
            languages=['fakelang1', 'fakelang2'],
        ),
        ['fakelang1', 'fakelang2'],
    )


def test_sidecar_equals_output(resources, no_outpdf):
    op = no_outpdf
    with pytest.raises(BadArgsError, match=r'--sidecar'):
        run_ocrmypdf_api(resources / 'trivial.pdf', op, '--sidecar', op)


def test_devnull_sidecar(resources):
    with pytest.raises(BadArgsError, match=r'--sidecar.*NUL'):
        run_ocrmypdf_api(resources / 'trivial.pdf', os.devnull, '--sidecar')


================================================
FILE: tests/test_verapdf.py
================================================
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: CC-BY-SA-4.0

"""Tests for verapdf wrapper and speculative PDF/A conversion."""

from __future__ import annotations

import pikepdf
import pytest
from pikepdf import Name

from ocrmypdf._exec import verapdf
from ocrmypdf.pdfa import (
    _pdfa_part_conformance,
    add_pdfa_metadata,
    add_srgb_output_intent,
    speculative_pdfa_conversion,
)


class TestVerapdfModule:
    """Tests for verapdf wrapper module."""

    def test_output_type_to_flavour(self):
        assert verapdf.output_type_to_flavour('pdfa') == '2b'
        assert verapdf.output_type_to_flavour('pdfa-1') == '1b'
        assert verapdf.output_type_to_flavour('pdfa-2') == '2b'
        assert verapdf.output_type_to_flavour('pdfa-3') == '3b'
        # Unknown should default to 2b
        assert verapdf.output_type_to_flavour('unknown') == '2b'

    @pytest.mark.skipif(not verapdf.available(), reason='verapdf not installed')
    def test_version(self):
        ver = verapdf.version()
        assert ver.major >= 1

    @pytest.mark.skipif(not verapdf.available(), reason='verapdf not installed')
    def test_validate_non_pdfa(self, tmp_path):
        """Test validation of a non-PDF/A file returns invalid."""
        test_pdf = tmp_path / 'test.pdf'
        with pikepdf.new() as pdf:
            pdf.add_blank_page()
            pdf.save(test_pdf)

        result = verapdf.validate(test_pdf, '2b')
        assert not result.valid
        assert result.failed_rules > 0


class TestPdfaPartConformance:
    """Tests for _pdfa_part_conformance helper."""

    def test_pdfa_part_conformance(self):
        assert _pdfa_part_conformance('pdfa') == ('2', 'B')
        assert _pdfa_part_conformance('pdfa-1') == ('1', 'B')
        assert _pdfa_part_conformance('pdfa-2') == ('2', 'B')
        assert _pdfa_part_conformance('pdfa-3') == ('3', 'B')
        # Unknown should default to 2B
        assert _pdfa_part_conformance('unknown') == ('2', 'B')


class TestAddPdfaMetadata:
    """Tests for add_pdfa_metadata function."""

    def test_add_pdfa_metadata(self, tmp_path):
        """Test adding PDF/A XMP metadata."""
        test_pdf = tmp_path / 'test.pdf'
        with pikepdf.new() as pdf:
            pdf.add_blank_page()
            pdf.save(test_pdf)

        with pikepdf.open(test_pdf, allow_overwriting_input=True) as pdf:
            add_pdfa_metadata(pdf, '2', 'B')
            with pdf.open_metadata() as meta:
                assert meta.pdfa_status == '2B'
            pdf.save(test_pdf)

        # Verify it persists after save
        with pikepdf.open(test_pdf) as pdf, pdf.open_metadata() as meta:
            assert meta.pdfa_status == '2B'


class TestAddSrgbOutputIntent:
    """Tests for add_srgb_output_intent function."""

    def test_add_srgb_output_intent(self, tmp_path):
        """Test adding sRGB OutputIntent to a PDF."""
        test_pdf = tmp_path / 'test.pdf'
        with pikepdf.new() as pdf:
            pdf.add_blank_page()
            pdf.save(test_pdf)

        with pikepdf.open(test_pdf, allow_overwriting_input=True) as pdf:
            add_srgb_output_intent(pdf)
            assert Name.OutputIntents in pdf.Root
            assert len(pdf.Root.OutputIntents) == 1
            intent = pdf.Root.OutputIntents[0]
            assert str(intent.get(Name.OutputConditionIdentifier)) == 'sRGB'
            pdf.save(test_pdf)

    def test_add_srgb_output_intent_idempotent(self, tmp_path):
        """Test that adding OutputIntent twice doesn't duplicate."""
        test_pdf = tmp_path / 'test.pdf'
        with pikepdf.new() as pdf:
            pdf.add_blank_page()
            pdf.save(test_pdf)

        with pikepdf.open(test_pdf, allow_overwriting_input=True) as pdf:
            add_srgb_output_intent(pdf)
            add_srgb_output_intent(pdf)  # Second call should be a no-op
            assert len(pdf.Root.OutputIntents) == 1
            pdf.save(test_pdf)


class TestSpeculativePdfaConversion:
    """Tests for speculative PDF/A conversion."""

    def test_speculative_conversion_creates_pdfa_structures(self, tmp_path, resources):
        """Test that speculative conversion adds PDF/A structures."""
        input_pdf = resources / 'graph.pdf'
        output_pdf = tmp_path / 'output.pdf'

        result = speculative_pdfa_conversion(input_pdf, output_pdf, 'pdfa-2')

        assert result.exists()
        with pikepdf.open(result) as pdf:
            assert Name.OutputIntents in pdf.Root
            with pdf.open_metadata() as meta:
                assert meta.pdfa_status == '2B'

    def test_speculative_conversion_different_parts(self, tmp_path, resources):
        """Test speculative conversion with different PDF/A parts."""
        input_pdf = resources / 'graph.pdf'

        for output_type, expected_status in [
            ('pdfa-1', '1B'),
            ('pdfa-2', '2B'),
            ('pdfa-3', '3B'),
        ]:
            output_pdf = tmp_path / f'output_{output_type}.pdf'
            speculative_pdfa_conversion(input_pdf, output_pdf, output_type)

            with pikepdf.open(output_pdf) as pdf, pdf.open_metadata() as meta:
                assert meta.pdfa_status == expected_status


@pytest.mark.skipif(not verapdf.available(), reason='verapdf not installed')
class TestVerapdfIntegration:
    """Integration tests requiring verapdf."""

    def test_speculative_conversion_validation(self, tmp_path, resources):
        """Test that speculative conversion can be validated by verapdf.

        Note: Most test PDFs will fail validation because they have issues
        that require Ghostscript to fix (fonts, colorspaces, etc.). This test
        verifies the validation pipeline works, not that all PDFs pass.
        """
        input_pdf = resources / 'graph.pdf'
        output_pdf = tmp_path / 'output.pdf'

        speculative_pdfa_conversion(input_pdf, output_pdf, 'pdfa-2')

        # The converted file can be validated (even if it fails)
        result = verapdf.validate(output_pdf, '2b')
        assert isinstance(result.valid, bool)
        assert isinstance(result.failed_rules, int)


================================================
FILE: tests/test_watcher.py
================================================
from __future__ import annotations

import datetime as dt
import os
import shutil
import subprocess
import sys
import time
from pathlib import Path

import pytest

watchdog = pytest.importorskip('watchdog')


@pytest.mark.parametrize('year_month', [True, False])
def test_watcher(tmp_path, resources, year_month):
    input_dir = tmp_path / 'input'
    input_dir.mkdir()
    output_dir = tmp_path / 'output'
    output_dir.mkdir()
    processed_dir = tmp_path / 'processed'
    processed_dir.mkdir()

    env_extra = {'OCR_OUTPUT_DIRECTORY_YEAR_MONTH': '1'} if year_month else {}
    proc = subprocess.Popen(
        [
            sys.executable,
            Path(__file__).parent.parent / 'misc' / 'watcher.py',
            str(input_dir),
            str(output_dir),
            str(processed_dir),
        ],
        cwd=str(tmp_path),
        env=os.environ.copy() | env_extra,
    )
    time.sleep(5)

    shutil.copy(resources / 'trivial.pdf', input_dir / 'trivial.pdf')
    time.sleep(5)

    if year_month:
        assert (
            output_dir
            / f'{dt.date.today().year}'
            / f'{dt.date.today().month:02d}'
            / 'trivial.pdf'
        ).exists()
    else:
        assert (output_dir / 'trivial.pdf').exists()

    proc.terminate()
    proc.wait()