[
  {
    "path": ".docker/Dockerfile",
    "content": "# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nFROM ubuntu:25.04 AS base\n\nENV LANG=C.UTF-8\nENV TZ=UTC\nRUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections\n\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  python3 \\\n  python-is-python3\n\nFROM base AS builder\n\n# Note we need leptonica here to build jbig2\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  build-essential autoconf automake libtool \\\n  libleptonica-dev \\\n  zlib1g-dev \\\n  libffi-dev \\\n  ca-certificates \\\n  curl \\\n  git \\\n  libcairo2-dev \\\n  pkg-config\n\n# Compile and install jbig2\n# Needs libleptonica-dev, zlib1g-dev\nRUN \\\n  mkdir jbig2 \\\n  && curl -L https://github.com/agl/jbig2enc/archive/c0141bf.tar.gz | \\\n  tar xz -C jbig2 --strip-components=1 \\\n  && cd jbig2 \\\n  && ./autogen.sh && ./configure && make && make install \\\n  && cd .. \\\n  && rm -rf jbig2\n\n\nWORKDIR /app\n\n# Copy uv from ghcr\nCOPY --from=ghcr.io/astral-sh/uv:0.9.8 /uv /uvx /bin/\n\nENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy\n\n# Install the project's dependencies using the lockfile and settings\nRUN --mount=type=cache,target=/root/.cache/uv \\\n    --mount=type=bind,source=uv.lock,target=uv.lock \\\n    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \\\n    uv sync --frozen --no-install-project --no-dev\n\n# Then, add the rest of the project source code and install it\n# Installing separately from its dependencies allows optimal layer caching\nCOPY . /app\nRUN --mount=type=cache,target=/root/.cache/uv \\\n    uv sync --frozen \\\n        --extra webservice --extra watcher --no-dev \\\n        --no-install-package pyarrow\n\nFROM base\n\nRUN apt-get update && apt-get install -y software-properties-common\n\nRUN add-apt-repository -y ppa:alex-p/tesseract-ocr5\n\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  ghostscript \\\n  fonts-droid-fallback \\\n  fonts-noto-core \\\n  fonts-noto-cjk \\\n  jbig2dec \\\n  pngquant \\\n  tesseract-ocr \\\n  tesseract-ocr-chi-sim \\\n  tesseract-ocr-deu \\\n  tesseract-ocr-eng \\\n  tesseract-ocr-fra \\\n  tesseract-ocr-por \\\n  tesseract-ocr-spa \\\n  unpaper \\\n  && rm -rf /var/lib/apt/lists/*\n\nWORKDIR /app\n\nCOPY --from=builder /usr/local/lib/ /usr/local/lib/\nCOPY --from=builder /usr/local/bin/ /usr/local/bin/\n\nCOPY --from=builder --chown=app:app /app /app\n\nRUN rm -rf /app/.git && \\\nln -s /app/misc/webservice.py /app/webservice.py && \\\nln -s /app/misc/watcher.py /app/watcher.py\n\nENV PATH=\"/app/.venv/bin:${PATH}\"\n\nENTRYPOINT [\"/app/.venv/bin/ocrmypdf\"]\n\n"
  },
  {
    "path": ".docker/Dockerfile.alpine",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nFROM alpine:3.23 AS base\n\nENV LANG=C.UTF-8\nENV TZ=UTC\n\nRUN apk add --no-cache \\\n    python3 \\\n    zlib\n\nFROM base AS builder\n\n# Yes it really is python3-dev, and py3-package\nRUN apk add --no-cache \\\n    ca-certificates \\\n    git \\\n    python3-dev \\\n    py3-pyarrow \\\n    curl\n\nWORKDIR /app\n\nCOPY --from=ghcr.io/astral-sh/uv:0.9.8 /uv /uvx /bin/\n\nENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy\n\nRUN uv venv --system-site-packages .venv\n\n# Install the project's dependencies using the lockfile and settings\nRUN --mount=type=cache,target=/root/.cache/uv \\\n    --mount=type=bind,source=uv.lock,target=uv.lock \\\n    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \\\n    uv sync --frozen --no-install-project --no-dev\n\n# Then, add the rest of the project source code and install it\n# Installing separately from its dependencies allows optimal layer caching\nCOPY . /app\nRUN --mount=type=cache,target=/root/.cache/uv \\\n    uv sync --frozen \\\n        --extra webservice --extra watcher --no-dev \\\n        --no-install-package pyarrow\n\nFROM base\n\nRUN apk add --no-cache \\\n    ghostscript \\\n    jbig2dec \\\n    jbig2enc \\\n    pngquant \\\n    tesseract-ocr \\\n    tesseract-ocr-data-chi_sim \\\n    tesseract-ocr-data-deu \\\n    tesseract-ocr-data-eng \\\n    tesseract-ocr-data-fra \\\n    tesseract-ocr-data-osd \\\n    tesseract-ocr-data-por \\\n    tesseract-ocr-data-spa \\\n    font-noto \\\n    ttf-droid \\\n    unpaper \\\n    && rm -rf /var/cache/apk/*\n\nWORKDIR /app\n\nCOPY --from=builder --chown=app:app /app /app\n\nRUN rm -rf /app/.git && \\\n    ln -s /app/misc/webservice.py /app/webservice.py && \\\n    ln -s /app/misc/watcher.py /app/watcher.py\n\nENV PATH=\"/app/.venv/bin:${PATH}\"\n\nENTRYPOINT [\"/app/.venv/bin/ocrmypdf\"]\n"
  },
  {
    "path": ".dockerignore",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n# dotfiles\n.*\n!.coveragerc\n!.dockerignore\n!.git_archival.txt\n!.gitattributes\n!.gitignore\n!.pre-commit-config.yaml\n!.readthedocs.yml\n\n# Dev scratch\n*.ipynb\n**/*.pyc\n/*.pdf\n/*.qdf\n/*.png\n/scratch.py\nIDEAS\nlog/\ntests/resources/private/\ntmp/\nvenv*/\n/debug_tests.py\n*.traineddata\n/private\n\n# Package building\n*.egg-info/\nbuild/\ndist/\nwheelhouse/\npip-wheel-metadata/\n\n# Code coverage\nhtmlcov/\n\n# Docker specific\nbin/\ndocs/\ninclude/\nlib/\n\n# Docker include .git/\n!.git/\n"
  },
  {
    "path": ".git_archival.txt",
    "content": "node: $Format:%H$\nnode-date: $Format:%cI$\ndescribe-name: $Format:%(describe:tags=true)$\nref-names: $Format:%D$\n"
  },
  {
    "path": ".gitattributes",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n# Always use Unix convention for new lines\n* text eol=lf\n\n# These files are binary and should be left untouched\n# (binary is a macro for -text -diff)\n*.jar\tbinary\n*.pdf\tbinary\n*.PDF\tbinary\n*.png\tbinary\n*.jpg\tbinary\n*.bin   binary\n*.afdesign  binary\n*.ttf   binary\n\n.git_archival.txt  export-subst\n"
  },
  {
    "path": ".github/FUNDING.yml",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n# These are supported funding model platforms\n\ngithub: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]\npatreon: # Replace with a single Patreon username\nopen_collective: james-barlow\nko_fi: # Replace with a single Ko-fi username\ntidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel\ncommunity_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry\nliberapay: # Replace with a single Liberapay username\nissuehunt: # Replace with a single IssueHunt username\notechie: # Replace with a single Otechie username\ncustom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/1-bug-report-general.yml",
    "content": "name: Installation, packaging, dependencies\ndescription: Installation, packages, dependencies, \"nothing works\", test suite failures...\ntitle: \"[Bug]: \"\nlabels: [\"triage\"]\nassignees:\n  - jbarlow83\nbody:\n  - type: markdown\n    attributes:\n      value: |\n        Thanks for taking the time to fill out this bug report!\n\n        If your issue involves using OCRmyPDF on specific file(s) and not getting\n        good results, this is the *wrong* issue template. Please use the recommended\n        template to ensure we have enough information to help.\n  - type: textarea\n    id: what-happened\n    attributes:\n      label: What were you trying to do?\n      description: Also tell us, what did you expect to happen?\n      placeholder: Tell us what you see!\n    validations:\n      required: true\n  - type: dropdown\n    id: packaging-system\n    attributes:\n      label: Where are you installing/running from?\n      multiple: true\n      options:\n        - PyPI (pip, poetry, pipx, etc.)\n        - Linux package manager (apt, dnf, etc.)\n        - Wndows package manager (chocolatey, etc.)\n        - Homebrew\n        - Docker container\n        - Ubuntu snap\n        - Conda\n        - source build\n    validations:\n      required: true\n  - type: input\n    id: version\n    attributes:\n      label: OCRmyPDF version\n      description: Paste \"ocrmypdf --version\" here\n  - type: dropdown\n    id: operating-system\n    attributes:\n      label: What operating system are you working on?\n      multiple: true\n      options:\n        - Linux\n        - Windows\n        - macOS\n        - BSD\n  - type: input\n    id: os_version\n    attributes:\n      label: Operating system details and version\n  - type: checkboxes\n    attributes:\n      label: Simple sanity checks\n      description: Select all that apply\n      options:\n        - label: Operating system is currently supported by its vendor (not end of life)\n        - label: Python version is compatible with OCRmyPDF\n        - label: This issue is not about a specific input file\n  - type: textarea\n    id: logs\n    attributes:\n      label: Relevant log output\n      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.\n      render: plain text\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/2-problem-with-specific-file.yml",
    "content": "name: Problem with specific file\ndescription: Something went wrong while trying to OCR a specific file\ntitle: \"[Bug]: \"\nlabels: [\"triage\"]\nassignees:\n  - jbarlow83\nbody:\n  - type: markdown\n    attributes:\n      value: |\n        Thanks for taking the time to describe this issue with a particular file.\n  - type: textarea\n    id: what-happened\n    attributes:\n      label: Describe the bug\n      description: A clear and concise description of what the bug is.\n      placeholder: Tell us what you see!\n    validations:\n      required: true\n  - type: textarea\n    id: reproduce\n    attributes:\n      label: Steps to reproduce\n      description: Please include steps to reproduce.\n      value: |\n        1. Run ocrmypdf -v1 ...arguments... input.pdf output.pdf\n        2. Open output.pdf\n        3. ...\n      render: plain text\n  - type: textarea\n    id: files\n    attributes:\n      label: Files\n      description: |\n        Please attach the input and output files, or any screenshots that may be helpful.\n\n        If you cannot provide a test file, we probably won't be able to help with the issue.\n        PDF is a complex file format, and there may be technical details in the PDF that are\n        causing the issue. There's really no substitute for a test file.\n\n        We understand files may contain personal or sensitive information. Here are some options:\n        - Try reproducing the issue with a file from the OCRmyPDF test suite. (See tests/resources)\n        - Try to create another file in the same way as your private file.\n        - Encrypt the file to OCRmyPDF's private GPG key, and then zip the GPG file.\n        - Use ``qpdf --json yourfile.pdf`` to produce a JSON representation of your file that\n          omits personal information.\n      placeholder: |\n        Drag and drop files here.\n  - type: dropdown\n    id: packaging-system\n    attributes:\n      label: How did you download and install the software?\n      multiple: true\n      options:\n        - PyPI (pip, poetry, pipx, etc.)\n        - Linux package manager (apt, dnf, etc.)\n        - Windows package manager (chocolatey, etc.)\n        - Homebrew\n        - Docker container\n        - Ubuntu snap\n        - Conda\n        - source build\n  - type: input\n    id: version\n    attributes:\n      label: OCRmyPDF version\n      description: Paste \"ocrmypdf --version\" here\n      placeholder: ocrmypdf --version\n  - type: textarea\n    id: logs\n    attributes:\n      label: Relevant log output\n      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.\n      placeholder: Run OCRmyPDF with verbosity `-v1` to get more detailed logging output.\n      render: plain text\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/3-app.yml",
    "content": "name: Problem with third party app that uses OCRmyPDF\ndescription: |\n  For PDF generation issues with third party software such as Paperless-ngx that\n  uses OCRmyPDF to perform OCR or generate PDFs.\ntitle: \"[3rdparty]: \"\nlabels: [\"triage\"]\nassignees:\n  - jbarlow83\nbody:\n  - type: markdown\n    attributes:\n      value: |\n        Thanks for taking the time to describe this issue with a particular file\n        and third party app.\n\n        If you are comfortable using OCRmyPDF, please trying to install OCRmyPDF,\n        run it on your file, and see if it works. It's easier for everyone\n        if you can confirm that the issue occurs with OCRmyPDF and not with\n        the third party app.\n  - type: checkboxes\n    attributes:\n      label: Simple sanity checks\n      description: Select all that apply\n      options:\n        - label: This is an issue with an app that uses OCRmyPDF for OCR\n        - label: I am using a recent version of the third party app\n        - label: I will include a file that reproduces the issuse\n  - type: input\n    id: thirdparty-app-name-version\n    attributes:\n      label: Third party app name and version\n      description: e.g. Paperless-ngx 2.9.0\n  - type: textarea\n    id: what-happened\n    attributes:\n      label: Describe the bug\n      description: A clear and concise description of what the bug is.\n      placeholder: Tell us what you see!\n    validations:\n      required: true\n  - type: textarea\n    id: reproduce\n    attributes:\n      label: Steps to reproduce\n      description: Please include steps to reproduce.\n      value: |\n        1. Import attached file into Paperless-ngx\n        2. Trigger OCR\n        3. Check log file\n        4. ...\n      render: plain text\n  - type: textarea\n    id: files\n    attributes:\n      label: Files\n      description: |\n        Please attach the input and output files, or any screenshots that may be helpful.\n\n        If you cannot provide a test file, we probably won't be able to help with the issue.\n        PDF is a complex file format, and there may be technical details in the PDF that are\n        causing the issue. There's really no substitute for a test file.\n\n        We understand files may contain personal or sensitive information. Here are some options:\n        - Try reproducing the issue with a file from the test suite. (See tests/resources)\n        - Try to create another file in the same way as your private file.\n        - Encrypt the file to OCRmyPDF's private GPG key, and then zip the GPG file.\n        - Use ``qpdf --json yourfile.pdf`` to produce a JSON representation of your file that\n          omits personal information.\n      placeholder: |\n        Drag and drop files here.\n  - type: input\n    id: version\n    attributes:\n      label: OCRmyPDF version\n      description: Paste \"ocrmypdf --version\" here\n      placeholder: ocrmypdf --version\n  - type: textarea\n    id: logs\n    attributes:\n      label: Relevant log output\n      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.\n      placeholder: Run OCRmyPDF with verbosity `-v1` to get more detailed logging output.\n      render: plain text\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/4-feature-request.yml",
    "content": "name: Feature request\ndescription: Suggest an idea for this project\ntitle: \"[Feature]: \"\nlabels: [\"enhancement\", \"triage\"]\nassignees:\n  - jbarlow83\nbody:\n  - type: textarea\n    id: feature\n    attributes:\n      label: Describe the proposed feature\n      description: A clear and concise description of what the desired is.\n"
  },
  {
    "path": ".github/dependabot.yml",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n# To get started with Dependabot version updates, you'll need to specify which\n# package ecosystems to update and where the package manifests are located.\n# Please see the documentation for all configuration options:\n# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates\n\nversion: 2\nupdates:\n  - package-ecosystem: \"github-actions\" # See documentation for possible values\n    directory: \"/\" # Location of package manifests\n    schedule:\n      interval: \"weekly\"\n"
  },
  {
    "path": ".github/workflows/build.yml",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\nname: Test and deploy\n\non:\n  push:\n    branches:\n      - main\n      - ci\n      - release/*\n      - feature/*\n    paths-ignore:\n      - README*\n  pull_request:\n\njobs:\n  test_linux:\n    name: Test ${{ matrix.os }} with Python ${{ matrix.python }}\n    runs-on: ${{ matrix.os }}\n    strategy:\n      matrix:\n        os: [ubuntu-22.04, ubuntu-24.04]\n        python: [\"3.11\", \"3.12\", \"3.13\", \"3.14\"]\n        include:\n          - os: ubuntu-22.04\n            tesseract_ppa: \"ppa\"\n            python: \"3.11\"\n\n    env:\n      OS: ${{ matrix.os }}\n      PYTHON: ${{ matrix.python }}\n\n    steps:\n      - uses: actions/checkout@v6\n\n      - name: Install uv\n        uses: astral-sh/setup-uv@v7\n        with:\n          version: \"0.9.x\"\n\n      - name: \"Set up Python\"\n        uses: actions/setup-python@v6\n        with:\n          python-version: ${{ matrix.python }}\n\n      - name: Install Tesseract from PPA\n        if: matrix.tesseract_ppa == 'ppa'\n        run: |\n          sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5\n\n      - name: Install common packages\n        run: |\n          sudo apt-get update\n          sudo apt-get install -y --no-install-recommends \\\n            curl \\\n            fonts-noto-core \\\n            fonts-noto-cjk \\\n            ghostscript \\\n            jbig2dec \\\n            img2pdf \\\n            libexempi8 \\\n            libffi-dev \\\n            libsm6 libxext6 libxrender-dev \\\n            pngquant \\\n            poppler-utils \\\n            tesseract-ocr \\\n            tesseract-ocr-deu \\\n            tesseract-ocr-eng \\\n            tesseract-ocr-osd \\\n            unpaper \\\n            zlib1g\n\n      - name: Install Python packages\n        run: |\n          uv sync --group test\n\n      - name: Report versions\n        run: |\n          tesseract --version\n          gs --version\n          pngquant --version\n          unpaper --version\n          uv run --no-dev img2pdf --version\n\n      - name: Test\n        run: |\n          uv run --no-dev pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/\n\n      - name: Upload coverage to Codecov\n        uses: codecov/codecov-action@v5\n        env:\n          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}\n        with:\n          files: ./coverage.xml\n          env_vars: OS,PYTHON\n\n  test_macos:\n    name: Test macOS\n    runs-on: ${{ matrix.os }}\n    strategy:\n      matrix:\n        os: [macos-latest]\n        python: [\"3.11\", \"3.12\", \"3.13\", \"3.14\"]\n\n    env:\n      OS: ${{ matrix.os }}\n      PYTHON: ${{ matrix.python }}\n\n    steps:\n      - uses: actions/checkout@v6\n\n      - name: Install Homebrew deps\n        continue-on-error: true\n        run: |\n          brew update\n          brew install \\\n            exempi \\\n            ghostscript \\\n            jbig2enc \\\n            openjpeg \\\n            pngquant \\\n            poppler \\\n            tesseract \\\n            verapdf\n\n      - name: Install uv\n        uses: astral-sh/setup-uv@v7\n        with:\n          version: \"0.9.x\"\n\n      - name: \"Set up Python\"\n        uses: actions/setup-python@v6\n        with:\n          python-version: ${{ matrix.python }}\n\n      - name: Install Python packages\n        run: |\n          uv sync --group test\n\n      - name: Report versions\n        run: |\n          tesseract --version\n          gs --version\n          pngquant --version\n          uv run --no-dev img2pdf --version\n\n      - name: Test\n        run: |\n          uv run --no-dev pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/\n\n      - name: Upload coverage to Codecov\n        uses: codecov/codecov-action@v5\n        env:\n          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}\n        with:\n          files: ./coverage.xml\n          env_vars: OS,PYTHON\n\n  test_windows:\n    name: Test Windows\n    runs-on: ${{ matrix.os }}\n    strategy:\n      matrix:\n        os: [windows-latest]\n        python: [\"3.11\", \"3.12\", \"3.13\", \"3.14\"]\n\n    env:\n      OS: ${{ matrix.os }}\n      PYTHON: ${{ matrix.python }}\n\n    steps:\n      - uses: actions/checkout@v6\n\n      - name: Install uv\n        uses: astral-sh/setup-uv@v7\n        with:\n          version: \"0.9.x\"\n\n      - name: \"Set up Python\"\n        uses: actions/setup-python@v6\n        with:\n          python-version: ${{ matrix.python }}\n\n      - name: Install system packages\n        run: |\n          choco install --yes --no-progress tesseract\n          choco install --yes --no-progress --ignore-checksums ghostscript --version 9.56.1\n          choco install --yes --no-progress poppler --version=25.11.0\n\n      - name: Install Python packages\n        run: |\n          uv sync --group test\n\n      - name: Test\n        run: |\n          uv run --no-dev pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/\n\n      - name: Upload coverage to Codecov\n        uses: codecov/codecov-action@v5\n        env:\n          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}\n        with:\n          files: ./coverage.xml\n          env_vars: OS,PYTHON\n\n  wheel_sdist_linux:\n    name: Build sdist and wheels\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v6\n\n      - name: Install uv\n        uses: astral-sh/setup-uv@v7\n        with:\n          version: \"0.9.x\"\n\n      - name: Make wheels and sdist\n        run: |\n          uv build --sdist --wheel\n\n      - uses: actions/upload-artifact@v6\n        with:\n          name: artifact\n          path: |\n            ./dist/*.whl\n            ./dist/*.tar.gz\n\n  stage_release:\n    name: Stage release artifacts\n    needs: [wheel_sdist_linux, test_linux, test_macos, test_windows]\n    runs-on: ubuntu-latest\n    if: github.ref == 'refs/heads/main'\n    permissions:\n      contents: write\n    steps:\n      - uses: actions/checkout@v6\n\n      - uses: actions/download-artifact@v7\n        with:\n          name: artifact\n          path: dist\n\n      - name: Read version from source\n        id: version\n        run: |\n          VERSION=$(python3 -c \"exec(open('src/ocrmypdf/_version.py').read()); print(__version__)\")\n          echo \"version=$VERSION\" >> $GITHUB_OUTPUT\n\n      - name: Create or update draft release\n        env:\n          GITHUB_TOKEN: ${{ github.token }}\n        run: |\n          TAG=\"v${{ steps.version.outputs.version }}\"\n\n          # Delete existing draft release if it exists (ignore errors)\n          gh release delete \"$TAG\" --yes 2>/dev/null || true\n\n          # Create new draft release with all artifacts\n          gh release create \"$TAG\" \\\n            --draft \\\n            --title \"$TAG\" \\\n            --notes \"Draft release - will be updated when tag is pushed\" \\\n            dist/*\n\n  docker_ubuntu:\n    name: Build Ubuntu-based Docker image\n    needs: [wheel_sdist_linux, test_linux, test_macos, test_windows]\n    runs-on: ubuntu-latest\n    if: github.event_name != 'pull_request'\n    steps:\n      - name: Set image tag to release or branch\n        run: echo \"DOCKER_IMAGE_TAG=${GITHUB_REF##*/}\" >> $GITHUB_ENV\n\n      - name: If main, set to latest\n        run: echo 'DOCKER_IMAGE_TAG=latest' >> $GITHUB_ENV\n        if: env.DOCKER_IMAGE_TAG == 'main'\n\n      - name: Set Docker Hub repository to username\n        run: echo \"DOCKER_REPOSITORY=jbarlow83\" >> $GITHUB_ENV\n\n      - name: Set image name\n        run: echo \"DOCKER_IMAGE_NAME=ocrmypdf\" >> $GITHUB_ENV\n\n      - uses: actions/checkout@v6\n\n      - name: Login to Docker Hub\n        uses: docker/login-action@v3\n        with:\n          username: jbarlow83\n          password: ${{ secrets.DOCKERHUB_TOKEN }}\n\n      - name: Set up QEMU\n        uses: docker/setup-qemu-action@v3\n\n      - name: Set up Docker Buildx\n        id: buildx\n        uses: docker/setup-buildx-action@v3\n\n      - name: Print image tag\n        run: echo \"Building image ${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}\"\n\n      - name: Build\n        run: |\n          docker buildx build \\\n            --push \\\n            --platform linux/arm64/v8,linux/amd64  \\\n            --tag \"${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}\" \\\n            --tag \"${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}-ubuntu:${DOCKER_IMAGE_TAG}\" \\\n            --file .docker/Dockerfile .\n\n  docker_alpine:\n    name: Build Alpine-based Docker images\n    needs: [wheel_sdist_linux, test_linux, test_macos, test_windows]\n    runs-on: ubuntu-latest\n    if: github.event_name != 'pull_request'\n    steps:\n      - name: Set image tag to release or branch\n        run: echo \"DOCKER_IMAGE_TAG=${GITHUB_REF##*/}\" >> $GITHUB_ENV\n\n      - name: If main, set to latest\n        run: echo 'DOCKER_IMAGE_TAG=latest' >> $GITHUB_ENV\n        if: env.DOCKER_IMAGE_TAG == 'main'\n\n      - name: Set Docker Hub repository to username\n        run: echo \"DOCKER_REPOSITORY=jbarlow83\" >> $GITHUB_ENV\n\n      - name: Set image name\n        run: echo \"DOCKER_IMAGE_NAME=ocrmypdf-alpine\" >> $GITHUB_ENV\n\n      - uses: actions/checkout@v6\n\n      - name: Login to Docker Hub\n        uses: docker/login-action@v3\n        with:\n          username: jbarlow83\n          password: ${{ secrets.DOCKERHUB_TOKEN }}\n\n      - name: Set up Docker Buildx\n        id: buildx\n        uses: docker/setup-buildx-action@v3\n\n      - name: Print image tag\n        run: echo \"Building image ${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}\"\n\n      - name: Build\n        run: |\n          docker buildx build \\\n            --push \\\n            --platform linux/amd64,linux/arm64  \\\n            --tag \"${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}\" \\\n            --file .docker/Dockerfile.alpine .\n"
  },
  {
    "path": ".github/workflows/release.yml",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nname: Publish Release\n\non:\n  push:\n    tags:\n      - \"v*\"\n\njobs:\n  publish:\n    name: Publish release\n    runs-on: ubuntu-latest\n    environment:\n      name: release\n      url: https://pypi.org/p/ocrmypdf\n    permissions:\n      contents: write\n      id-token: write\n    steps:\n      - uses: actions/checkout@v6\n\n      - name: Download artifacts from draft release\n        env:\n          GITHUB_TOKEN: ${{ github.token }}\n        run: |\n          mkdir -p dist\n          gh release download \"$GITHUB_REF_NAME\" --dir dist --pattern '*.whl'\n          gh release download \"$GITHUB_REF_NAME\" --dir dist --pattern '*.tar.gz'\n\n      - name: Publish to PyPI\n        uses: pypa/gh-action-pypi-publish@release/v1\n\n      # PyPI doesn't support sigstore publishing, so generate after publishing to PyPI\n      - name: Sign the dists with Sigstore\n        uses: sigstore/gh-action-sigstore-python@v3.2.0\n        with:\n          inputs: |\n            ./dist/*.tar.gz\n            ./dist/*.whl\n\n      - name: Extract release notes\n        run: |\n          VERSION=\"${GITHUB_REF_NAME#v}\"\n          MAJOR=\"${VERSION%%.*}\"\n          MAJOR_PADDED=$(printf \"%02d\" \"$MAJOR\")\n          RELEASE_FILE=\"docs/releasenotes/version${MAJOR_PADDED}.md\"\n\n          python3 << EOF\n          import re\n\n          version = \"${VERSION}\"\n          release_file = \"${RELEASE_FILE}\"\n\n          try:\n              with open(release_file) as f:\n                  content = f.read()\n\n              # Find the section for this version\n              # Match from \"## vX.Y.Z\" until the next \"## v\" or end of file\n              pattern = rf\"## v{re.escape(version)}\\n(.*?)(?=\\n## v|\\Z)\"\n              match = re.search(pattern, content, re.DOTALL)\n              notes = match.group(1).strip() if match else \"\"\n          except FileNotFoundError:\n              notes = \"\"\n\n          with open(\"release_notes.md\", \"w\") as f:\n              f.write(notes)\n          EOF\n\n      - name: Publish release (convert draft to published)\n        env:\n          GITHUB_TOKEN: ${{ github.token }}\n        run: |\n          # Update release: remove draft status, add release notes\n          gh release edit \"$GITHUB_REF_NAME\" \\\n            --draft=false \\\n            --notes-file release_notes.md\n\n          # Upload signatures to the release\n          gh release upload \"$GITHUB_REF_NAME\" dist/*.sigstore.json --clobber\n\n  docker_tag:\n    name: Tag Docker images with release version\n    needs: [publish]\n    runs-on: ubuntu-latest\n    steps:\n      - name: Login to Docker Hub\n        uses: docker/login-action@v3\n        with:\n          username: jbarlow83\n          password: ${{ secrets.DOCKERHUB_TOKEN }}\n\n      - name: Set up Docker Buildx\n        uses: docker/setup-buildx-action@v3\n\n      - name: Tag ocrmypdf (Ubuntu) image\n        run: |\n          docker buildx imagetools create \\\n            --tag \"jbarlow83/ocrmypdf:$GITHUB_REF_NAME\" \\\n            \"jbarlow83/ocrmypdf:latest\"\n\n      - name: Tag ocrmypdf-ubuntu image\n        run: |\n          docker buildx imagetools create \\\n            --tag \"jbarlow83/ocrmypdf-ubuntu:$GITHUB_REF_NAME\" \\\n            \"jbarlow83/ocrmypdf-ubuntu:latest\"\n\n      - name: Tag ocrmypdf-alpine image\n        run: |\n          docker buildx imagetools create \\\n            --tag \"jbarlow83/ocrmypdf-alpine:$GITHUB_REF_NAME\" \\\n            \"jbarlow83/ocrmypdf-alpine:latest\"\n"
  },
  {
    "path": ".github/workflows/triage.yml",
    "content": "# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nname: Remove Triage Label on Reply\n\non:\n  issue_comment:\n    types:\n      - created\n\njobs:\n  remove-triage-label:\n    runs-on: ubuntu-latest\n\n    steps:\n      - name: Check if comment is by the repository owner\n        id: check_comment\n        run: |\n          echo \"::set-output name=is_owner::$(\n            if [[ '${{ github.event.comment.user.login }}' == 'jbarlow83' ]]; then\n              echo 'true';\n            else\n              echo 'false';\n            fi\n          )\"\n\n      - name: Remove 'triage' label\n        if: ${{ steps.check_comment.outputs.is_owner == 'true' }}\n        uses: actions-ecosystem/action-remove-labels@v1\n        with:\n          github_token: ${{ secrets.GITHUB_TOKEN }}\n          labels: triage\n"
  },
  {
    "path": ".gitignore",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: CC-BY-SA-4.0\n\n# dotfiles\n.coverage\n.venv*/\n.tox/\n.vscode/\n.hypothesis/\n.ipynb_checkpoints/\n.mypy_cache/\n.pytest_cache/\n\n# Dev scratch\n*.ipynb\n**/*.pyc\n/*.pdf\n/*.qdf\n/*.png\n/scratch.py\nIDEAS\nlog/\ntests/resources/private/\ntmp/\nvenv*/\n/debug_tests.py\n*.traineddata\n/private\n/coverage.xml\n/issuepdf\n\n# Package building\n*.egg-info/\nbuild/\ndist/\nwheelhouse/\npip-wheel-metadata/\n\n# Code coverage\nhtmlcov/\n\n# Automatically generated files\ndocs/_build/\ndocs/_static/\ndocs/_templates/\ndocs/Makefile\n\n.idea/\n.aider*\nCLAUDE.md\n.claude/"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nrepos:\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v4.4.0\n    hooks:\n      - id: check-case-conflict\n      - id: check-merge-conflict\n      - id: check-toml\n      - id: check-yaml\n      - id: debug-statements\n  - repo: https://github.com/astral-sh/ruff-pre-commit\n    rev: \"v0.14.11\"\n    hooks:\n      - id: ruff-check\n        args: [--fix]\n      - id: ruff-format\n  - repo: https://github.com/pre-commit/mirrors-mypy\n    rev: v1.2.0\n    hooks:\n      - id: mypy\n        additional_dependencies:\n          - types-toml\n          - types-setuptools\n          - types-requests\n          - types-Pillow\n"
  },
  {
    "path": ".readthedocs.yaml",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n# Read the Docs configuration file\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details\n\n# Required\nversion: 2\n\n# Build documentation in the docs/ directory with Sphinx\nsphinx:\n  configuration: docs/conf.py\n\n# Optionally set the version of Python and requirements required to build your docs\nbuild:\n  os: ubuntu-22.04\n  tools:\n    python: \"3.13\"\n  jobs:\n    pre_create_environment:\n      - asdf plugin add uv\n      - asdf install uv latest\n      - asdf global uv latest\n    create_environment:\n      - uv venv \"${READTHEDOCS_VIRTUALENV_PATH}\"\n    install:\n      - UV_PROJECT_ENVIRONMENT=\"${READTHEDOCS_VIRTUALENV_PATH}\" uv sync --frozen --group docs"
  },
  {
    "path": "LICENSE",
    "content": "Mozilla Public License Version 2.0\n==================================\n\n1. Definitions\n--------------\n\n1.1. \"Contributor\"\n    means each individual or legal entity that creates, contributes to\n    the creation of, or owns Covered Software.\n\n1.2. \"Contributor Version\"\n    means the combination of the Contributions of others (if any) used\n    by a Contributor and that particular Contributor's Contribution.\n\n1.3. \"Contribution\"\n    means Covered Software of a particular Contributor.\n\n1.4. \"Covered Software\"\n    means Source Code Form to which the initial Contributor has attached\n    the notice in Exhibit A, the Executable Form of such Source Code\n    Form, and Modifications of such Source Code Form, in each case\n    including portions thereof.\n\n1.5. \"Incompatible With Secondary Licenses\"\n    means\n\n    (a) that the initial Contributor has attached the notice described\n        in Exhibit B to the Covered Software; or\n\n    (b) that the Covered Software was made available under the terms of\n        version 1.1 or earlier of the License, but not also under the\n        terms of a Secondary License.\n\n1.6. \"Executable Form\"\n    means any form of the work other than Source Code Form.\n\n1.7. \"Larger Work\"\n    means a work that combines Covered Software with other material, in\n    a separate file or files, that is not Covered Software.\n\n1.8. \"License\"\n    means this document.\n\n1.9. \"Licensable\"\n    means having the right to grant, to the maximum extent possible,\n    whether at the time of the initial grant or subsequently, any and\n    all of the rights conveyed by this License.\n\n1.10. \"Modifications\"\n    means any of the following:\n\n    (a) any file in Source Code Form that results from an addition to,\n        deletion from, or modification of the contents of Covered\n        Software; or\n\n    (b) any new file in Source Code Form that contains any Covered\n        Software.\n\n1.11. \"Patent Claims\" of a Contributor\n    means any patent claim(s), including without limitation, method,\n    process, and apparatus claims, in any patent Licensable by such\n    Contributor that would be infringed, but for the grant of the\n    License, by the making, using, selling, offering for sale, having\n    made, import, or transfer of either its Contributions or its\n    Contributor Version.\n\n1.12. \"Secondary License\"\n    means either the GNU General Public License, Version 2.0, the GNU\n    Lesser General Public License, Version 2.1, the GNU Affero General\n    Public License, Version 3.0, or any later versions of those\n    licenses.\n\n1.13. \"Source Code Form\"\n    means the form of the work preferred for making modifications.\n\n1.14. \"You\" (or \"Your\")\n    means an individual or a legal entity exercising rights under this\n    License. For legal entities, \"You\" includes any entity that\n    controls, is controlled by, or is under common control with You. For\n    purposes of this definition, \"control\" means (a) the power, direct\n    or indirect, to cause the direction or management of such entity,\n    whether by contract or otherwise, or (b) ownership of more than\n    fifty percent (50%) of the outstanding shares or beneficial\n    ownership of such entity.\n\n2. License Grants and Conditions\n--------------------------------\n\n2.1. Grants\n\nEach Contributor hereby grants You a world-wide, royalty-free,\nnon-exclusive license:\n\n(a) under intellectual property rights (other than patent or trademark)\n    Licensable by such Contributor to use, reproduce, make available,\n    modify, display, perform, distribute, and otherwise exploit its\n    Contributions, either on an unmodified basis, with Modifications, or\n    as part of a Larger Work; and\n\n(b) under Patent Claims of such Contributor to make, use, sell, offer\n    for sale, have made, import, and otherwise transfer either its\n    Contributions or its Contributor Version.\n\n2.2. Effective Date\n\nThe licenses granted in Section 2.1 with respect to any Contribution\nbecome effective for each Contribution on the date the Contributor first\ndistributes such Contribution.\n\n2.3. Limitations on Grant Scope\n\nThe licenses granted in this Section 2 are the only rights granted under\nthis License. No additional rights or licenses will be implied from the\ndistribution or licensing of Covered Software under this License.\nNotwithstanding Section 2.1(b) above, no patent license is granted by a\nContributor:\n\n(a) for any code that a Contributor has removed from Covered Software;\n    or\n\n(b) for infringements caused by: (i) Your and any other third party's\n    modifications of Covered Software, or (ii) the combination of its\n    Contributions with other software (except as part of its Contributor\n    Version); or\n\n(c) under Patent Claims infringed by Covered Software in the absence of\n    its Contributions.\n\nThis License does not grant any rights in the trademarks, service marks,\nor logos of any Contributor (except as may be necessary to comply with\nthe notice requirements in Section 3.4).\n\n2.4. Subsequent Licenses\n\nNo Contributor makes additional grants as a result of Your choice to\ndistribute the Covered Software under a subsequent version of this\nLicense (see Section 10.2) or under the terms of a Secondary License (if\npermitted under the terms of Section 3.3).\n\n2.5. Representation\n\nEach Contributor represents that the Contributor believes its\nContributions are its original creation(s) or it has sufficient rights\nto grant the rights to its Contributions conveyed by this License.\n\n2.6. Fair Use\n\nThis License is not intended to limit any rights You have under\napplicable copyright doctrines of fair use, fair dealing, or other\nequivalents.\n\n2.7. Conditions\n\nSections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted\nin Section 2.1.\n\n3. Responsibilities\n-------------------\n\n3.1. Distribution of Source Form\n\nAll distribution of Covered Software in Source Code Form, including any\nModifications that You create or to which You contribute, must be under\nthe terms of this License. You must inform recipients that the Source\nCode Form of the Covered Software is governed by the terms of this\nLicense, and how they can obtain a copy of this License. You may not\nattempt to alter or restrict the recipients' rights in the Source Code\nForm.\n\n3.2. Distribution of Executable Form\n\nIf You distribute Covered Software in Executable Form then:\n\n(a) such Covered Software must also be made available in Source Code\n    Form, as described in Section 3.1, and You must inform recipients of\n    the Executable Form how they can obtain a copy of such Source Code\n    Form by reasonable means in a timely manner, at a charge no more\n    than the cost of distribution to the recipient; and\n\n(b) You may distribute such Executable Form under the terms of this\n    License, or sublicense it under different terms, provided that the\n    license for the Executable Form does not attempt to limit or alter\n    the recipients' rights in the Source Code Form under this License.\n\n3.3. Distribution of a Larger Work\n\nYou may create and distribute a Larger Work under terms of Your choice,\nprovided that You also comply with the requirements of this License for\nthe Covered Software. If the Larger Work is a combination of Covered\nSoftware with a work governed by one or more Secondary Licenses, and the\nCovered Software is not Incompatible With Secondary Licenses, this\nLicense permits You to additionally distribute such Covered Software\nunder the terms of such Secondary License(s), so that the recipient of\nthe Larger Work may, at their option, further distribute the Covered\nSoftware under the terms of either this License or such Secondary\nLicense(s).\n\n3.4. Notices\n\nYou may not remove or alter the substance of any license notices\n(including copyright notices, patent notices, disclaimers of warranty,\nor limitations of liability) contained within the Source Code Form of\nthe Covered Software, except that You may alter any license notices to\nthe extent required to remedy known factual inaccuracies.\n\n3.5. Application of Additional Terms\n\nYou may choose to offer, and to charge a fee for, warranty, support,\nindemnity or liability obligations to one or more recipients of Covered\nSoftware. However, You may do so only on Your own behalf, and not on\nbehalf of any Contributor. You must make it absolutely clear that any\nsuch warranty, support, indemnity, or liability obligation is offered by\nYou alone, and You hereby agree to indemnify every Contributor for any\nliability incurred by such Contributor as a result of warranty, support,\nindemnity or liability terms You offer. You may include additional\ndisclaimers of warranty and limitations of liability specific to any\njurisdiction.\n\n4. Inability to Comply Due to Statute or Regulation\n---------------------------------------------------\n\nIf it is impossible for You to comply with any of the terms of this\nLicense with respect to some or all of the Covered Software due to\nstatute, judicial order, or regulation then You must: (a) comply with\nthe terms of this License to the maximum extent possible; and (b)\ndescribe the limitations and the code they affect. Such description must\nbe placed in a text file included with all distributions of the Covered\nSoftware under this License. Except to the extent prohibited by statute\nor regulation, such description must be sufficiently detailed for a\nrecipient of ordinary skill to be able to understand it.\n\n5. Termination\n--------------\n\n5.1. The rights granted under this License will terminate automatically\nif You fail to comply with any of its terms. However, if You become\ncompliant, then the rights granted under this License from a particular\nContributor are reinstated (a) provisionally, unless and until such\nContributor explicitly and finally terminates Your grants, and (b) on an\nongoing basis, if such Contributor fails to notify You of the\nnon-compliance by some reasonable means prior to 60 days after You have\ncome back into compliance. Moreover, Your grants from a particular\nContributor are reinstated on an ongoing basis if such Contributor\nnotifies You of the non-compliance by some reasonable means, this is the\nfirst time You have received notice of non-compliance with this License\nfrom such Contributor, and You become compliant prior to 30 days after\nYour receipt of the notice.\n\n5.2. If You initiate litigation against any entity by asserting a patent\ninfringement claim (excluding declaratory judgment actions,\ncounter-claims, and cross-claims) alleging that a Contributor Version\ndirectly or indirectly infringes any patent, then the rights granted to\nYou by any and all Contributors for the Covered Software under Section\n2.1 of this License shall terminate.\n\n5.3. In the event of termination under Sections 5.1 or 5.2 above, all\nend user license agreements (excluding distributors and resellers) which\nhave been validly granted by You or Your distributors under this License\nprior to termination shall survive termination.\n\n************************************************************************\n*                                                                      *\n*  6. Disclaimer of Warranty                                           *\n*  -------------------------                                           *\n*                                                                      *\n*  Covered Software is provided under this License on an \"as is\"       *\n*  basis, without warranty of any kind, either expressed, implied, or  *\n*  statutory, including, without limitation, warranties that the       *\n*  Covered Software is free of defects, merchantable, fit for a        *\n*  particular purpose or non-infringing. The entire risk as to the     *\n*  quality and performance of the Covered Software is with You.        *\n*  Should any Covered Software prove defective in any respect, You     *\n*  (not any Contributor) assume the cost of any necessary servicing,   *\n*  repair, or correction. This disclaimer of warranty constitutes an   *\n*  essential part of this License. No use of any Covered Software is   *\n*  authorized under this License except under this disclaimer.         *\n*                                                                      *\n************************************************************************\n\n************************************************************************\n*                                                                      *\n*  7. Limitation of Liability                                          *\n*  --------------------------                                          *\n*                                                                      *\n*  Under no circumstances and under no legal theory, whether tort      *\n*  (including negligence), contract, or otherwise, shall any           *\n*  Contributor, or anyone who distributes Covered Software as          *\n*  permitted above, be liable to You for any direct, indirect,         *\n*  special, incidental, or consequential damages of any character      *\n*  including, without limitation, damages for lost profits, loss of    *\n*  goodwill, work stoppage, computer failure or malfunction, or any    *\n*  and all other commercial damages or losses, even if such party      *\n*  shall have been informed of the possibility of such damages. This   *\n*  limitation of liability shall not apply to liability for death or   *\n*  personal injury resulting from such party's negligence to the       *\n*  extent applicable law prohibits such limitation. Some               *\n*  jurisdictions do not allow the exclusion or limitation of           *\n*  incidental or consequential damages, so this exclusion and          *\n*  limitation may not apply to You.                                    *\n*                                                                      *\n************************************************************************\n\n8. Litigation\n-------------\n\nAny litigation relating to this License may be brought only in the\ncourts of a jurisdiction where the defendant maintains its principal\nplace of business and such litigation shall be governed by laws of that\njurisdiction, without reference to its conflict-of-law provisions.\nNothing in this Section shall prevent a party's ability to bring\ncross-claims or counter-claims.\n\n9. Miscellaneous\n----------------\n\nThis License represents the complete agreement concerning the subject\nmatter hereof. If any provision of this License is held to be\nunenforceable, such provision shall be reformed only to the extent\nnecessary to make it enforceable. Any law or regulation which provides\nthat the language of a contract shall be construed against the drafter\nshall not be used to construe this License against a Contributor.\n\n10. Versions of the License\n---------------------------\n\n10.1. New Versions\n\nMozilla Foundation is the license steward. Except as provided in Section\n10.3, no one other than the license steward has the right to modify or\npublish new versions of this License. Each version will be given a\ndistinguishing version number.\n\n10.2. Effect of New Versions\n\nYou may distribute the Covered Software under the terms of the version\nof the License under which You originally received the Covered Software,\nor under the terms of any subsequent version published by the license\nsteward.\n\n10.3. Modified Versions\n\nIf you create software not governed by this License, and you want to\ncreate a new license for such software, you may create and use a\nmodified version of this License if you rename the license and remove\nany references to the name of the license steward (except to note that\nsuch modified license differs from this License).\n\n10.4. Distributing Source Code Form that is Incompatible With Secondary\nLicenses\n\nIf You choose to distribute Source Code Form that is Incompatible With\nSecondary Licenses under the terms of this version of the License, the\nnotice described in Exhibit B of this License must be attached.\n\nExhibit A - Source Code Form License Notice\n-------------------------------------------\n\n  This Source Code Form is subject to the terms of the Mozilla Public\n  License, v. 2.0. If a copy of the MPL was not distributed with this\n  file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\nIf it is not possible or desirable to put the notice in a particular\nfile, then You may include the notice in a location (such as a LICENSE\nfile in a relevant directory) where a recipient would be likely to look\nfor such a notice.\n\nYou may add additional accurate notices of copyright ownership.\n\nExhibit B - \"Incompatible With Secondary Licenses\" Notice\n---------------------------------------------------------\n\n  This Source Code Form is \"Incompatible With Secondary Licenses\", as\n  defined by the Mozilla Public License, v. 2.0.\n"
  },
  {
    "path": "LICENSES/AGPL-3.0-or-later.txt",
    "content": "GNU AFFERO GENERAL PUBLIC LICENSE\nVersion 3, 19 November 2007\n\nCopyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>\n\nEveryone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.\n\n                            Preamble\n\nThe GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software.\n\nThe licenses for most software and other practical works are designed to take away your freedom to share and change the works.  By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users.\n\nWhen we speak of free software, we are referring to freedom, not price.  Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things.\n\nDevelopers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software.\n\nA secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate.  Many developers of free software are heartened and encouraged by the resulting cooperation.  However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public.\n\nThe GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community.  It requires the operator of a network server to provide the source code of the modified version running there to the users of that server.  Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version.\n\nAn older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals.  This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license.\n\nThe precise terms and conditions for copying, distribution and modification follow.\n\n                       TERMS AND CONDITIONS\n\n0. Definitions.\n\n\"This License\" refers to version 3 of the GNU Affero General Public License.\n\n\"Copyright\" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks.\n\n\"The Program\" refers to any copyrightable work licensed under this License.  Each licensee is addressed as \"you\".  \"Licensees\" and \"recipients\" may be individuals or organizations.\n\nTo \"modify\" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy.  The resulting work is called a \"modified version\" of the earlier work or a work \"based on\" the earlier work.\n\nA \"covered work\" means either the unmodified Program or a work based on the Program.\n\nTo \"propagate\" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy.  Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well.\n\nTo \"convey\" a work means any kind of propagation that enables other parties to make or receive copies.  Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying.\n\nAn interactive user interface displays \"Appropriate Legal Notices\" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License.  If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion.\n\n1. Source Code.\nThe \"source code\" for a work means the preferred form of the work for making modifications to it.  \"Object code\" means any non-source form of a work.\n\nA \"Standard Interface\" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language.\n\nThe \"System Libraries\" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form.  A \"Major Component\", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it.\n\nThe \"Corresponding Source\" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities.  However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work.  For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those\nsubprograms and other parts of the work.\n\nThe Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source.\n\nThe Corresponding Source for a work in source code form is that same work.\n\n2. Basic Permissions.\nAll rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met.  This License explicitly affirms your unlimited permission to run the unmodified Program.  The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work.  This License acknowledges your rights of fair use or other equivalent, as provided by copyright law.\n\nYou may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force.  You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright.  Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you.\n\nConveying under any other circumstances is permitted solely under the conditions stated below.  Sublicensing is not allowed; section 10 makes it unnecessary.\n\n3. Protecting Users' Legal Rights From Anti-Circumvention Law.\nNo covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures.\n\nWhen you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures.\n\n4. Conveying Verbatim Copies.\nYou may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program.\n\nYou may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee.\n\n5. Conveying Modified Source Versions.\nYou may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions:\n\n    a) The work must carry prominent notices stating that you modified it, and giving a relevant date.\n\n    b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7.  This requirement modifies the requirement in section 4 to \"keep intact all notices\".\n\n    c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy.  This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged.  This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it.\n\n    d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so.\n\nA compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an \"aggregate\" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit.  Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate.\n\n6. Conveying Non-Source Forms.\nYou may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways:\n\n    a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange.\n\n    b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge.\n\n    c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source.  This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b.\n\n    d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge.  You need not require recipients to copy the Corresponding Source along with the object code.  If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source.  Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements.\n\n    e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d.\n\nA separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work.\n\nA \"User Product\" is either (1) a \"consumer product\", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling.  In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage.  For a particular product received by a particular user, \"normally used\" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product.  A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product.\n\n\"Installation Information\" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source.  The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made.\n\nIf you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information.  But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM).\n\nThe requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed.  Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network.\n\nCorresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying.\n\n7. Additional Terms.\n\"Additional permissions\" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law.  If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions.\n\nWhen you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it.  (Additional permissions may be written to require their own removal in certain cases when you modify the work.)  You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission.\n\nNotwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms:\n\n    a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or\n\n    b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or\n\n    c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or\n\n    d) Limiting the use for publicity purposes of names of licensors or authors of the material; or\n\n    e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or\n\n    f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors.\n\nAll other non-permissive additional terms are considered \"further restrictions\" within the meaning of section 10.  If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term.  If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying.\n\nIf you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms.\n\nAdditional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way.\n\n8. Termination.\n\nYou may not propagate or modify a covered work except as expressly provided under this License.  Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11).\n\nHowever, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation.\n\nMoreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice.\n\nTermination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License.  If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10.\n\n9. Acceptance Not Required for Having Copies.\n\nYou are not required to accept this License in order to receive or run a copy of the Program.  Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance.  However, nothing other than this License grants you permission to propagate or modify any covered work.  These actions infringe copyright if you do not accept this License.  Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so.\n\n10. Automatic Licensing of Downstream Recipients.\n\nEach time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License.  You are not responsible for enforcing compliance by third parties with this License.\n\nAn \"entity transaction\" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations.  If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts.\n\nYou may not impose any further restrictions on the exercise of the rights granted or affirmed under this License.  For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it.\n\n11. Patents.\n\nA \"contributor\" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based.  The work thus licensed is called the contributor's \"contributor version\".\n\nA contributor's \"essential patent claims\" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version.  For purposes of this definition, \"control\" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License.\n\nEach contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version.\n\nIn the following three paragraphs, a \"patent license\" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement).  To \"grant\" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party.\n\nIf you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent\nlicense to downstream recipients.  \"Knowingly relying\" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid.\n\nIf, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it.\n\nA patent license is \"discriminatory\" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License.  You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007.\n\nNothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law.\n\n12. No Surrender of Others' Freedom.\n\nIf conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License.  If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may\nnot convey it at all.  For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program.\n\n13. Remote Network Interaction; Use with the GNU General Public License.\n\nNotwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software.  This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph.\n\nNotwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work.  The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License.\n\n14. Revised Versions of this License.\n\nThe Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time.  Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.\n\nEach version is given a distinguishing version number.  If the Program specifies that a certain numbered version of the GNU Affero General Public License \"or any later version\" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation.  If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation.\n\nIf the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program.\n\nLater license versions may give you additional or different permissions.  However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version.\n\n15. Disclaimer of Warranty.\n\nTHERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM \"AS IS\" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n\n16. Limitation of Liability.\n\nIN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.\n\n17. Interpretation of Sections 15 and 16.\n\nIf the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee.\n\nEND OF TERMS AND CONDITIONS\n\n            How to Apply These Terms to Your New Programs\n\nIf you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms.\n\nTo do so, attach the following notices to the program.  It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the \"copyright\" line and a pointer to where the full notice is found.\n\n     <one line to give the program's name and a brief idea of what it does.>\n     Copyright (C) <year>  <name of author>\n\n     This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.\n\n     This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more details.\n\n     You should have received a copy of the GNU Affero General Public License along with this program.  If not, see <http://www.gnu.org/licenses/>.\n\nAlso add information on how to contact you by electronic and paper mail.\n\nIf your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source.  For example, if your program is a web application, its interface could display a \"Source\" link that leads users to an archive of the code.  There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements.\n\nYou should also get your employer (if you work as a programmer) or school, if any, to sign a \"copyright disclaimer\" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see <http://www.gnu.org/licenses/>.\n"
  },
  {
    "path": "LICENSES/Apache-2.0.txt",
    "content": "Apache License\nVersion 2.0, January 2004\nhttp://www.apache.org/licenses/\n\nTERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n1. Definitions.\n\n\"License\" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.\n\n\"Licensor\" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.\n\n\"Legal Entity\" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, \"control\" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.\n\n\"You\" (or \"Your\") shall mean an individual or Legal Entity exercising permissions granted by this License.\n\n\"Source\" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.\n\n\"Object\" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.\n\n\"Work\" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).\n\n\"Derivative Works\" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.\n\n\"Contribution\" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, \"submitted\" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as \"Not a Contribution.\"\n\n\"Contributor\" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.\n\n2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.\n\n3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.\n\n4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:\n\n     (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and\n\n     (b) You must cause any modified files to carry prominent notices stating that You changed the files; and\n\n     (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and\n\n     (d) If the Work includes a \"NOTICE\" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.\n\n     You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.\n\n5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.\n\n6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.\n\n7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.\n\n8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.\n\n9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.\n\nEND OF TERMS AND CONDITIONS\n\nAPPENDIX: How to apply the Apache License to your work.\n\nTo apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets \"[]\" replaced with your own identifying information. (Don't include the brackets!)  The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same \"printed page\" as the copyright notice for easier identification within third-party archives.\n\nCopyright [yyyy] [name of copyright owner]\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\nhttp://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.\n"
  },
  {
    "path": "LICENSES/CC-BY-SA-1.0.txt",
    "content": "Creative Commons Attribution-ShareAlike 1.0\n\n CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DRAFT LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN \"AS-IS\" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.\n\nLicense\n\nTHE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE (\"CCPL\" OR \"LICENSE\"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE IS PROHIBITED.\n\nBY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\n\n1. Definitions\n\n     a. \"Collective Work\" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.\n\n     b. \"Derivative Work\" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License.\n\n     c. \"Licensor\" means the individual or entity that offers the Work under the terms of this License.\n\n     d. \"Original Author\" means the individual or entity who created the Work.\n\n     e. \"Work\" means the copyrightable work of authorship offered under the terms of this License.\n\n     f. \"You\" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.\n\n2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.\n\n3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:\n\n     a. to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;\n\n     b. to create and reproduce Derivative Works;\n\n     c. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;\n\n     d. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works;\n\nThe above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.\n\n4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:\n\n     a. You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any reference to such Licensor or the Original Author, as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any reference to such Licensor or the Original Author, as requested.\n\n     b. You may distribute, publicly display, publicly perform, or publicly digitally perform a Derivative Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of each Derivative Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Derivative Works that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder, and You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Derivative Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Derivative Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Derivative Work itself to be made subject to the terms of this License.\n\n     c. If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and give the Original Author credit reasonable to the medium or means You are utilizing by conveying the name (or pseudonym if applicable) of the Original Author if supplied; the title of the Work if supplied; in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., \"French translation of the Work by Original Author,\" or \"Screenplay based on original Work by Original Author\"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.\n\n5. Representations, Warranties and Disclaimer\n\n     a. By offering the Work for public release under this License, Licensor represents and warrants that, to the best of Licensor's knowledge after reasonable inquiry:\n\n           i. Licensor has secured all rights in the Work necessary to grant the license rights hereunder and to permit the lawful exercise of the rights granted hereunder without You having any obligation to pay any royalties, compulsory license fees, residuals or any other payments;\n\n          ii. The Work does not infringe the copyright, trademark, publicity rights, common law rights or any other right of any third party or constitute defamation, invasion of privacy or other tortious injury to any third party.\n\n     b. EXCEPT AS EXPRESSLY STATED IN THIS LICENSE OR OTHERWISE AGREED IN WRITING OR REQUIRED BY APPLICABLE LAW, THE WORK IS LICENSED ON AN \"AS IS\" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES REGARDING THE CONTENTS OR ACCURACY OF THE WORK.\n\n6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, AND EXCEPT FOR DAMAGES ARISING FROM LIABILITY TO A THIRD PARTY RESULTING FROM BREACH OF THE WARRANTIES IN SECTION 5, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.\n\n7. Termination\n\n     a. This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.\n\n     b. Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.\n\n8. Miscellaneous\n\n     a. Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.\n\n     b. Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.\n\n     c. If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.\n\n     d. No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.\n\n     e. This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.\n\nCreative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.\n\nExcept for the limited purpose of indicating to the public that the Work is licensed under the CCPL, neither party will use the trademark \"Creative Commons\" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time.\n\nCreative Commons may be contacted at http://creativecommons.org/.\n"
  },
  {
    "path": "LICENSES/CC-BY-SA-2.0.txt",
    "content": "Creative Commons Attribution-ShareAlike 2.0\n\n CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN \"AS-IS\" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.\n\nLicense\n\nTHE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE (\"CCPL\" OR \"LICENSE\"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.\n\nBY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\n\n1. Definitions\n\n     a. \"Collective Work\" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.\n\n     b. \"Derivative Work\" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License. For the avoidance of doubt, where the Work is a musical composition or sound recording, the synchronization of the Work in timed-relation with a moving image (\"synching\") will be considered a Derivative Work for the purpose of this License.\n\n     c. \"Licensor\" means the individual or entity that offers the Work under the terms of this License.\n\n     d. \"Original Author\" means the individual or entity who created the Work.\n\n     e. \"Work\" means the copyrightable work of authorship offered under the terms of this License.\n\n     f. \"You\" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.\n\n     g. \"License Elements\" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike.\n\n2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.\n\n3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:\n\n     a. to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;\n\n     b. to create and reproduce Derivative Works;\n\n     c. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;\n\n     d. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works.\n\n     e. For the avoidance of doubt, where the work is a musical composition:\n\n          i. Performance Royalties Under Blanket Licenses. Licensor waives the exclusive right to collect, whether individually or via a performance rights society (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital performance (e.g. webcast) of the Work.\n\n          ii. Mechanical Rights and Statutory Royalties. Licensor waives the exclusive right to collect, whether individually or via a music rights society or designated agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from the Work (\"cover version\") and distribute, subject to the compulsory license created by 17 USC Section 115 of the US Copyright Act (or the equivalent in other jurisdictions).\n\n     f. Webcasting Rights and Statutory Royalties. For the avoidance of doubt, where the Work is a sound recording, Licensor waives the exclusive right to collect, whether individually or via a performance-rights society (e.g. SoundExchange), royalties for the public digital performance (e.g. webcast) of the Work, subject to the compulsory license created by 17 USC Section 114 of the US Copyright Act (or the equivalent in other jurisdictions).\n\nThe above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.\n\n4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:\n\n     a. You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any reference to such Licensor or the Original Author, as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any reference to such Licensor or the Original Author, as requested.\n\n     b. You may distribute, publicly display, publicly perform, or publicly digitally perform a Derivative Work only under the terms of this License, a later version of this License with the same License Elements as this License, or a Creative Commons iCommons license that contains the same License Elements as this License (e.g. Attribution-ShareAlike 2.0 Japan). You must include a copy of, or the Uniform Resource Identifier for, this License or other license specified in the previous sentence with every copy or phonorecord of each Derivative Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Derivative Works that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder, and You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Derivative Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Derivative Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Derivative Work itself to be made subject to the terms of this License.\n\n     c. If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and give the Original Author credit reasonable to the medium or means You are utilizing by conveying the name (or pseudonym if applicable) of the Original Author if supplied; the title of the Work if supplied; to the extent reasonably practicable, the Uniform Resource Identifier, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., \"French translation of the Work by Original Author,\" or \"Screenplay based on original Work by Original Author\"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.\n\n5. Representations, Warranties and Disclaimer\n\nUNLESS OTHERWISE AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE MATERIALS, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.\n\n6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.\n\n7. Termination\n\n     a. This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.\n\n     b. Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.\n\n8. Miscellaneous\n\n     a. Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.\n\n     b. Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.\n\n     c. If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.\n\n     d. No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.\n\n     e. This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.\n\nCreative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.\n\nExcept for the limited purpose of indicating to the public that the Work is licensed under the CCPL, neither party will use the trademark \"Creative Commons\" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time.\n\nCreative Commons may be contacted at http://creativecommons.org/.\n"
  },
  {
    "path": "LICENSES/CC-BY-SA-2.5.txt",
    "content": "Creative Commons Attribution-ShareAlike 2.5\n\n CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN \"AS-IS\" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.\n\nLicense\n\nTHE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE (\"CCPL\" OR \"LICENSE\"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.\n\nBY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\n\n1. Definitions\n\n     a. \"Collective Work\" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.\n\n     b. \"Derivative Work\" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License. For the avoidance of doubt, where the Work is a musical composition or sound recording, the synchronization of the Work in timed-relation with a moving image (\"synching\") will be considered a Derivative Work for the purpose of this License.\n\n     c. \"Licensor\" means the individual or entity that offers the Work under the terms of this License.\n\n     d. \"Original Author\" means the individual or entity who created the Work.\n\n     e. \"Work\" means the copyrightable work of authorship offered under the terms of this License.\n\n     f. \"You\" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.\n\n     g. \"License Elements\" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike.\n\n2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.\n\n3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:\n\n     a. to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;\n\n     b. to create and reproduce Derivative Works;\n\n     c. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;\n\n     d. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works.\n\n     e. For the avoidance of doubt, where the work is a musical composition:\n\n          i. Performance Royalties Under Blanket Licenses. Licensor waives the exclusive right to collect, whether individually or via a performance rights society (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital performance (e.g. webcast) of the Work.\n\n          ii. Mechanical Rights and Statutory Royalties. Licensor waives the exclusive right to collect, whether individually or via a music rights society or designated agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from the Work (\"cover version\") and distribute, subject to the compulsory license created by 17 USC Section 115 of the US Copyright Act (or the equivalent in other jurisdictions).\n\n     f. Webcasting Rights and Statutory Royalties. For the avoidance of doubt, where the Work is a sound recording, Licensor waives the exclusive right to collect, whether individually or via a performance-rights society (e.g. SoundExchange), royalties for the public digital performance (e.g. webcast) of the Work, subject to the compulsory license created by 17 USC Section 114 of the US Copyright Act (or the equivalent in other jurisdictions).\n\nThe above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.\n\n4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:\n\n     a. You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any credit as required by clause 4(c), as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any credit as required by clause 4(c), as requested.\n\n     b. You may distribute, publicly display, publicly perform, or publicly digitally perform a Derivative Work only under the terms of this License, a later version of this License with the same License Elements as this License, or a Creative Commons iCommons license that contains the same License Elements as this License (e.g. Attribution-ShareAlike 2.5 Japan). You must include a copy of, or the Uniform Resource Identifier for, this License or other license specified in the previous sentence with every copy or phonorecord of each Derivative Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Derivative Works that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder, and You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Derivative Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Derivative Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Derivative Work itself to be made subject to the terms of this License.\n\n     c. If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or (ii) if the Original Author and/or Licensor designate another party or parties (e.g. a sponsor institute, publishing entity, journal) for attribution in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; the title of the Work if supplied; to the extent reasonably practicable, the Uniform Resource Identifier, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., \"French translation of the Work by Original Author,\" or \"Screenplay based on original Work by Original Author\"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.\n\n5. Representations, Warranties and Disclaimer\n\nUNLESS OTHERWISE AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE MATERIALS, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.\n\n6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.\n\n7. Termination\n\n     a. This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.\n\n     b. Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.\n\n8. Miscellaneous\n\n     a. Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.\n\n     b. Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.\n\n     c. If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.\n\n     d. No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.\n\n     e. This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.\n\nCreative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.\n\nExcept for the limited purpose of indicating to the public that the Work is licensed under the CCPL, neither party will use the trademark \"Creative Commons\" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time.\n\nCreative Commons may be contacted at http://creativecommons.org/.\n"
  },
  {
    "path": "LICENSES/CC-BY-SA-3.0.txt",
    "content": "Creative Commons Attribution-ShareAlike 3.0 Unported\n\n CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN \"AS-IS\" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.\n\nLicense\n\nTHE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE (\"CCPL\" OR \"LICENSE\"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.\n\nBY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\n\n1. Definitions\n\n     a. \"Adaptation\" means a work based upon the Work, or upon the Work and other pre-existing works, such as a translation, adaptation, derivative work, arrangement of music or other alterations of a literary or artistic work, or phonogram or performance and includes cinematographic adaptations or any other form in which the Work may be recast, transformed, or adapted including in any form recognizably derived from the original, except that a work that constitutes a Collection will not be considered an Adaptation for the purpose of this License. For the avoidance of doubt, where the Work is a musical work, performance or phonogram, the synchronization of the Work in timed-relation with a moving image (\"synching\") will be considered an Adaptation for the purpose of this License.\n\n     b. \"Collection\" means a collection of literary or artistic works, such as encyclopedias and anthologies, or performances, phonograms or broadcasts, or other works or subject matter other than works listed in Section 1(f) below, which, by reason of the selection and arrangement of their contents, constitute intellectual creations, in which the Work is included in its entirety in unmodified form along with one or more other contributions, each constituting separate and independent works in themselves, which together are assembled into a collective whole. A work that constitutes a Collection will not be considered an Adaptation (as defined below) for the purposes of this License.\n\n     c. \"Creative Commons Compatible License\" means a license that is listed at http://creativecommons.org/compatiblelicenses that has been approved by Creative Commons as being essentially equivalent to this License, including, at a minimum, because that license: (i) contains terms that have the same purpose, meaning and effect as the License Elements of this License; and, (ii) explicitly permits the relicensing of adaptations of works made available under that license under this License or a Creative Commons jurisdiction license with the same License Elements as this License.\n\n     d. \"Distribute\" means to make available to the public the original and copies of the Work or Adaptation, as appropriate, through sale or other transfer of ownership.\n\n     e. \"License Elements\" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike.\n\n     f. \"Licensor\" means the individual, individuals, entity or entities that offer(s) the Work under the terms of this License.\n\n     g. \"Original Author\" means, in the case of a literary or artistic work, the individual, individuals, entity or entities who created the Work or if no individual or entity can be identified, the publisher; and in addition (i) in the case of a performance the actors, singers, musicians, dancers, and other persons who act, sing, deliver, declaim, play in, interpret or otherwise perform literary or artistic works or expressions of folklore; (ii) in the case of a phonogram the producer being the person or legal entity who first fixes the sounds of a performance or other sounds; and, (iii) in the case of broadcasts, the organization that transmits the broadcast.\n\n     h. \"Work\" means the literary and/or artistic work offered under the terms of this License including without limitation any production in the literary, scientific and artistic domain, whatever may be the mode or form of its expression including digital form, such as a book, pamphlet and other writing; a lecture, address, sermon or other work of the same nature; a dramatic or dramatico-musical work; a choreographic work or entertainment in dumb show; a musical composition with or without words; a cinematographic work to which are assimilated works expressed by a process analogous to cinematography; a work of drawing, painting, architecture, sculpture, engraving or lithography; a photographic work to which are assimilated works expressed by a process analogous to photography; a work of applied art; an illustration, map, plan, sketch or three-dimensional work relative to geography, topography, architecture or science; a performance; a broadcast; a phonogram; a compilation of data to the extent it is protected as a copyrightable work; or a work performed by a variety or circus performer to the extent it is not otherwise considered a literary or artistic work.\n\n     i. \"You\" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.\n\n     j. \"Publicly Perform\" means to perform public recitations of the Work and to communicate to the public those public recitations, by any means or process, including by wire or wireless means or public digital performances; to make available to the public Works in such a way that members of the public may access these Works from a place and at a place individually chosen by them; to perform the Work to the public by any means or process and the communication to the public of the performances of the Work, including by public digital performance; to broadcast and rebroadcast the Work by any means including signs, sounds or images.\n\n     k. \"Reproduce\" means to make copies of the Work by any means including without limitation by sound or visual recordings and the right of fixation and reproducing fixations of the Work, including storage of a protected performance or phonogram in digital form or other electronic medium.\n\n2. Fair Dealing Rights. Nothing in this License is intended to reduce, limit, or restrict any uses free from copyright or rights arising from limitations or exceptions that are provided for in connection with the copyright protection under copyright law or other applicable laws.\n\n3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:\n\n     a. to Reproduce the Work, to incorporate the Work into one or more Collections, and to Reproduce the Work as incorporated in the Collections;\n\n     b. to create and Reproduce Adaptations provided that any such Adaptation, including any translation in any medium, takes reasonable steps to clearly label, demarcate or otherwise identify that changes were made to the original Work. For example, a translation could be marked \"The original work was translated from English to Spanish,\" or a modification could indicate \"The original work has been modified.\";\n\n     c. to Distribute and Publicly Perform the Work including as incorporated in Collections; and,\n\n     d. to Distribute and Publicly Perform Adaptations.\n\n     e. For the avoidance of doubt:\n\n          i. Non-waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme cannot be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License;\n\n          ii. Waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme can be waived, the Licensor waives the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; and,\n\n          iii. Voluntary License Schemes. The Licensor waives the right to collect royalties, whether individually or, in the event that the Licensor is a member of a collecting society that administers voluntary licensing schemes, via that society, from any exercise by You of the rights granted under this License.\n\nThe above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. Subject to Section 8(f), all rights not expressly granted by Licensor are hereby reserved.\n\n4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:\n\n     a. You may Distribute or Publicly Perform the Work only under the terms of this License. You must include a copy of, or the Uniform Resource Identifier (URI) for, this License with every copy of the Work You Distribute or Publicly Perform. You may not offer or impose any terms on the Work that restrict the terms of this License or the ability of the recipient of the Work to exercise the rights granted to that recipient under the terms of the License. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work You Distribute or Publicly Perform. When You Distribute or Publicly Perform the Work, You may not impose any effective technological measures on the Work that restrict the ability of a recipient of the Work from You to exercise the rights granted to that recipient under the terms of the License. This Section 4(a) applies to the Work as incorporated in a Collection, but this does not require the Collection apart from the Work itself to be made subject to the terms of this License. If You create a Collection, upon notice from any Licensor You must, to the extent practicable, remove from the Collection any credit as required by Section 4(c), as requested. If You create an Adaptation, upon notice from any Licensor You must, to the extent practicable, remove from the Adaptation any credit as required by Section 4(c), as requested.\n\n     b. You may Distribute or Publicly Perform an Adaptation only under the terms of: (i) this License; (ii) a later version of this License with the same License Elements as this License; (iii) a Creative Commons jurisdiction license (either this or a later license version) that contains the same License Elements as this License (e.g., Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible License. If you license the Adaptation under one of the licenses mentioned in (iv), you must comply with the terms of that license. If you license the Adaptation under the terms of any of the licenses mentioned in (i), (ii) or (iii) (the \"Applicable License\"), you must comply with the terms of the Applicable License generally and the following provisions: (I) You must include a copy of, or the URI for, the Applicable License with every copy of each Adaptation You Distribute or Publicly Perform; (II) You may not offer or impose any terms on the Adaptation that restrict the terms of the Applicable License or the ability of the recipient of the Adaptation to exercise the rights granted to that recipient under the terms of the Applicable License; (III) You must keep intact all notices that refer to the Applicable License and to the disclaimer of warranties with every copy of the Work as included in the Adaptation You Distribute or Publicly Perform; (IV) when You Distribute or Publicly Perform the Adaptation, You may not impose any effective technological measures on the Adaptation that restrict the ability of a recipient of the Adaptation from You to exercise the rights granted to that recipient under the terms of the Applicable License. This Section 4(b) applies to the Adaptation as incorporated in a Collection, but this does not require the Collection apart from the Adaptation itself to be made subject to the terms of the Applicable License.\n\n     c. If You Distribute, or Publicly Perform the Work or any Adaptations or Collections, You must, unless a request has been made pursuant to Section 4(a), keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or if the Original Author and/or Licensor designate another party or parties (e.g., a sponsor institute, publishing entity, journal) for attribution (\"Attribution Parties\") in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; (ii) the title of the Work if supplied; (iii) to the extent reasonably practicable, the URI, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and (iv) , consistent with Ssection 3(b), in the case of an Adaptation, a credit identifying the use of the Work in the Adaptation (e.g., \"French translation of the Work by Original Author,\" or \"Screenplay based on original Work by Original Author\"). The credit required by this Section 4(c) may be implemented in any reasonable manner; provided, however, that in the case of a Adaptation or Collection, at a minimum such credit will appear, if a credit for all contributing authors of the Adaptation or Collection appears, then as part of these credits and in a manner at least as prominent as the credits for the other contributing authors. For the avoidance of doubt, You may only use the credit required by this Section for the purpose of attribution in the manner set out above and, by exercising Your rights under this License, You may not implicitly or explicitly assert or imply any connection with, sponsorship or endorsement by the Original Author, Licensor and/or Attribution Parties, as appropriate, of You or Your use of the Work, without the separate, express prior written permission of the Original Author, Licensor and/or Attribution Parties.\n\n     d. Except as otherwise agreed in writing by the Licensor or as may be otherwise permitted by applicable law, if You Reproduce, Distribute or Publicly Perform the Work either by itself or as part of any Adaptations or Collections, You must not distort, mutilate, modify or take other derogatory action in relation to the Work which would be prejudicial to the Original Author's honor or reputation. Licensor agrees that in those jurisdictions (e.g. Japan), in which any exercise of the right granted in Section 3(b) of this License (the right to make Adaptations) would be deemed to be a distortion, mutilation, modification or other derogatory action prejudicial to the Original Author's honor and reputation, the Licensor will waive or not assert, as appropriate, this Section, to the fullest extent permitted by the applicable national law, to enable You to reasonably exercise Your right under Section 3(b) of this License (right to make Adaptations) but not otherwise.\n\n5. Representations, Warranties and Disclaimer\n\nUNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.\n\n6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.\n\n7. Termination\n\n     a. This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Adaptations or Collections from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.\n\n     b. Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.\n\n8. Miscellaneous\n\n     a. Each time You Distribute or Publicly Perform the Work or a Collection, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.\n\n     b. Each time You Distribute or Publicly Perform an Adaptation, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.\n\n     c. If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.\n\n     d. No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.\n\n     e. This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.\n\n     f. The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). These rights and subject matter take effect in the relevant jurisdiction in which the License terms are sought to be enforced according to the corresponding provisions of the implementation of those treaty provisions in the applicable national law. If the standard suite of rights granted under applicable copyright law includes additional rights not granted under this License, such additional rights are deemed to be included in the License; this License is not intended to restrict the license of any rights under applicable law.\n\nCreative Commons Notice\n\nCreative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.\n\nExcept for the limited purpose of indicating to the public that the Work is licensed under the CCPL, Creative Commons does not authorize the use by either party of the trademark \"Creative Commons\" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time. For the avoidance of doubt, this trademark restriction does not form part of the License.\n\nCreative Commons may be contacted at http://creativecommons.org/.\n"
  },
  {
    "path": "LICENSES/CC-BY-SA-4.0.txt",
    "content": "Creative Commons Attribution-ShareAlike 4.0 International\n\n Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible.\n\nUsing Creative Commons Public Licenses\n\nCreative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses.\n\nConsiderations for licensors: Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. More considerations for licensors.\n\nConsiderations for the public: By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described.\n\nAlthough not required by our licenses, you are encouraged to respect those requests where reasonable. More considerations for the public.\n\nCreative Commons Attribution-ShareAlike 4.0 International Public License\n\nBy exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-ShareAlike 4.0 International Public License (\"Public License\"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.\n\nSection 1 – Definitions.\n\n     a.\tAdapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.\n\n     b.\tAdapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.\n\n     c.\tBY-SA Compatible License means a license listed at creativecommons.org/compatiblelicenses, approved by Creative Commons as essentially the equivalent of this Public License.\n\n     d.\tCopyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.\n\n     e.\tEffective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.\n\n     f.\tExceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.\n\n     g.\tLicense Elements means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution and ShareAlike.\n\n     h.\tLicensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.\n\n     i.\tLicensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.\n\n     j.\tLicensor means the individual(s) or entity(ies) granting rights under this Public License.\n\n     k.\tShare means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.\n\n     l.\tSui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.\n\n     m.\tYou means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.\n\nSection 2 – Scope.\n\n     a.\tLicense grant.\n\n          1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:\n\n               A. reproduce and Share the Licensed Material, in whole or in part; and\n\n               B. produce, reproduce, and Share Adapted Material.\n\n          2. Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.\n\n          3. Term. The term of this Public License is specified in Section 6(a).\n\n          4. Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.\n\n          5. Downstream recipients.\n\n               A. Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.\n\n               B. Additional offer from the Licensor – Adapted Material. Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply.\n\n               C. No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.\n\n          6. No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).\n\n     b.\tOther rights.\n\n          1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.\n\n          2. Patent and trademark rights are not licensed under this Public License.\n\n          3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties.\n\nSection 3 – License Conditions.\n\nYour exercise of the Licensed Rights is expressly made subject to the following conditions.\n\n     a.\tAttribution.\n\n          1. If You Share the Licensed Material (including in modified form), You must:\n\n               A. retain the following if it is supplied by the Licensor with the Licensed Material:\n\n                    i.\tidentification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);\n\n                    ii.\ta copyright notice;\n\n                    iii. a notice that refers to this Public License;\n\n                    iv.\ta notice that refers to the disclaimer of warranties;\n\n                    v.\ta URI or hyperlink to the Licensed Material to the extent reasonably practicable;\n\n               B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and\n\n               C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.\n\n          2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.\n\n          3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.\n\n     b.\tShareAlike.In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply.\n\n          1. The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-SA Compatible License.\n\n          2. You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material.\n\n          3. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply.\n\nSection 4 – Sui Generis Database Rights.\n\nWhere the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:\n\n     a.\tfor the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database;\n\n     b.\tif You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and\n\n     c.\tYou must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.\nFor the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.\n\nSection 5 – Disclaimer of Warranties and Limitation of Liability.\n\n     a.\tUnless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.\n\n     b.\tTo the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.\n\n     c.\tThe disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.\n\nSection 6 – Term and Termination.\n\n     a.\tThis Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.\n\n     b.\tWhere Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:\n\n          1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or\n\n          2. upon express reinstatement by the Licensor.\n\n     c.\tFor the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.\n\n     d.\tFor the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.\n\n     e.\tSections 1, 5, 6, 7, and 8 survive termination of this Public License.\n\nSection 7 – Other Terms and Conditions.\n\n     a.\tThe Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.\n\n     b.\tAny arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.\n\nSection 8 – Interpretation.\n\n     a.\tFor the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.\n\n     b.\tTo the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.\n\n     c.\tNo term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.\n\n     d.\tNothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.\n\nCreative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at creativecommons.org/policies, Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses.\n\nCreative Commons may be contacted at creativecommons.org.\n"
  },
  {
    "path": "LICENSES/GFDL-1.2-or-later.txt",
    "content": "GNU Free Documentation License\nVersion 1.2, November 2002\n\nCopyright (C) 2000,2001,2002 Free Software Foundation, Inc. 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA\n\nEveryone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.\n\n0. PREAMBLE\n\nThe purpose of this License is to make a manual, textbook, or other functional and useful document \"free\" in the sense of freedom: to assure everyone the effective freedom to copy and redistribute it, with or without modifying it, either commercially or noncommercially. Secondarily, this License preserves for the author and publisher a way to get credit for their work, while not being considered responsible for modifications made by others.\n\nThis License is a kind of \"copyleft\", which means that derivative works of the document must themselves be free in the same sense. It complements the GNU General Public License, which is a copyleft license designed for free software.\n\nWe have designed this License in order to use it for manuals for free software, because free software needs free documentation: a free program should come with manuals providing the same freedoms that the software does. But this License is not limited to software manuals; it can be used for any textual work, regardless of subject matter or whether it is published as a printed book. We recommend this License principally for works whose purpose is instruction or reference.\n\n1. APPLICABILITY AND DEFINITIONS\n\nThis License applies to any manual or other work, in any medium, that contains a notice placed by the copyright holder saying it can be distributed under the terms of this License. Such a notice grants a world-wide, royalty-free license, unlimited in duration, to use that work under the conditions stated herein. The \"Document\", below, refers to any such manual or work. Any member of the public is a licensee, and is addressed as \"you\". You accept the license if you copy, modify or distribute the work in a way requiring permission under copyright law.\n\nA \"Modified Version\" of the Document means any work containing the Document or a portion of it, either copied verbatim, or with modifications and/or translated into another language.\n\nA \"Secondary Section\" is a named appendix or a front-matter section of the Document that deals exclusively with the relationship of the publishers or authors of the Document to the Document's overall subject (or to related matters) and contains nothing that could fall directly within that overall subject. (Thus, if the Document is in part a textbook of mathematics, a Secondary Section may not explain any mathematics.) The relationship could be a matter of historical connection with the subject or with related matters, or of legal, commercial, philosophical, ethical or political position regarding them.\n\nThe \"Invariant Sections\" are certain Secondary Sections whose titles are designated, as being those of Invariant Sections, in the notice that says that the Document is released under this License. If a section does not fit the above definition of Secondary then it is not allowed to be designated as Invariant. The Document may contain zero Invariant Sections. If the Document does not identify any Invariant Sections then there are none.\n\nThe \"Cover Texts\" are certain short passages of text that are listed, as Front-Cover Texts or Back-Cover Texts, in the notice that says that the Document is released under this License. A Front-Cover Text may be at most 5 words, and a Back-Cover Text may be at most 25 words.\n\nA \"Transparent\" copy of the Document means a machine-readable copy, represented in a format whose specification is available to the general public, that is suitable for revising the document straightforwardly with generic text editors or (for images composed of pixels) generic paint programs or (for drawings) some widely available drawing editor, and that is suitable for input to text formatters or for automatic translation to a variety of formats suitable for input to text formatters. A copy made in an otherwise Transparent file format whose markup, or absence of markup, has been arranged to thwart or discourage subsequent modification by readers is not Transparent. An image format is not Transparent if used for any substantial amount of text. A copy that is not \"Transparent\" is called \"Opaque\".\n\nExamples of suitable formats for Transparent copies include plain ASCII without markup, Texinfo input format, LaTeX input format, SGML or XML using a publicly available DTD, and standard-conforming simple HTML, PostScript or PDF designed for human modification. Examples of transparent image formats include PNG, XCF and JPG. Opaque formats include proprietary formats that can be read and edited only by proprietary word processors, SGML or XML for which the DTD and/or processing tools are not generally available, and the machine-generated HTML, PostScript or PDF produced by some word processors for output purposes only.\n\nThe \"Title Page\" means, for a printed book, the title page itself, plus such following pages as are needed to hold, legibly, the material this License requires to appear in the title page. For works in formats which do not have any title page as such, \"Title Page\" means the text near the most prominent appearance of the work's title, preceding the beginning of the body of the text.\n\nA section \"Entitled XYZ\" means a named subunit of the Document whose title either is precisely XYZ or contains XYZ in parentheses following text that translates XYZ in another language. (Here XYZ stands for a specific section name mentioned below, such as \"Acknowledgements\", \"Dedications\", \"Endorsements\", or \"History\".) To \"Preserve the Title\" of such a section when you modify the Document means that it remains a section \"Entitled XYZ\" according to this definition.\n\nThe Document may include Warranty Disclaimers next to the notice which states that this License applies to the Document. These Warranty Disclaimers are considered to be included by reference in this License, but only as regards disclaiming warranties: any other implication that these Warranty Disclaimers may have is void and has no effect on the meaning of this License.\n\n2. VERBATIM COPYING\n\nYou may copy and distribute the Document in any medium, either commercially or noncommercially, provided that this License, the copyright notices, and the license notice saying this License applies to the Document are reproduced in all copies, and that you add no other conditions whatsoever to those of this License. You may not use technical measures to obstruct or control the reading or further copying of the copies you make or distribute. However, you may accept compensation in exchange for copies. If you distribute a large enough number of copies you must also follow the conditions in section 3.\n\nYou may also lend copies, under the same conditions stated above, and you may publicly display copies.\n\n3. COPYING IN QUANTITY\n\nIf you publish printed copies (or copies in media that commonly have printed covers) of the Document, numbering more than 100, and the Document's license notice requires Cover Texts, you must enclose the copies in covers that carry, clearly and legibly, all these Cover Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on the back cover. Both covers must also clearly and legibly identify you as the publisher of these copies. The front cover must present the full title with all words of the title equally prominent and visible. You may add other material on the covers in addition. Copying with changes limited to the covers, as long as they preserve the title of the Document and satisfy these conditions, can be treated as verbatim copying in other respects.\n\nIf the required texts for either cover are too voluminous to fit legibly, you should put the first ones listed (as many as fit reasonably) on the actual cover, and continue the rest onto adjacent pages.\n\nIf you publish or distribute Opaque copies of the Document numbering more than 100, you must either include a machine-readable Transparent copy along with each Opaque copy, or state in or with each Opaque copy a computer-network location from which the general network-using public has access to download using public-standard network protocols a complete Transparent copy of the Document, free of added material. If you use the latter option, you must take reasonably prudent steps, when you begin distribution of Opaque copies in quantity, to ensure that this Transparent copy will remain thus accessible at the stated location until at least one year after the last time you distribute an Opaque copy (directly or through your agents or retailers) of that edition to the public.\n\nIt is requested, but not required, that you contact the authors of the Document well before redistributing any large number of copies, to give them a chance to provide you with an updated version of the Document.\n\n4. MODIFICATIONS\n\nYou may copy and distribute a Modified Version of the Document under the conditions of sections 2 and 3 above, provided that you release the Modified Version under precisely this License, with the Modified Version filling the role of the Document, thus licensing distribution and modification of the Modified Version to whoever possesses a copy of it. In addition, you must do these things in the Modified Version:\n\n     A. Use in the Title Page (and on the covers, if any) a title distinct from that of the Document, and from those of previous versions (which should, if there were any, be listed in the History section of the Document). You may use the same title as a previous version if the original publisher of that version gives permission.\n     B. List on the Title Page, as authors, one or more persons or entities responsible for authorship of the modifications in the Modified Version, together with at least five of the principal authors of the Document (all of its principal authors, if it has fewer than five), unless they release you from this requirement.\n     C. State on the Title page the name of the publisher of the Modified Version, as the publisher.\n     D. Preserve all the copyright notices of the Document.\n     E. Add an appropriate copyright notice for your modifications adjacent to the other copyright notices.\n     F. Include, immediately after the copyright notices, a license notice giving the public permission to use the Modified Version under the terms of this License, in the form shown in the Addendum below.\n     G. Preserve in that license notice the full lists of Invariant Sections and required Cover Texts given in the Document's license notice.\n     H. Include an unaltered copy of this License.\n     I. Preserve the section Entitled \"History\", Preserve its Title, and add to it an item stating at least the title, year, new authors, and publisher of the Modified Version as given on the Title Page. If there is no section Entitled \"History\" in the Document, create one stating the title, year, authors, and publisher of the Document as given on its Title Page, then add an item describing the Modified Version as stated in the previous sentence.\n     J. Preserve the network location, if any, given in the Document for public access to a Transparent copy of the Document, and likewise the network locations given in the Document for previous versions it was based on. These may be placed in the \"History\" section. You may omit a network location for a work that was published at least four years before the Document itself, or if the original publisher of the version it refers to gives permission.\n     K. For any section Entitled \"Acknowledgements\" or \"Dedications\", Preserve the Title of the section, and preserve in the section all the substance and tone of each of the contributor acknowledgements and/or dedications given therein.\n     L. Preserve all the Invariant Sections of the Document, unaltered in their text and in their titles. Section numbers or the equivalent are not considered part of the section titles.\n     M. Delete any section Entitled \"Endorsements\". Such a section may not be included in the Modified Version.\n     N. Do not retitle any existing section to be Entitled \"Endorsements\" or to conflict in title with any Invariant Section.\n     O. Preserve any Warranty Disclaimers.\n\nIf the Modified Version includes new front-matter sections or appendices that qualify as Secondary Sections and contain no material copied from the Document, you may at your option designate some or all of these sections as invariant. To do this, add their titles to the list of Invariant Sections in the Modified Version's license notice. These titles must be distinct from any other section titles.\n\nYou may add a section Entitled \"Endorsements\", provided it contains nothing but endorsements of your Modified Version by various parties--for example, statements of peer review or that the text has been approved by an organization as the authoritative definition of a standard.\n\nYou may add a passage of up to five words as a Front-Cover Text, and a passage of up to 25 words as a Back-Cover Text, to the end of the list of Cover Texts in the Modified Version. Only one passage of Front-Cover Text and one of Back-Cover Text may be added by (or through arrangements made by) any one entity. If the Document already includes a cover text for the same cover, previously added by you or by arrangement made by the same entity you are acting on behalf of, you may not add another; but you may replace the old one, on explicit permission from the previous publisher that added the old one.\n\nThe author(s) and publisher(s) of the Document do not by this License give permission to use their names for publicity for or to assert or imply endorsement of any Modified Version.\n\n5. COMBINING DOCUMENTS\n\nYou may combine the Document with other documents released under this License, under the terms defined in section 4 above for modified versions, provided that you include in the combination all of the Invariant Sections of all of the original documents, unmodified, and list them all as Invariant Sections of your combined work in its license notice, and that you preserve all their Warranty Disclaimers.\n\nThe combined work need only contain one copy of this License, and multiple identical Invariant Sections may be replaced with a single copy. If there are multiple Invariant Sections with the same name but different contents, make the title of each such section unique by adding at the end of it, in parentheses, the name of the original author or publisher of that section if known, or else a unique number. Make the same adjustment to the section titles in the list of Invariant Sections in the license notice of the combined work.\n\nIn the combination, you must combine any sections Entitled \"History\" in the various original documents, forming one section Entitled \"History\"; likewise combine any sections Entitled \"Acknowledgements\", and any sections Entitled \"Dedications\". You must delete all sections Entitled \"Endorsements\".\n\n6. COLLECTIONS OF DOCUMENTS\n\nYou may make a collection consisting of the Document and other documents released under this License, and replace the individual copies of this License in the various documents with a single copy that is included in the collection, provided that you follow the rules of this License for verbatim copying of each of the documents in all other respects.\n\nYou may extract a single document from such a collection, and distribute it individually under this License, provided you insert a copy of this License into the extracted document, and follow this License in all other respects regarding verbatim copying of that document.\n\n7. AGGREGATION WITH INDEPENDENT WORKS\n\nA compilation of the Document or its derivatives with other separate and independent documents or works, in or on a volume of a storage or distribution medium, is called an \"aggregate\" if the copyright resulting from the compilation is not used to limit the legal rights of the compilation's users beyond what the individual works permit. When the Document is included in an aggregate, this License does not apply to the other works in the aggregate which are not themselves derivative works of the Document.\n\nIf the Cover Text requirement of section 3 is applicable to these copies of the Document, then if the Document is less than one half of the entire aggregate, the Document's Cover Texts may be placed on covers that bracket the Document within the aggregate, or the electronic equivalent of covers if the Document is in electronic form. Otherwise they must appear on printed covers that bracket the whole aggregate.\n\n8. TRANSLATION\n\nTranslation is considered a kind of modification, so you may distribute translations of the Document under the terms of section 4. Replacing Invariant Sections with translations requires special permission from their copyright holders, but you may include translations of some or all Invariant Sections in addition to the original versions of these Invariant Sections. You may include a translation of this License, and all the license notices in the Document, and any Warranty Disclaimers, provided that you also include the original English version of this License and the original versions of those notices and disclaimers. In case of a disagreement between the translation and the original version of this License or a notice or disclaimer, the original version will prevail.\n\nIf a section in the Document is Entitled \"Acknowledgements\", \"Dedications\", or \"History\", the requirement (section 4) to Preserve its Title (section 1) will typically require changing the actual title.\n\n9. TERMINATION\n\nYou may not copy, modify, sublicense, or distribute the Document except as expressly provided for under this License. Any other attempt to copy, modify, sublicense or distribute the Document is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance.\n\n10. FUTURE REVISIONS OF THIS LICENSE\n\nThe Free Software Foundation may publish new, revised versions of the GNU Free Documentation License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. See http://www.gnu.org/copyleft/.\n\nEach version of the License is given a distinguishing version number. If the Document specifies that a particular numbered version of this License \"or any later version\" applies to it, you have the option of following the terms and conditions either of that specified version or of any later version that has been published (not as a draft) by the Free Software Foundation. If the Document does not specify a version number of this License, you may choose any version ever published (not as a draft) by the Free Software Foundation.\n\nADDENDUM: How to use this License for your documents\n\nTo use this License in a document you have written, include a copy of the License in the document and put the following copyright and license notices just after the title page:\n\n Copyright (c) YEAR YOUR NAME. Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.2 or any later version published by the Free Software Foundation; with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the license is included in the section entitled \"GNU Free Documentation License\".\n\nIf you have Invariant Sections, Front-Cover Texts and Back-Cover Texts, replace the \"with...Texts.\" line with this:\n\n with the Invariant Sections being LIST THEIR TITLES, with the Front-Cover Texts being LIST, and with the Back-Cover Texts being LIST.\n\nIf you have Invariant Sections without Cover Texts, or some other combination of the three, merge those two alternatives to suit the situation.\n\nIf your document contains nontrivial examples of program code, we recommend releasing these examples in parallel under your choice of free software license, such as the GNU General Public License, to permit their use in free software.\n"
  },
  {
    "path": "LICENSES/MIT.txt",
    "content": "MIT License\n\nCopyright (c) <year> <copyright holders>\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
  },
  {
    "path": "LICENSES/MPL-2.0.txt",
    "content": "Mozilla Public License Version 2.0\n==================================\n\n1. Definitions\n--------------\n\n1.1. \"Contributor\"\n    means each individual or legal entity that creates, contributes to\n    the creation of, or owns Covered Software.\n\n1.2. \"Contributor Version\"\n    means the combination of the Contributions of others (if any) used\n    by a Contributor and that particular Contributor's Contribution.\n\n1.3. \"Contribution\"\n    means Covered Software of a particular Contributor.\n\n1.4. \"Covered Software\"\n    means Source Code Form to which the initial Contributor has attached\n    the notice in Exhibit A, the Executable Form of such Source Code\n    Form, and Modifications of such Source Code Form, in each case\n    including portions thereof.\n\n1.5. \"Incompatible With Secondary Licenses\"\n    means\n\n    (a) that the initial Contributor has attached the notice described\n        in Exhibit B to the Covered Software; or\n\n    (b) that the Covered Software was made available under the terms of\n        version 1.1 or earlier of the License, but not also under the\n        terms of a Secondary License.\n\n1.6. \"Executable Form\"\n    means any form of the work other than Source Code Form.\n\n1.7. \"Larger Work\"\n    means a work that combines Covered Software with other material, in \n    a separate file or files, that is not Covered Software.\n\n1.8. \"License\"\n    means this document.\n\n1.9. \"Licensable\"\n    means having the right to grant, to the maximum extent possible,\n    whether at the time of the initial grant or subsequently, any and\n    all of the rights conveyed by this License.\n\n1.10. \"Modifications\"\n    means any of the following:\n\n    (a) any file in Source Code Form that results from an addition to,\n        deletion from, or modification of the contents of Covered\n        Software; or\n\n    (b) any new file in Source Code Form that contains any Covered\n        Software.\n\n1.11. \"Patent Claims\" of a Contributor\n    means any patent claim(s), including without limitation, method,\n    process, and apparatus claims, in any patent Licensable by such\n    Contributor that would be infringed, but for the grant of the\n    License, by the making, using, selling, offering for sale, having\n    made, import, or transfer of either its Contributions or its\n    Contributor Version.\n\n1.12. \"Secondary License\"\n    means either the GNU General Public License, Version 2.0, the GNU\n    Lesser General Public License, Version 2.1, the GNU Affero General\n    Public License, Version 3.0, or any later versions of those\n    licenses.\n\n1.13. \"Source Code Form\"\n    means the form of the work preferred for making modifications.\n\n1.14. \"You\" (or \"Your\")\n    means an individual or a legal entity exercising rights under this\n    License. For legal entities, \"You\" includes any entity that\n    controls, is controlled by, or is under common control with You. For\n    purposes of this definition, \"control\" means (a) the power, direct\n    or indirect, to cause the direction or management of such entity,\n    whether by contract or otherwise, or (b) ownership of more than\n    fifty percent (50%) of the outstanding shares or beneficial\n    ownership of such entity.\n\n2. License Grants and Conditions\n--------------------------------\n\n2.1. Grants\n\nEach Contributor hereby grants You a world-wide, royalty-free,\nnon-exclusive license:\n\n(a) under intellectual property rights (other than patent or trademark)\n    Licensable by such Contributor to use, reproduce, make available,\n    modify, display, perform, distribute, and otherwise exploit its\n    Contributions, either on an unmodified basis, with Modifications, or\n    as part of a Larger Work; and\n\n(b) under Patent Claims of such Contributor to make, use, sell, offer\n    for sale, have made, import, and otherwise transfer either its\n    Contributions or its Contributor Version.\n\n2.2. Effective Date\n\nThe licenses granted in Section 2.1 with respect to any Contribution\nbecome effective for each Contribution on the date the Contributor first\ndistributes such Contribution.\n\n2.3. Limitations on Grant Scope\n\nThe licenses granted in this Section 2 are the only rights granted under\nthis License. No additional rights or licenses will be implied from the\ndistribution or licensing of Covered Software under this License.\nNotwithstanding Section 2.1(b) above, no patent license is granted by a\nContributor:\n\n(a) for any code that a Contributor has removed from Covered Software;\n    or\n\n(b) for infringements caused by: (i) Your and any other third party's\n    modifications of Covered Software, or (ii) the combination of its\n    Contributions with other software (except as part of its Contributor\n    Version); or\n\n(c) under Patent Claims infringed by Covered Software in the absence of\n    its Contributions.\n\nThis License does not grant any rights in the trademarks, service marks,\nor logos of any Contributor (except as may be necessary to comply with\nthe notice requirements in Section 3.4).\n\n2.4. Subsequent Licenses\n\nNo Contributor makes additional grants as a result of Your choice to\ndistribute the Covered Software under a subsequent version of this\nLicense (see Section 10.2) or under the terms of a Secondary License (if\npermitted under the terms of Section 3.3).\n\n2.5. Representation\n\nEach Contributor represents that the Contributor believes its\nContributions are its original creation(s) or it has sufficient rights\nto grant the rights to its Contributions conveyed by this License.\n\n2.6. Fair Use\n\nThis License is not intended to limit any rights You have under\napplicable copyright doctrines of fair use, fair dealing, or other\nequivalents.\n\n2.7. Conditions\n\nSections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted\nin Section 2.1.\n\n3. Responsibilities\n-------------------\n\n3.1. Distribution of Source Form\n\nAll distribution of Covered Software in Source Code Form, including any\nModifications that You create or to which You contribute, must be under\nthe terms of this License. You must inform recipients that the Source\nCode Form of the Covered Software is governed by the terms of this\nLicense, and how they can obtain a copy of this License. You may not\nattempt to alter or restrict the recipients' rights in the Source Code\nForm.\n\n3.2. Distribution of Executable Form\n\nIf You distribute Covered Software in Executable Form then:\n\n(a) such Covered Software must also be made available in Source Code\n    Form, as described in Section 3.1, and You must inform recipients of\n    the Executable Form how they can obtain a copy of such Source Code\n    Form by reasonable means in a timely manner, at a charge no more\n    than the cost of distribution to the recipient; and\n\n(b) You may distribute such Executable Form under the terms of this\n    License, or sublicense it under different terms, provided that the\n    license for the Executable Form does not attempt to limit or alter\n    the recipients' rights in the Source Code Form under this License.\n\n3.3. Distribution of a Larger Work\n\nYou may create and distribute a Larger Work under terms of Your choice,\nprovided that You also comply with the requirements of this License for\nthe Covered Software. If the Larger Work is a combination of Covered\nSoftware with a work governed by one or more Secondary Licenses, and the\nCovered Software is not Incompatible With Secondary Licenses, this\nLicense permits You to additionally distribute such Covered Software\nunder the terms of such Secondary License(s), so that the recipient of\nthe Larger Work may, at their option, further distribute the Covered\nSoftware under the terms of either this License or such Secondary\nLicense(s).\n\n3.4. Notices\n\nYou may not remove or alter the substance of any license notices\n(including copyright notices, patent notices, disclaimers of warranty,\nor limitations of liability) contained within the Source Code Form of\nthe Covered Software, except that You may alter any license notices to\nthe extent required to remedy known factual inaccuracies.\n\n3.5. Application of Additional Terms\n\nYou may choose to offer, and to charge a fee for, warranty, support,\nindemnity or liability obligations to one or more recipients of Covered\nSoftware. However, You may do so only on Your own behalf, and not on\nbehalf of any Contributor. You must make it absolutely clear that any\nsuch warranty, support, indemnity, or liability obligation is offered by\nYou alone, and You hereby agree to indemnify every Contributor for any\nliability incurred by such Contributor as a result of warranty, support,\nindemnity or liability terms You offer. You may include additional\ndisclaimers of warranty and limitations of liability specific to any\njurisdiction.\n\n4. Inability to Comply Due to Statute or Regulation\n---------------------------------------------------\n\nIf it is impossible for You to comply with any of the terms of this\nLicense with respect to some or all of the Covered Software due to\nstatute, judicial order, or regulation then You must: (a) comply with\nthe terms of this License to the maximum extent possible; and (b)\ndescribe the limitations and the code they affect. Such description must\nbe placed in a text file included with all distributions of the Covered\nSoftware under this License. Except to the extent prohibited by statute\nor regulation, such description must be sufficiently detailed for a\nrecipient of ordinary skill to be able to understand it.\n\n5. Termination\n--------------\n\n5.1. The rights granted under this License will terminate automatically\nif You fail to comply with any of its terms. However, if You become\ncompliant, then the rights granted under this License from a particular\nContributor are reinstated (a) provisionally, unless and until such\nContributor explicitly and finally terminates Your grants, and (b) on an\nongoing basis, if such Contributor fails to notify You of the\nnon-compliance by some reasonable means prior to 60 days after You have\ncome back into compliance. Moreover, Your grants from a particular\nContributor are reinstated on an ongoing basis if such Contributor\nnotifies You of the non-compliance by some reasonable means, this is the\nfirst time You have received notice of non-compliance with this License\nfrom such Contributor, and You become compliant prior to 30 days after\nYour receipt of the notice.\n\n5.2. If You initiate litigation against any entity by asserting a patent\ninfringement claim (excluding declaratory judgment actions,\ncounter-claims, and cross-claims) alleging that a Contributor Version\ndirectly or indirectly infringes any patent, then the rights granted to\nYou by any and all Contributors for the Covered Software under Section\n2.1 of this License shall terminate.\n\n5.3. In the event of termination under Sections 5.1 or 5.2 above, all\nend user license agreements (excluding distributors and resellers) which\nhave been validly granted by You or Your distributors under this License\nprior to termination shall survive termination.\n\n************************************************************************\n*                                                                      *\n*  6. Disclaimer of Warranty                                           *\n*  -------------------------                                           *\n*                                                                      *\n*  Covered Software is provided under this License on an \"as is\"       *\n*  basis, without warranty of any kind, either expressed, implied, or  *\n*  statutory, including, without limitation, warranties that the       *\n*  Covered Software is free of defects, merchantable, fit for a        *\n*  particular purpose or non-infringing. The entire risk as to the     *\n*  quality and performance of the Covered Software is with You.        *\n*  Should any Covered Software prove defective in any respect, You     *\n*  (not any Contributor) assume the cost of any necessary servicing,   *\n*  repair, or correction. This disclaimer of warranty constitutes an   *\n*  essential part of this License. No use of any Covered Software is   *\n*  authorized under this License except under this disclaimer.         *\n*                                                                      *\n************************************************************************\n\n************************************************************************\n*                                                                      *\n*  7. Limitation of Liability                                          *\n*  --------------------------                                          *\n*                                                                      *\n*  Under no circumstances and under no legal theory, whether tort      *\n*  (including negligence), contract, or otherwise, shall any           *\n*  Contributor, or anyone who distributes Covered Software as          *\n*  permitted above, be liable to You for any direct, indirect,         *\n*  special, incidental, or consequential damages of any character      *\n*  including, without limitation, damages for lost profits, loss of    *\n*  goodwill, work stoppage, computer failure or malfunction, or any    *\n*  and all other commercial damages or losses, even if such party      *\n*  shall have been informed of the possibility of such damages. This   *\n*  limitation of liability shall not apply to liability for death or   *\n*  personal injury resulting from such party's negligence to the       *\n*  extent applicable law prohibits such limitation. Some               *\n*  jurisdictions do not allow the exclusion or limitation of           *\n*  incidental or consequential damages, so this exclusion and          *\n*  limitation may not apply to You.                                    *\n*                                                                      *\n************************************************************************\n\n8. Litigation\n-------------\n\nAny litigation relating to this License may be brought only in the\ncourts of a jurisdiction where the defendant maintains its principal\nplace of business and such litigation shall be governed by laws of that\njurisdiction, without reference to its conflict-of-law provisions.\nNothing in this Section shall prevent a party's ability to bring\ncross-claims or counter-claims.\n\n9. Miscellaneous\n----------------\n\nThis License represents the complete agreement concerning the subject\nmatter hereof. If any provision of this License is held to be\nunenforceable, such provision shall be reformed only to the extent\nnecessary to make it enforceable. Any law or regulation which provides\nthat the language of a contract shall be construed against the drafter\nshall not be used to construe this License against a Contributor.\n\n10. Versions of the License\n---------------------------\n\n10.1. New Versions\n\nMozilla Foundation is the license steward. Except as provided in Section\n10.3, no one other than the license steward has the right to modify or\npublish new versions of this License. Each version will be given a\ndistinguishing version number.\n\n10.2. Effect of New Versions\n\nYou may distribute the Covered Software under the terms of the version\nof the License under which You originally received the Covered Software,\nor under the terms of any subsequent version published by the license\nsteward.\n\n10.3. Modified Versions\n\nIf you create software not governed by this License, and you want to\ncreate a new license for such software, you may create and use a\nmodified version of this License if you rename the license and remove\nany references to the name of the license steward (except to note that\nsuch modified license differs from this License).\n\n10.4. Distributing Source Code Form that is Incompatible With Secondary\nLicenses\n\nIf You choose to distribute Source Code Form that is Incompatible With\nSecondary Licenses under the terms of this version of the License, the\nnotice described in Exhibit B of this License must be attached.\n\nExhibit A - Source Code Form License Notice\n-------------------------------------------\n\n  This Source Code Form is subject to the terms of the Mozilla Public\n  License, v. 2.0. If a copy of the MPL was not distributed with this\n  file, You can obtain one at https://mozilla.org/MPL/2.0/.\n\nIf it is not possible or desirable to put the notice in a particular\nfile, then You may include the notice in a location (such as a LICENSE\nfile in a relevant directory) where a recipient would be likely to look\nfor such a notice.\n\nYou may add additional accurate notices of copyright ownership.\n\nExhibit B - \"Incompatible With Secondary Licenses\" Notice\n---------------------------------------------------------\n\n  This Source Code Form is \"Incompatible With Secondary Licenses\", as\n  defined by the Mozilla Public License, v. 2.0.\n"
  },
  {
    "path": "LICENSES/Zlib.txt",
    "content": "zlib License\n\nThis software is provided 'as-is', without any express or implied warranty.  In no event will the authors be held liable for any damages arising from the use of this software.\n\nPermission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:\n\n     1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n\n     2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n\n     3. This notice may not be removed or altered from any source distribution.\n"
  },
  {
    "path": "README.md",
    "content": "<!-- SPDX-FileCopyrightText: 2014 Julien Pfefferkorn -->\n<!-- SPDX-FileCopyrightText: 2015 James R. Barlow -->\n<!-- SPDX-License-Identifier: CC-BY-SA-4.0 -->\n\n<img src=\"docs/images/logo.svg\" width=\"240\" alt=\"OCRmyPDF\">\n\n[![Build Status](https://github.com/ocrmypdf/OCRmyPDF/actions/workflows/build.yml/badge.svg)](https://github.com/ocrmypdf/OCRmyPDF/actions/workflows/build.yml) [![PyPI version][pypi]](https://pypi.org/project/ocrmypdf/) ![Homebrew version][homebrew] ![ReadTheDocs][docs] ![Python versions][pyversions]\n\n[pypi]: https://img.shields.io/pypi/v/ocrmypdf.svg \"PyPI version\"\n[homebrew]: https://img.shields.io/homebrew/v/ocrmypdf.svg \"Homebrew version\"\n[docs]: https://readthedocs.org/projects/ocrmypdf/badge/?version=latest \"RTD\"\n[pyversions]: https://img.shields.io/pypi/pyversions/ocrmypdf \"Supported Python versions\"\n\nOCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched or copy-pasted.\n\n```bash\nocrmypdf                      # it's a scriptable command line program\n   -l eng+fra                 # it supports multiple languages\n   --rotate-pages             # it can fix pages that are misrotated\n   --deskew                   # it can deskew crooked PDFs!\n   --title \"My PDF\"           # it can change output metadata\n   --jobs 4                   # it uses multiple cores by default\n   --output-type pdfa         # it produces PDF/A by default\n   input_scanned.pdf          # takes PDF input (or images)\n   output_searchable.pdf      # produces validated PDF output\n```\n\n[See the release notes for details on the latest changes](https://ocrmypdf.readthedocs.io/en/latest/release_notes.html).\n\n## Main features\n\n- Generates a searchable [PDF/A](https://en.wikipedia.org/?title=PDF/A) file from a regular PDF\n- Places OCR text accurately below the image to ease copy / paste\n- Keeps the exact resolution of the original embedded images\n- When possible, inserts OCR information as a \"lossless\" operation without disrupting any other content\n- Optimizes PDF images, often producing files smaller than the input file\n- If requested, deskews and/or cleans the image before performing OCR\n- Validates input and output files\n- Distributes work across all available CPU cores\n- Uses [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) engine to recognize more than [100 languages](https://github.com/tesseract-ocr/tessdata)\n- Keeps your private data private.\n- Scales properly to handle files with thousands of pages.\n- Battle-tested on millions of PDFs.\n\n<img src=\"misc/screencast/demo.svg\" alt=\"Demo of OCRmyPDF in a terminal session\">\n\nFor details: please consult the [documentation](https://ocrmypdf.readthedocs.io/en/latest/).\n\n## Motivation\n\nI searched the web for a free command line tool to OCR PDF files: I found many, but none of them were really satisfying:\n\n- Either they produced PDF files with misplaced text under the image (making copy/paste impossible)\n- Or they did not handle accents and multilingual characters\n- Or they changed the resolution of the embedded images\n- Or they generated ridiculously large PDF files\n- Or they crashed when trying to OCR\n- Or they did not produce valid PDF files\n- On top of that none of them produced PDF/A files (format dedicated for long time storage)\n\n...so I decided to develop my own tool.\n\n## Installation\n\nLinux, Windows, macOS and FreeBSD are supported. Docker images are also available, for both x64 and ARM.\n\n| Operating system              | Install command               |\n| ----------------------------- | ------------------------------|\n| Debian, Ubuntu                | ``apt install ocrmypdf``      |\n| Windows Subsystem for Linux   | ``apt install ocrmypdf``      |\n| Fedora                        | ``dnf install ocrmypdf``      |\n| macOS (Homebrew)              | ``brew install ocrmypdf``     |\n| macOS (MacPorts)              | ``port install ocrmypdf``     |\n| macOS (nix)                   | ``nix-env -i ocrmypdf``       |\n| LinuxBrew                     | ``brew install ocrmypdf``     |\n| FreeBSD                       | ``pkg install py-ocrmypdf``   |\n| OpenBSD                       | ``pkg_add ocrmypdf``          |\n| Ubuntu Snap                   | ``snap install ocrmypdf``     |\n\nFor everyone else, [see our documentation](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for installation steps.\n\n## Languages\n\nOCRmyPDF uses Tesseract for OCR, and relies on its language packs. For Linux users, you can often find packages that provide language packs:\n\n```bash\n\n# Debian/Ubuntu users\napt-cache search tesseract-ocr # Display a list of all Tesseract language packs\napt-get install tesseract-ocr-chi-sim  # Example: Install Chinese Simplified language pack\n\n\n# Arch Linux users\npacman -S tesseract-data-eng tesseract-data-deu # Example: Install the English and German language packs\n\n# OpenBSD users\npkg_info -aQ tesseract  # Display a list of all Tesseract language packs\npkg_add tesseract-cym  # Example: Install the Welsh language pack\n\n# brew macOS users\nbrew install tesseract-lang\n\n# Fedora users\ndnf search tesseract-langpack # Display a list of all Tesseract language packs \ndnf install tesseract-langpack-ita # Example: Install the Italian language pack\n\n\n```\n\nYou can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested.\n\nOCRmyPDF supports Tesseract 4.1.1+. It will automatically use whichever version it finds first on the `PATH` environment variable. On Windows, if `PATH` does not provide a Tesseract binary, we use the highest version number that is installed according to the Windows Registry.\n\n## Documentation and support\n\nOnce OCRmyPDF is installed, the built-in help which explains the command syntax and options can be accessed via:\n\n```bash\nocrmypdf --help\n```\n\nOur [documentation is served on Read the Docs](https://ocrmypdf.readthedocs.io/en/latest/index.html).\n\nPlease report issues on our [GitHub issues](https://github.com/ocrmypdf/OCRmyPDF/issues) page, and follow the issue template for quick response.\n\n## Feature demo\n\n```bash\n# Add an OCR layer and require PDF/A\nocrmypdf --output-type pdfa input.pdf output.pdf\n\n# Convert an image to single page PDF\nocrmypdf input.jpg output.pdf\n\n# Add OCR to a file in place (only modifies file on success)\nocrmypdf myfile.pdf myfile.pdf\n\n# OCR with non-English languages (look up your language's ISO 639-3 code)\nocrmypdf -l fra LeParisien.pdf LeParisien.pdf\n\n# OCR multilingual documents\nocrmypdf -l eng+fra Bilingual-English-French.pdf Bilingual-English-French.pdf\n\n# Deskew (straighten crooked pages)\nocrmypdf --deskew input.pdf output.pdf\n```\n\nFor more features, see the [documentation](https://ocrmypdf.readthedocs.io/en/latest/index.html).\n\n## Requirements\n\nIn addition to the required Python version, OCRmyPDF requires external program installations of Ghostscript and Tesseract OCR. OCRmyPDF is pure Python, and runs on pretty much everything: Linux, macOS, Windows and FreeBSD.\n\n## Plugins\n\nOCRmyPDF provides a plugin interface allowing its capabilities to be extended or replaced. Here are some plugins we are aware of:\n\n- [OCRmyPDF-AppleOCR](https://github.com/mkyt/ocrmypdf-AppleOCR): replaces the standard Tesseract OCR engine with Apple Vision Framework. Requires macOS.\n- [OCRmyPDF-EasyOCR](https://github.com/ocrmypdf/OCRmyPDF-EasyOCR): replaces the standard Tesseract OCR engine with EasyOCR, a newer OCR engine based on PyTorch. GPU strongly recommended.\n- [OCRmyPDF-PaddleOCR](https://github.com/clefru/ocrmypdf-paddleocr): replaces the standard Tesseract OCR engine with PaddleOCR, a powerful GPU accelerated OCR engine.\n\n[paperless-ngx](https://docs.paperless-ngx.com/) provides integration of OCRmyPDF into a searchable document management system.\n\n## Press & Media\n\n- [Going paperless with OCRmyPDF](https://medium.com/@ikirichenko/going-paperless-with-ocrmypdf-e2f36143f46a)\n- [Converting a scanned document into a compressed searchable PDF with redactions](https://medium.com/@treyharris/converting-a-scanned-document-into-a-compressed-searchable-pdf-with-redactions-63f61c34fe4c)\n- [c't 1-2014, page 59](https://heise.de/-2279695): Detailed presentation of OCRmyPDF v1.0 in the leading German IT magazine c't\n- [heise Open Source, 09/2014: Texterkennung mit OCRmyPDF](https://heise.de/-2356670)\n- [heise Durchsuchbare PDF-Dokumente mit OCRmyPDF erstellen](https://www.heise.de/ratgeber/Durchsuchbare-PDF-Dokumente-mit-OCRmyPDF-erstellen-4607592.html)\n- [Excellent Utilities: OCRmyPDF](https://www.linuxlinks.com/excellent-utilities-ocrmypdf-add-ocr-text-layer-scanned-pdfs/)\n- [LinuxUser Texterkennung mit OCRmyPDF und Scanbd automatisieren](https://www.linux-community.de/ausgaben/linuxuser/2021/06/texterkennung-mit-ocrmypdf-und-scanbd-automatisieren/)\n- [Y Combinator discussion](https://news.ycombinator.com/item?id=32028752)\n\n## Business enquiries\n\nOCRmyPDF would not be the software that it is today without companies and users choosing to provide support for feature development and consulting enquiries. We are happy to discuss all enquiries, whether for extending the existing feature set, or integrating OCRmyPDF into a larger system.\n\n## License\n\nThe OCRmyPDF software is licensed under the Mozilla Public License 2.0 (MPL-2.0). This license permits integration of OCRmyPDF with other code, included commercial and closed source, but asks you to publish source-level modifications you make to OCRmyPDF.\n\nSome components of OCRmyPDF have other licenses, as indicated by standard SPDX license identifiers or the DEP5 copyright and licensing information file. Generally speaking, non-core code is licensed under MIT, and the documentation and test files are licensed under Creative Commons ShareAlike 4.0 (CC-BY-SA 4.0).\n\n## Disclaimer\n\nThe software is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"
  },
  {
    "path": "README_ZH.md",
    "content": "# OCRmyPDF 中文说明\n\n我将为您创建一个中文版的 README.md 文件。以下是基于原始英文版本的中文翻译：\n\n```markdown:/Users/huapai/PycharmProjects/OCRmyPDF/README.md\n<!-- SPDX-FileCopyrightText: 2014 Julien Pfefferkorn -->\n<!-- SPDX-FileCopyrightText: 2015 James R. Barlow -->\n<!-- SPDX-License-Identifier: CC-BY-SA-4.0 -->\n\n<img src=\"docs/images/logo.svg\" width=\"240\" alt=\"OCRmyPDF\">\n\n[![构建状态](https://github.com/ocrmypdf/OCRmyPDF/actions/workflows/build.yml/badge.svg)](https://github.com/ocrmypdf/OCRmyPDF/actions/workflows/build.yml) [![PyPI 版本][pypi]](https://pypi.org/project/ocrmypdf/) ![Homebrew 版本][homebrew] ![ReadTheDocs][docs] ![Python 版本][pyversions]\n\n[pypi]: https://img.shields.io/pypi/v/ocrmypdf.svg \"PyPI 版本\"\n[homebrew]: https://img.shields.io/homebrew/v/ocrmypdf.svg \"Homebrew 版本\"\n[docs]: https://readthedocs.org/projects/ocrmypdf/badge/?version=latest \"RTD\"\n[pyversions]: https://img.shields.io/pypi/pyversions/ocrmypdf \"支持的 Python 版本\"\n\nOCRmyPDF 为扫描的 PDF 文件添加 OCR 文本层，使其可以被搜索或复制粘贴。\n\n```bash\nocrmypdf                      # 这是一个可脚本化的命令行程序\n   -l eng+fra                 # 支持多种语言\n   --rotate-pages             # 可以修正旋转错误的页面\n   --deskew                   # 可以校正倾斜的 PDF！\n   --title \"My PDF\"           # 可以更改输出元数据\n   --jobs 4                   # 默认使用多核心处理\n   --output-type pdfa         # 默认生成 PDF/A 格式\n   input_scanned.pdf          # 接受 PDF 输入（或图像）\n   output_searchable.pdf      # 生成经过验证的 PDF 输出\n```\n\n[查看发布说明了解最新变更的详情](https://ocrmypdf.readthedocs.io/en/latest/release_notes.html)。\n\n## 主要特点\n\n- 从普通 PDF 生成可搜索的 [PDF/A](https://en.wikipedia.org/?title=PDF/A) 文件\n- 准确地将 OCR 文本放置在图像下方，便于复制/粘贴\n- 保持原始嵌入图像的精确分辨率\n- 在可能的情况下，以\"无损\"操作方式插入 OCR 信息，不破坏任何其他内容\n- 优化 PDF 图像，通常生成比输入文件更小的文件\n- 如果需要，在执行 OCR 前对图像进行校正和/或清理\n- 验证输入和输出文件\n- 在所有可用的 CPU 核心上分配工作\n- 使用 [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) 引擎识别超过 [100 种语言](https://github.com/tesseract-ocr/tessdata)\n- 保护您的私人数据安全\n- 适当扩展以处理包含数千页的文件\n- 在数百万 PDF 上经过实战测试\n\n<img src=\"misc/screencast/demo.svg\" alt=\"终端会话中的 OCRmyPDF 演示\">\n\n详情请参阅[文档](https://ocrmypdf.readthedocs.io/en/latest/)。\n\n## 开发动机\n\n我在网上搜索免费的命令行工具来对 PDF 文件进行 OCR：我找到了很多，但没有一个真正令人满意：\n\n- 要么它们生成的 PDF 文件中文本位置错误（使复制/粘贴变得不可能）\n- 要么它们不处理重音和多语言字符\n- 要么它们改变了嵌入图像的分辨率\n- 要么它们生成了体积巨大的 PDF 文件\n- 要么它们在尝试 OCR 时崩溃\n- 要么它们不生成有效的 PDF 文件\n- 最重要的是，它们都不生成 PDF/A 文件（专为长期存储设计的格式）\n\n...所以我决定开发自己的工具。\n\n## 安装\n\n支持 Linux、Windows、macOS 和 FreeBSD。Docker 镜像也可用，同时支持 x64 和 ARM。\n\n| 操作系统                     | 安装命令                      |\n| --------------------------- | ----------------------------- |\n| Debian, Ubuntu              | ``apt install ocrmypdf``      |\n| Windows Subsystem for Linux | ``apt install ocrmypdf``      |\n| Fedora                      | ``dnf install ocrmypdf``      |\n| macOS (Homebrew)            | ``brew install ocrmypdf``     |\n| macOS (MacPorts)            | ``port install ocrmypdf``     |\n| macOS (nix)                 | ``nix-env -i ocrmypdf``       |\n| LinuxBrew                   | ``brew install ocrmypdf``     |\n| FreeBSD                     | ``pkg install py-ocrmypdf``   |\n| Ubuntu Snap                 | ``snap install ocrmypdf``     |\n\n对于其他用户，[请参阅我们的文档](https://ocrmypdf.readthedocs.io/en/latest/installation.html)了解安装步骤。\n\n## 语言\n\nOCRmyPDF 使用 Tesseract 进行 OCR，并依赖其语言包。对于 Linux 用户，您通常可以找到提供语言包的软件包：\n\n```bash\n# 显示所有 Tesseract 语言包的列表\napt-cache search tesseract-ocr\n\n# Debian/Ubuntu 用户\napt-get install tesseract-ocr-chi-sim  # 示例：安装中文简体语言包\n\n# Arch Linux 用户\npacman -S tesseract-data-eng tesseract-data-deu # 示例：安装英语和德语语言包\n\n# brew macOS 用户\nbrew install tesseract-lang\n```\n\n然后，您可以向 OCRmyPDF 传递 `-l LANG` 参数，提示它应该搜索哪些语言。可以请求多种语言。\n\nOCRmyPDF 支持 Tesseract 4.1.1+。它会自动使用在 `PATH` 环境变量中首先找到的版本。在 Windows 上，如果 `PATH` 不提供 Tesseract 二进制文件，我们会根据 Windows 注册表使用已安装的最高版本号。\n\n## 文档和支持\n\n安装 OCRmyPDF 后，可以通过以下方式访问内置帮助，解释命令语法和选项：\n\n```bash\nocrmypdf --help\n```\n\n我们的[文档托管在 Read the Docs 上](https://ocrmypdf.readthedocs.io/en/latest/index.html)。\n\n请在我们的 [GitHub issues](https://github.com/ocrmypdf/OCRmyPDF/issues) 页面上报告问题，并遵循问题模板以获得快速响应。\n\n## 功能演示\n\n```bash\n# 添加 OCR 层并转换为 PDF/A\nocrmypdf input.pdf output.pdf\n\n# 将图像转换为单页 PDF\nocrmypdf input.jpg output.pdf\n\n# 就地为文件添加 OCR（仅在成功时修改文件）\nocrmypdf myfile.pdf myfile.pdf\n\n# 使用非英语语言进行 OCR（查找您语言的 ISO 639-3 代码）\nocrmypdf -l fra LeParisien.pdf LeParisien.pdf\n\n# OCR 多语言文档\nocrmypdf -l eng+fra Bilingual-English-French.pdf Bilingual-English-French.pdf\n\n# 校正（矫正倾斜的页面）\nocrmypdf --deskew input.pdf output.pdf\n```\n\n更多功能，请参阅[文档](https://ocrmypdf.readthedocs.io/en/latest/index.html)。\n\n## 要求\n\n除了所需的 Python 版本外，OCRmyPDF 还需要外部程序安装 Ghostscript 和 Tesseract OCR。OCRmyPDF 是纯 Python 编写的，几乎可以在所有平台上运行：Linux、macOS、Windows 和 FreeBSD。\n\n## 媒体报道\n\n- [使用 OCRmyPDF 实现无纸化](https://medium.com/@ikirichenko/going-paperless-with-ocrmypdf-e2f36143f46a)\n- [将扫描文档转换为带有编辑的压缩可搜索 PDF](https://medium.com/@treyharris/converting-a-scanned-document-into-a-compressed-searchable-pdf-with-redactions-63f61c34fe4c)\n- [c't 1-2014, 第 59 页](https://heise.de/-2279695)：在德国领先的 IT 杂志 c't 中详细介绍 OCRmyPDF v1.0\n- [heise Open Source, 09/2014: 使用 OCRmyPDF 进行文本识别](https://heise.de/-2356670)\n- [heise 使用 OCRmyPDF 创建可搜索的 PDF 文档](https://www.heise.de/ratgeber/Durchsuchbare-PDF-Dokumente-mit-OCRmyPDF-erstellen-4607592.html)\n- [优秀实用工具：OCRmyPDF](https://www.linuxlinks.com/excellent-utilities-ocrmypdf-add-ocr-text-layer-scanned-pdfs/)\n- [LinuxUser 使用 OCRmyPDF 和 Scanbd 自动化文本识别](https://www.linux-community.de/ausgaben/linuxuser/2021/06/texterkennung-mit-ocrmypdf-und-scanbd-automatisieren/)\n- [Y Combinator 讨论](https://news.ycombinator.com/item?id=32028752)\n\n## 商业咨询\n\n如果没有公司和用户选择为功能开发和咨询提供支持，OCRmyPDF 就不会成为今天的软件。我们很乐意讨论所有咨询，无论是扩展现有功能集，还是将 OCRmyPDF 集成到更大的系统中。\n\n## 许可证\n\nOCRmyPDF 软件根据 Mozilla 公共许可证 2.0 (MPL-2.0) 授权。此许可证允许将 OCRmyPDF 与其他代码集成，包括商业和闭源代码，但要求您发布对 OCRmyPDF 所做的源代码级修改。\n\nOCRmyPDF 的某些组件有其他许可证，如标准 SPDX 许可证标识符或 DEP5 版权和许可信息文件所示。一般来说，非核心代码根据 MIT 许可，文档和测试文件根据 Creative Commons ShareAlike 4.0 (CC-BY-SA 4.0) 许可。\n\n## 免责声明\n\n本软件按\"原样\"分发，不提供任何明示或暗示的保证或条件。\n\n这份中文版 README.md 保留了原始文档的所有重要信息，包括功能介绍、安装说明、语言支持、使用示例等内容，同时保持了原始格式和结构。"
  },
  {
    "path": "REUSE.toml",
    "content": "version = 1\nSPDX-PackageName = \"OCRmyPDF\"\nSPDX-PackageSupplier = \"James R. Barlow <james@purplerock.ca>\"\nSPDX-PackageDownloadLocation = \"https://github.com/ocrmypdf/OCRmyPDF\"\n\n[[annotations]]\npath = [\"docs/**\", 'misc/screencast/**']\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2025 James R. Barlow\"\nSPDX-License-Identifier = \"CC-BY-SA-4.0\"\n\n[[annotations]]\npath = [\n    \"uv.lock\",\n    \".git_archival.txt\",\n    \"docs/images/logo-social.png\",\n    \"docs/images/logo-square-256.svg\",\n    \"docs/images/logo-square.png\",\n    \"docs/images/logo-square.svg\",\n    \"docs/images/logo.svg\",\n]\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2025 James R. Barlow\"\nSPDX-License-Identifier = \"MPL-2.0\"\n\n[[annotations]]\npath = [\".github/ISSUE_TEMPLATE/**.yml\"]\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2025 James R. Barlow\"\nSPDX-License-Identifier = \"CC-BY-SA-4.0\"\n\n[[annotations]]\npath = [\n    \"tests/resources/acroform.pdf\",\n    \"tests/resources/aspect.pdf\",\n    \"tests/resources/blank.pdf\",\n    \"tests/resources/cmyk.pdf\",\n    \"tests/resources/crom.png\",\n    \"tests/resources/enormous.pdf\",\n    \"tests/resources/formxobject.pdf\",\n    \"tests/resources/francais.pdf\",\n    \"tests/resources/hugemono.pdf\",\n    \"tests/resources/invalid.pdf\",\n    \"tests/resources/kcs.pdf\",\n    \"tests/resources/livecycle.pdf\",\n    \"tests/resources/meta.pdf\",\n    \"tests/resources/missing_docinfo.pdf\",\n    \"tests/resources/negzero.pdf\",\n    \"tests/resources/no_contents.pdf\",\n    \"tests/resources/tagged**\",\n    \"tests/resources/toc.pdf\",\n    \"tests/resources/trivial.pdf\",\n    \"tests/resources/truetype_font_nomapping.pdf\",\n    \"tests/resources/type3_font_nomapping.pdf\",\n]\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2025 James R. Barlow\"\nSPDX-License-Identifier = \"CC-BY-SA-4.0\"\n\n[[annotations]]\npath = [\"tests/resources/graph.pdf\", \"tests/resources/graph_ocred.pdf\"]\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2012 SmokeyJoe\"\nSPDX-License-Identifier = \"GFDL-1.2-or-later or CC-BY-SA-3.0\"\n\n[[annotations]]\npath = [\"tests/resources/c02-22.pdf\", \"tests/resources/multipage.pdf\"]\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"Public domain\"\nSPDX-License-Identifier = \"public-domain\"\n\n[[annotations]]\npath = \"docs/images/bitmap_vs_svg.svg\"\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2006 Yug\"\nSPDX-License-Identifier = \"CC-BY-SA-2.5\"\n\n[[annotations]]\npath = \"tests/cache/**\"\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2025 James R. Barlow\"\nSPDX-License-Identifier = \"CC-BY-SA-4.0\"\n\n[[annotations]]\npath = [\n    \"tests/resources/linn.png\",\n    \"tests/resources/linn.pdf\",\n    \"tests/resources/linn.txt\",\n    \"tests/resources/ccitt.pdf\",\n    \"tests/resources/cardinal.pdf\",\n    \"tests/resources/jbig2.pdf\",\n    \"tests/resources/jbig2_baddevicen.pdf\",\n    \"tests/resources/skew.pdf\",\n    \"tests/resources/rotated_skew.pdf\",\n    \"tests/resources/poster.pdf\",\n]\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 1985 Forat Electronics\"\nSPDX-License-Identifier = \"GFDL-1.2-or-later or CC-BY-SA-3.0\"\n\n[[annotations]]\npath = \"tests/resources/lichtenstein.pdf\"\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = [\"(C) 2001 Andreas Tille\", \"(C) 2007 Alessio Damato\"]\nSPDX-License-Identifier = \"GFDL-1.2-or-later or CC-BY-SA-3.0\"\n\n[[annotations]]\npath = \"tests/resources/masks.pdf\"\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = [\n    \"held by the contributors to the German Wikipedia article \\\"Linux\\\"\",\n    \"see: https://de.wikipedia.org/w/index.php?title=Linux&action=history\",\n    \"(masks.pdf generated from Wikipedia article as of 2016-08-24)\",\n]\nSPDX-License-Identifier = \"CC-BY-SA-3.0\"\n\n[[annotations]]\npath = \"tests/resources/epson.pdf\"\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = [\n    \"held by the contributors to the Wikipedia article \\\"Optical character recognition\\\"\",\n    \"see: https://en.wikipedia.org/w/index.php?title=Optical_character_recognition&action=history\",\n    \"(epson.pdf generated from Wikipedia article as of 2016-09-14)\",\n]\nSPDX-License-Identifier = \"CC-BY-SA-3.0\"\n\n[[annotations]]\npath = [\"tests/resources/typewriter.png\", \"tests/resources/2400dpi.pdf\"]\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2005 Ellywa\"\nSPDX-License-Identifier = \"GFDL-1.2-or-later or CC-BY-SA-1.0 or CC-BY-SA-2.0 or CC-BY-SA-2.5 or CC-BY-SA-3.0\"\nSPDX-FileComment = \"\\n Obtained from: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif\"\n\n[[annotations]]\npath = \"tests/resources/overlay.pdf\"\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2017 Max Anderson\"\nSPDX-License-Identifier = \"MIT\"\n\n[[annotations]]\npath = [\n    \"tests/resources/baiona**.png\",\n    \"tests/resources/baiona**.jpg\",\n    \"tests/resources/link.pdf\",\n    \"tests/resources/palette.pdf\",\n]\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2014 Euskaldunaa\"\nSPDX-License-Identifier = \"CC-BY-SA-4.0\"\n\n[[annotations]]\npath = \"tests/resources/vector.pdf\"\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = \"(C) 2018 Catscratch\"\nSPDX-License-Identifier = \"MIT\"\n\n[[annotations]]\npath = \"src/ocrmypdf/data/sRGB.icc\"\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = [\n    \"Kai-Uwe Behrmann <www.behrmann.name>\",\n    \"Marti Maria <www.littlecms.com>\",\n    \"Photogamut <www.photogamut.org>\",\n    \"Graeme Gill <www.argyllcms.com>\",\n    \"ColorSolutions <www.basICColor.com>\",\n]\nSPDX-License-Identifier = \"Zlib\"\n\n[[annotations]]\npath = \"src/ocrmypdf/data/Occulta.ttf\"\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = [\"(C) 2026 James R. Barlow\"]\nSPDX-License-Identifier = \"Apache-2.0\"\n\n[[annotations]]\npath = \"tests/resources/3small.pdf\"\nprecedence = \"aggregate\"\nSPDX-FileCopyrightText = [\n    \"(C) 2014 Euskaldunaa\",\n    \"(C) 2017 James R. Barlow\",\n    \"(C) 2005 Ellywa\",\n]\nSPDX-License-Identifier = \"CC-BY-SA-4.0 and (GFDL-1.2-or-later or CC-BY-SA-1.0 or CC-BY-SA-2.0 or CC-BY-SA-2.5 or CC-BY-SA-3.0)\"\nSPDX-FileComment = \"concatenation of baiona_gray.png, crom.png and typewriter.png/2400dpi.pdf\"\n"
  },
  {
    "path": "bin/bump_version.py",
    "content": "#!/usr/bin/env python3\n# SPDX-FileCopyrightText: 2017-2019 Joe Rickerby and contributors\n# SPDX-License-Identifier: BSD-2-Clause\n\n\"\"\"Bump the version number in all the right places.\"\"\"\n\nfrom __future__ import annotations\n\nimport glob\nimport os\nimport subprocess\nimport sys\nimport time\nimport urllib.parse\nfrom pathlib import Path\n\nimport cyclopts\nfrom packaging.version import InvalidVersion, Version\n\ntry:\n    from github import Github, GithubException\nexcept ImportError:\n    Github = None  # type: ignore\n    GithubException = Exception  # type: ignore\n\nimport ocrmypdf\n\nconfig = [\n    # file path, version find/replace format\n    (\"src/ocrmypdf/_version.py\", '__version__ = \"{}\"'),\n    (\"pyproject.toml\", 'version = \"{}\"'),\n]\n\nRED = \"\\u001b[31m\"\nGREEN = \"\\u001b[32m\"\nYELLOW = \"\\u001b[33m\"\nOFF = \"\\u001b[0m\"\n\nREPO_NAME = \"ocrmypdf/OCRmyPDF\"\n\n\ndef validate_release_notes(new_version: str) -> bool:\n    \"\"\"Check that the version appears in the release notes.\n\n    Returns True if the version is found, False otherwise.\n    \"\"\"\n    version_obj = Version(new_version)\n    major = version_obj.major\n    release_notes_path = Path(f\"docs/releasenotes/version{major:02d}.md\")\n\n    if not release_notes_path.exists():\n        print(f\"{RED}error:{OFF} Release notes file not found: {release_notes_path}\")\n        return False\n\n    content = release_notes_path.read_text(encoding=\"utf8\")\n    version_header = f\"## v{new_version}\"\n\n    if version_header not in content:\n        print(\n            f\"{RED}error:{OFF} Version v{new_version} not found in {release_notes_path}\"\n        )\n        print(f\"       Expected to find: {version_header}\")\n        return False\n\n    print(f\"{GREEN}Found v{new_version} in {release_notes_path}{OFF}\")\n    return True\n\n\ndef get_github_client():\n    \"\"\"Get an authenticated GitHub client.\"\"\"\n    if Github is None:\n        print(f\"{RED}error:{OFF} PyGithub is not installed\")\n        print(\"       Install with: pip install PyGithub\")\n        return None\n\n    # Try GITHUB_TOKEN env var first\n    token = os.environ.get(\"GITHUB_TOKEN\")\n\n    # Fall back to gh CLI\n    if not token:\n        try:\n            result = subprocess.run(\n                [\"gh\", \"auth\", \"token\"],\n                capture_output=True,\n                encoding=\"utf8\",\n                check=True,\n            )\n            token = result.stdout.strip()\n        except (FileNotFoundError, subprocess.CalledProcessError):\n            print(f\"{RED}error:{OFF} No GitHub authentication found\")\n            print(\"       Set GITHUB_TOKEN env var or run: gh auth login\")\n            return None\n\n    try:\n        return Github(token)\n    except GithubException as e:\n        print(f\"{RED}error:{OFF} Failed to authenticate with GitHub: {e}\")\n        return None\n\n\ndef wait_for_ci_completion(commit_sha: str, timeout_minutes: int = 30) -> bool:\n    \"\"\"Wait for CI to complete on the given commit.\n\n    Returns True if CI passed, False otherwise.\n    \"\"\"\n    gh = get_github_client()\n    if gh is None:\n        return False\n\n    try:\n        repo = gh.get_repo(REPO_NAME)\n    except GithubException as e:\n        print(f\"{RED}error:{OFF} Failed to access repository: {e}\")\n        return False\n\n    workflow_name = \"Test and deploy\"\n    start_time = time.time()\n    timeout_seconds = timeout_minutes * 60\n    poll_interval = 30  # seconds\n\n    print(f\"Waiting for CI workflow '{workflow_name}' on commit {commit_sha[:8]}...\")\n\n    # First, wait for the workflow run to appear\n    run = None\n    while time.time() - start_time < timeout_seconds:\n        try:\n            runs = repo.get_workflow_runs(head_sha=commit_sha)\n            for r in runs:\n                if r.name == workflow_name:\n                    run = r\n                    break\n            if run:\n                break\n        except GithubException as e:\n            print(f\"{YELLOW}Warning:{OFF} Error checking workflow runs: {e}\")\n\n        elapsed = int(time.time() - start_time)\n        print(f\"  Waiting for workflow to start... ({elapsed}s)\")\n        time.sleep(poll_interval)\n\n    if not run:\n        print(\n            f\"{RED}error:{OFF} Workflow run not found within {timeout_minutes} minutes\"\n        )\n        return False\n\n    print(f\"  Found workflow run #{run.run_number} (ID: {run.id})\")\n\n    # Now wait for the workflow to complete\n    while time.time() - start_time < timeout_seconds:\n        try:\n            run = repo.get_workflow_run(run.id)  # Refresh the run\n        except GithubException as e:\n            print(f\"{YELLOW}Warning:{OFF} Error refreshing workflow run: {e}\")\n            time.sleep(poll_interval)\n            continue\n\n        status = run.status\n        conclusion = run.conclusion\n\n        elapsed = int(time.time() - start_time)\n        if status == \"completed\":\n            if conclusion == \"success\":\n                print(f\"{GREEN}CI passed!{OFF} (took {elapsed}s)\")\n                return True\n            else:\n                print(f\"{RED}CI failed!{OFF} Conclusion: {conclusion}\")\n                print(f\"  View details: {run.html_url}\")\n                return False\n        else:\n            print(f\"  Status: {status} ({elapsed}s elapsed)\")\n            time.sleep(poll_interval)\n\n    print(f\"{RED}error:{OFF} CI did not complete within {timeout_minutes} minutes\")\n    return False\n\n\ndef push_and_wait_for_ci(branch: str) -> bool:\n    \"\"\"Push to remote and wait for CI tests to pass.\"\"\"\n    print(\"Pushing to GitHub...\")\n\n    push_result = subprocess.run(\n        [\"git\", \"push\", \"origin\", branch],\n        capture_output=True,\n        encoding=\"utf8\",\n    )\n\n    if push_result.returncode != 0:\n        print(f\"{RED}error:{OFF} Failed to push: {push_result.stderr}\")\n        return False\n\n    # Get the commit SHA we just pushed\n    sha_result = subprocess.run(\n        [\"git\", \"rev-parse\", \"HEAD\"],\n        capture_output=True,\n        encoding=\"utf8\",\n        check=True,\n    )\n    commit_sha = sha_result.stdout.strip()\n\n    print(f\"Pushed commit {commit_sha[:8]}\")\n\n    return wait_for_ci_completion(commit_sha)\n\n\ndef push_tag(tag: str) -> bool:\n    \"\"\"Push the tag to trigger release workflow.\"\"\"\n    print(f\"Pushing tag {tag} to trigger release...\")\n\n    result = subprocess.run(\n        [\"git\", \"push\", \"origin\", tag],\n        capture_output=True,\n        encoding=\"utf8\",\n    )\n\n    if result.returncode != 0:\n        print(f\"{RED}error:{OFF} Failed to push tag: {result.stderr}\")\n        return False\n\n    print(f\"{GREEN}Tag {tag} pushed successfully!{OFF}\")\n    return True\n\n\ndef bump_version() -> None:\n    \"\"\"Bump the version number in all the right places.\"\"\"\n    current_version = ocrmypdf.__version__  # type: ignore\n    try:\n        commit_date_str = subprocess.run(\n            [\n                \"git\",\n                \"show\",\n                \"--no-patch\",\n                \"--pretty=format:%ci\",\n                f\"v{current_version}^{{commit}}\",\n            ],\n            check=True,\n            capture_output=True,\n            encoding=\"utf8\",\n        ).stdout\n        cd_date, cd_time, cd_tz = commit_date_str.split(\" \")\n\n        url_opts = urllib.parse.urlencode(\n            {\"q\": f\"is:pr merged:>{cd_date}T{cd_time}{cd_tz}\"}\n        )\n        url = f\"https://github.com/{REPO_NAME}/pulls?{url_opts}\"\n\n        print(f\"PRs merged since last release:\\n  {url}\")\n        print()\n    except subprocess.CalledProcessError as e:\n        print(e)\n        print(\"Failed to get previous version tag information.\")\n        print(\"Is the virtual environment active?\")\n        sys.exit(1)\n\n    git_changes_result = subprocess.run([\"git diff-index --quiet HEAD --\"], shell=True)\n    repo_has_uncommitted_changes = git_changes_result.returncode != 0\n\n    if repo_has_uncommitted_changes:\n        print(\"error: Uncommitted changes detected.\")\n        sys.exit(1)\n\n    # fmt: off\n    print(              'Current version:', current_version)\n    new_version = input('    New version: ').strip()\n    # fmt: on\n\n    try:\n        Version(new_version)\n    except InvalidVersion:\n        print(\"error: This version doesn't conform to PEP440\")\n        print(\"       https://www.python.org/dev/peps/pep-0440/\")\n        sys.exit(1)\n\n    # Validate release notes contain this version\n    if not validate_release_notes(new_version):\n        print()\n        print(\"Please add release notes for this version before proceeding.\")\n        print(f\"Edit: docs/releasenotes/version{Version(new_version).major:02d}.md\")\n        sys.exit(1)\n\n    actions = []\n\n    for path_pattern, version_pattern in config:\n        paths = [Path(p) for p in glob.glob(path_pattern)]\n\n        if not paths:\n            print(f\"error: Pattern {path_pattern} didn't match any files\")\n            sys.exit(1)\n\n        find_pattern = version_pattern.format(current_version)\n        replace_pattern = version_pattern.format(new_version)\n        found_at_least_one_file_needing_update = False\n\n        for path in paths:\n            contents = path.read_text(encoding=\"utf8\")\n            if find_pattern in contents:\n                found_at_least_one_file_needing_update = True\n                actions.append(\n                    (\n                        path,\n                        find_pattern,\n                        replace_pattern,\n                    )\n                )\n\n        if not found_at_least_one_file_needing_update:\n            print(\n                f'''error: Didn't find any occurrences of \"{find_pattern}\" in \"{path_pattern}\"'''\n            )\n            sys.exit(1)\n\n    print()\n    print(\"Here's the plan:\")\n    print()\n\n    for action in actions:\n        path, find, replace = action\n        print(f\"{path}  {RED}{find}{OFF} → {GREEN}{replace}{OFF}\")\n\n    print(f\"Then commit, and tag as v{new_version}\")\n\n    answer = input(\"Proceed? [y/N] \").strip()\n\n    if answer != \"y\":\n        print(\"Aborted\")\n        sys.exit(1)\n\n    for path, find, replace in actions:\n        contents = path.read_text(encoding=\"utf8\")\n        contents = contents.replace(find, replace)\n        path.write_text(contents, encoding=\"utf8\")\n\n    print(\"Files updated.\")\n    print()\n\n    while input('Type \"done\" to continue: ').strip().lower() != \"done\":\n        pass\n\n    subprocess.run(\n        [\n            \"git\",\n            \"commit\",\n            \"--all\",\n            f\"--message=Bump version: v{new_version}\",\n        ],\n        check=True,\n    )\n\n    subprocess.run(\n        [\n            \"git\",\n            \"tag\",\n            \"--annotate\",\n            f\"--message=v{new_version}\",\n            f\"v{new_version}\",\n        ],\n        check=True,\n    )\n\n    print(\"Commit and tag created locally.\")\n    print()\n\n    # Get current branch\n    branch_result = subprocess.run(\n        [\"git\", \"rev-parse\", \"--abbrev-ref\", \"HEAD\"],\n        capture_output=True,\n        encoding=\"utf8\",\n        check=True,\n    )\n    branch = branch_result.stdout.strip()\n\n    # Push commit and wait for CI\n    if not push_and_wait_for_ci(branch):\n        print()\n        print(f\"{RED}CI failed. The tag was NOT pushed.{OFF}\")\n        print(\"Fix the issues, then manually push the tag:\")\n        print(f\"    git push origin v{new_version}\")\n        sys.exit(1)\n\n    # Push tag to trigger release\n    if not push_tag(f\"v{new_version}\"):\n        print(f\"{RED}Failed to push tag.{OFF} Push manually:\")\n        print(f\"    git push origin v{new_version}\")\n        sys.exit(1)\n\n    print()\n    print(f\"{GREEN}Done! Release workflow has been triggered.{OFF}\")\n    print()\n\n    release_url = f\"https://github.com/{REPO_NAME}/releases/tag/v{new_version}\"\n    print(\"Monitor the release at:\")\n    print(f\"    {release_url}\")\n\n\nif __name__ == \"__main__\":\n    os.chdir(Path(__file__).parent.parent.resolve())\n    cyclopts.run(bump_version)\n"
  },
  {
    "path": "docs/advanced.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Advanced features\n\n## Control of unpaper\n\nOCRmyPDF uses `unpaper` to provide the implementation of the\n`--clean` and `--clean-final` arguments.\n[unpaper](https://github.com/Flameeyes/unpaper/blob/main/doc/basic-concepts.md)\nprovides a variety of image processing filters to improve images.\n\nBy default, OCRmyPDF uses only `unpaper` arguments that were found to\nbe safe to use on almost all files without having to inspect every page\nof the file afterwards. This is particularly true when only `--clean`\nis used, since that instructs OCRmyPDF to only clean the image before\nOCR and not the final image.\n\nHowever, if you wish to use the more aggressive options in `unpaper`,\nyou may use `--unpaper-args '...'` to override the OCRmyPDF's defaults\nand forward other arguments to unpaper. This option will forward\narguments to `unpaper` without any knowledge of what that program\nconsiders to be valid arguments. The string of arguments must be quoted\nas shown in the examples below. No filename arguments may be included.\nOCRmyPDF will assume it can append input and output filename of\nintermediate images to the `--unpaper-args` string.\n\nIn this example, we tell `unpaper` to expect two pages of text on a\nsheet (image), such as occurs when two facing pages of a book are\nscanned. `unpaper` uses this information to deskew each independently\nand clean up the margins of both.\n\n```bash\nocrmypdf --clean --clean-final --unpaper-args '--layout double' input.pdf output.pdf\nocrmypdf --clean --clean-final --unpaper-args '--layout double --no-noisefilter' input.pdf output.pdf\n```\n\n:::{warning}\nSome `unpaper` features will reposition text within the image.\n`--clean-final` is recommended to avoid this issue.\n:::\n\n:::{warning}\nSome `unpaper` features cause multiple input or output files to be\nconsumed or produced. OCRmyPDF requires `unpaper` to consume one\nfile and produce one file; errors will result if this assumption is not\nmet.\n:::\n\n:::{note}\n`unpaper` uses uncompressed PBM/PGM/PPM files for its intermediate\nfiles. For large images or documents, it can take a lot of temporary\ndisk space.\n:::\n\n## Control of OCR options\n\nOCRmyPDF provides many features to control the behavior of the OCR\nengine, Tesseract.\n\n### OCR processing mode\n\n:::{versionadded} 17.0.0\nThe `--mode` (`-m`) argument consolidates OCR processing options.\n:::\n\nOCRmyPDF provides a unified `--mode` argument to control how pages with\nexisting text are handled:\n\n| Mode | Behavior | Legacy equivalent |\n|------|----------|-------------------|\n| `default` | Error if text is found | (no flag) |\n| `force` | Rasterize all content and run OCR | `--force-ocr` |\n| `skip` | Skip pages with existing text | `--skip-text` |\n| `redo` | Re-OCR pages, stripping old OCR layer | `--redo-ocr` |\n\n```bash\n# Skip pages that already have text\nocrmypdf --mode skip input.pdf output.pdf\n# or equivalently:\nocrmypdf -m skip input.pdf output.pdf\n\n# Force OCR on all pages (rasterizes everything)\nocrmypdf --mode force input.pdf output.pdf\n\n# Re-do OCR, replacing old invisible text\nocrmypdf --mode redo input.pdf output.pdf\n```\n\nThe legacy flags (`--force-ocr`, `--skip-text`, `--redo-ocr`) remain as\nsilent aliases for backward compatibility.\n\n### When OCR is skipped\n\nIf a page in a PDF seems to have text, by default OCRmyPDF will exit\nwithout modifying the PDF. This is to ensure that PDFs that were\npreviously OCRed or were \"born digital\" rather than scanned are not\nprocessed.\n\nIf `--mode skip` (or `--skip-text`) is issued, then no image processing or OCR will be\nperformed on pages that already have text. The page will be copied to\nthe output. This may be useful for documents that contain both \"born\ndigital\" and scanned content, or to use OCRmyPDF to normalize and\nconvert to PDF/A regardless of their contents.\n\nIf `--mode redo` (or `--redo-ocr`) is issued, then a detailed text analysis is performed.\nText is categorized as either visible or invisible. Invisible text (OCR)\nis stripped out. Then an image of each page is created with visible text\nmasked out. The page image is sent for OCR, and any additional text is\ninserted as OCR. If a file contains a mix of text and bitmap images that\ncontain text, OCRmyPDF will locate the additional text in images without\ndisrupting the existing text. Some PDF OCR solutions render text as\ntechnically printable or visible in some way, perhaps by drawing it and\nthen painting over it. OCRmyPDF cannot distinguish this type of OCR\ntext from real text, so it will not be \"redone\".\n\nIf `--mode force` (or `--force-ocr`) is issued, then all pages will be rasterized to\nimages, discarding any hidden OCR text, rasterizing any printable\ntext, and flattening form fields or interactive objects into their visual\nrepresentation. This is useful for redoing OCR, for fixing OCR text\nwith a damaged character map (text is selectable but not searchable),\nand destroying redacted information.\n\n### Time and image size limits\n\nBy default, OCRmyPDF permits tesseract to run for three minutes (180\nseconds) per page. This is usually more than enough time to find all\ntext on a reasonably sized page with modern hardware.\n\nIf a page is skipped, it will be inserted without OCR. If preprocessing\nwas requested, the preprocessed image layer will be inserted.\n\nIf you want to adjust the amount of time spent on OCR, change\n`--tesseract-timeout`. You can also automatically skip images that\nexceed a certain number of megapixels with `--skip-big`. (A 300 DPI,\n8.5×11\" page image is 8.4 megapixels.)\n\n```bash\n# Allow 300 seconds for OCR; skip any page larger than 50 megapixels\nocrmypdf --tesseract-timeout 300 --skip-big 50 bigfile.pdf output.pdf\n```\n\n### OCR for huge images\n\nTesseract has internal limits on the size\nof images it will process. By default,\n`--tesseract-downsample-large-images` is enabled, and OCRmyPDF will\ndownsample images to fit Tesseract limits. (The limits are usually encountered\nonly for scanned images of oversized media, such as large maps or blueprints exceeding\n110 cm or 43 inches in either dimension, and at high DPI.) This feature can disabled\nusing `--no-tesseract-downsample-large-images`.\n\n`--tesseract-downsample-above Npixels` adjusts the threshold at which images\nwill be downsampled. By default, only images that exceed any of Tesseract's\ninternal limits are downsampled (32767 pixels on either dimension).\n\nYou will also need to set `--tesseract-timeout` high enough to allow\nfor processing.\n\nOnly the image sent for OCR is downsampled. The original image is\npreserved.\n\n```bash\n# Allow 600 seconds for OCR on huge images\nocrmypdf --tesseract-timeout 600 \\\n    --tesseract-downsample-large-images \\\n    bigfile.pdf output.pdf\n\n# Downsample images above 5000 pixels on the longest dimension to\n# 5000 pixels\nocrmypdf --tesseract-timeout 120 \\\n    --tesseract-downsample-large-images \\\n    --tesseract-downsample-above 5000 \\\n    bigfile.pdf output_downsampled_ocr.pdf\n```\n\n### Overriding default tesseract\n\nOCRmyPDF checks the system `PATH` for the `tesseract` binary.\n\nSome relevant environment variables that influence Tesseract's behavior\ninclude:\n\n```{eval-rst}\n.. envvar:: TESSDATA_PREFIX\n\n   Overrides the path to Tesseract's data files. This can allow\n   simultaneous installation of the \"best\" and \"fast\" training data\n   sets. OCRmyPDF does not manage this environment variable.\n```\n\n```{eval-rst}\n.. envvar:: OMP_THREAD_LIMIT\n\n   Controls the number of threads Tesseract will use. OCRmyPDF will\n   manage this environment variable if it is not already set.\n```\n\nFor example, if you have a development build of Tesseract don't wish to\nuse the system installation, you can launch OCRmyPDF as follows:\n\n```bash\nenv \\\n    PATH=/home/user/src/tesseract/api:$PATH \\\n    TESSDATA_PREFIX=/home/user/src/tesseract \\\n    ocrmypdf input.pdf output.pdf\n```\n\nIn this example `TESSDATA_PREFIX` is required to redirect Tesseract to\nan alternate folder for its \"tessdata\" files.\n\n### Overriding other support programs\n\nIn addition to tesseract, OCRmyPDF uses the following external binaries:\n\n- `gs` (Ghostscript)\n- `unpaper`\n- `pngquant`\n- `jbig2`\n\nIn each case OCRmyPDF will search the `PATH` environment variable to\nlocate the binaries. By modifying the `PATH` environment variable, you\ncan override the binaries that OCRmyPDF uses.\n\n### Changing Tesseract configuration variables\n\nYou can override Tesseract's default [control\nparameters](https://tesseract-ocr.github.io/tessdoc/tess3/ControlParams.html)\nwith a configuration file.\n\nAs an example, this configuration will disable Tesseract's dictionary\nfor current language. Normally the dictionary is helpful for\ninterpolating words that are unclear, but it may interfere with OCR if\nthe document does not contain many words (for example, a list of part\nnumbers).\n\nCreate a file named \"no-dict.cfg\" with these contents:\n\n```\nload_system_dawg 0\nlanguage_model_penalty_non_dict_word 0\nlanguage_model_penalty_non_freq_dict_word 0\n```\n\nthen run ocrmypdf as follows (along with any other desired arguments):\n\n```bash\nocrmypdf --tesseract-config no-dict.cfg input.pdf output.pdf\n```\n\n:::{warning}\nSome combinations of control parameters will break Tesseract or break\nassumptions that OCRmyPDF makes about Tesseract's output.\n:::\n\n### Changing page segmentation mode\n\nThe directive `--tesseract-pagesegmode Nmode` forwards the desired page segmentation\nmode to Tesseract OCR. The default is 3.\n\nPage segmentation can improve OCR results when you know that a PDF ought to be\nanalyzed a particular way, such as PDFs whose pages contain only a single line of\ntext. For the vast majority of users, changing the page segmentation mode will only\nmake things worse.\n\nAs of June 2024, the Tesseract page segmentation modes are:\n\n| ID  | Description                                                                                   |\n| --- | --------------------------------------------------------------------------------------------- |\n| 0   | Orientation and script detection (OSD) only.                                                  |\n| 1   | Automatic page segmentation with OSD.                                                         |\n| 2   | Automatic page segmentation, but no OSD, or OCR. (not implemented)                            |\n| 3   | Fully automatic page segmentation, but no OSD. (Default)                                      |\n| 4   | Assume a single column of text of variable sizes.                                             |\n| 5   | Assume a single uniform block of vertically aligned text.                                     |\n| 6   | Assume a single uniform block of text.                                                        |\n| 7   | Treat the image as a single text line.                                                        |\n| 8   | Treat the image as a single word.                                                             |\n| 9   | Treat the image as a single word in a circle.                                                 |\n| 10  | Treat the image as a single character.                                                        |\n| 11  | Sparse text. Find as much text as possible in no particular order.                            |\n| 12  | Sparse text with OSD.                                                                         |\n| 13  | Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific. |\n\nModes 0, 1, 2, and 12 (all of those that enable orientation and script detection)\nare not compatible with OCRmyPDF, which performs OSD in a separate step from OCR.\nTheir use may interfere with `--rotate-pages` and other features.\n\nIt is currently not possible to use advanced Tesseract OCR features, such as creating\nOCR information, when using Tesseract through OCRmyPDF.\n\n## Choosing a PDF rasterizer\n\n:::{versionadded} 17.0.0\n:::\n\nrasterizing\n\n: Converting a PDF page to an image for OCR processing.\n\nOCRmyPDF supports two PDF rasterizers:\n\n| Rasterizer | Package | Advantages | Disadvantages |\n|------------|---------|------------|---------------|\n| pypdfium2 | Python package | Faster, fewer version issues | Requires pypdfium2 package |\n| Ghostscript | System binary | More widely packaged | Version consistency issues, restrictive AGPLv3 |\n\nThe `--rasterizer` argument controls which rasterizer is used:\n\n```bash\n# Automatic selection (default) - prefers pypdfium when available\nocrmypdf --rasterizer auto input.pdf output.pdf\n\n# Force pypdfium2\nocrmypdf --rasterizer pypdfium input.pdf output.pdf\n\n# Force Ghostscript\nocrmypdf --rasterizer ghostscript input.pdf output.pdf\n```\n\npypdfium2 is a Python binding for pdfium, the PDF rendering library used\nby Google Chrome and Chromium. It generally produces output identical to\nGhostscript but with better performance.\n\n:::{note}\nIf pypdfium2 is not installed and `--rasterizer pypdfium` is requested,\nOCRmyPDF will exit with an error. Install it with: `pip install pypdfium2`\n:::\n\n## Changing the PDF renderer\n\nrendering\n\n: Creating a new PDF from other data (such as an existing PDF).\n\n:::{versionchanged} 17.0.0\nThe fpdf2 renderer is now the default, replacing the legacy hOCR renderer.\n:::\n\nOCRmyPDF uses PDF renderers to create the invisible text layer. The\nrenderer may be selected using `--pdf-renderer`. The default is\n`auto` which selects `fpdf2`.\n\n### The `fpdf2` renderer (default)\n\n:::{versionadded} 17.0.0\n:::\n\nThe fpdf2 renderer creates text layers using the fpdf2 library. It provides:\n\n- Full multilingual support including RTL languages (Arabic, Hebrew, Persian)\n- Accurate text positioning aligned with OCR bounding boxes\n- Improved \"Occulta\" glyphless font handling:\n  - Zero-width markers are properly handled\n  - Double-width CJK characters are properly sized\n- Direct OcrElement tree input (no hOCR intermediate format required)\n\nThe fpdf2 renderer is the recommended choice for all installations.\n\n:::{note}\nThe fpdf2 renderer may be slightly slower than the legacy hocrtransform\nrenderer for some workloads. This is an area of ongoing optimization.\n:::\n\nIn both renderers, a text-only layer is rendered and sandwiched (overlaid)\non to either the original PDF page, or newly rasterized version of the\noriginal PDF page (when `--mode force` is used). In this way, loss\nof PDF information is generally avoided. (You may need to disable PDF/A\nconversion and optimization to eliminate all lossy transformations.)\n\n### The `sandwich` renderer\n\nThe `sandwich` renderer uses Tesseract's text-only PDF feature,\nwhich produces a PDF page that lays out the OCR in invisible text.\n\nCurrently some problematic PDF viewers like Mozilla PDF.js and macOS\nPreview have problems with segmenting its text output, and\nmightrunseveralwordstogether. It also does not implement right to left\nfonts (Arabic, Hebrew, Persian). The output of this renderer cannot\nbe edited. The sandwich renderer is retained for testing.\n\nWhen image preprocessing features like `--deskew` are used, the\noriginal PDF will be rendered as a full page and the OCR layer will be\nplaced on top.\n\n### Legacy renderer options\n\nThe `hocr` and `hocrdebug` renderer options are deprecated and\nautomatically redirect to `fpdf2`. They will be removed in a future version.\n\n## Rendering and rasterizing options\n\n:::{versionadded} 14.3.0\n:::\n\nThe `--continue-on-soft-render-error` option allows OCRmyPDF to\nproceed if a page cannot be rasterized/rendered. This is useful if you are\ntrying to get the best possible OCR from a PDF that is not well-formed,\nand you are willing to accept some pages that may not visually match the\ninput, and that may not OCR well.\n\n## Color conversion strategy\n\n:::{versionadded} 15.0.0\n:::\n\nOCRmyPDF uses Ghostscript to convert PDF to PDF/A. In some cases, this\nconversion requires color conversion. The default strategy is to convert\nusing the `LeaveColorUnchanged` strategy, which preserves the original\ncolor space wherever possible (some rare color spaces might still be\nconverted).\n\nUsually document scanners produce PDFs in the sRGB color space, and do\nnot need to be converted, so the default strategy is appropriate.\n\nSuppose that you have a document that was prepared for professional\nprinting in a Separation or CMYK color space, and text was converted to\ncurves. In this case, you may want to use a different color conversion\nstrategy. The `--color-conversion-strategy` option allows you to select a\ndifferent strategy, such as `RGB`.\n\n## PDF/A output modes\n\n:::{versionchanged} 17.0.0\nThe default `--output-type` is now `auto` instead of `pdfa`.\n:::\n\nOCRmyPDF can produce PDF/A compliant output for long-term archival. The\n`--output-type` argument controls PDF/A conversion:\n\n| Output type | Behavior |\n|-------------|----------|\n| `auto` | Best-effort PDF/A without requiring Ghostscript (default) |\n| `pdfa` | PDF/A-2b via Ghostscript |\n| `pdfa-1` | PDF/A-1b via Ghostscript |\n| `pdfa-2` | PDF/A-2b via Ghostscript (same as `pdfa`) |\n| `pdfa-3` | PDF/A-3b via Ghostscript |\n| `pdf` | Standard PDF, no PDF/A conversion |\n| `none` | No output file (useful with `--sidecar`) |\n\n### Speculative PDF/A conversion\n\n:::{versionadded} 17.0.0\n:::\n\nWhen `--output-type auto` is used (the default), OCRmyPDF attempts a\nfast \"speculative\" PDF/A conversion that avoids Ghostscript when possible:\n\n1. OCRmyPDF adds an sRGB ICC profile and PDF/A XMP metadata using pikepdf\n2. If verapdf is available, it validates the result\n3. If validation passes, Ghostscript is skipped entirely\n4. If validation fails or verapdf is unavailable, falls back to Ghostscript\n\nThis approach is faster and avoids some Ghostscript limitations (such as\nimage transcoding), but only works for PDFs that are already \"mostly\"\nPDF/A compliant.\n\n### PDF/A conversion flow\n\nThe following diagram illustrates the PDF/A conversion decision tree:\n\n```{mermaid}\nflowchart TD\n    A[Start] --> B{--output-type?}\n    B -->|pdf| C[Output standard PDF]\n    B -->|pdfa/pdfa-N| D[Use Ghostscript]\n    B -->|auto| E[Attempt speculative conversion]\n\n    E --> F[\"Add sRGB ICC + XMP metadata (pikepdf)\"]\n    F --> G{verapdf available?}\n\n    G -->|No| H{Ghostscript available?}\n    G -->|Yes| I[Validate with verapdf]\n\n    I --> J{Validation passed?}\n    J -->|Yes| K[Output PDF/A - Ghostscript skipped]\n    J -->|No| H\n\n    H -->|Yes| D\n    H -->|No| L[Output standard PDF + WARNING]\n\n    D --> M[Ghostscript PDF/A conversion]\n    M --> N[Output PDF/A]\n\n    style K fill:#90EE90\n    style N fill:#90EE90\n    style L fill:#FFB6C1\n```\n\n:::{warning}\n**Breaking change:** If neither Ghostscript nor verapdf is installed,\n`--output-type auto` will produce a standard PDF instead of PDF/A.\nThis is a change from previous versions where Ghostscript was required\nand PDF/A was always produced.\n:::\n\n## Return code policy\n\nOCRmyPDF writes all messages to `stderr`. `stdout` is reserved for\npiping output files. `stdin` is reserved for piping input files.\n\nThe return codes generated by the OCRmyPDF are considered part of the\nstable user interface. They may be imported from\n`ocrmypdf.exceptions`.\n\n```{eval-rst}\n.. list-table:: Return codes\n    :widths: 5 35 60\n    :header-rows: 1\n\n    *   - Code\n        - Name\n        - Interpretation\n    *   - 0\n        - ``ExitCode.ok``\n        - Everything worked as expected.\n    *   - 1\n        - ``ExitCode.bad_args``\n        - Invalid arguments, exited with an error.\n    *   - 2\n        - ``ExitCode.input_file``\n        - The input file does not seem to be a valid PDF.\n    *   - 3\n        - ``ExitCode.missing_dependency``\n        - An external program required by OCRmyPDF is missing.\n    *   - 4\n        - ``ExitCode.invalid_output_pdf``\n        - An output file was created, but it does not seem to be a valid PDF. The file will be available.\n    *   - 5\n        - ``ExitCode.file_access_error``\n        - The user running OCRmyPDF does not have sufficient permissions to read the input file and write the output file.\n    *   - 6\n        - ``ExitCode.already_done_ocr``\n        - The file already appears to contain text so it may not need OCR. See output message.\n    *   - 7\n        - ``ExitCode.child_process_error``\n        - An error occurred in an external program (child process) and OCRmyPDF cannot continue.\n    *   - 8\n        - ``ExitCode.encrypted_pdf``\n        - The input PDF is encrypted. OCRmyPDF does not read encrypted PDFs. Use another program such as ``qpdf`` to remove encryption.\n    *   - 9\n        - ``ExitCode.invalid_config``\n        - A custom configuration file was forwarded to Tesseract using ``--tesseract-config``, and Tesseract rejected this file.\n    *   - 10\n        - ``ExitCode.pdfa_conversion_failed``\n        - A valid PDF was created, PDF/A conversion failed. The file will be available.\n    *   - 15\n        - ``ExitCode.other_error``\n        - Some other error occurred.\n    *   - 130\n        - ``ExitCode.ctrl_c``\n        - The program was interrupted by pressing Ctrl+C.\n\n```\n\n(tmpdir)=\n## Changing temporary storage location\n\nOCRmyPDF generates many temporary files during processing.\n\nTo change where temporary files are stored, change the `TMPDIR`\nenvironment variable for ocrmypdf's environment. (Python's\n`tempfile.gettempdir()` returns the root directory in which temporary\nfiles will be stored.) For example, one could redirect `TMPDIR` to a\nlarge RAM disk to avoid wear on HDD/SSD and potentially improve\nperformance.\n\nOn Windows, the `TEMP` environment variable is used instead.\n\n## Debugging the intermediate files\n\nOCRmyPDF normally saves its intermediate results to a temporary folder\nand deletes this folder when it exits, whether it succeeded or failed.\n\nIf the `--keep-temporary-files` (`-k`) argument is issued on the\ncommand line, OCRmyPDF will keep the temporary folder and print the location,\nwhether it succeeded or failed. An example message is:\n\n```none\nTemporary working files retained at:\n/tmp/ocrmypdf.io.u20wpz07\n```\n\nWhen OCRmyPDF is launched as a snap, this corresponds to the snap filesystem, for instance:\n\n> /tmp/snap-private-tmp/snap.ocrmypdf/tmp/ocrmypdf.io.u20wpz07\n\nThe organization of this folder is an implementation detail and subject\nto change between releases. However the general organization is that\nworking files on a per page basis have the page number as a prefix\n(starting with page 1), an infix indicates the processing stage, and a\nsuffix indicates the file type. Some important files include:\n\n- `_rasterize.png` - what the input page looks like\n- `_ocr.png` - the file that is sent to Tesseract for OCR; depending\n  on arguments this may differ from the presentation image\n- `_pp_deskew.png` - the image, after deskewing\n- `_pp_clean.png` - the image, after cleaning with unpaper\n- `_ocr_hocr.pdf` - the OCR file; appears as a blank page with invisible\n  text embedded\n- `_ocr_hocr.txt` - the OCR text (not necessarily all text on the page,\n  if the page is mixed format)\n- `fix_docinfo.pdf` - a temporary file created to fix the PDF DocumentInfo\n  data structure\n- `graft_layers.pdf` - the rendered PDF with OCR layers grafted on\n- `pdfa.pdf` - `graft_layers.pdf` after conversion to PDF/A\n- `pdfa.ps` - a PostScript file used by Ghostscript for PDF/A conversion\n- `optimize.pdf` - the PDF generated before optimization\n- `optimize.out.pdf` - the PDF generated by optimization\n- `origin` - the input file\n- `origin.pdf` - the input file or the input image converted to PDF\n- `images/*` - images extracted during the optimization process; here\n  the prefix indicates a PDF object ID not a page number\n"
  },
  {
    "path": "docs/api.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Using the OCRmyPDF API\n\nOCRmyPDF originated as a command line program and continues to have this\nlegacy, but parts of it can be imported and used in other Python\napplications.\n\nSome applications may want to consider running ocrmypdf from a\nsubprocess call anyway, as this provides isolation of its activities.\n\n## Example\n\nOCRmyPDF provides one high-level function to run its main engine from an\napplication.\n\n```{versionchanged} 17.0\nThe {func}`ocrmypdf.ocr` function now accepts an {class}`~ocrmypdf.OcrOptions`\nobject as its first argument, providing a cleaner API with full type hints\nand validation. The previous positional argument style remains supported.\n```\n\n### Modern API (recommended)\n\nThe recommended way to call {func}`ocrmypdf.ocr` is to construct an\n{class}`~ocrmypdf.OcrOptions` object with all settings, then pass it\nas the sole argument:\n\n```python\nimport ocrmypdf\nfrom ocrmypdf import OcrOptions\n\nif __name__ == '__main__':  # To ensure correct behavior on Windows and macOS\n    options = OcrOptions(\n        input_file='input.pdf',\n        output_file='output.pdf',\n        deskew=True,\n        languages=['eng'],\n    )\n    ocrmypdf.ocr(options)\n```\n\n{class}`~ocrmypdf.OcrOptions` is a Pydantic model that provides:\n\n- Full type hints and IDE autocompletion\n- Validation of option values at construction time\n- Clear documentation of all available options\n\n```{versionadded} 17.0\nThe {class}`~ocrmypdf.OcrOptions` class is now exported from the top-level\n`ocrmypdf` module.\n```\n\n### Legacy API\n\nFor compatibility with OCRmyPDF < v17, the traditional calling style\nwith positional arguments is still fully supported:\n\n```python\nimport ocrmypdf\n\nif __name__ == '__main__':  # To ensure correct behavior on Windows and macOS\n    ocrmypdf.ocr('input.pdf', 'output.pdf', deskew=True)\n```\n\nWith this style, all of the command line arguments are available\nand may be passed as equivalent keywords.\n\nA few differences are that `verbose` and `quiet` are not available.\nInstead, output should be managed by configuring logging.\n\n### Parent process requirements\n\nThe {func}`ocrmypdf.ocr` function runs OCRmyPDF similar to command line\nexecution. To do this, it will:\n\n- create worker processes or threads\n- manage the signal flags of its worker processes\n- execute other subprocesses (forking and executing other programs)\n\nThe Python process that calls {func}`ocrmypdf.ocr()` must be sufficiently\nprivileged to perform these actions.\n\nThere currently is no option to manage how jobs are scheduled other\nthan the argument `jobs=` which will limit the number of worker\nprocesses.\n\nCreating a child process to call {func}`ocrmypdf.ocr()` is suggested. That\nway your application will survive and remain interactive even if\nOCRmyPDF fails for any reason. For example:\n\n```python\nfrom multiprocessing import Process\nimport ocrmypdf\nfrom ocrmypdf import OcrOptions\n\ndef ocrmypdf_process():\n    options = OcrOptions(input_file='input.pdf', output_file='output.pdf')\n    ocrmypdf.ocr(options)\n\ndef call_ocrmypdf_from_my_app():\n    p = Process(target=ocrmypdf_process)\n    p.start()\n    p.join()\n```\n\nPrograms that call {func}`ocrmypdf.ocr()` should also install a SIGBUS signal\nhandler (except on Windows), to raise an exception if access to a memory\nmapped file fails. OCRmyPDF may use memory mapping.\n\n{func}`ocrmypdf.ocr()` will take a threading lock to prevent multiple runs of itself\nin the same Python interpreter process. This is not thread-safe, because of how\nOCRmyPDF's plugins and Python's library import system work. If you need to parallelize\nOCRmyPDF, use processes.\n\n:::{warning}\nOn Windows and macOS, the script that calls {func}`ocrmypdf.ocr()` must be\nprotected by an \"ifmain\" guard (`if __name__ == '__main__'`). If you do\nnot take at least one of these steps, process semantics will prevent\nOCRmyPDF from working correctly.\n:::\n\n### Logging\n\nOCRmyPDF will log under loggers named `ocrmypdf`. In addition, it\nimports `pdfminer` and `PIL`, both of which post log messages under\nthose logging namespaces.\n\nYou can configure the logging as desired for your application or call\n{func}`ocrmypdf.configure_logging` to configure logging the same way\nOCRmyPDF itself does. The command line parameters such as `--quiet`\nand `--verbose` have no equivalents in the API; you must use the\nprovided configuration function or do configuration in a way that suits\nyour use case.\n\n### Progress monitoring\n\nOCRmyPDF uses the `rich` package to implement its progress bars.\n{func}`ocrmypdf.configure_logging` will set up logging output to\n`sys.stderr` in a way that is compatible with the display of the\nprogress bar. Use `ocrmypdf.ocr(...progress_bar=False)` to disable\nthe progress bar.\n\n### Standard output\n\nOCRmyPDF is strict about not writing to standard output so that\nusers can safely use it in a pipeline and produce a valid output\nfile. A caller application will have to ensure it does not write to\nstandard output either, if it wants to be compatible with this\nbehavior and support piping to a file. Another benefit of running\nOCRmyPDF in a child process, as recommended above, is that it will\nnot interfere with the parent process's standard output.\n\n### Exceptions\n\nOCRmyPDF may throw standard Python exceptions, `ocrmypdf.exceptions.*`\nexceptions, some exceptions related to multiprocessing, and\n{exc}`KeyboardInterrupt`. The parent process should provide an exception\nhandler. OCRmyPDF will clean up its temporary files and worker processes\nautomatically when an exception occurs.\n\nWhen OCRmyPDF succeeds conditionally, it returns an integer exit code.\n\n### Plugin Development Changes\n\n```{versionchanged} 16.13\nPlugin hooks now receive {class}`~ocrmypdf.OcrOptions` objects instead of\n`argparse.Namespace`.\n```\n\n- {class}`~ocrmypdf.OcrOptions` provides the same attribute access as `Namespace` (duck-typing compatible)\n- Plugin developers should update type hints: `from ocrmypdf import OcrOptions`\n- Built-in plugins no longer modify options in-place for better immutability\n\nMost existing plugins will continue working without modification due to the\nduck-typing compatibility between {class}`~ocrmypdf.OcrOptions` and `Namespace`.\n"
  },
  {
    "path": "docs/apiref.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# API reference\n\nThis page summarizes the rest of the public API. Generally speaking this\nshould be mainly of interest to plugin developers.\n\n## ocrmypdf.api\n\n```{eval-rst}\n.. automodule:: ocrmypdf.api\n    :members:\n```\n\n## ocrmypdf._options\n\n```{eval-rst}\n.. automodule:: ocrmypdf._options\n    :members: OcrOptions\n```\n\n## ocrmypdf.exceptions\n\n```{eval-rst}\n.. automodule:: ocrmypdf.exceptions\n    :members:\n    :undoc-members:\n```\n\n## ocrmypdf.helpers\n\n```{eval-rst}\n.. automodule:: ocrmypdf.helpers\n    :members:\n```\n\n## ocrmypdf.hocrtransform\n\n```{eval-rst}\n.. automodule:: ocrmypdf.hocrtransform\n    :members:\n```\n\n## ocrmypdf.pdfa\n\n```{eval-rst}\n.. automodule:: ocrmypdf.pdfa\n    :members:\n```\n\n## ocrmypdf.quality\n\n```{eval-rst}\n.. automodule:: ocrmypdf.quality\n    :members:\n```\n\n## ocrmypdf.subprocess\n\n```{eval-rst}\n.. automodule:: ocrmypdf.subprocess\n    :members:\n```\n"
  },
  {
    "path": "docs/batch.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\nBatch processing\n================\n\nThis article provides information about running OCRmyPDF on multiple\nfiles or configuring it as a service triggered by file system events.\n\nBatch jobs\n----------\n\nConsider using the excellent [GNU\nParallel](https://www.gnu.org/software/parallel/) to apply OCRmyPDF to\nmultiple files at once.\n\nBoth `parallel` and `ocrmypdf` will try to use all available processors.\nTo maximize parallelism without overloading your system with processes,\nconsider using `parallel -j 2` to limit parallel to running two jobs at\nonce.\n\nThis command will run `ocrmypdf` on all files named `*.pdf` in the\ncurrent directory and write them to the previously created `output/`\nfolder. It will not search subdirectories.\n\nThe `--tag` argument tells parallel to print the filename as a prefix\nwhenever a message is printed, so that one can trace any errors to the\nfile that produced them.\n\n:::{code} bash\nparallel --tag -j 2 ocrmypdf '{}' 'output/{}' ::: *.pdf\n:::\n\nOCRmyPDF automatically repairs PDFs before parsing and gathering\ninformation from them.\n\nDirectory trees\n---------------\n\nThis will walk through a directory tree and run OCR on all files in\nplace, and printing each filename in between runs:\n\n:::{code} bash\nfind . -name '*.pdf' -printf '%p\\n' -exec ocrmypdf '{}' '{}' \\;\n:::\n\nThis only runs one `ocrmypdf` process at a time. This variation uses\n`find` to create a directory list and `parallel` to parallelize runs of\n`ocrmypdf`, again updating files in place.\n\n:::{code} bash\nfind . -name '*.pdf' | parallel --tag -j 2 ocrmypdf '{}' '{}'\n:::\n\nIn a Windows batch file, use\n\n:::{code} bat\nfor /r %%f in (*.pdf) do ocrmypdf %%f %%f\n:::\n\nWith a Docker container, you will need to stream through standard input\nand output:\n\n:::{code} bash\nfind . -name '*.pdf' -print0 | xargs -0 | while read pdf; do\n    pdfout=$(mktemp)\n    docker run --rm -i jbarlow83/ocrmypdf - - <$pdf >$pdfout && cp $pdfout $pdf\ndone\n:::\n\n### Sample script\n\nThis user contributed script also provides an example of batch\nprocessing.\n\n:::{literalinclude} ../misc/batch.py\n---\ncaption: misc/batch.py\n---\n:::\n\n### Synology DiskStations\n\nSynology DiskStations (Network Attached Storage devices) can run the\nDocker image of OCRmyPDF if the Synology [Docker\npackage](https://www.synology.com/en-global/dsm/packages/Docker) is\ninstalled. Attached is a script to address particular quirks of using\nOCRmyPDF on one of these devices.\n\nAt the time this script was written, it only worked for x86-based\nSynology products. It is not known if it will work on ARM-based Synology\nproducts. Further adjustments might be needed to deal with the\nSynology\\'s relatively limited CPU and RAM.\n\n:::{literalinclude} ../misc/synology.py\n---\ncaption: misc/synology.py - Sample script for Synology DiskStations\n---\n:::\n\n### Huge batch jobs\n\nIf you have thousands of files to work with, contact the author.\nConsulting work related to OCRmyPDF helps fund this open source project\nand all inquiries are appreciated.\n\nHot (watched) folders\n---------------------\n\n### Watched folders with watcher.py\n\nOCRmyPDF has a folder watcher called watcher.py, which is currently\nincluded in source distributions but not part of the main program. It\nmay be used natively or may run in a Docker container. Native instances\ntend to give better performance. watcher.py works on all platforms.\n\nUsers may need to customize the script to meet their requirements.\n\n:::{code} bash\n# Using uv (recommended)\nuv sync --extra watcher\n\n# Or using pip\npip3 install ocrmypdf[watcher]\n\nenv OCR_INPUT_DIRECTORY=/mnt/input-pdfs \\\n    OCR_OUTPUT_DIRECTORY=/mnt/output-pdfs \\\n    OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \\\n    python3 watcher.py\n:::\n\n:::{list-table} watcher.py environment variables\n---\nheader-rows: 1\n---\n\n* - Environment variable\n  - Description\n* - OCR\\_INPUT\\_DIRECTORY\n  - Set input directory to monitor (recursive)\n* - OCR\\_OUTPUT\\_DIRECTORY\n  - Set output directory (should not be under input)\n* - OCR\\_ARCHIVE\\_DIRECTORY\n  - Set archive directory for processed originals (should not be under input, requires `OCR_ON_SUCCESS_ARCHIVE` to be set)\n* - OCR\\_ON\\_SUCCESS\\_DELETE\n  - This will move the processed original file to `OCR_ARCHIVE_DIRECTORY` if the exit code is 0 (OK). Note that `OCR_ON_SUCCESS_DELETE` takes precedence over this option, i.e. if both options are set, the input file will be deleted.\n* - OCR\\_OUTPUT\\_DIRECTORY\\_YEAR\\_MONTH\n  - This will place files in the output in `{output}/{year}/{month}/{filename}`\n* - OCR\\_DESKEW\n  - Apply deskew to crooked input PDFs\n* - OCR\\_JSON\\_SETTINGS\n  - A JSON string specifying any other arguments for `ocrmypdf.ocr`, e.g. `'OCR_JSON_SETTINGS={\"rotate_pages\": true, \"optimize\": \"3\"}'`.\n* - OCR\\_POLL\\_NEW\\_FILE\\_SECONDS\n  - Polling interval\n* - OCR\\_LOGLEVEL\n  - Level of log messages t\n:::\n\nOne could configure a networked scanner or scanning computer to drop\nfiles in the watched folder.\n\n### Watched folders with Docker\n\nThe watcher service is included in the OCRmyPDF Docker image. To run it:\n\n:::{code} bash\ndocker run \\\n    --volume <path to files to convert>:/input \\\n    --volume <path to store results>:/output \\\n    --volume <path to store processed originals>:/processed \\\n    --env OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \\\n    --env OCR_ON_SUCCESS_ARCHIVE=1 \\\n    --env OCR_DESKEW=1 \\\n    --env PYTHONUNBUFFERED=1 \\\n    --interactive --tty --entrypoint python3 \\\n    jbarlow83/ocrmypdf \\\n    watcher.py\n:::\n\nThis service will watch for a file that matches `/input/\\*.pdf`, convert\nit to a OCRed PDF in `/output/`, and move the processed original to\n`/processed`. The parameters to this image are:\n\n:::{list-table} Watcher Docker Parameters\n:header-rows: 1\n\n* - Parameter\n  - Description\n* - `--volume <path to files to convert>:/input`\n  - Files placed in this location will be OCRed\n* - `--volume <path to store results>:/output`\n  - This is where OCRed files will be stored\n* - `--volume <path to store processed originals>:/processed`\n  - Archive processed originals here\n* - `--env OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1`\n  - Define environment variable `OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1` to place files in the output in `{output}/{year}/{month}/{filename}`\n* - `--env OCR_ON_SUCCESS_ARCHIVE=1`\n  - Define environment variable `OCR_ON_SUCCESS_ARCHIVE` to move processed originals\n* - `--env OCR_DESKEW=1`\n  - Define environment variable `OCR_DESKEW` to apply deskew to crooked input PDFs\n* - `--env PYTHONBUFFERED=1`\n  - This will force `STDOUT` to be unbuffered and allow you to see messages in docker logs\n* - `--env OCR_LOGLEVEL='DEBUG'`\n  - Level of log messages\n* - `--env OCR_JSON_SETTINGS={\"language\":\"deu+eng\", \"rotate_pages\": true}`\n  - A JSON string specifying any other arguments for `ocrmypdf.ocr`\n:::\n\nThis service relies on polling to check for changes to the filesystem.\nIt may not be suitable for some environments, such as filesystems shared\non a slow network.\n\nA configuration manager such as Docker Compose could be used to ensure\nthat the service is always available.\n\n:::{literalinclude} ../misc/docker-compose.example.yml\n---\ncaption: misc/docker-compose.example.yml\n---\n:::\n\n### Caveats\n\n-   `watchmedo` may not work properly on a networked file system,\n    depending on the capabilities of the file system client and server.\n-   This simple recipe does not filter for the type of file system\n    event, so file copies, deletes and moves, and directory operations,\n    will all be sent to ocrmypdf, producing errors in several cases.\n    Disable your watched folder if you are doing anything other than\n    copying files to it.\n-   If the source and destination directory are the same, watchmedo may\n    create an infinite loop.\n-   On BSD, FreeBSD and older versions of macOS, you may need to\n    increase the number of file descriptors to monitor more files, using\n    `ulimit -n 1024` to watch a folder of up to 1024 files.\n\n### Alternatives\n\n-   On Linux, [systemd user\n    services](https://wiki.archlinux.org/index.php/Systemd/User) can be\n    configured to automatically perform OCR on a collection of files.\n-   [Watchman](https://facebook.github.io/watchman/) is a more powerful\n    alternative to `watchmedo`.\n\nmacOS Automator\n---------------\n\nYou can use the Automator app with macOS, to create a Workflow or Quick\nAction. Use a *Run Shell Script* action in your workflow. In the context\nof Automator, the `PATH` may be set differently your Terminal\\'s `PATH`;\nyou may need to explicitly set the PATH to include `ocrmypdf`. The\nfollowing example may serve as a starting point:\n\n![](images/macos-workflow.png)\n\nYou may customize the command sent to ocrmypdf.\n"
  },
  {
    "path": "docs/cloud.md",
    "content": "% SPDX-FileCopyrightText: 2025 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n(ocr-service)=\n\n# Online deployments\n\nOCRmyPDF is designed to be used as a command line tool, but it can be\nused in a web service. This document describes some considerations for\ndoing so.\n\nA basic web service implementation is provided in the source code\nrepository, as `misc/webservice.py`. It is only demonstration quality\nand is not intended for production use.\n\nOCRmyPDF is not designed for use as a public web service where a\nmalicious user could upload a chosen PDF. In particular, it is not\nnecessarily secure against PDF malware or PDFs that cause denial of\nservice. For further discussino of security, see\n[security](security).\n\nOCRmyPDF relies on Ghostscript, and therefore, if deployed online one\nshould be prepared to comply with Ghostscript\\'s Affero GPL license, and\nany other licenses.\n\nSetting aside these concerns, a side effect of OCRmyPDF is that it may\nincidentally sanitize PDFs containing certain types of malware. It\nrepairs the PDF with pikepdf/libqpdf, which could correct malformed PDF\nstructures that are part of an attack. When PDF/A output is selected\n(the default), the input PDF is partially reconstructed by Ghostscript.\nWhen `--force-ocr` is used, all pages are rasterized and reconverted to\nPDF, which could remove malware in embedded images.\n\n## Limiting CPU usage\n\nOCRmyPDF will attempt to use all available CPUs and storage, so\nexecuting `nice ocrmypdf` or limiting the number of jobs with the\n`--jobs` argument may ensure the server remains responsive. Another\noption would be to run OCRmyPDF jobs inside a Docker container, a\nvirtual machine, or a cloud instance, which can impose its own limits on\nCPU usage and be terminated \\\"from orbit\\\" if it fails to complete.\n\n## Temporary storage requirements\n\nOCRmyPDF will use a large amount of temporary storage for its work,\nproportional to the total number of pixels needed to rasterize the PDF.\nThe raster image of a 8.5×11\\\" color page at 300 DPI takes 25 MB\nuncompressed; OCRmyPDF saves its intermediates as PNG, but that still\nmeans it requires about 9 MB per intermediate based on average\ncompression ratios. Multiple intermediates per page are also required,\ndepending on the command line given. A rule of thumb would be to allow\n100 MB of temporary storage per page in a file -- meaning that a small\ncloud servers or small VM partitions should be provisioned with plenty\nof extra space, if say, a 500 page file might be sent.\n\nTo change the temporary directory, see [tmpdir](#tmpdir).\n\nOn Amazon Web Services or other cloud vendors, consider setting your\ntemporary directory to [empheral\nstorage](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html).\n\n## Timeouts\n\nTo prevent excessively long OCR jobs consider setting\n`--tesseract-timeout` and/or `--skip-big` arguments. `--skip-big` is\nparticularly helpful if your PDFs include documents such as reports on\nstandard page sizes with large images attached - often large images are\nnot worth OCR\\'ing anyway.\n\n## Document management systems\n\nIf you are looking for a full document management system, consider\n[paperless-ngx](https://github.com/paperless-ngx/paperless-ngx), which\nis a web application that uses OCRmyPDF to automatically OCR and archive\ndocuments.\n\n## Commercial OCR alternatives\n\nThe author also provides professional services that include OCR and\nbuilding databases around PDFs, and is happy to provide consultation.\n\nAbbyy Cloud OCR is viable commercial alternative with a web services\nAPI. Amazon Textract, Google Cloud Vision, and Microsoft Azure Computer\nVision provide advanced OCR but have less PDF rendering capability.\n"
  },
  {
    "path": "docs/conf.py",
    "content": "#!/usr/bin/env python3\n# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: CC-BY-SA-4.0\n\n# ruff: noqa: E402\n\n# ocrmypdf documentation build configuration file, created by\n# sphinx-quickstart on Sun Sep  4 14:29:43 2016.\n#\n# This file is execfile()d with the current directory set to its\n# containing dir.\n#\n# Note that not all possible configuration values are present in this\n# autogenerated file.\n#\n# All configuration values have a default; values that are commented out\n# serve to show the default.\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\n# import os\n# import sys\n# sys.path.insert(0, os.path.abspath('.'))\n\n# -- General configuration ------------------------------------------------\nfrom __future__ import annotations\n\nneeds_sphinx = '8'\n\nimport datetime as dt\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    'myst_parser',\n    'sphinx.ext.autodoc',\n    'sphinx.ext.intersphinx',\n    'sphinx.ext.autosummary',\n    'sphinx.ext.napoleon',\n    'sphinx.ext.imgconverter',  # PDF docs needs this for SVG to PNG conversion\n    'sphinx_issues',\n    'sphinx_reredirects',\n    'sphinxcontrib.mermaid',\n]\n\nmyst_enable_extensions = ['colon_fence', 'attrs_block', 'attrs_inline', 'substitution']\n\n# Extension settings\nintersphinx_mapping = {'python': ('https://docs.python.org/3', None)}\nnapoleon_use_rtype = False\nissues_github_path = \"ocrmypdf/OCRmyPDF\"\nredirects = {\n    \"release_notes\": \"releasenotes/index.html\",\n}\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = ['_templates']\n\n# The suffix(es) of source filenames.\nsource_suffix = {'.rst': 'restructuredtext', '.md': 'markdown', '.txt': 'markdown'}\n\n# The master toctree document.\nmaster_doc = 'index'\n\n# General information about the project.\nproject = 'ocrmypdf'\n\nyear = str(dt.date.today().year)\ncopyright = (\n    f'{year}, James R. Barlow. '\n    + 'Licensed under Creative Commons Attribution-ShareAlike 4.0'\n)\nauthor = 'James R. Barlow'\n\n# The version info for the project you're documenting, acts as replacement for\n# |version| and |release|, also used in various other places throughout the\n# built documents.\n#\n# The short X.Y version.\n\nimport os\nfrom importlib.metadata import version as package_version\n\non_rtd = os.environ.get('READTHEDOCS') == 'True'\n\nif on_rtd:\n    # Help ReadTheDocs avoid having to install any binary extension modules\n    import sys\n    from unittest.mock import MagicMock\n\n    class Mock(MagicMock):\n        @classmethod\n        def __getattr__(cls, name):\n            return MagicMock()\n\n    MOCK_MODULES = [\n        'pikepdf',\n        'pikepdf.canvas',\n        'pikepdf.models',\n        'pikepdf.models.metadata',\n    ]\n    sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)\n\n\n# The full version, including alpha/beta/rc tags.\nrelease = package_version('ocrmypdf')\nversion = '.'.join(release.split('.')[:2])\n\n\n# The language for content autogenerated by Sphinx. Refer to documentation\n# for a list of supported languages.\n#\n# This is also used if you do content translation via gettext catalogs.\n# Usually you set \"language\" from the command line for these cases.\nlanguage = 'en'\n\n# There are two options for replacing |today|: either, you set today to some\n# non-false value, then it is used:\n#\n# today = ''\n#\n# Else, today_fmt is used as the format for a strftime call.\n#\ntoday_fmt = '%Y-%m-%d'\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This patterns also effect to html_static_path and html_extra_path\nexclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']\n\n# The reST default role (used for this markup: `text`) to use for all\n# documents.\n#\n# default_role = None\n\n# If true, '()' will be appended to :func: etc. cross-reference text.\n#\n# add_function_parentheses = True\n\n# If true, the current module name will be prepended to all description\n# unit titles (such as .. function::).\n#\n# add_module_names = True\n\n# If true, sectionauthor and moduleauthor directives will be shown in the\n# output. They are ignored by default.\n#\n# show_authors = False\n\n# The name of the Pygments (syntax highlighting) style to use.\npygments_style = 'sphinx'\n\n# A list of ignored prefixes for module index sorting.\n# modindex_common_prefix = []\n\n# If true, keep warnings as \"system message\" paragraphs in the built documents.\n# keep_warnings = False\n\n# If true, `todo` and `todoList` produce output, else they produce nothing.\ntodo_include_todos = False\n\n\n# -- Options for HTML output ----------------------------------------------\n\nimport sphinx_rtd_theme  # noqa: F401\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\nhtml_theme = 'sphinx_rtd_theme'\n\n# Theme options are theme-specific and customize the look and feel of a theme\n# further.  For a list of options available for each theme, see the\n# documentation.\n#\nhtml_theme_options = {}\n\n# Add any paths that contain custom themes here, relative to this directory.\n# html_theme_path = []\n\n# The name for this set of Sphinx documents.\n# \"<project> v<release> documentation\" by default.\n#\n# html_title = 'ocrmypdf v4.2'\n\n# A shorter title for the navigation bar.  Default is the same as html_title.\n#\n# html_short_title = None\n\n# The name of an image file (relative to this directory) to place at the top\n# of the sidebar.\n#\n# html_logo = \"images/logo.svg\"  # looks bad\n\n# The name of an image file (relative to this directory) to use as a favicon of\n# the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32\n# pixels large.\n#\n# html_favicon = None\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\n# html_static_path = ['_static']\n\n# Add any extra paths that contain custom files (such as robots.txt or\n# .htaccess) here, relative to this directory. These files are copied\n# directly to the root of the documentation.\n#\n# html_extra_path = []\n\n# If not None, a 'Last updated on:' timestamp is inserted at every page\n# bottom, using the given strftime format.\n# The empty string is equivalent to '%b %d, %Y'.\n#\n# html_last_updated_fmt = None\n\n# If true, SmartyPants will be used to convert quotes and dashes to\n# typographically correct entities.\n#\n# html_use_smartypants = True\n\n# Custom sidebar templates, maps document names to template names.\n#\n# html_sidebars = {}\n\n# Additional templates that should be rendered to pages, maps page names to\n# template names.\n#\n# html_additional_pages = {}\n\n# If false, no module index is generated.\n#\n# html_domain_indices = True\n\n# If false, no index is generated.\n#\n# html_use_index = True\n\n# If true, the index is split into individual pages for each letter.\n#\n# html_split_index = False\n\n# If true, links to the reST sources are added to the pages.\n#\n# html_show_sourcelink = True\n\n# If true, \"Created using Sphinx\" is shown in the HTML footer. Default is True.\n#\n# html_show_sphinx = True\n\n# If true, \"(C) Copyright ...\" is shown in the HTML footer. Default is True.\n#\n# html_show_copyright = True\n\n# If true, an OpenSearch description file will be output, and all pages will\n# contain a <link> tag referring to it.  The value of this option must be the\n# base URL from which the finished HTML is served.\n#\n# html_use_opensearch = ''\n\n# This is the file name suffix for HTML files (e.g. \".xhtml\").\n# html_file_suffix = None\n\n# Language to be used for generating the HTML full-text search index.\n# Sphinx supports the following languages:\n#   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'\n#   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'\n#\n# html_search_language = 'en'\n\n# A dictionary with options for the search language support, empty by default.\n# 'ja' uses this config value.\n# 'zh' user can custom change `jieba` dictionary path.\n#\n# html_search_options = {'type': 'default'}\n\n# The name of a javascript file (relative to the configuration directory) that\n# implements a search results scorer. If empty, the default will be used.\n#\n# html_search_scorer = 'scorer.js'\n\n# Output file base name for HTML help builder.\nhtmlhelp_basename = 'ocrmypdfdoc'\n\n# -- Options for LaTeX output ---------------------------------------------\n\nlatex_elements = {  # type: ignore\n    # The paper size ('letterpaper' or 'a4paper').\n    #\n    # 'papersize': 'letterpaper',\n    # The font size ('10pt', '11pt' or '12pt').\n    #\n    # 'pointsize': '10pt',\n    # Additional stuff for the LaTeX preamble.\n    #\n    # 'preamble': '',\n    # Latex figure (float) alignment\n    #\n    # 'figure_align': 'htbp',\n}\n\n# Grouping the document tree into LaTeX files. List of tuples\n# (source start file, target name, title,\n#  author, documentclass [howto, manual, or own class]).\nlatex_documents = [\n    (master_doc, 'ocrmypdf.tex', 'ocrmypdf Documentation', 'James R. Barlow', 'manual')\n]\n\n# The name of an image file (relative to this directory) to place at the top of\n# the title page.\n#\n# latex_logo = None\n\n# For \"manual\" documents, if this is true, then toplevel headings are parts,\n# not chapters.\n#\n# latex_use_parts = False\n\n# If true, show page references after internal links.\n#\n# latex_show_pagerefs = False\n\n# If true, show URL addresses after external links.\n#\n# latex_show_urls = False\n\n# Documents to append as an appendix to all manuals.\n#\n# latex_appendices = []\n\n# It false, will not define \\strong, \\code, \titleref, \\crossref ... but only\n# \\sphinxstrong, ..., \\sphinxtitleref, ... To help avoid clash with user added\n# packages.\n#\n# latex_keep_old_macro_names = True\n\n# If false, no module index is generated.\n#\n# latex_domain_indices = True\n\n\n# -- Options for manual page output ---------------------------------------\n\n# One entry per manual page. List of tuples\n# (source start file, name, description, authors, manual section).\nman_pages = [(master_doc, 'ocrmypdf', 'ocrmypdf Documentation', [author], 1)]\n\n# If true, show URL addresses after external links.\n#\n# man_show_urls = False\n\n\n# -- Options for Texinfo output -------------------------------------------\n\n# Grouping the document tree into Texinfo files. List of tuples\n# (source start file, target name, title, author,\n#  dir menu entry, description, category)\ntexinfo_documents = [\n    (\n        master_doc,\n        'ocrmypdf',\n        'ocrmypdf Documentation',\n        author,\n        'ocrmypdf',\n        'One line description of project.',\n        'Miscellaneous',\n    )\n]\n\n# Documents to append as an appendix to all manuals.\n#\n# texinfo_appendices = []\n\n# If false, no module index is generated.\n#\n# texinfo_domain_indices = True\n\n# How to display URL addresses: 'footnote', 'no', or 'inline'.\n#\n# texinfo_show_urls = 'footnote'\n\n# If true, do not generate a @detailmenu in the \"Top\" node's menu.\n#\n# texinfo_no_detailmenu = False\n"
  },
  {
    "path": "docs/contributing.md",
    "content": "% SPDX-FileCopyrightText: 2025 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Contributing guidelines\n\nContributions are welcome!\n\n## Big changes\n\nPlease open a new issue to discuss or propose a major change. Not only\nis it fun to discuss big ideas, but we might save each other\\'s time\ntoo. Perhaps some of the work you\\'re contemplating is already half-done\nin a development branch.\n\n## Code style\n\nWe use `ruff` for code formatting.\nThe settings for these programs are in `pyproject.toml`. Pull requests\nshould follow the style guide. One difference we use from \\\"black\\\"\nstyle is that strings shown to the user are always in double quotes\n(`\"`) and strings for internal uses are in single quotes (`'`).\n\n## Tests\n\nNew features should come with tests that confirm their correctness.\n\n## New dependencies\n\nIf you are proposing a change that will require a new dependency, we\nprefer dependencies that are already packaged by Debian or Red Hat. This\nmakes life much easier for our downstream package maintainers. A package\nthat is only available on PyPI or GitHub, and not more widely packaged,\nmay not be accepted.\n\nWe are unlikely to accept a dependency on CUDA or other GPU-based\nlibraries, because these are still difficult to package and install on\nmany systems. We recommend implementing these changes as plugins.\n\nPython dependencies must also be license-compatible. GPLv3 or AGPLv3 are\nlikely incompatible with the project\\'s license, but LGPLv3 is\ncompatible.\n\n## New non-Python dependencies\n\nOCRmyPDF uses several external programs (Tesseract, Ghostscript and\nothers) for its functionality. In general we prefer to avoid adding new\nexternal programs, and if we are to add external programs, we prefer\nthose that are already packaged by Debian or Red Hat.\n\n## Plugins\n\nSome new features may be a good fit for a plugin. Plugins are a way to\nadd features to OCRmyPDF without adding them to the core program.\nPlugins are installed separately from OCRmyPDF. They are written in\nPython and can be installed from PyPI. See the [plugin\ndocumentation](https://ocrmypdf.readthedocs.io/en/latest/plugins.html).\n\nWe are happy to link users to your plugin from the documentation.\n\n## Style guide: Is it OCRmyPDF or ocrmypdf?\n\nThe program/project is OCRmyPDF and the name of the executable or\nlibrary is ocrmypdf.\n\n## Copyright and license\n\nFor contributions over 10 lines of code, please add your name to list of\ncopyright holders for that file. The core program is licensed under\nMPL-2.0, test files and documentation under CC-BY-SA 4.0, and\nmiscellaneous files under MIT, with a few minor exceptions. Please\ncontribute only content that you own or have the right to contribute\nunder these licenses.\n"
  },
  {
    "path": "docs/cookbook.md",
    "content": "% SPDX-FileCopyrightText: 2025 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Cookbook\n\n## Basic examples\n\n### Help!\n\nocrmypdf has built-in help.\n\n```bash\nocrmypdf --help\n```\n\n### Add an OCR layer and convert to PDF/A\n\n```bash\nocrmypdf input.pdf output.pdf\n```\n\n### Add an OCR layer and output a standard PDF\n\n```bash\nocrmypdf --output-type pdf input.pdf output.pdf\n```\n\n### Create a PDF/A with all color and grayscale images converted to JPEG\n\n```bash\nocrmypdf --output-type pdfa --pdfa-image-compression jpeg input.pdf output.pdf\n```\n\n### Modify a file in place\n\nThe file will only be overwritten if OCRmyPDF is successful.\n\n```bash\nocrmypdf myfile.pdf myfile.pdf\n```\n\n### Correct page rotation\n\nOCR will attempt to automatic correct the rotation of each page. This\ncan help fix a scanning job that contains a mix of landscape and\nportrait pages.\n\n```bash\nocrmypdf --rotate-pages myfile.pdf myfile.pdf\n```\n\nYou can increase (decrease) the parameter `--rotate-pages-threshold` to\nmake page rotation more (less) aggressive. The threshold number is the\nratio of how confidence the OCR engine is that the document image should\nbe changed, compared to kept the same. The default value is quite\nconservative; on some files it may not attempt rotations at all unless\nit is very confident that the current rotation is wrong. A lower value\nof `2.0` will produce more rotations, and more false positives. Run with\n`-v1` to see the confidence level for each page to see if there may be a\nbetter value for your files.\n\nIf the page is \\\"just a little off horizontal\\\", like a crooked picture,\nthen you want `--deskew`. `--rotate-pages` is for when the cardinal\nangle is wrong.\n\n### OCR languages other than English\n\nOCRmyPDF assumes the document is in English unless told otherwise. OCR\nquality may be poor if the wrong language is used.\n\n```bash\nocrmypdf -l fra LeParisien.pdf LeParisien.pdf\nocrmypdf -l eng+fra Bilingual-English-French.pdf Bilingual-English-French.pdf\n```\n\nLanguage packs must be installed for all languages specified. See\n`Installing additional language packs <lang-packs>`{.interpreted-text\nrole=\"ref\"}.\n\nUnfortunately, the Tesseract OCR engine has no ability to detect the\nlanguage when it is unknown.\n\n### Produce PDF and text file containing OCR text\n\nThis produces a file named \\\"output.pdf\\\" and a companion text file\nnamed \\\"output.txt\\\".\n\n```bash\nocrmypdf --sidecar output.txt input.pdf output.pdf\n```\n\n:::{note}\nThe sidecar file contains the **OCR text** found by OCRmyPDF. If the\ndocument contains pages that already have text, that text will not\nappear in the sidecar. If the option `--pages` is used, only those pages\non which OCR was performed will be included in the sidecar. If certain\npages were skipped because of options like `--skip-big` or\n`--tesseract-timeout`, those pages will not be in the sidecar.\n\nIf you don\\'t want to generate the output PDF, use `--output-type=none`\nto avoid generating one. Set the output filename to `-` (i.e. redirect\nto stdout).\n\nTo extract all text from a PDF, whether generated from OCR or otherwise,\nuse a program like Poppler\\'s `pdftotext` or `pdfgrep`.\n:::\n\n### OCR images, not PDFs\n\n#### Option: use Tesseract\n\nIf you are starting with images, you can just use Tesseract directly to\nconvert images to PDFs:\n\n```bash\ntesseract my-image.jpg output-prefix pdf\n```\n\n```bash\n# When there are multiple images\ntesseract text-file-containing-list-of-image-filenames.txt output-prefix pdf\n```\n\nTesseract\\'s PDF output is quite good -- OCRmyPDF uses it internally, in\nsome cases. However, OCRmyPDF has many features not available in\nTesseract like image processing, metadata control, and PDF/A generation.\n\n#### Option: use img2pdf\n\nYou can also use a program like\n[img2pdf](https://gitlab.mister-muffin.de/josch/img2pdf) to convert your\nimages to PDFs, and then pipe the results to run ocrmypdf. The `-` tells\nocrmypdf to read standard input.\n\n```bash\nimg2pdf my-images*.jpg | ocrmypdf - myfile.pdf\n```\n\n`img2pdf` is recommended because it does an excellent job at generating\nPDFs without transcoding images.\n\n#### Option: use OCRmyPDF (single images only)\n\nFor convenience, OCRmyPDF can also convert single images to PDFs on its\nown. If the resolution (dots per inch, DPI) of an image is not set or is\nincorrect, it can be overridden with `--image-dpi`. (As 1 inch is 2.54\ncm, 1 dpi = 0.39 dpcm).\n\n```bash\nocrmypdf --image-dpi 300 image.png myfile.pdf\n```\n\nIf you have multiple images, you must use `img2pdf` to convert the\nimages to PDF.\n\n#### Not recommended\n\nWe caution against using ImageMagick or Ghostscript to convert images to\nPDF, since they may transcode images or produce downsampled images,\nsometimes without warning.\n\n(image-processing)=\n\n## Image processing\n\nOCRmyPDF perform some image processing on each page of a PDF, if\ndesired. The same processing is applied to each page. It is suggested\nthat the user review files after image processing as these commands\nmight remove desirable content, especially from poor quality scans.\n\n-   `--rotate-pages` attempts to determine the correct orientation for\n    each page and rotates the page if necessary.\n-   `--remove-background` attempts to detect and remove a noisy\n    background from grayscale or color images. Monochrome images are\n    ignored. This should not be used on documents that contain color\n    photos as it may remove them.\n-   `--deskew` will correct pages that were scanned at a skewed angle by\n    rotating them back into place.\n-   `--clean` uses [unpaper](https://www.flameeyes.eu/projects/unpaper)\n    to clean up pages before OCR, but does not alter the final output.\n    This makes it less likely that OCR will try to find text in\n    background noise.\n-   `--clean-final` uses unpaper to clean up pages before OCR and\n    inserts the page into the final output. You will want to review each\n    page to ensure that unpaper did not remove something important.\n\n:::{note}\nIn many cases image processing will rasterize PDF pages as images,\npotentially losing quality.\n:::\n\n:::{warning}\n`--clean-final` and `--remove-background` may leave undesirable visual\nartifacts in some images where their algorithms have shortcomings. Files\nshould be visually reviewed after using these options.\n:::\n\n### Example: OCR and correct document skew (crooked scan)\n\nDeskew:\n\n```bash\nocrmypdf --deskew input.pdf output.pdf\n```\n\nImage processing commands can be combined. The order in which options\nare given does not matter. OCRmyPDF always applies the steps of the\nimage processing pipeline in the same order (rotate, remove background,\ndeskew, clean).\n\n```bash\nocrmypdf --deskew --clean --rotate-pages input.pdf output.pdf\n```\n\nDon\\'t actually OCR my PDF\n--------------------------\n\nIf you set `--ocr-engine none` OCRmyPDF will apply its image processing without\nperforming OCR. This works if all you want to is to apply image processing or PDF/A\nconversion.\n\n```bash\nocrmypdf --ocr-engine none --deskew --output-type pdfa input.pdf output.pdf\n```\n\n:::{versionchanged} v17.0.0\n\nPrior to this version, `--tesseract-timeout 0` was recommended as an idiom\nto turn off OCR. This is not longer recommended, as we move away from\nTesseract OCR as the primary OCR engine.\n\n:::\n\n:::{versionchanged} v14.1.0\n\nPrior to this version, `--tesseract-timeout 0` would prevent other uses\nof Tesseract, such as deskewing, from working. This is no longer the\ncase. Use `--tesseract-non-ocr-timeout` to control the timeout for\nnon-OCR operations, if needed.\n:::\n\n### Remove all text or OCR from my PDF\n\nThis is getting ridiculous, but OCRmyPDF can complete strip all textual\ninformation from a PDF and reconstruct it as a \\\"bag of images\\\" PDF.\n\n```bash\nocrmypdf --ocr-engine none --force-ocr input.pdf output.pdf\n```\n\nWhy would you want to do this? Perhaps you have a PDF where OCR fails to\nproduce useful results, and just want to get rid of all OCR information.\nThis command also removes OCR generated by third party tools.\n\n### Optimize images without performing OCR\n\nYou can also optimize all images without performing any OCR:\n\n```bash\nocrmypdf --ocr-engine none --optimize 3 --skip-text input.pdf output.pdf\n```\n\n## Using v17 features\n\n### Select a rasterizer\n\n:::{versionadded} 17.0.0\n:::\n\nOCRmyPDF can use pypdfium2 or Ghostscript to rasterize PDF pages. pypdfium2\nis generally faster and is preferred when available.\n\n```bash\n# Automatic selection (default) - prefers pypdfium when available\nocrmypdf --rasterizer auto input.pdf output.pdf\n\n# Explicitly use pypdfium2 (requires pip install pypdfium2)\nocrmypdf --rasterizer pypdfium input.pdf output.pdf\n\n# Explicitly use Ghostscript\nocrmypdf --rasterizer ghostscript input.pdf output.pdf\n```\n\n### PDF/A without Ghostscript\n\n:::{versionadded} 17.0.0\n:::\n\nWith verapdf installed, OCRmyPDF can produce PDF/A without using Ghostscript\nfor conversion. This is faster and avoids some Ghostscript limitations.\n\n```bash\n# Uses speculative conversion with verapdf validation (default)\nocrmypdf --output-type auto input.pdf output.pdf\n\n# Explicitly request Ghostscript-based PDF/A conversion\nocrmypdf --output-type pdfa input.pdf output.pdf\n```\n\n### Using --mode instead of legacy flags\n\n:::{versionadded} 17.0.0\n:::\n\nThe `--mode` (`-m`) flag consolidates OCR behavior options:\n\n```bash\n# Instead of --skip-text\nocrmypdf --mode skip input.pdf output.pdf\n\n# Instead of --force-ocr\nocrmypdf --mode force input.pdf output.pdf\n\n# Instead of --redo-ocr\nocrmypdf --mode redo input.pdf output.pdf\n\n# Short form\nocrmypdf -m skip input.pdf output.pdf\n```\n\nThe legacy flags continue to work as aliases.\n\n### Process only certain pages\n\nYou can ask OCRmyPDF to only apply [image processing](#image-processing)\nand OCR to certain pages.\n\n```bash\nocrmypdf --pages 2,3,13-17 input.pdf output.pdf\n```\n\nHyphens denote a range of pages and commas separate page numbers. If you\nprefer to use spaces, quote all of the page numbers:\n`--pages '2, 3, 5, 7'`.\n\nOCRmyPDF will warn if your list of page numbers contains duplicates or\noverlapping pages. OCRmyPDF does not currently account for document page\nnumbers, such as an introduction section of a book that uses Roman\nnumerals. It simply counts the number of virtual pieces of paper since\nthe start. If your list of pages is out of numerical order, OCRmyPDF\nwill sort it for you.\n\nRegardless of the argument to `--pages`, OCRmyPDF will optimize all\npages/images in the file and convert it to PDF/A, unless you disable\nthose options. Both of these steps are \\\"whole file\\\" operations. In\nthis example, we want to OCR only the title and otherwise change the PDF\nas little as possible:\n\n```bash\nocrmypdf --pages 1 --output-type pdf --optimize 0 input.pdf output.pdf\n```\n\n## Redo existing OCR\n\nTo redo OCR on a file OCRed with other OCR software or a previous\nversion of OCRmyPDF and/or Tesseract, you may use the `--redo-ocr`\nargument. (Normally, OCRmyPDF will exit with an error if asked to modify\na file with OCR.)\n\nThis may be helpful for users who want to take advantage of accuracy\nimprovements in Tesseract for files they previously OCRed with an\nearlier version of Tesseract and OCRmyPDF.\n\n```bash\nocrmypdf --redo-ocr input.pdf output.pdf\n```\n\nThis method will replace OCR without rasterizing, reducing quality or\nremoving vector content. If a file contains a mix of pure digital text\nand OCR, digital text will be ignored and OCR will be replaced. As such\nthis mode is incompatible with image processing options, since they\nalter the appearance of the file.\n\nIn some cases, existing OCR cannot be detected or replaced. Files\nproduced by OCRmyPDF v2.2 or earlier, for example, are internally\nrepresented as having visible text with an opaque image drawn on top.\nThis situation cannot be detected.\n\nIf `--redo-ocr` does not work, you can use `--force-ocr`, which will\nforce rasterization of all pages, potentially reducing quality or losing\nvector content.\n\nImproving OCR quality\n---------------------\n\nThe [Image processing](#image-processing) features can improve OCR\nquality.\n\nRotating pages and deskewing helps to ensure that the page orientation\nis correct before OCR begins. Removing the background and/or cleaning\nthe page can also improve results. The `--oversample DPI` argument can\nbe specified to resample images to higher resolution before attempting\nOCR; this can improve results as well.\n\nOCR quality will suffer if the resolution of input images is not correct\n(since the range of pixel sizes that will be checked for possible fonts\nwill also be incorrect).\n\n## PDF optimization\n\nBy default OCRmyPDF will attempt to perform lossless optimizations on\nthe images inside PDFs after OCR is complete. Optimization is performed\neven if no OCR text is found.\n\nThe `--optimize N` (short form `-O`) argument controls optimization,\nwhere `N` ranges from 0 to 3 inclusive, analogous to the optimization\nlevels in the GCC compiler. `-O1` is the default.\n\nFor further details, see the section on [PDF optimization](optimizer).\n\n```bash\nocrmypdf --optimize 3 in.pdf out.pdf  # Make it small\n```\n\nSome users may consider enabling lossy JBIG2. See:\n`jbig2-lossy`{.interpreted-text role=\"ref\"}.\n\n:::{note}\nImage processing and PDF/A conversion can also introduce lossy\ntransformations to your PDF images, even when `--optimize 1` is in use.\n:::\n\nDigitally signed PDFs\n---------------------\n\nOCRmyPDF cannot preserve digital signatures in PDFs and also add OCR to\nthem. By default, it will refuse to modify a signed PDF regardless of\nother settings. You can override this behavior with\n`--invalidate-digital-signatures`; as the name suggests, any digital\nsignatures will be invalidated.\n\nOCRmyPDF cannot open documents that are encrypted with a digital\ncertificate.\n\nVersions of OCRmyPDF prior to 14.4.0 would invalidate existing digital\nsignatures without warning.\n"
  },
  {
    "path": "docs/design_notes.md",
    "content": "% SPDX-FileCopyrightText: 2023 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Design notes\n\n## Why doesn\\'t OCRmyPDF use PyTesseract?\n\nPyTesseract is a Python wrapper around the Tesseract OCR engine. When\nOCRmyPDF was first written, PyTesseract used ABI bindings to call the\nTesseract library. This was not a good fit for OCRmyPDF because ABI\nbindings can be fragile.\n\nPyTesseract has since evolved calling the Tesseract executable,\nabandoning the ABI approach and using the CLI instead, just like\nOCRmyPDF does. If it were written from scratch today, OCRmyPDF might use\nPyTesseract.\n\nPyTesseract has more features don\\'t particularly need PDF output, but\nless features than OCRmyPDF\\'s API for creating PDFs.\n\n## What is `executor()`?\n\nOCRmyPDF uses a custom concurrent executor which can support either\nthreads or processes with the same interface. This is useful because\nOCRmyPDF can use either threads or processes to parallelize work,\nwhichever is more appropriate for the task at hand.\n\nThe interface is currently private and subject to change. In particular,\nif experiments with asyncio and anyio are successful, the interface will\nchange.\n"
  },
  {
    "path": "docs/docker.md",
    "content": "# OCRmyPDF Docker image {#docker}\n\nOCRmyPDF is also available in Docker images that packages recent\nversions of all dependencies.\n\nFor users who already have Docker installed this may be an easy and\nconvenient option.\n\nOn platforms other than Linux, Docker runs in a virtual machine, and so\nmay be less performant. You may also want to adjust the Docker virtual\nmachine\\'s memory and CPU allocation. On Linux, the Docker image runs\nnatively and performance is comparable to a system installation.\n\n{#docker-install}\n## Installing the Docker image\n\nIf you have [Docker](https://docs.docker.com/) installed on your system,\nyou can install a Docker image of the latest release.\n\nIf you can run this command successfully, your system is ready to\ndownload and execute the image:\n\n:::{code} bash\ndocker run hello-world\n:::\n\n:::{list-table} Docker Images\n:header-rows: 1\n\n* - Image\n  - Architecture\n  - Description\n* - `jbarlow83/ocrmypdf-alpine`\n  - x86_64 and arm64\n  - Recommended image, based on Alpine Linux.\n* - `jbarlow83/ocrmypdf-ubuntu`\n  - x86_64 and arm64\n  - Alternate image, based on Ubuntu. When the Alpine image is considered stable and available for arm64, this image will be deprecated.\n* - `jbarlow83/ocrmypdf`\n  - x86_64 and arm64\n  - Currently an alias for ocrmypdf-ubuntu. When the Alpine image is considered stable and available for arm64, this name will point to the Alpine image. If you don\\'t know about the difference between Alpine and Ubuntu, use this image.\n:::\n\nTo install:\n\n:::{code} bash\ndocker pull jbarlow83/ocrmypdf-alpine\n:::\n\nThe `ocrmypdf` image is also available, but is deprecated and will be\nremoved in the future.\n\nOCRmyPDF will use all available CPU cores. See the Docker documentation\nfor [adjusting memory and CPU on other\nplatforms](https://docs.docker.com/config/containers/resource_constraints/)\nif you are using Docker on macOS or Windows, where you may need to\nmanually assign more resources. On Linux, all resources will be\navailable automatically.\n\nThe underlying operating system and other details in Docker images are\nconsidered implementation details and **subject to change at minor\nreleases**. If you are modifying the image, you should pin the version\nyou intend to use.\n\n## Using the Docker image on the command line\n\n**Unlike typical Docker containers**, in this section the OCRmyPDF\nDocker container is ephemeral -- it runs for one OCR job and terminates,\njust like a command line program. We are using Docker to deliver an\napplication (as opposed to the more conventional case, where a Docker\ncontainer runs as a server). For that reason we usually use the `--rm`\nargument to delete the container when it exits.\n\nTo start a Docker container (instance of the image):\n\n:::{code} bash\ndocker run --rm -i jbarlow83/ocrmypdf-alpine (... all other arguments here...) - -\n:::\n\nFor convenience, create a shell alias to hide the Docker command. It is\neasier to send the input file as stdin and read the output from stdout\n-- **this avoids the messy permission issues with Docker entirely**.\n\n:::{code} bash\nalias docker_ocrmypdf='docker run --rm -i jbarlow83/ocrmypdf-alpine'\ndocker_ocrmypdf --version  # runs docker version\ndocker_ocrmypdf - - <input.pdf >output.pdf\n:::\n\nOr in the wonderful [fish shell](https://fishshell.com/):\n\n:::{code} fish\nalias docker_ocrmypdf 'docker run --rm jbarlow83/ocrmypdf-alpine'\nfuncsave docker_ocrmypdf\n:::\n\nAlternately, you could mount the local current working directory as a\nDocker volume:\n\n:::{code} bash\nalias docker_ocrmypdf='docker run --rm  -i --user \"$(id -u):$(id -g)\" --workdir /data -v \"$PWD:/data\" jbarlow83/ocrmypdf-alpine'\ndocker_ocrmypdf /data/input.pdf /data/output.pdf\n:::\n\n## Podman\n\nEspecially if you use [Podman](https://podman.io/) (or use Docker in\nrootless mode), you may need to add `--userns keep-id` there,\notherwise you may get access errors, because the user ID is otherwise not\nmapped to the same UID as on the host:\n\n:::{code} bash\nalias podman_ocrmypdf='podman run --rm -i --user \"$(id -u):$(id -g)\" --userns keep-id --workdir /data -v \"$PWD:/data\" jbarlow83/ocrmypdf-alpine'\npodman_ocrmypdf /data/input.pdf /data/output.pdf\n:::\n\nIf you have SELinux enabled, you may additionally need to add the `:Z` [suffix to\nthe\nvolume](https://docs.podman.io/en/stable/markdown/podman-run.1.html#volume-v-source-volume-host-dir-container-dir-options)\nor disable SELinux for the container using\n`--security-opt label=disable`, which is suggested for system files as\nthey should not be re-labelled. Please refer to the „Note\" section at\nthe end of the linked podman documentation for details. This results in\nthe following full command:\n\n:::{code} bash\nalias podman_ocrmypdf='podman run --rm -i --user \"$(id -u):$(id -g)\" --userns keep-id --workdir /data -v \"$PWD:/data\" --security-opt label=disable jbarlow83/ocrmypdf-alpine'\npodman_ocrmypdf /data/input.pdf /data/output.pdf\n:::\n\n{#docker-lang-packs}\n## Adding languages to the Docker image\n\nBy default the Docker image includes English, German, Simplified\nChinese, French, Portuguese and Spanish, the most popular languages for\nOCRmyPDF users based on feedback. You may add other languages by\ncreating a new Dockerfile based on the public one.\n\n:::{code} dockerfile\nFROM jbarlow83/ocrmypdf\n\n# Example: add Italian\nRUN apt install tesseract-ocr-ita\n:::\n\nTo install language packs (training data) such as the\n[tessdata\\_best](https://github.com/tesseract-ocr/tessdata_best) suite\nor custom data, you first need to determine the version of Tesseract\ndata files, which may differ from the Tesseract program version. Use\nthis command to determine the data file version:\n\n:::{code} bash\ndocker run -i --rm --entrypoint /bin/ls jbarlow83/ocrmypdf /usr/share/tesseract-ocr\n:::\n\nAs of 2021, the data file version is probably `4.00`.\n\nYou can then add new data with either a Dockerfile:\n\n:::{code} dockerfile\nFROM jbarlow83/ocrmypdf:{TAG}\n\n# Example: add a tessdata_best file\nCOPY chi_tra_vert.traineddata /usr/share/tesseract-ocr/<data version>/tessdata/\n:::\n\nWhen creating your own image, you should always pin a specific version\nof the OCRmyPDF Docker image. This ensures that your image will not\nbreak when a new version of OCRmyPDF is released.\n\nAlternately, you can copy training data into a Docker container as\nfollows:\n\n:::{code} bash\ndocker cp mycustomtraining.traineddata name_of_container:/usr/share/tesseract-ocr/<tesseract version>/tessdata/\n:::\n\nExtending the Docker image\n--------------------------\n\nYou can extend the Docker image with your own customizations, similar to\nthe way it is extended to add language packs.\n\nNote that the Docker image is subject to change at any time. For\nexample, the base image may be updated to a newer version of Ubuntu or\nDebian. Such changes will be noted in the release notes but might occur\nat minor versions releases, unless the way a \\\"casual\\\" user of the\nDocker image is affected.\n\nIf you extend the Docker image, you should pin a specific version of the\nOCRmyPDF Docker image.\n\nExecuting the test suite\n------------------------\n\nThe OCRmyPDF test suite is installed with image. To run it:\n\n:::{code} bash\ndocker run --rm --entrypoint python  jbarlow83/ocrmypdf -m pytest\n:::\n\nAccessing the shell\n-------------------\n\nTo use the shell in the Docker image:\n\n:::{code} bash\ndocker run -it --entrypoint sh  jbarlow83/ocrmypdf\n:::\n\nUsing the OCRmyPDF web service wrapper\n--------------------------------------\n\nThe OCRmyPDF Docker image includes an example, barebones HTTP web\nservice. The webservice may be launched as follows:\n\n:::{code} bash\ndocker run --entrypoint python -p 5000:5000  jbarlow83/ocrmypdf webservice.py\n:::\n\nWe omit the `--rm` parameter so that the container will not be\nautomatically deleted when it exits.\n\nThis will configure the machine to listen on port 5000. On Linux\nmachines this is port 5000 of localhost. On macOS or Windows machines\nrunning Docker, this is port 5000 of the virtual machine that runs your\nDocker images. You can find its IP address using the command\n`docker-machine ip`.\n\nUnlike command line usage this program will open a socket and wait for\nconnections.\n\n:::{warning}\nThe OCRmyPDF web service wrapper is intended for demonstration or\ndevelopment. It provides no security, no authentication, no protection\nagainst denial of service attacks, and no load balancing. The default\nFlask WSGI server is used, which is intended for development only. The\nserver is single-threaded and so can respond to only one client at a\ntime. While running OCR, it cannot respond to any other clients.\n:::\n\nClients must keep their open connection while waiting for OCR to\ncomplete. This may entail setting a long timeout; this interface is more\nuseful for internal HTTP API calls.\n\nUnlike the rest of OCRmyPDF, this web service is licensed under the\nAffero GPLv3 (AGPLv3) since Ghostscript is also licensed in this way.\n\nIn addition to the above, please read our\n`general remarks on using OCRmyPDF as a service <ocr-service>`{.interpreted-text\nrole=\"ref\"}.\n"
  },
  {
    "path": "docs/errors.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Common error messages\n\n## Page already has text\n\n:::{code}\nERROR -    1: page already has text! – aborting (use --force-ocr to force OCR)\n:::\n\nYou ran ocrmypdf on a file that already contains printable text or a\nhidden OCR text layer (it can\\'t quite tell the difference). You\nprobably don\\'t want to do this, because the file is already searchable.\n\nAs the error message suggests, your options are:\n\n-   `ocrmypdf --force-ocr` to\n    `rasterize <raster-vector>`{.interpreted-text role=\"ref\"} all vector\n    content and run OCR on the images. This is useful if a previous OCR\n    program failed, or if the document contains a text watermark.\n-   `ocrmypdf --skip-text` to skip OCR and other processing on any pages\n    that contain text. Text pages will be copied into the output PDF\n    without modification.\n-   `ocrmypdf --redo-ocr` to scan the file for any existing OCR\n    (non-printing text), remove it, and do OCR again. This is one way to\n    take advantage of improvements in OCR accuracy. Printable vector\n    text is excluded from OCR, so this can be used on files that contain\n    a mix of digital and scanned files.\n\n## Input file \\'filename\\' is not a valid PDF\n\nOCRmyPDF checks files with pikepdf, a library that in turn uses libqpdf\nto fixes errors in PDFs, before it tries to work on them. In most cases\nthis happens because the PDF is corrupt and truncated (incomplete file\ncopying) and not much can be done.\n\nYou can try rewriting the file with Ghostscript:\n\n:::{code} bash\ngs -o output.pdf -dSAFER -sDEVICE=pdfwrite input.pdf\n:::\n\n`pdftk` can also rewrite PDFs:\n\n:::{code} bash\npdftk input.pdf cat output output.pdf\n:::\n\nSometimes Acrobat can repair PDFs with its [Preflight\ntool](https://helpx.adobe.com/acrobat/using/correcting-problem-areas-preflight-tool.html).\n"
  },
  {
    "path": "docs/index.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# OCRmyPDF documentation\n\n:::{figure} images/logo.svg\n:::\n\nOCRmyPDF adds an optical character recognition (OCR) text layer to scanned PDF\nfiles, allowing them to be searched.\n\nPDF is the best format for storing and exchanging scanned documents.\nUnfortunately, PDFs can be difficult to modify. OCRmyPDF makes it easy to apply\nimage processing and OCR (recognized, searchable text) to existing PDFs.\n\n```{toctree}\n:maxdepth: 1\n\nintroduction\nreleasenotes/index\ninstallation\nlanguages\njbig2\n```\n\n```{toctree}\n:caption: Usage\n:maxdepth: 2\n\ncookbook\noptimizer\ndocker\nadvanced\nbatch\ncloud\nperformance\npdfsecurity\nerrors\n```\n\n```{toctree}\n:caption: Developers\n:maxdepth: 2\n\napi\nplugins\napiref\ndesign_notes\ncontributing\nmaintainers\n```\n\n# Indices and tables\n\n- {ref}`genindex`\n- {ref}`modindex`\n- {ref}`search`\n"
  },
  {
    "path": "docs/installation.md",
    "content": "---\nmyst:\n  substitutions:\n    deb_12: |-\n      :::{image} https://repology.org/badge/version-for-repo/debian_12/ocrmypdf.svg\n      :alt: Debian 12\n      :::\n    deb_13: |-\n      :::{image} https://repology.org/badge/version-for-repo/debian_13/ocrmypdf.svg\n      :alt: Debian 13\n      :::\n    deb_unstable: |-\n      :::{image} https://repology.org/badge/version-for-repo/debian_unstable/ocrmypdf.svg\n      :alt: Debian unstable\n      :::\n    fedora_40: |-\n      :::{image} https://repology.org/badge/version-for-repo/fedora_40/ocrmypdf.svg\n      :alt: Fedora 40\n      :::\n    fedora_41: |-\n      :::{image} https://repology.org/badge/version-for-repo/fedora_41/ocrmypdf.svg\n      :alt: Fedora 41\n      :::\n    fedora_rawhide: |-\n      :::{image} https://repology.org/badge/version-for-repo/fedora_rawhide/ocrmypdf.svg\n      :alt: Fedora Rawhide\n      :::\n    latest: |-\n      :::{image} https://img.shields.io/pypi/v/ocrmypdf.svg\n      :alt: OCRmyPDF latest released version on PyPI\n      :::\n    ubu_2204: |-\n      :::{image} https://repology.org/badge/version-for-repo/ubuntu_22_04/ocrmypdf.svg\n      :alt: Ubuntu 22.04 LTS\n      :::\n    ubu_2404: |-\n      :::{image} https://repology.org/badge/version-for-repo/ubuntu_24_04/ocrmypdf.svg\n      :alt: Ubuntu 24.04 LTS\n      :::\n---\n\n% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Installing OCRmyPDF\n\n(latest)=\n\nThe easiest way to install OCRmyPDF is to follow the steps for your operating\nsystem/platform. This version may be out of date, however.\n\nThese platforms have one-liner installs:\n\n:::{list-table}\n:header-rows: 0\n\n* - Homebrew (macOS and Linux)\n  - ``brew install ocrmypdf``\n* - Debian, Ubuntu\n  - ``apt install ocrmypdf``\n* - Windows Subsystem for Linux\n  - ``apt install ocrmypdf``\n* - Fedora\n  - ``dnf install ocrmypdf tesseract-osd``\n* - macOS (MacPorts)\n  - ``port install ocrmypdf``\n* - FreeBSD\n  - ``pkg install textproc/py-ocrmypdf``\n* - Snap (snapcraft packaging)\n  - ``snap install ocrmypdf``\n:::\n\nMore detailed procedures are outlined below. If you want to do a manual\ninstall, or install a more recent version than your platform provides, read on.\n\n:::{contents} Platform-specific steps\n:depth: 2\n:local: true\n:::\n\n## Installing on Linux\n\n### Debian and Ubuntu 22.04 or newer\n\n:::{list-table}\n:header-rows: 1\n\n* - OCRmyPDF versions in Debian & Ubuntu\n* - {{ latest }}\n* - {{ deb_12 }} {{ deb_13 }} {{ deb_unstable }}\n* - {{ ubu_2204 }} {{ ubu_2404 }}\n:::\n\nUsers of Debian or Ubuntu may simply\n\n```bash\napt install ocrmypdf\n```\n\nAs indicated in the table above, Debian and Ubuntu releases may lag\nbehind the latest version. If the version available for your platform is\nout of date, you could opt to install the latest version from source.\nSee [Installing HEAD revision from\nsources](#installing-head-revision-from-sources).\n\nFor full details on version availability for your platform, check the\n[Debian Package Tracker](https://tracker.debian.org/pkg/ocrmypdf) or\n[Ubuntu launchpad.net](https://launchpad.net/ocrmypdf).\n\n:::{note}\nOCRmyPDF for Debian and Ubuntu currently omit the JBIG2 encoder.\nOCRmyPDF works fine without it but will produce larger output files.\nAll JBIG2 patents expired in 2017, so if you build jbig2enc from source,\nOCRmyPDF will automatically detect it on the `PATH`.\nTo add JBIG2 encoding, see {ref}`jbig2`.\n:::\n\n### Fedora\n\n:::{list-table}\n:header-rows: 1\n\n* - OCRmyPDF version\n* - {{latest}}\n* - {{fedora_40}} {{fedora_41}} {{fedora_rawhide}}\n:::\n\nUsers of Fedora may simply\n\n```bash\ndnf install ocrmypdf tesseract-osd\n```\n\nFor full details on version availability, check the [Fedora Package\nTracker](https://packages.fedoraproject.org/pkgs/ocrmypdf/ocrmypdf/).\n\nIf the version available for your platform is out of date, you could opt\nto install the latest version from source. See [Installing HEAD revision\nfrom sources](#installing-head-revision-from-sources).\n\n:::{note}\nOCRmyPDF for Fedora currently omits the JBIG2 encoder. All JBIG2 patents\nexpired in 2017. OCRmyPDF works fine without it but will produce larger\noutput files. If you build jbig2enc from source, OCRmyPDF will automatically\ndetect it on the `PATH`. To add JBIG2 encoding, see {ref}`jbig2`.\n:::\n\n(ubuntu-lts-latest)=\n\n### RHEL 9\n\nPrepare the environment by getting Python 3.12:\n\n```bash\ndnf install python3.12 python3.12-pip\n```\n\nThen, follow [Requirements for pip and HEAD install](#requirements-for-pip-and-head-install) to install dependencies:\n\n```bash\ndnf install ghostscript tesseract\n```\n\nand build ocrmypdf in virtual environment:\n\n```bash\npython3.12 -m venv .venv\n```\n\nTo add JBIG2 encoding, see {ref}`Installing the JBIG2 encoder <jbig2>`.\n\nNote Fedora packages for language data haven't been branched for RHEL/EPEL, but you can get traineddata files directly from [tesseract](https://github.com/tesseract-ocr/tessdata/) and place them in `/usr/share/tesseract/tessdata`.\n\n### Installing the latest version on Ubuntu 22.04/24.04 LTS\n\nUbuntu includes an older version of OCRmyPDF - you can install that with\n`apt install ocrmypdf`. To install the latest version, we recommend using uv:\n\n```bash\n# Install system dependencies first\nsudo apt-get update\nsudo apt-get -y install ocrmypdf\n\n# Install uv and upgrade to the latest OCRmyPDF\npip install uv\nuv pip install --user --upgrade ocrmypdf\n```\n\nAlternatively, use Homebrew on Linux for a full-featured installation (see below).\n\nTo add JBIG2 encoding, see {ref}`jbig2`.\n\n### Ubuntu 20.04 LTS (and other older distributions)\n\n:::{note}\nUbuntu 20.04 is approaching end of life. Consider upgrading to Ubuntu 22.04 or 24.04 LTS.\n:::\n\nFor older distributions, the most convenient way to install a recent version of\nOCRmyPDF is to use Homebrew on Linux:\n\n```bash\nbrew install ocrmypdf\n```\n\nSee {ref}`homebrew-linux` for more information on using Homebrew on Linux.\n\n### Arch Linux (AUR)\n\n:::{image} https://repology.org/badge/version-for-repo/aur/ocrmypdf.svg\n:alt: ArchLinux\n:target: https://repology.org/metapackage/ocrmypdf\n:::\n\nThere is an [Arch User Repository (AUR) package for OCRmyPDF](https://aur.archlinux.org/packages/ocrmypdf/).\n\nInstalling AUR packages as root is not allowed, so you must first [setup a\nnon-root user](https://wiki.archlinux.org/index.php/Users_and_groups#User_management) and\n[configure sudo](https://wiki.archlinux.org/index.php/Sudo#Configuration).\nThe standard Docker image, `archlinux/base:latest`, does **not** have a\nnon-root user configured, so users of that image must follow these guides. If\nyou are using a VM image, such as [the official Vagrant image](https://app.vagrantup.com/archlinux/boxes/archlinux), this work may already\nbe completed for you.\n\nNext you should install the [base-devel package group](https://archlinux.org/packages/core/any/base-devel/). This includes the\nstandard tooling needed to build packages, such as a compiler and binary tools.\n\n```bash\nsudo pacman -S --needed base-devel\n```\n\nNow you are ready to install the OCRmyPDF package.\n\n```bash\ncurl -O https://aur.archlinux.org/cgit/aur.git/snapshot/ocrmypdf.tar.gz\ntar xvzf ocrmypdf.tar.gz\ncd ocrmypdf\nmakepkg -sri\n```\n\nAt this point you will have a working install of OCRmyPDF, but the Tesseract\ninstall won’t include any OCR language data. You can install [the\ntesseract-data package group](https://www.archlinux.org/groups/any/tesseract-data/) to add all supported\nlanguages, or use that package listing to identify the appropriate package for\nyour desired language.\n\n```bash\nsudo pacman -S tesseract-data-eng\n```\n\nAs an alternative to this manual procedure, consider using an [AUR helper](https://wiki.archlinux.org/index.php/AUR_helpers). Such a tool will\nautomatically fetch, build and install the AUR package, resolve dependencies\n(including dependencies on AUR packages), and ease the upgrade procedure.\n\nIf you have any difficulties with installation, check the repository package\npage.\n\n:::{note}\nThe OCRmyPDF AUR package currently omits the JBIG2 encoder. OCRmyPDF works\nfine without it but will produce larger output files. The encoder is\navailable from [the jbig2enc-git AUR package](https://aur.archlinux.org/packages/jbig2enc-git/) and may be installed\nusing the same series of steps as for the installation OCRmyPDF AUR\npackage. Alternatively, it may be built manually from source following the\ninstructions in {ref}`Installing the JBIG2 encoder <jbig2>`. If JBIG2 is\ninstalled, OCRmyPDF 7.0.0 and later will automatically detect it.\n:::\n\n### Alpine Linux\n\n:::{image} https://repology.org/badge/version-for-repo/alpine_edge/ocrmypdf.svg\n:alt: Alpine Linux\n:target: https://repology.org/metapackage/ocrmypdf\n:::\n\nTo install OCRmyPDF for Alpine Linux:\n\n```bash\napk add ocrmypdf\n```\n\n### Gentoo Linux\n\n:::{image} https://repology.org/badge/version-for-repo/gentoo_ovl_guru/ocrmypdf.svg\n:alt: Gentoo Linux\n:target: https://repology.org/metapackage/ocrmypdf\n:::\n\nTo install OCRmyPDF on Gentoo Linux, use the following commands:\n\n```bash\neselect repository enable guru\nemaint sync --repo guru\nemerge --ask app-text/OCRmyPDF\n```\n\n### Other Linux packages\n\nSee the\n[Repology](https://repology.org/metapackage/ocrmypdf/versions) page.\n\nIn general, first install the OCRmyPDF package for your system, then\noptionally use the procedure [Installing with Python\npip](#installing-with-python-pip) to install a more recent version.\n\n(homebrew-linux)=\n\n## Installing with Homebrew (macOS and Linux)\n\n:::{image} https://img.shields.io/homebrew/v/ocrmypdf.svg\n:alt: homebrew\n:target: https://formulae.brew.sh/formula/ocrmypdf\n:::\n\n[Homebrew](https://brew.sh) provides a full-featured OCRmyPDF installation\non both macOS and Linux with all recommended dependencies. This is often\nthe easiest way to get a complete, up-to-date installation.\n\n```bash\nbrew install ocrmypdf\n```\n\nThis includes Tesseract, Ghostscript, and all required dependencies. English\nlanguage support is included by default. For other languages:\n\n```bash\nbrew install tesseract-lang  # Optional: Install all language packs\n```\n\n:::{tip}\n**For Linux users:** Homebrew on Linux is an excellent choice when your\ndistribution's package is outdated or missing optional dependencies like\njbig2enc, pngquant, or unpaper. Homebrew provides a consistent, full-featured\ninstallation that works across many Linux distributions.\n\nInstall Homebrew on Linux: https://brew.sh\n:::\n\n## Installing on macOS\n\n### Homebrew\n\nSee {ref}`homebrew-linux` above - the installation is identical on macOS.\n\n### MacPorts\n\n:::{image} https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fports.macports.org%2Fapi%2Fv1%2Fports%2Focrmypdf%2F%3Fformat%3Djson&query=version&label=MacPorts\n:alt: Macports Version Information\n:target: https://ports.macports.org/port/ocrmypdf\n:::\n\nOCRmyPDF is included in MacPorts:\n\n```bash\nsudo port install ocrmypdf\n```\n\nNote that while this will install tesseract you will need to install\nthe appropriate tesseract [language ports](https://ports.macports.org/search/?selected_facets=categories_exact%3Atextproc&installed_file=&q=tesseract&name=on).\n\n### Manual installation on macOS\n\nThese instructions are for installing a more current version of OCRmyPDF than\nis available from Homebrew. Note that Homebrew versions usually track\nreleases fairly closely.\n\nIf it's not already present, [install Homebrew](http://brew.sh/).\n\nUpdate Homebrew and install dependencies:\n\n```bash\nbrew update\n```\n\nInstall or upgrade the required Homebrew packages, if any are missing.\nTo do this, use `brew edit ocrmypdf` to obtain a recent list of Homebrew\ndependencies. You could also check the `.workflows/build.yml`.\n\nThis will include the English, French, German and Spanish language\npacks. If you need other languages you can optionally install them all:\n\n(macos-all-languages)=\n\n> ```bash\n> brew install tesseract-lang  # Option 2: for all language packs\n> ```\n\nInstall uv and OCRmyPDF:\n\n```bash\npip install uv\nuv pip install --user ocrmypdf\n```\n\nThe command line program should now be available:\n\n```bash\nocrmypdf --help\n```\n\n## Installing on Windows\n\n### Native Windows\n\n% If you have a Windows that is not the Home edition, you can use Windows Sandbox to test on a blank Windows instance.\n% https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/\n\n:::{note}\nAdministrator privileges will be required for some of these steps.\n:::\n\nYou must install the following for Windows:\n\n- Python 64-bit\n- Tesseract 64-bit\n- Ghostscript 64-bit\n\nUsing the [winget](https://docs.microsoft.com/en-us/windows/package-manager/winget/)\npackage manager:\n\n- `winget install -e --id Python.Python.3.12`\n- `winget install -e --id UB-Mannheim.TesseractOCR`\n\nYou will need to install Ghostscript manually, [since it does not support automated\ninstalls anymore](https://artifex.com/news/ghostscript-10.01.0-disabling-silent-install-option).\n\n- [Ghostscript download page](https://ghostscript.com/releases/gsdnld.html).\\`\n\n(Or alternately, using the [Chocolatey](https://chocolatey.org/) package manager, install\nthe following when running in an Administrator command prompt):\n\n- `choco install python3`\n- `choco install --pre tesseract`\n- `choco install pngquant` (optional)\n\nEither set of commands will install the required software. At the moment there is no\nsingle command to install Windows.\n\nYou may then use `pip` to install ocrmypdf. (This can performed by a user or\nAdministrator.):\n\n- `python3 -m pip install ocrmypdf`\n\n% The Windows Python versions do not place any python or python3 executable in the path.\n% They add the py launcher to the path:\n% https://docs.python.org/3/using/windows.html#python-launcher-for-windows\n\nIf you installed Python using WinGet, then use the following command instead:\n\n- `py -m pip install ocrmypdf`\n\nand use:\n\n- `py -m ocrmypdf`\n\nTo start OCRmyPDF.\n\nIf you intend to use more Python software on your Windows machine, consider the use of\n[pipx](https://pipx.pypa.io/stable/) or a similar tool to create isolated Python\nenvironments for each Python software that you want to use.\n\nOCRmyPDF will check the Windows Registry and standard locations in your Program Files\nfor third party software it needs (specifically, Tesseract and Ghostscript). To\noverride the versions OCRmyPDF selects, you can modify the `PATH` environment\nvariable. [Follow these directions](https://www.computerhope.com/issues/ch000549.htm#dospath)\nto change the PATH.\n\n:::{warning}\n32-bit Windows is not supported.\n:::\n\n### Windows Subsystem for Linux\n\n1. Install Ubuntu 22.04 for Windows Subsystem for Linux, if not already installed.\n2. Follow the procedure to install {ref}`OCRmyPDF on Ubuntu 22.04 <ubuntu-lts-latest>`.\n3. Open the Windows command prompt and create a symlink:\n\n```powershell\nwsl sudo ln -s  /home/$USER/.local/bin/ocrmypdf /usr/local/bin/ocrmypdf\n```\n\nThen confirm that the expected version from PyPI ({{ latest }}) is installed:\n\n```powershell\nwsl ocrmypdf --version\n```\n\nYou can then run OCRmyPDF in the Windows command prompt or Powershell, prefixing\n`wsl`, and call it from Windows programs or batch files.\n\n### Cygwin64\n\nFirst install the the following prerequisite Cygwin packages using `setup-x86_64.exe`:\n\n```\npython311 (or later)\npython3?-devel\npython3?-pip\npython3?-lxml\npython3?-imaging\n\n   (where 3? means match the version of python3 you installed)\n\ngcc-g++\nghostscript\nlibexempi3\nlibexempi-devel\nlibffi6\nlibffi-devel\npngquant\nqpdf\nlibqpdf-devel\ntesseract-ocr\ntesseract-ocr-devel\n```\n\nThen open a Cygwin terminal (i.e. `mintty`), run the following commands. Note\nthat if you are using the version of `pip` that was installed with the Cygwin\nPython package, the command name will be `pip3`. If you have since updated\n`pip` (with, for instance `pip3 install --upgrade pip`) the the command is\nlikely just `pip` instead of `pip3`:\n\n```bash\npip3 install wheel\npip3 install ocrmypdf\n```\n\nThe optional dependency \"unpaper\" that is currently not available under Cygwin.\nWithout it, certain options such as `--clean` will produce an error message.\nHowever, the OCR-to-text-layer functionality is available.\n\n### Docker\n\nYou can also [Install the Docker image](docker) on Windows. Ensure that\nyour command prompt can run the docker \"hello world\" container.\n\n## Installing on FreeBSD\n\n:::{image} https://repology.org/badge/version-for-repo/freebsd/ocrmypdf.svg\n:alt: FreeBSD\n:target: https://repology.org/project/ocrmypdf/versions\n:::\n\n```bash\npkg install textproc/py-ocrmypdf\n```\n\nTo install a more recent version, you could attempt to first install the system\nversion with `pkg`, then use `pip install --user ocrmypdf`.\n\n## Installing the Docker image\n\nFor some users, installing the Docker image will be easier than\ninstalling all of OCRmyPDF's dependencies.\n\nSee [Installing the Docker image](docker) for more information.\n\n(installing-with-python-pip)=\n\n## Installing with uv (recommended)\n\nWe recommend using [uv](https://docs.astral.sh/uv/) for installing OCRmyPDF from PyPI.\nuv is a fast, modern Python package manager that provides better dependency resolution\nand consistent behavior across all platforms.\n\nFor best results, first install [your platform's\nversion](https://repology.org/metapackage/ocrmypdf/versions) of\n`ocrmypdf` using the instructions elsewhere in this document to satisfy system\ndependencies. Then use uv to get the latest OCRmyPDF version.\n\n```bash\n# Install uv if you don't have it\npip install uv\n\n# Install ocrmypdf in a virtual environment (recommended)\nuv venv\nsource .venv/bin/activate  # On Windows: .venv\\Scripts\\activate\nuv pip install ocrmypdf\n\n# Or install globally\nuv pip install --system ocrmypdf\n```\n\nUse `ocrmypdf --version` to confirm what version was installed.\n\n### Installing with pip\n\nIf you prefer pip, you can still use it:\n\n```bash\npip install --user ocrmypdf\n```\n\n(If the message appears `Requirement already satisfied: ocrmypdf in...`,\nyou will need to use `pip install --user --upgrade ocrmypdf`.)\n\n### Installing with pipx\n\nSome users may prefer pipx for isolated command-line tool installations:\n\n```bash\npipx install ocrmypdf\n```\n\nOr run without permanent installation:\n\n```bash\npipx run ocrmypdf\n```\n\n(requirements-for-pip-and-head-install)=\n\n### Requirements for pip and HEAD install\n\nOCRmyPDF currently requires these external programs and libraries to be\ninstalled, and must be satisfied using the operating system package\nmanager. `pip` cannot provide them.\n\n:::{versionchanged} 17.0.0\nGhostscript is now optional. pypdfium2 can be used for PDF rasterization,\nand verapdf can validate speculative PDF/A conversion.\n:::\n\nThe following versions are required:\n\n- Python 3.11 or newer (3.12+ recommended)\n- Tesseract 4.1.1 or newer\n- One of: Ghostscript 9.54+ **or** pypdfium2 (Python package)\n- One of: Ghostscript 9.54+ **or** verapdf (for PDF/A output)\n- fpdf2 2.8 or newer (Python package)\n- uharfbuzz (Python package)\n- fonts-noto or equivalent (system package, recommended)\n- jbig2enc 0.29 or newer (optional)\n- pngquant 2.5 or newer (optional)\n- unpaper 6.1 (optional)\n\n:::{note}\nFor the best user experience, install both Ghostscript and pypdfium2. pypdfium2 is\nfaster for rasterization, while Ghostscript provides is required for certain PDF/A\nconversions.\n:::\n\n**Dependency summary:**\n\n| Feature | Option 1 | Option 2 | Notes |\n|---------|----------|----------|-------|\n| PDF rasterization | pypdfium2 (Python) | Ghostscript (binary) | pypdfium2 preferred when available |\n| PDF/A conversion | verapdf + pikepdf | Ghostscript | verapdf validates speculative conversion |\n| Text rendering | fpdf2 + uharfbuzz | - | Required |\n| OCR | tesseract-ocr | `--ocr-engine none` | Can be skipped entirely |\n\n**Minimum viable installation:**\ntesseract-ocr + (pypdfium2 OR Ghostscript) + fpdf2 + uharfbuzz\n\n**Recommended installation:**\ntesseract-ocr + pypdfium2 + Ghostscript + verapdf + fpdf2 + uharfbuzz + fonts-noto + unpaper + pngquant + jbig2enc\n\nWe recommend 64-bit versions of all software. (32-bit versions are not\nsupported, although on Linux, they may still work.)\n\n**fpdf2** and **uharfbuzz** are required dependencies that provide the text\nlayer rendering engine. fpdf2 generates the PDF text layer, while uharfbuzz\nprovides text shaping for proper multilingual support. These replace the\nlegacy hOCR-based renderer. Install with: `pip install fpdf2 uharfbuzz`\n\n**fonts-noto** (or an equivalent comprehensive font package) is recommended\nfor proper text rendering, especially for non-Latin scripts. On Debian/Ubuntu:\n`apt install fonts-noto`. On Fedora: `dnf install google-noto-fonts-common`.\nOn macOS with Homebrew: `brew install font-noto`.\n\n**pypdfium2**, if present, provides fast PDF page rasterization using\nthe pdfium library (the same library used by Google Chrome). It is\npreferred over Ghostscript when available due to better performance.\nInstall with: `pip install pypdfium2`\n\n**verapdf**, if present, enables fast speculative PDF/A conversion.\nOCRmyPDF attempts to create PDF/A by adding metadata and ICC profiles\nusing pikepdf, then validates with verapdf. If validation passes,\nGhostscript is skipped entirely. See your distribution's package manager\nor visit [verapdf.org](https://verapdf.org/).\n\n**jbig2enc**, if present, will be used to optimize the encoding of\nmonochrome images. This can significantly reduce the file size of the\noutput file. It is not required.\n[jbig2enc](https://github.com/agl/jbig2enc) is not available in some\ndistributions due to historical patent concerns, but all JBIG2 patents\nexpired in 2017. It can easily be built from source. To add JBIG2 encoding,\nsee {ref}`jbig2`.\n\n:::{warning}\nLossy JBIG2 encoding (`--jbig2-lossy`) has been removed in v17.0.0 due to\nwell-documented risks of character substitution errors. Only lossless\nJBIG2 compression is now supported.\n:::\n\n**pngquant**, if present, is optionally used to optimize the encoding of\nPNG-style images in PDFs (actually, any that are that losslessly\nencoded) by lossily quantizing to a smaller color palette. It is only\nactivated then the `--optimize` argument is `2` or `3`.\n\n**unpaper**, if present, enables the `--clean` and `--clean-final`\ncommand line options.\n\nThese are in addition to the Python packaging dependencies, meaning that\nunfortunately, the `pip install` command cannot satisfy all of them.\n\n(installing-head-revision-from-sources)=\n\n## Installing HEAD revision from sources\n\nIf you have `git` and Python 3.12 or newer installed, you can install\nfrom source. (Python 3.11 is supported but 3.12+ is recommended.) When the `pip` installer runs, it will alert you if\ndependencies are missing.\n\nIf you prefer to build every from source, you will need to [build\npikepdf from\nsource](https://pikepdf.readthedocs.io/en/latest/installation.html#building-from-source).\nFirst ensure you can build and install pikepdf.\n\nWe recommend using uv to install from sources:\n\n```bash\ngit clone -b main https://github.com/ocrmypdf/OCRmyPDF.git\ncd OCRmyPDF\npip install uv  # If not already installed\nuv sync\n```\n\nThis creates a virtual environment and installs all dependencies. Activate\nthe environment to use ocrmypdf:\n\n```bash\nsource .venv/bin/activate\nocrmypdf --help\n```\n\nAlternatively, install directly from GitHub using pip:\n\n```bash\npip install git+https://github.com/ocrmypdf/OCRmyPDF.git\n```\n\nOr, to install in editable mode allowing customization:\n\n```bash\ngit clone -b main https://github.com/ocrmypdf/OCRmyPDF.git\ncd OCRmyPDF\npip install -e .\n```\n\nNote: `ocrmypdf` will only be accessible when the virtual environment\nis activated.\n\nTo run the program:\n\n```bash\nocrmypdf --help\n```\n\nIf not yet installed, the script will notify you about dependencies that\nneed to be installed. The script requires specific versions of the\ndependencies. Older version than the ones mentioned in the release notes\nare likely not to be compatible to OCRmyPDF.\n\n## Optional Features\n\nOCRmyPDF provides optional features and development tools. We recommend using `uv` as your package manager.\n\n### Installing User Features\n\nUser features are available as optional dependencies. Install them with `uv` (recommended) or `pip`:\n\n```bash\n# Using uv (recommended)\nuv sync --extra watcher        # File watching service\nuv sync --extra webservice     # Streamlit web UI\nuv sync --extra watcher --extra webservice  # Multiple features\n```\n\n### Development Tools\n\nDevelopment tools use dependency groups:\n\n```bash\n# Testing infrastructure\nuv sync --group test\n\n# Documentation building\nuv sync --group docs\n\n# Enhanced Streamlit development\nuv sync --group streamlit-dev\n\n# All development groups\nuv sync\n```\n\n**Why use uv?**\n\n- Modern, fast Python package manager\n- Required for development (testing, docs)\n- Better dependency resolution\n- Consistent across all platforms\n\nInstall uv: `curl -LsSf https://astral.sh/uv/install.sh | sh` or visit https://docs.astral.sh/uv/\n\n### For development\n\nTo install all of the development and test requirements:\n\n```bash\ngit clone -b main https://github.com/ocrmypdf/OCRmyPDF.git\ncd OCRmyPDF\nuv sync --all-groups\n```\n\nTo add JBIG2 encoding, see {ref}`jbig2`.\n\n## Shell completions\n\nCompletions for `bash` and `fish` are available in the project's\n`misc/completion` folder. The `bash` completions are likely `zsh`\ncompatible but this has not been confirmed. Package maintainers, please\ninstall these at the appropriate locations for your system.\n\nTo manually install the `bash` completion, copy\n`misc/completion/ocrmypdf.bash` to `/etc/bash_completion.d/ocrmypdf`\n(rename the file).\n\nTo manually install the `fish` completion, copy\n`misc/completion/ocrmypdf.fish` to\n`~/.config/fish/completions/ocrmypdf.fish`.\n\n## Note on 32-bit support\n\nWe don't support any 32-bit system, including 32-bit Python or 32-bit\nGhostscript on Windows."
  },
  {
    "path": "docs/introduction.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Introduction\n\nOCRmyPDF is a Python application and library that adds text \"layers\" to images in\nPDFs, making scanned image PDFs searchable. It uses OCR to guess the text\ncontained in images. OCRmyPDF also supports plugins\nthat enable customization of its processing steps, and it is highly tolerant\nof PDFs containing scanned images and \"born digital\" content that doesn't\nrequire text recognition.\n\n## About OCR\n\n[Optical character\nrecognition](https://en.wikipedia.org/wiki/Optical_character_recognition)\nis a technology that converts images of typed or handwritten text, such as\nin a scanned document, into computer text that can be selected, searched and copied.\n\nOCRmyPDF uses\n[Tesseract](https://github.com/tesseract-ocr/tesseract), a widely\navailable open source OCR engine, to perform OCR.\n\n(raster-vector)=\n\n## About PDFs\n\nPDFs are page description files that attempt to preserve a layout\nexactly. They contain [vector\ngraphics](http://vector-conversions.com/vectorizing/raster_vs_vector.html)\nthat can contain raster objects, such as scanned images. Because PDFs can\ncontain multiple pages (unlike many image formats) and can contain fonts\nand text, they are a suitable format for exchanging scanned documents.\n\n:::{image} images/bitmap_vs_svg.svg\n:::\n\nA PDF page may contain multiple images, even if it appears to have only\none image. Some scanners or scanning software may segment pages into\nmonochromatic text and color regions, for example, to enhance the compression\nratio and appearance of the page.\n\nRasterizing a PDF is the process of generating corresponding raster images.\nOCR engines like Tesseract work with images, not scalable vector graphics\nor mixed raster-vector-text graphics such as PDF.\n\n## About PDF/A\n\n[PDF/A](https://en.wikipedia.org/wiki/PDF/A) is an ISO-standardized\nsubset of the full PDF specification that is designed for archiving (the\n'A' stands for Archive). PDF/A differs from PDF primarily by omitting\nfeatures that could complicate future file readability,\nsuch as embedded Javascript, video, audio and references to external\nfonts. All fonts and resources needed to interpret the PDF must be\ncontained within it. Because PDF/A disables Javascript and other types\nof embedded content, it is likely more secure.\n\nThere are various conformance levels and versions, such as \"PDF/A-2b\".\n\nIn general, the preferred format for scanned documents is PDF/A. Some\ngovernments and jurisdictions, US Courts in particular, [mandate the use\nof PDF/A](https://pdfblog.com/2012/02/13/what-is-pdfa/) for scanned\ndocuments.\n\nSince most individuals scanning documents aim for long-term readability,\nOCRmyPDF defaults to generating PDF/A-2b.\n\nPDF/A does have a few drawbacks. Some PDF viewers display an alert\nindicating that the file is in PDF/A format, which may confuse some users.\nAdditionally, it tends to result in larger files than standard PDFs because\nit embeds certain resources, even if they are widely available. PDF/A\nfiles can be digitally signed but may not be encrypted to ensure future\nreadability. Fortunately, converting from PDF/A to a regular PDF is\nstraightforward, and any PDF viewer can handle PDF/A files.\n\n## What OCRmyPDF does\n\nOCRmyPDF analyzes each page of a PDF to determine the required colorspace\nand resolution (DPI) for capturing all the information on that page without\nlosing content. It uses a PDF rasterizer (pypdfium2 or\n[Ghostscript](http://ghostscript.com/)) to convert each page to an image and\nsubsequently performs OCR on the rasterized image to generate an OCR \"layer.\"\nThis layer is then integrated back into the original PDF.\n\n:::{versionchanged} 17.0.0\nOCRmyPDF now supports pypdfium2 as an alternative rasterizer to Ghostscript.\npypdfium2 is a Python binding for pdfium, the PDF rendering library used by\nGoogle Chrome. The `--rasterizer auto` setting (default) prefers pypdfium2\nwhen available.\n:::\n\nWhile it is possible to use a program like Ghostscript or ImageMagick to\nobtain an image and then run that image through Tesseract OCR, this process\nactually generates a new PDF, potentially resulting in the loss of various\ndetails (such as the document's metadata). In contrast, OCRmyPDF can produce\na minimally altered PDF as the output.\n\nOCRmyPDF also offers several image processing options, such as deskew, which\nenhances the visual quality of files and the accuracy of OCR. When these\noptions are utilized, the OCR layer is integrated into the processed image.\n\nBy default, OCRmyPDF generates archival PDFs in the PDF/A format, which is\na more rigid subset of PDF features designed for long-term archives. If you\nprefer regular PDFs, you can disable this feature using the\n`--output-type pdf` option.\n\n## Why you shouldn't do this manually\n\nA PDF is similar to an HTML file, in that it contains document structure\nalong with images. While some PDFs may solely display a full-page image,\nthey often contain additional content that would be forfeited if not preserved.\n\nA manual process could take one of these approaches:\n\n1. Rasterize each page as an image, perform OCR on the images, and then merge the\n   output into a PDF. This method preserves the layout of each page, but\n   resamples all images potentially leading to quality loss, increased file size,\n   and the introduction of compression artifacts, among other issues.\n2. Extract each image, OCR, and combine the output into a PDF. This approach\n   loses the context in which images are used in the PDF, potentially resulting\n   in loss of information related to scaling and position of images. Some scanned\n   PDFs contain multiple images segmented into black and white, grayscale\n   and color regions, with stencil masks to prevent overlap, as this can\n   enhance the appearance of a file while reducing file size.\n   Reassembling these images can be challenging, and risks losing vector art\n   or text that is not part of an image.\n\nIn cases where a PDF solely serves as a container for images without any\nrotation, scaling, or cropping, the second approach can be lossless.\n\nOCRmyPDF uses various strategies depending on input options and the input PDF\nitself. Generally, it rasterizes a page for OCR and then integrates the OCR\ndata back into the original PDF. This approach allows it to handle complex\nPDFs and preserve their content as much as possible.\n\nFurthermore, OCRmyPDF supports a wide range of edge cases that have emerged\nduring several years of development. It accommodates PDF features like\nimages within Form XObjects and pages with UserUnit scaling. It also\nsupports less common image formats like non-monochrome 1-bit images and\nprovides warnings about files you may not want to OCR. Thanks to tools\nlike pikepdf and QPDF, it can auto-repair damaged PDFs. You don't need to\nunderstand the intricacies of these issues; you should be able to use\nOCRmyPDF with any PDF file, and expect reasonable results.\n\n## Limitations\n\nOCRmyPDF is subject to limitations imposed by the Tesseract OCR engine.\nThese limitations are inherent to any software relying on Tesseract:\n\n- The OCR accuracy may not match that of commercial OCR solutions.\n- It is incapable of recognizing handwriting.\n- It may detect gibberish and report it as OCR output.\n- Results may be subpar when a document contains languages not specified\n  in the `-l LANG` argument.\n- Tesseract may struggle to analyze the natural reading order of documents.\n  For instance, it might fail to recognize two columns in a document and\n  attempt to join text across columns.\n- Poor quality scans can result in subpar OCR quality. In other words, the\n  quality of the OCR output depends on the quality of the input.\n- Tesseract does not provide information about the font family to which text\n  belongs.\n- Tesseract does not divide text into paragraphs or headings. It only provides\n  the text and its bounding box. As such, the generated PDF does not\n  contain any information about the document's structure.\n\n### Ghostscript considerations\n\n:::{versionchanged} 17.0.0\nGhostscript is no longer strictly required. OCRmyPDF can use pypdfium2\nfor rasterization and verapdf for PDF/A validation.\n:::\n\nWhile Ghostscript remains a capable and feature-rich tool with a long history,\nrecent releases have introduced some compatibility challenges that OCRmyPDF\nv17 addresses through alternative codepaths. When Ghostscript is used:\n\n- PDFs containing JPEG 2000-encoded content may be converted to JPEG\n  encoding, which may introduce compression artifacts, if Ghostscript\n  PDF/A is enabled.\n- Ghostscript may transcode grayscale and color images, potentially\n  lossily, based on an internal algorithm. This\n  behavior can be suppressed by setting `--pdfa-image-compression` to\n  `jpeg` or `lossless` to set all images to one type or the other.\n  Ghostscript lacks an option to maintain the input image's format.\n  (Modern Ghostscript can copy JPEG images without transcoding them.)\n- Ghostscript's PDF/A conversion removes any XMP metadata that is not\n  one of the standard XMP metadata namespaces for PDFs. In particular,\n  PRISM Metadata is removed.\n- Ghostscript's PDF/A conversion may remove or deactivate\n  hyperlinks and other active content.\n\nWhen pypdfium2 and verapdf are available, many of these limitations can be\navoided by using the speculative PDF/A conversion path (enabled by default\nwith `--output-type auto`).\n\nYou can use `--output-type pdf` to disable PDF/A conversion and produce\na standard, non-archival PDF.\n\nRegarding OCRmyPDF itself:\n\n- PDFs using transparency are not currently represented in the test\n  suite\n\n## Similar programs\n\nTo the author's knowledge, OCRmyPDF is the most feature-rich and\nthoroughly tested command line OCR PDF conversion tool. If it does not\nmeet your needs, contributions and suggestions are welcome.\n\nGhostscript recently added three \"pdfocr\" output devices. They work by\nrasterizing all content and converting all pages to a single colour space.\n\n## Web front-ends\n\nThe Docker image of OCRmyPDF provides a web service front-end\nthat allows files to submitted over HTTP, and the results can be downloaded.\nThis is an HTTP server intended to demonstrate how OCRmyPDF can be\nintegrated into a web service. It is not intended to be deployed on the\npublic internet and does not provide any security measures.\n\nIn addition, the following third-party integrations are available:\n\n- [Paperless-ngx](https://docs.paperless-ngx.com/) is a free software\n  document management system that uses OCRmyPDF to perform OCR on\n  uploaded documents.\n- [Nextcloud OCR](https://github.com/janis91/ocr) is a free software\n  plugin for the Nextcloud private cloud software.\n\nOCRmyPDF is not designed to be secure against malware-bearing PDFs (see\n[Using OCRmyPDF online](ocr-service)). Users should ensure they\ncomply with OCRmyPDF's licenses and the licenses of all dependencies. In\nparticular, OCRmyPDF requires Ghostscript, which is licensed under\nAGPLv3.\n"
  },
  {
    "path": "docs/jbig2.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n{#jbig2}\n\n# Installing the JBIG2 encoder\n\nMost Linux distributions do not include a JBIG2 encoder since JBIG2\nencoding was patented for a long time. All known JBIG2 US patents have\nexpired as of 2017, but it is possible that unknown patents exist.\n\nJBIG2 encoding is recommended for OCRmyPDF and is used to losslessly\ncreate smaller PDFs. If JBIG2 encoding is not available, lower quality\nCCITT encoding will be used for monochrome images.\n\nJBIG2 decoding is not patented and is performed automatically by most\nPDF viewers. It is widely supported and has been part of the PDF\nspecification since 2001.\n\nJBIG encoding is automatically provided by these OCRmyPDF packages: -\nDocker image (both Ubuntu and Alpine) - Snap package - ArchLinux AUR\npackage - Alpine Linux package - Homebrew on macOS\n\nFor all other platforms, you would need to build the JBIG2 encoder from\nsource:\n\n:::{code} bash\ngit clone https://github.com/agl/jbig2enc\ncd jbig2enc\n./autogen.sh\n./configure && make\n[sudo] make install\n:::\n\nDependencies include libtoolize and libleptonica, which on Ubuntu\nsystems are packaged as libtool and libleptonica-dev. On Fedora (35)\nthey are packaged as libtool and leptonica-devel. For this to work,\nplease make sure to install `autotools`, `automake`, `libtool`, `pkg-config`\nand `leptonica` first if not already installed. Other dependencies might\nbe required depending on your system.\n\n:::{code} bash\n[sudo] apt install autotools-dev automake libtool libleptonica-dev pkg-config\n:::\n\n## JBIG2 Compression\n\nOCRmyPDF uses JBIG2 lossless compression for bitonal (black and white)\nimages. This provides excellent compression ratios compared to the older\nCCITT G4 standard, while preserving the exact pixel content of the\noriginal image.\n\nYou can adjust the threshold for JBIG2 compression with\n`--jbig2-threshold`. The default is 0.85.\n\n:::{note}\nPrevious versions of OCRmyPDF supported a lossy JBIG2 mode\n(`--jbig2-lossy`). This feature has been removed due to the well-known\nrisk of character substitution errors (e.g., 6/8 confusion). See\n[JBIG2 disadvantages](https://en.wikipedia.org/wiki/JBIG2#Disadvantages)\nfor more information on why lossy JBIG2 is problematic. The `--jbig2-lossy`\nand `--jbig2-page-group-size` arguments are now ignored with a warning.\n:::\n"
  },
  {
    "path": "docs/languages.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n(lang-packs)=\n\n# Installing additional language packs\n\nOCRmyPDF uses Tesseract for OCR, and relies on its language packs for all languages.\nOn most platforms, English is installed with Tesseract by default, but not always.\n\nTesseract supports [most\nlanguages](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc#languages).\nLanguages are identified by standardized three-letter codes (called ISO 639-2 Alpha-3).\nTesseract's documentation also lists the three-letter code for your language.\nSome are anglicized, e.g. Spanish is `spa` rather than `esp`, while others\nare not, e.g. German is `deu` and French is `fra`.\n\nLanguage packs (strictly speaking, Tesseract \"traineddata\" files) generally correspond\nto the language in question, but different language packs are used in certain\nsituations. For German, the \"Fraktur\" language pack can assist with reading older\nmaterials in the Fraktur typeface family (`deu_frak`). Some communities have changed\ntheir script from Cyrillic to Latin; the Cyrillic version of Uzbek is available\nas `uzb_cyrl` and the Latin version is `uzb`.\n\nAfter you have installed a language pack, you can use it with `ocrmypdf -l <language>`,\nfor example `ocrmypdf -l spa`. For multilingual documents, you can specify\nall languages to be expected, e.g. `ocrmypdf -l eng+fra` for English and French.\nEnglish is assumed by default unless other language(s) are specified.\n\nFor Linux users, you can often find packages that provide language\npacks.\n\n## Platform install steps\n\n### Debian and Ubuntu (apt)\n\n```bash\n# Display a list of all Tesseract language packs\napt-cache search tesseract-ocr\n\n# Install Chinese Simplified language pack\napt-get install tesseract-ocr-chi-sim\n```\n\nYou can then pass the `-l LANG` argument to OCRmyPDF to give a hint as\nto what languages it should search for. Multiple languages can be\nrequested using either `-l eng+fra` (English and French) or\n`-l eng -l fra`.\n\n### Fedora\n\n```bash\n# Display a list of all Tesseract language packs\ndnf search tesseract\n\n# Install Chinese Simplified language pack\ndnf install tesseract-langpack-chi_sim\n```\n\nYou can then pass the `-l LANG` argument to OCRmyPDF to give a hint as\nto what languages it should search for. Multiple languages can be\nrequested using either `-l eng+fra` (English and French) or\n`-l eng -l fra`.\n\n### Arch Linux\n\n```bash\n# Display a list of all Tesseract language packs\npacman -Ss tesseract-data\n\n# Install German language pack\npacman -S tesseract-data-deu\n```\n\nYou can then pass the `-l LANG` argument to OCRmyPDF to give a hint as\nto what languages it should search for. Multiple languages can be\nrequested using either `-l eng+fra` (English and French) or\n`-l eng -l fra`.\n\n### Gentoo\n\nOn Gentoo the package `app-text/tessdata_fast`, which `app-text/tesseract` depends on, handles Tesseract languages.\nIt accepts USE flags to select what languages should be installed, these can be set in `/etc/portage/package.use`.\nAlternatively one can globally set the [L10N use extension](https://wiki.gentoo.org/wiki/Localization/Guide#L10N) in `/etc/portage/make.conf`.\nThis enables these languages for all packages (e.g. including aspell).\n\n```bash\n# Display a list of all Tesseract language packs\nequery uses app-text/tessdata_fast\n\n# Add English and German language support for Tesseract only\necho 'app-text/tessdata_fast l10n_de l10n_en' >> /etc/portage/package.use\n\n# Add global English and German language support (the `l10n_` from equery has to be omitted)\necho L10N=\"de en\" >> /etc/portage/make.conf\n\n# update system to reflect changed USE flags\nemerge --update --deep --newuse @world\n```\n\nYou can then pass the `-l LANG` argument to OCRmyPDF to give a hint as\nto what languages it should search for. Multiple languages can be\nrequested using either `-l eng+fra` (English and French) or\n`-l eng -l fra`.\n\n### macOS\n\nYou can install additional language packs by\n{ref}`installing Tesseract using Homebrew with all language packs <macos-all-languages>`.\n\n### Docker\n\nUsers of the OCRmyPDF Docker image should install language packs into a\nderived Docker image as\n{ref}`described in that section <docker-lang-packs>`.\n\n### Windows\n\nThe Tesseract installer provided by Chocolatey currently includes only English language.\nTo install other languages, download the respective language pack (`.traineddata` file)\nfrom <https://github.com/tesseract-ocr/tessdata/> and place it in\n`C:\\\\Program Files\\\\Tesseract-OCR\\\\tessdata` (or wherever Tesseract OCR is installed).\n\n## Custom language packs\n\nIf you have fine-tuned or trained Tesseract and generated custom trained data, you can\ncopy your `customlang.traineddata` file into your Tesseract \"tessdata\" folder, and\nthen use the `-l customlang` argument to tell OCRmyPDF to pass that language on to\nTesseract.\n"
  },
  {
    "path": "docs/maintainers.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Maintainer notes\n\nThis is for those who package OCRmyPDF for downstream use. (Thank you\nfor your hard work.)\n\n## Known ports/packagers\n\nOCRmyPDF has been ported to many platforms already. If you are\ninteresting in porting to a new platform, check with\n[Repology](https://repology.org/projects/?search=ocrmypdf) to see the\nstatus of that platform.\n\n### Make sure you can package pikepdf\n\npikepdf, created by the same author, is a mixed Python and C++14 package\nwith much stiffer build requirements. If you want to use OCRmyPDF on\nsome novel platform or distribution, first make sure you can package\npikepdf.\n\n### Core dependencies\n\n:::{versionchanged} 17.0.0\nGhostscript is no longer strictly required. OCRmyPDF now supports alternative\ncodepaths for both PDF rasterization and PDF/A conversion.\n:::\n\nOCRmyPDF has the following runtime dependencies:\n\n**For PDF rasterization** (converting PDF pages to images for OCR):\n\n- `pypdfium2` (Python package) - OR -\n- `ghostscript` (system binary)\n- Recommendation: Install both for best compatibility\n\n**For PDF/A conversion**:\n\n- `verapdf` (system binary) with pikepdf's speculative conversion - OR -\n- `ghostscript` (system binary)\n- Recommendation: Install both for best compatibility\n\n**For OCR**:\n- `tesseract-ocr` (system binary) - Required for MVP\n\n**For text rendering** (expressing OCR results in PDF):\n- `fpdf2` (Python package) - Required for text layer rendering\n- `uharfbuzz` (Python package) - Required for text layer rendering\n- `font-noto` (system package) - Recommended for text layer rendering\n\n**Other dependencies**:\n- `unpaper` (system binary) - Optional, enables `--clean` and `--clean-final`\n- `pngquant` (system binary) - Optional, enables `--optimize 2` and `--optimize 3`\n- `jbig2enc` (system binary) - Optional, improves compression of monochrome images\n\nWhile Ghostscript remains a capable and feature-rich tool with a long history,\nrecent releases have introduced some compatibility challenges that OCRmyPDF v17\naddresses through alternative codepaths. For the best user experience, packagers\nshould install both Ghostscript and the alternative tools (pypdfium2, verapdf)\nwhen available.\n\nOn Windows, OCRmyPDF will also check the registry for Tesseract and Ghostscript\nlocations.\n\nTesseract OCR relies on SIMD for performance and only has proper support\nfor this on ARM and x86\\_64. Performance may be poor on other processor\narchitectures.\n\n### Versioning scheme\n\nOCRmyPDF uses hatch-vcs for versioning, which derives the version from\nGit as a single source of truth. This may be unsuitable for some\ndistributions, e.g. to indicate that your distribution modifies OCRmyPDF\nin some way.\n\nYou can patch the `__version__` variable in `src/ocrmypdf/_version.py`\nif necessary, or set the environment variable\n`SETUPTOOLS_SCM_PRETEND_VERSION` to the required version, if you need to\noverride versioning for some reason.\n\n### jbig2enc\n\nOCRmyPDF will use jbig2enc, a JBIG2 encoder, if one can be found. Some\ndistributions have shied away from packaging JBIG2 because it contains\npatented algorithms, but all patents have expired since 2017. If\npossible, consider packaging it too to improve OCRmyPDF's compression.\n\n:::{note}\nLossy JBIG2 encoding has been removed in v17.0.0 due to well-documented\nrisks of character substitution errors. Previously we provided this feature\non a \"caveat emptor\" basis but in the interest of focusing and eliminating\nrisks, we decided to remove this option. Now, only lossless JBIG2 compression\nis supported.\n:::\n\n### Dependency matrix for packagers\n\n:::{versionadded} 17.0.0\n:::\n\nThe following table summarizes the dependency options introduced in v17.0.0:\n\n| Feature | Option 1 | Option 2 | Notes |\n|---------|----------|----------|-------|\n| PDF rasterization | pypdfium2 (Python) | ghostscript (binary) | pypdfium2 preferred when available |\n| PDF/A conversion | verapdf + pikepdf | ghostscript | verapdf validates speculative conversion |\n| Text rendering | fpdf2 (Python) | - | Required, replaces legacy hOCR renderer |\n| OCR | tesseract-ocr | `--ocr-engine none` | Can be skipped entirely |\n\n**Minimum viable installation:**\n\n- tesseract-ocr + (pypdfium2 OR ghostscript) + fpdf2\n\n**Recommended installation:**\n\n- tesseract-ocr + pypdfium2 + ghostscript + verapdf + fpdf2 + unpaper + pngquant + jbig2enc\n\n:::{warning}\nIf Ghostscript is not installed and verapdf is not available, PDF/A output\ncannot be produced. The output will be a standard PDF instead. This is a\nbreaking change for rare configurations that previously relied on PDF/A\noutput without Ghostscript alternatives.\n:::\n\n**Sample debian/control dependency specification**\n\n```\nDepends:\n fonts-noto,\n fpdf2 (>= 2.8),\n ghostscript (>= 9.55),  # Not strictly required, but best user experience\n icc-profiles-free,\n img2pdf,\n python3-coloredlogs,\n python3-deprecation,\n python3-pdfminer (>= 20181108+dfsg-3),\n python3-pikepdf (>= 8.14.0),\n python3-pil,\n python3-pluggy,\n python3-reportlab,\n python3-rich,\n python3-uharfbuzz,  # Not currently in Debian\n tesseract-ocr (>= 5.0.0),\n zlib1g,\n ${misc:Depends},\n ${python3:Depends},\nRecommends:\n cyclopts,   # Not currently in Debian\n jbig2\n paddleocr,  # Not currently in Debian\n pngquant,\n pypdfium2,  # Not currently in Debian\n unpaper,\n verapdf,    # Not currently in Debian\nSuggests:\n ocrmypdf-doc,\n python-watchdog,\n```\n\n### Command line completions\n\nPlease ensure that command line completions are installed, as described\nin the installation documentation.\n\n### 32-bit Linux support\n\nIf you maintain a Linux distribution that supports 32-bit x86 or ARM,\nOCRmyPDF should continue to work as long as all of its dependencies\ncontinue to be available in 32-bit form. Please note we do not test on\n32-bit platforms.\n\n### HEIF/HEIC\n\nOCRmyPDF defaults to installing the pi-heif PyPI package, which supports\nconverting HEIF (High Efficiency Image File Format) images to PDF from\nthe command line. If your distribution does not have this library\navailable, you can exclude it and OCRmyPDF will gracefully degrade\nautomatically, losing only support for this feature.\n"
  },
  {
    "path": "docs/optimizer.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# PDF optimization\n\nOCRmyPDF includes an image-oriented PDF optimizer. By default, the\noptimizer runs with safe settings with the goal of improving compression\nat no loss of quality. At higher optimization levels, lossy\noptimizations may be applied and tuned. Optimization occurs after OCR,\nand only if OCR succeeded. It does not perform other possible\noptimizations such as deduplicating resources, consolidating fonts,\nsimplifying vector drawings, or anything of that nature.\n\n:::{list-table} OCRmyPDF optimization settings\n---\nwidths: 33 6 60\nheader-rows: 1\n---\n\n* - Optimization level\n  - Shorthand\n  - Description\n* - ``--optimize 0``\n  - ``-O0``\n  - Disable most optimizations.\n* - ``--optimize 1`` (default)\n  - ``-O1``\n  - Enables lossless optimizations, such as transcoding images to more\n      efficient formats. Also compress other uncompressed objects in the\n      PDF and enables the more efficient \"object streams\" within the PDF.\n* - ``--optimize 2``\n  - ``-O2``\n  - All of the above, and enables lossy optimizations and color quantization.\n* - ``--optimize 3``\n  - ``-O3``\n  - All of the above, and enables more aggressive optimizations and targets lower\n      image quality.\n:::\n\nThe exact type of optimizations performed will vary over time, and\ndepend on what third party tools are installed.\n\nDespite optimizations, OCRmyPDF might still increase the overall file\nsize, since it must embed information about the recognized text, and\ndepending on the settings chosen, may not be able to represent the\noutput file as compactly as the input file.\n\n## Optimizations that always occurs\n\nOCRmyPDF will automatically replace obsolete or inferior compression\nschemes such as RLE or LZW with superior schemes such as Deflate, and\nconvert monochrome images to CCITT G4. Since this is lossless, it always\noccurs and there is no way to disable it. Other non-image compressed\nobjects are compressed as well.\n\n## Fast web view\n\nOCRmyPDF automatically optimizes PDFs for \\\"fast web view\\\" in Adobe\nAcrobat\\'s parlance, or equivalently, linearizes PDFs so that the\nresources they reference are presented in the order a viewer needs them\nfor sequential display. This reduces the latency of viewing a PDF both\nonline and from local storage, in exchange for a slight increase in file\nsize.\n\nTo disable this optimization and all others, use\n`ocrmypdf --optimize 0 ...` or the shorthand `-O0`.\n\nAdobe Acrobat might not report the file as being \\\"fast web view\\\".\n\n## Lossless optimizations\n\nAt optimization level `-O1` (the default), OCRmyPDF will also attempt\nlossless image optimization.\n\nIf a JBIG2 encoder is available, then monochrome images will be\nconverted to JBIG2, with the potential for huge savings on large black\nand white images, since JBIG2 is far more efficient than any other\nmonochrome (bi-level) compression. (All known US patents related to\nJBIG2 have probably expired, but it remains the responsibility of the\nuser to supply a JBIG2 encoder such as\n[jbig2enc](https://github.com/agl/jbig2enc). OCRmyPDF does not implement\nJBIG2 encoding on its own.)\n\nOCRmyPDF currently does not attempt to recompress losslessly compressed\nobjects more aggressively.\n\n## Lossy optimizations\n\nAt optimization level `-O1`, `-O2` and `-O3`, OCRmyPDF will some attempt\nloss image optimization.\n\nIf Ghostscript is used to create a PDF/A (the default), Ghostscript will\noptimize some images by converting them to JPEG, which are lossy. If\n`--output-type pdf` is used, there are no lossy optimizations. Ghostscript's\nJPEG conversion is quite safe.\n\nIf `pngquant` is installed, OCRmyPDF will use it to perform quantize\npaletted images to reduce their size.\n\nThe quality of JPEGs may be lowered, on the assumption that a lower\nquality image may be suitable for storage after OCR.\n\nIt is not possible to optimize all image types. Uncommon image types may\nbe skipped by the optimizer.\n"
  },
  {
    "path": "docs/pdfsecurity.md",
    "content": "(security)=\n\n# PDF security issues\n\n> OCRmyPDF should only be used on PDFs you trust. It is not designed to\n> protect you against malware.\n\nRecognizing that many users have an interest in handling PDFs and\napplying OCR to PDFs they did not generate themselves, this article\ndiscusses the security implications of PDFs and how users can protect\nthemselves.\n\nThe disclaimer applies: this software has no warranties of any kind.\n\n## PDFs may contain malware\n\nPDF is a rich, complex file format. The official PDF 1.7 specification,\nISO 32000:2008, is hundreds of pages long and references several annexes\neach of which are similar in length. PDFs can contain video, audio, XML,\nJavaScript and other programming, and forms. In some cases, they can\nopen internet connections to pre-selected URLs. All of these are\npossible attack vectors.\n\nIn short, PDFs [may contain\nviruses](https://security.stackexchange.com/questions/64052/can-a-pdf-file-contain-a-virus).\n\nIf you do not trust a PDF or its source, do not open it or use OCRmyPDF\non it. Consider using a Docker container or virtual machine to isolate\nan untrusted PDF from your system.\n\n## How OCRmyPDF processes PDFs\n\nOCRmyPDF must open and interpret your PDF in order to insert an OCR\nlayer. First, it runs all PDFs through\n[pikepdf](https://github.com/pikepdf/pikepdf), a library based on\n[QPDF](https://github.com/qpdf/qpdf), a program that repairs PDFs with\nsyntax errors. This is done because, in the author\\'s experience, a\nsignificant number of PDFs in the wild, especially those created by\nscanners, are not well-formed files. QPDF makes it more likely that\nOCRmyPDF will succeed, but offers no security guarantees. QPDF is also\nused to split the PDF into single page PDFs.\n\nFinally, OCRmyPDF rasterizes each page of the PDF using\n[Ghostscript](http://ghostscript.com/) in `-dSAFER` mode.\n\nDepending on the options specified, OCRmyPDF may graft the OCR layer\ninto the existing PDF or it may essentially reconstruct (\\\"re-fry\\\") a\nvisually identical PDF that may be quite different at the binary level.\nThat said, OCRmyPDF is not a tool designed for sanitizing PDFs.\n\n## Password protected PDFs\n\nPassword protected PDFs usually have two passwords, and owner and user\npassword. When the user password is set to empty, PDF readers will open\nthe file automatically and mark it as \\\"(SECURED)\\\". Password security\ncan also request certain restrictions on the PDF, but anyone can remove\nthese restrictions if they have either the owner *or* user password.\nPasswords mainly present a barrier for casual users.\n\nOCRmyPDF cannot remove passwords from PDFs. If you want to remove a\npassword from a PDF, you must use other software, such as `qpdf`.\n\nIf the owner and user password are set, a password is required for\n`qpdf`. If only the owner password is set, then the password can be\nstripped, even if one does not have the owner password. To remove the\npassword from a using QPDF, use:\n\n:::{code} bash\nqpdf --decrypt --password='abc123' input.pdf no_password.pdf\n:::\n\nThen you can run OCRmyPDF on the file.\n\nIn its default mode, OCRmyPDF generates PDF/A. Passwords may not be set\non PDF/A documents. If you want to set a password on the output PDF, you\nmust specify `--output-type pdf`.\n\n## Signature images\n\nMany programs exist which are capable of inserting an image of\nsomeone\\'s signature. On its own, this offers no security guarantees. It\nis trivial to remove the signature image and apply it to other files.\nThis practice offers no real security.\n\n## Digital signatures\n\nImportant documents can be digitally signed and certified to attest to\ntheir authorship, approval or execution of a legal agreement. OCRmyPDF\nwill detect signed PDFs and will not modify them, unless the\n`--invalidate-digital-signatures` option is used, which will invalidate\nany signatures. (The signature may still be present in the PDF if\nopened, but PDF readers will not validate it.)\n\nA digital signature adds a cryptographic hash of the document to the\ndocument, so tamper protection is provided. That also precludes OCRmyPDF\nfrom modifying the document and preserving the signature.\n\nDigital signatures are not the same as a signature image. A digital\nsignature is a cryptographic hash of the document that is encrypted with\nthe author\\'s private key. The signature is decrypted with the author\\'s\npublic key. The public key is usually distributed by a certificate\nauthority. The signature is then verified by the PDF reader. If the\ndocument is modified, the signature will be invalidated.\n\n## Certificate-encrypted PDFs\n\nPDFs can be encrypted with a certificate. This is a more secure form of\nencryption than a password. The certificate is usually issued by a\ncertificate authority. A certificate is used to encrypt the document\nusing the public key for the benefit of a specific recipient who\npossesses the private key.\n\nOCRmyPDF cannot open certificate-encrypted PDFs. If you have the\ncertificate, you can use other PDF software, such as Acrobat, to decrypt\nthe PDF.\n"
  },
  {
    "path": "docs/performance.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Performance\n\nSome users have noticed that current versions of OCRmyPDF do not run as\nquickly as some older versions (specifically 6.x and older). This is\nbecause OCRmyPDF added image optimization as a postprocessing step, and\nit is enabled by default.\n\n## Speed\n\nIf running OCRmyPDF quickly is your main goal, you can use settings such\nas:\n\n-   `--optimize 0` to disable file size optimization\n-   `--output-type pdf` to disable PDF/A generation\n-   `--fast-web-view 999999` to disable fast web view optimization\n-   `--skip-big` to skip large images, if some pages have large images\n\nYou can also avoid:\n\n-   `--force-ocr`\n-   Image preprocessing\n"
  },
  {
    "path": "docs/plugins.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Plugins\n\n> The key words \"MUST\", \"MUST NOT\", \"REQUIRED\", \"SHALL\", \"SHALL\n> NOT\", \"SHOULD\", \"SHOULD NOT\", \"RECOMMENDED\", \"MAY\", and\n> \"OPTIONAL\" in this document are to be interpreted as described in\n> RFC 2119.\n\nYou can use plugins to customize the behavior of OCRmyPDF at certain points of\ninterest.\n\nCurrently, it is possible to:\n\n- add new command line arguments\n- override the decision for whether or not to perform OCR on a particular file\n- modify the image is about to be sent for OCR\n- modify the page image before it is converted to PDF\n- replace the Tesseract OCR with another OCR engine that has similar behavior\n- replace Ghostscript with another PDF to image converter (rasterizer) or\n  PDF/A generator\n\nOCRmyPDF plugins are based on the Python `pluggy` package and conform to its\nconventions. Note that: plugins installed with as setuptools entrypoints are\nnot checked currently, because OCRmyPDF assumes you may not want to enable\nplugins for all files.\n\nSee \\[OCRmyPDF-EasyOCR\\](<https://github.com/ocrmypdf/OCRmyPDF-EasyOCR>) for an\nexample of a straightforward, fully working plugin.\n\n## Script plugins\n\nScript plugins may be called from the command line, by specifying the name of a file.\nScript plugins may be convenient for informal or \"one-off\" plugins, when a certain\nbatch of files needs a special processing step for example.\n\n```bash\nocrmypdf --plugin ocrmypdf_example_plugin.py input.pdf output.pdf\n```\n\nMultiple plugins may be installed by issuing the `--plugin` argument multiple times.\n\n## Packaged plugins\n\nInstalled plugins may be installed into the same virtual environment as OCRmyPDF\nis installed into. They may be invoked using Python standard module naming.\nIf you are intending to distribute a plugin, please package it.\n\n```bash\nocrmypdf --plugin ocrmypdf_fancypants.pockets.contents input.pdf output.pdf\n```\n\nOCRmyPDF does not automatically import plugins, because the assumption is that\nplugins affect different files differently and you may not want them activated\nall the time. The command line or `ocrmypdf.ocr(plugin='...')` must call\nfor them.\n\nThird parties that wish to distribute packages for ocrmypdf should package them\nas packaged plugins, and these modules should begin with the name `ocrmypdf_`\nsimilar to `pytest` packages such as `pytest-cov` (the package) and\n`pytest_cov` (the module).\n\n:::{note}\nWe recommend plugin authors name their plugins with the prefix\n`ocrmypdf-` (for the package name on PyPI) and `ocrmypdf_` (for the\nmodule), just like pytest plugins. At the same time, please make it clear\nthat your package is not official.\n:::\n\n## Plugins\n\nYou can also create a plugin that OCRmyPDF will always automatically load if both are\ninstalled in the same virtual environment, using a project entrypoint.\nOCRmyPDF uses the entrypoint namespace \"ocrmypdf\".\n\nFor example, `pyproject.toml` would need to contain the following, for a plugin named\n`ocrmypdf-exampleplugin`:\n\n```toml\n[project]\nname = \"ocrmypdf-exampleplugin\"\n\n[project.entry-points.\"ocrmypdf\"]\nexampleplugin = \"exampleplugin.pluginmodule\"\n```\n\n## Plugin requirements\n\nOCRmyPDF generally uses multiple worker processes. When a new worker is started,\nPython will import all plugins again, including all plugins that were imported earlier.\nThis means that the global state of a plugin in one worker will not be shared with\nother workers. As such, plugin hook implementations should be stateless, relying\nonly on their inputs. Hook implementations may use their input parameters to\nto obtain a reference to shared state prepared by another hook implementation.\nPlugins must expect that other instances of the plugin will be running\nsimultaneously.\n\nThe `context` object that is passed to many hooks can be used to share information\nabout a file being worked on. Plugins must write private, plugin-specific data to\na subfolder named `{options.work_folder}/ocrmypdf-plugin-name`. Plugins MAY\nread and write files in `options.work_folder`, but should be aware that their\nsemantics are subject to change.\n\nOCRmyPDF will delete `options.work_folder` when it has finished OCRing\na file, unless invoked with `--keep-temporary-files`.\n\nThe documentation for some plugin hooks contain a detailed description of the\nexecution context in which they will be called.\n\nPlugins should be prepared to work whether executed in worker threads or worker\nprocesses. Generally, OCRmyPDF uses processes, but has a semi-hidden threaded\nargument that simplifies debugging.\n\n## Plugin hooks\n\nA plugin may provide the following hooks. Hooks must be decorated with\n`ocrmypdf.hookimpl`, for example:\n\n```python\nfrom ocrmypdf import hookimpl\n\n@hookimpl\ndef add_options(parser):\n    pass\n```\n\nThe following is a complete list of hooks that are available, and when\nthey are called.\n\n(firstresult)=\n\n**Note on firstresult hooks**\n\nIf multiple plugins install implementations for this hook, they will be called in\nthe reverse of the order in which they are installed (i.e., last plugin wins).\nWhen each hook implementation is called in order, the first implementation that\nreturns a value other than `None` will \"win\" and prevent execution of all other\nhooks. As such, you cannot \"chain\" a series of plugin filters together in this\nway. Instead, a single hook implementation should be responsible for any such\nchaining operations.\n\n## Examples\n\n- OCRmyPDF's test suite contains several plugins that are used to simulate certain\n  test conditions.\n- [ocrmypdf-papermerge](https://github.com/papermerge/OCRmyPDF_papermerge) is\n  a production plugin that integrates OCRmyPDF and the Papermerge document\n  management system.\n\n### Suppressing or overriding other plugins\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.initialize\n```\n\n### Custom command line arguments\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.add_options\n```\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.check_options\n```\n\n### Plugin option models\n\nPlugins can define their own option models using Pydantic. This allows plugins to:\n\n- Define type-safe option structures with validation\n- Add CLI arguments that map to their option model fields\n- Access options via nested namespaces (e.g., `options.tesseract.timeout`)\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.register_options\n```\n\nPlugin options can be accessed in two ways:\n\n1. **Flat access** (backward compatible): `options.tesseract_timeout`\n2. **Nested access**: `options.tesseract.timeout`\n\nBoth access patterns are equivalent and return the same values.\n\n:::{note}\n**Plugin Interface Change**: Starting in OCRmyPDF v17.0.0, plugin hooks receive\n`OcrOptions` objects instead of `argparse.Namespace` objects. Most plugins will\ncontinue working due to duck-typing compatibility, but plugin developers should\nupdate their type hints accordingly.\n:::\n\n### Migration guide for plugin developers\n\n:::{versionadded} 17.0.0\n:::\n\n**Update imports:**\n\n```python\nfrom ocrmypdf._options import OcrOptions\n```\n\n**Update type hints:**\n\n```python\n# Before (v16 and earlier)\ndef check_options(options: argparse.Namespace) -> None:\n    ...\n\n# After (v17+)\ndef check_options(options: OcrOptions) -> None:\n    ...\n```\n\n**Attribute access unchanged:**\n\n```python\n# These work exactly as before\noptions.languages\noptions.output_type\noptions.tesseract_timeout\n```\n\n**Remove in-place modifications:**\n\n```python\n# Before (v16 pattern - no longer recommended)\ndef check_options(options):\n    options.some_computed_value = compute_value(options)\n\n# After (v17 pattern - compute at point of use)\ndef some_function(options):\n    computed = compute_value(options)\n    use_computed(computed)\n```\n\n### Execution and progress reporting\n\n```{eval-rst}\n.. autoclass:: ocrmypdf.pluginspec.ProgressBar\n    :members:\n    :special-members: __init__, __enter__, __exit__\n```\n\n```{eval-rst}\n.. autoclass:: ocrmypdf.pluginspec.Executor\n    :members:\n    :special-members: __call__\n```\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.get_logging_console\n```\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.get_executor\n```\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.get_progressbar_class\n```\n\n### Applying special behavior before processing\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.validate\n```\n\n### PDF page to image\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.rasterize_pdf_page\n```\n\n### Modifying intermediate images\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.filter_ocr_image\n```\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.filter_page_image\n```\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.filter_pdf_page\n```\n\n### OCR engine\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.get_ocr_engine\n```\n\n```{eval-rst}\n.. autoclass:: ocrmypdf.pluginspec.OcrEngine\n    :members:\n\n    .. automethod:: __str__\n```\n\n```{eval-rst}\n.. autoclass:: ocrmypdf.pluginspec.OrientationConfidence\n```\n\n### PDF/A production\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.generate_pdfa\n```\n\n### PDF optimization\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.optimize_pdf\n```\n\n```{eval-rst}\n.. autofunction:: ocrmypdf.pluginspec.is_optimization_enabled\n```\n\n### Working with OcrElement trees\n\n:::{versionadded} 17.0.0\n:::\n\nOCRmyPDF v17 introduces the `OcrElement` dataclass for representing OCR\noutput in an engine-agnostic format. This enables plugins to work with\nOCR results without parsing hOCR XML.\n\n**Key classes:**\n\n```python\nfrom ocrmypdf import OcrElement, OcrClass, BoundingBox\n\n# OcrElement - represents any OCR structural unit\npage = OcrElement(\n    ocr_class=OcrClass.PAGE,\n    bbox=BoundingBox(0, 0, 612, 792),\n    children=[...]\n)\n\n# BoundingBox - axis-aligned bounding box (left, top, right, bottom)\nbbox = BoundingBox(left=100, top=50, right=300, bottom=80)\n\n# OcrClass - constants for element types\nOcrClass.PAGE      # \"ocr_page\"\nOcrClass.LINE      # \"ocr_line\"\nOcrClass.WORD      # \"ocrx_word\"\nOcrClass.PARAGRAPH # \"ocr_par\"\n```\n\n**Navigating the tree:**\n\n```python\n# Get all words in a page\nwords = page.words  # Returns list[OcrElement]\n\n# Get all lines\nlines = page.lines\n\n# Get combined text\ntext = page.get_text_recursive()\n\n# Iterate by class\nfor para in page.paragraphs:\n    print(para.get_text_recursive())\n```\n\n**OCR engine plugins:**\n\nPlugins implementing custom OCR engines can now output `OcrElement` trees\ndirectly via the `generate_ocr()` method, bypassing hOCR entirely:\n\n```python\nfrom pathlib import Path\nfrom ocrmypdf.pluginspec import OcrEngine\nfrom ocrmypdf import OcrElement, OcrClass, BoundingBox\n\nclass MyOcrEngine(OcrEngine):\n    def generate_ocr(\n        self,\n        input_file: Path,\n        options,\n        context,\n    ) -> OcrElement:\n        # Perform OCR and return OcrElement tree directly\n        # No need to generate hOCR XML\n        return OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(0, 0, width, height),\n            dpi=300,\n            children=[\n                OcrElement(\n                    ocr_class=OcrClass.LINE,\n                    bbox=BoundingBox(100, 50, 500, 80),\n                    children=[\n                        OcrElement(\n                            ocr_class=OcrClass.WORD,\n                            bbox=BoundingBox(100, 50, 200, 80),\n                            text=\"Hello\",\n                        ),\n                        # ... more words\n                    ]\n                ),\n                # ... more lines\n            ]\n        )\n\n    def supports_generate_ocr(self) -> bool:\n        return True  # Indicate this engine uses generate_ocr()\n```\n\nThis approach is simpler than generating hOCR and allows modern OCR\nengines to integrate more naturally with OCRmyPDF.\n"
  },
  {
    "path": "docs/releasenotes/index.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# Release notes\n\nOCRmyPDF uses [semantic versioning](http://semver.org/) for its\ncommand line interface and its public API.\n\nOCRmyPDF's output messages are not considered part of the stable interface -\nthat is, output messages may be improved at any release level, so parsing them\nmay be unreliable. Use the API to depend on precise behavior.\n\nThe public API may be useful in scripts that launch OCRmyPDF processes or that\nwish to use some of its features for working with PDFs.\n\nThe most recent release of OCRmyPDF is ![version](https://img.shields.io/pypi/v/ocrmypdf.svg). Any newer versions\nreferred to in these notes may exist the main branch but have not been\ntagged yet.\n\nOCRmyPDF typically supports the three most recent Python versions.\n\n:::{note}\nAttention maintainers: these release notes may be updated with information\nabout a forthcoming release that has not been tagged yet. A release is only\nofficial when it's tagged and posted to PyPI.\n:::\n\n```{toctree}\n:glob: true\n:maxdepth: 1\n:reversed: true\n\nversion*\n```\n"
  },
  {
    "path": "docs/releasenotes/version02.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v2\n\n## v2.2-stable (2014-09-29)\n\nOCRmyPDF versions 1 and 2 were implemented as shell scripts. OCRmyPDF\n3.0+ is a fork that gradually replaced all shell scripts with Python\nwhile maintaining the existing command line arguments. No one is\nmaintaining old versions.\n\nFor details on older versions, see the [final version of its release\nnotes](https://github.com/fritz-hh/OCRmyPDF/blob/7fd3dbdf42ca53a619412ce8add7532c5e81a9d1/RELEASE_NOTES.md).\n"
  },
  {
    "path": "docs/releasenotes/version03.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v3\n\n## v3.2.1\n\nChanges\n\n- Fixed {issue}`47`\n  \"convert() got and unexpected keyword argument 'dpi'\" by upgrading to\n  img2pdf 0.2\n- Tweaked the Dockerfiles\n\n## v3.2\n\nNew features\n\n- Lossless reconstruction: when possible, OCRmyPDF will inject text\n  layers without otherwise manipulating the content and layout of a PDF\n  page. For example, a PDF containing a mix of vector and raster\n  content would see the vector content preserved. Images may still be\n  transcoded during PDF/A conversion. (`--deskew` and\n  `--clean-final` disable this mode, necessarily.)\n- New argument `--tesseract-pagesegmode` allows you to pass page\n  segmentation arguments to Tesseract OCR. This helps for two column\n  text and other situations that confuse Tesseract.\n- Added a new \"polyglot\" version of the Docker image, that generates\n  Tesseract with all languages packs installed, for the polyglots among\n  us. It is much larger.\n\nChanges\n\n- JPEG transcoding quality is now 95 instead of the default 75. Bigger\n  file sizes for less degradation.\n\n## v3.1.1\n\nChanges\n\n- Fixed bug that caused incorrect page size and DPI calculations on\n  documents with mixed page sizes\n\n## v3.1\n\nChanges\n\n- Default output format is now PDF/A-2b instead of PDF/A-1b\n- Python 3.5 and macOS El Capitan are now supported platforms - no\n  changes were needed to implement support\n- Improved some error messages related to missing input files\n- Fixed {issue}`20`: uppercase .PDF extension not accepted\n- Fixed an issue where OCRmyPDF failed to text that certain pages\n  contained previously OCR'ed text, such as OCR text produced by\n  Tesseract 3.04\n- Inserts /Creator tag into PDFs so that errors can be traced back to\n  this project\n- Added new option `--pdf-renderer=auto`, to let OCRmyPDF pick the\n  best PDF renderer. Currently it always chooses the 'hocrtransform'\n  renderer but that behavior may change.\n- Set up Travis CI automatic integration testing\n\n## v3.0\n\nNew features\n\n- Easier installation with a Docker container or Python's `pip`\n  package manager\n- Eliminated many external dependencies, so it's easier to setup\n- Now installs `ocrmypdf` to `/usr/local/bin` or equivalent for\n  system-wide access and easier typing\n- Improved command line syntax and usage help (`--help`)\n- Tesseract 3.03+ PDF page rendering can be used instead for better\n  positioning of recognized text (`--pdf-renderer tesseract`)\n- PDF metadata (title, author, keywords) are now transferred to the\n  output PDF\n- PDF metadata can also be set from the command line (`--title`,\n  etc.)\n- Automatic repairs malformed input PDFs if possible\n- Added test cases to confirm everything is working\n- Added option to skip extremely large pages that take too long to OCR\n  and are often not OCRable (e.g. large scanned maps or diagrams);\n  other pages are still processed (`--skip-big`)\n- Added option to kill Tesseract OCR process if it seems to be taking\n  too long on a page, while still processing other pages\n  (`--tesseract-timeout`)\n- Less common colorspaces (CMYK, palette) are now supported by\n  conversion to RGB\n- Multiple images on the same PDF page are now supported\n\nChanges\n\n- New, robust rewrite in Python 3.4+ with\n  [ruffus](http://www.ruffus.org.uk/index.html) pipelines\n\n- Now uses Ghostscript 9.14's improved color conversion model to\n  preserve PDF colors\n\n- OCR text is now rendered in the PDF as invisible text. Previous\n  versions of OCRmyPDF incorrectly rendered visible text with an image\n  on top.\n\n- All \"tasks\" in the pipeline can be executed in parallel on any\n  available CPUs, increasing performance\n\n- The `-o DPI` argument has been phased out, in favor of\n  `--oversample DPI`, in case we need `-o OUTPUTFILE` in the future\n\n- Removed several dependencies, so it's easier to install. We no longer\n  use:\n\n  - GNU [parallel](https://www.gnu.org/software/parallel/)\n  - [ImageMagick](http://www.imagemagick.org/script/index.php)\n  - Python 2.7\n  - Poppler\n  - [MuPDF](http://mupdf.com/docs/) tools\n  - shell scripts\n  - Java and [JHOVE](http://jhove.sourceforge.net/)\n  - libxml2\n\n- Some new external dependencies are required or optional, compared to\n  v2.x:\n\n  - Ghostscript 9.14+\n  - [qpdf](http://qpdf.sourceforge.net/) 5.0.0+\n  - [Unpaper](https://github.com/Flameeyes/unpaper) 6.1 (optional)\n  - some automatically managed Python packages\n\nRelease candidates^\n\n- rc9:\n\n  - Fix\n    {issue}`118`:\n    report error if ghostscript iccprofiles are missing\n  - fixed another issue related to\n    {issue}`111`: PDF\n    rasterized to palette file\n  - add support image files with a palette\n  - don't try to validate PDF file after an exception occurs\n\n- rc8:\n\n  - Fix\n    {issue}`111`:\n    exception thrown if PDF is missing DocumentInfo dictionary\n\n- rc7:\n\n  - fix error when installing direct from pip, \"no such file\n    'requirements.txt'\"\n\n- rc6:\n\n  - dropped libxml2 (Python lxml) since Python 3's internal XML parser\n    is sufficient\n  - set up Docker container\n  - fix Unicode errors if recognized text contains Unicode characters\n    and system locale is not UTF-8\n\n- rc5:\n\n  - dropped Java and JHOVE in favour of qpdf\n  - improved command line error output\n  - additional tests and bug fixes\n  - tested on Ubuntu 14.04 LTS\n\n- rc4:\n\n  - dropped MuPDF in favour of qpdf\n  - fixed some installer issues and errors in installation\n    instructions\n  - improve performance: run Ghostscript with multithreaded rendering\n  - improve performance: use multiple cores by default\n  - bug fix: checking for wrong exception on process timeout\n\n- rc3: skipping version number intentionally to avoid confusion with\n  Tesseract\n\n- rc2: first release for public testing to test-PyPI, Github\n\n- rc1: testing release process\n\n## Compatibility notes\n\n- `./OCRmyPDF.sh` script is still available for now\n- Stacking the verbosity option like `-vvv` is no longer supported\n- The configuration file `config.sh` has been removed. Instead, you\n  can feed a file to the arguments for common settings:\n\n```\nocrmypdf input.pdf output.pdf @settings.txt\n```\n\nwhere `settings.txt` contains *one argument per line*, for example:\n\n```\n-l\ndeu\n--author\nA. Merkel\n--pdf-renderer\ntesseract\n```\n\nFixes\n\n- Handling of filenames containing spaces: fixed\n\nNotes and known issues\n\n- Some dependencies may work with lower versions than tested, so try\n  overriding dependencies if they are \"in the way\" to see if they work.\n- `--pdf-renderer tesseract` will output files with an incorrect page\n  size in Tesseract 3.03, due to a bug in Tesseract.\n- PDF files containing \"inline images\" are not supported and won't be\n  for the 3.0 release. Scanned images almost never contain inline\n  images.\n\n"
  },
  {
    "path": "docs/releasenotes/version04.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v4\n\n## v4.5.6\n\n- Fixed {issue}`156`,\n  'NoneType' object has no attribute 'getObject' on pages with no\n  optional /Contents record. This should resolve all issues related to\n  pages with no /Contents record.\n- Fixed {issue}`158`, ocrmypdf\n  now stops and terminates if Ghostscript fails on an intermediate\n  step, as it is not possible to proceed.\n- Fixed {issue}`160`,\n  exception thrown on certain invalid arguments instead of error\n  message\n\n## v4.5.5\n\n- Automated update of macOS homebrew tap\n- Fixed {issue}`154`, KeyError\n  '/Contents' when searching for text on blank pages that have no\n  /Contents record. Note: incomplete fix for this issue.\n\n## v4.5.4\n\n- Fixed `--skip-big` raising an exception if a page contains no images\n  ({issue}`152`) (thanks\n  to @TomRaz)\n- Fixed an issue where pages with no images might trigger \"cannot write\n  mode P as JPEG\"\n  ({issue}`151`)\n\n## v4.5.3\n\n- Added a workaround for Ghostscript 9.21 and probably earlier versions\n  would fail with the error message \"VMerror -25\", due to a Ghostscript\n  bug in XMP metadata handling\n- High Unicode characters (U+10000 and up) are no longer accepted for\n  setting metadata on the command line, as Ghostscript may not handle\n  them correctly.\n- Fixed an issue where the `tess4` renderer would duplicate content\n  onto output pages if tesseract failed or timed out\n- Fixed `tess4` renderer not recognized when lossless reconstruction\n  is possible\n\n## v4.5.2\n\n- Fixed {issue}`147`,\n  `--pdf-renderer tess4 --clean` will produce an oversized page\n  containing the original image in the bottom left corner, due to loss\n  DPI information.\n- Make \"using Tesseract 4.0\" warning less ominous\n- Set up machinery for homebrew OCRmyPDF tap\n\n## v4.5.1\n\n- Fixed {issue}`137`,\n  proportions of images with a non-square pixel aspect ratio would be\n  distorted in output for `--force-ocr` and some other combinations\n  of flags\n\n## v4.5\n\n- PDFs containing \"Form XObjects\" are now supported (issue\n  {issue}`134`; PDF\n  reference manual 8.10), and images they contain are taken into\n  account when determining the resolution for rasterizing\n- The Tesseract 4 Docker image no longer includes all languages,\n  because it took so long to build something would tend to fail\n- OCRmyPDF now warns about using `--pdf-renderer tesseract` with\n  Tesseract 3.04 or lower due to issues with Ghostscript corrupting the\n  OCR text in these cases\n\n## v4.4.2\n\n- The Docker images (ocrmypdf, ocrmypdf-polyglot, ocrmypdf-tess4) are\n  now based on Ubuntu 16.10 instead of Debian stretch\n\n  - This makes supporting the Tesseract 4 image easier\n  - This could be a disruptive change for any Docker users who built\n    customized these images with their own changes, and made those\n    changes in a way that depends on Debian and not Ubuntu\n\n- OCRmyPDF now prevents running the Tesseract 4 renderer with Tesseract\n  3.04, which was permitted in v4.4 and v4.4.1 but will not work\n\n## v4.4.1\n\n- To prevent a [TIFF output\n  error](https://github.com/python-pillow/Pillow/issues/2206) caused\n  by img2pdf >= 0.2.1 and Pillow \\<= 3.4.2, dependencies have been\n  tightened\n- The Tesseract 4.00 simultaneous process limit was increased from 1 to\n  2, since it was observed that 1 lowers performance\n- Documentation improvements to describe the `--tesseract-config`\n  feature\n- Added test cases and fixed error handling for `--tesseract-config`\n- Tweaks to setup.py to deal with issues in the v4.4 release\n\n## v4.4\n\n- Tesseract 4.00 is now supported on an experimental basis.\n\n  - A new rendering option `--pdf-renderer tess4` exploits Tesseract\n    4's new text-only output PDF mode. See the documentation on PDF\n    Renderers for details.\n  - The `--tesseract-oem` argument allows control over the Tesseract\n    4 OCR engine mode (tesseract's `--oem`). Use\n    `--tesseract-oem 2` to enforce the new LSTM mode.\n  - Fixed poor performance with Tesseract 4.00 on Linux\n\n- Fixed an issue that caused corruption of output to stdout in some\n  cases\n\n- Removed test for Pillow JPEG and PNG support, as the minimum\n  supported version of Pillow now enforces this\n\n- OCRmyPDF now tests that the intended destination file is writable\n  before proceeding\n\n- The test suite now requires `pytest-helpers-namespace` to run (but\n  not install)\n\n- Significant code reorganization to make OCRmyPDF re-entrant and\n  improve performance. All changes should be backward compatible for\n  the v4.x series.\n\n  - However, OCRmyPDF's dependency \"ruffus\" is not re-entrant, so no\n    Python API is available. Scripts should continue to use the\n    command line interface.\n\n## v4.3.5\n\n- Update documentation to confirm Python 3.6.0 compatibility. No code\n  changes were needed, so many earlier versions are likely supported.\n\n## v4.3.4\n\n- Fixed \"decimal.InvalidOperation: quantize result has too many digits\"\n  for high DPI images\n\n## v4.3.3\n\n- Fixed PDF/A creation with Ghostscript 9.20 properly\n- Fixed an exception on inline stencil masks with a missing optional\n  parameter\n\n## v4.3.2\n\n- Fixed a PDF/A creation issue with Ghostscript 9.20 (note: this fix\n  did not actually work)\n\n## v4.3.1\n\n- Fixed an issue where pages produced by the \"hocr\" renderer after a\n  Tesseract timeout would be rotated incorrectly if the input page was\n  rotated with a /Rotate marker\n- Fixed a file handle leak in LeptonicaErrorTrap that would cause a\n  \"too many open files\" error for files around hundred pages of pages\n  long when `--deskew` or `--remove-background` or other Leptonica\n  based image processing features were in use, depending on the system\n  value of `ulimit -n`\n- Ability to specify multiple languages for multilingual documents is\n  now advertised in documentation\n- Reduced the file sizes of some test resources\n- Cleaned up debug output\n- Tesseract caching in test cases is now more cautious about false\n  cache hits and reproducing exact output, not that any problems were\n  observed\n\n## v4.3\n\n- New feature `--remove-background` to detect and erase the\n  background of color and grayscale images\n\n- Better documentation\n\n- Fixed an issue with PDFs that draw images when the raster stack depth\n  is zero\n\n- ocrmypdf can now redirect its output to stdout for use in a shell\n  pipeline\n\n  - This does not improve performance since temporary files are still\n    used for buffering\n  - Some output validation is disabled in this mode\n\n## v4.2.5\n\n- Fixed an issue\n  ({issue}`100`) with\n  PDFs that omit the optional /BitsPerComponent parameter on images\n- Removed non-free file milk.pdf\n\n## v4.2.4\n\n- Fixed an error\n  ({issue}`90`) caused by\n  PDFs that use stencil masks properly\n- Fixed handling of PDFs that try to draw images or stencil masks\n  without properly setting up the graphics state (such images are now\n  ignored for the purposes of calculating DPI)\n\n## v4.2.3\n\n- Fixed an issue with PDFs that store page rotation (/Rotate) in an\n  indirect object\n\n- Integrated a few fixes to simplify downstream packaging (Debian)\n\n  - The test suite no longer assumes it is installed\n  - If running Linux, skip a test that passes Unicode on the command\n    line\n\n- Added a test case to check explicit masks and stencil masks\n\n- Added a test case for indirect objects and linearized PDFs\n\n- Deprecated the OCRmyPDF.sh shell script\n\n## v4.2.2\n\n- Improvements to documentation\n\n## v4.2.1\n\n- Fixed an issue where PDF pages that contained stencil masks would\n  report an incorrect DPI and cause Ghostscript to abort\n- Implemented stdin streaming\n\n## v4.2\n\n- ocrmypdf will now try to convert single image files to PDFs if they\n  are provided as input\n  ({issue}`15`)\n\n  - This is a basic convenience feature. It only supports a single\n    image and always makes the image fill the whole page.\n  - For better control over image to PDF conversion, use `img2pdf`\n    (one of ocrmypdf's dependencies)\n\n- New argument `--output-type {pdf|pdfa}` allows disabling\n  Ghostscript PDF/A generation\n\n  - `pdfa` is the default, consistent with past behavior\n  - `pdf` provides a workaround for users concerned about the\n    increase in file size from Ghostscript forcing JBIG2 images to\n    CCITT and transcoding JPEGs\n  - `pdf` preserves as much as it can about the original file,\n    including problems that PDF/A conversion fixes\n\n- PDFs containing images with \"non-square\" pixel aspect ratios, such as\n  200x100 DPI, are now handled and converted properly (fixing a bug\n  that caused to be cropped)\n\n- `--force-ocr` rasterizes pages even if they contain no images\n\n  - supports users who want to use OCRmyPDF to reconstruct text\n    information in PDFs with damaged Unicode maps (copy and paste text\n    does not match displayed text)\n  - supports reinterpreting PDFs where text was rendered as curves for\n    printing, and text needs to be recovered\n  - fixes issue\n    {issue}`82`\n\n- Fixes an issue where, with certain settings, monochrome images in\n  PDFs would be converted to 8-bit grayscale, increasing file size\n  ({issue}`79`)\n\n- Support for Ubuntu 12.04 LTS \"precise\" has been dropped in favor of\n  (roughly) Ubuntu 14.04 LTS \"trusty\"\n\n  - Some Ubuntu \"PPAs\" (backports) are needed to make it work\n\n- Support for some older dependencies dropped\n\n  - Ghostscript 9.15 or later is now required (available in Ubuntu\n    trusty with backports)\n  - Tesseract 3.03 or later is now required (available in Ubuntu\n    trusty)\n\n- Ghostscript now runs in \"safer\" mode where possible\n\n## v4.1.4\n\n- Bug fix: monochrome images with an ICC profile attached were\n  incorrectly converted to full color images if lossless reconstruction\n  was not possible due to other settings; consequence was increased\n  file size for these images\n\n## v4.1.3\n\n- More helpful error message for PDFs with version 4 security handler\n- Update usage instructions for Windows/Docker users\n- Fixed order of operations for matrix multiplication (no effect on most\n  users)\n- Add a few leptonica wrapper functions (no effect on most users)\n\n## v4.1.2\n\n- Replace IEC sRGB ICC profile with Debian's sRGB (from\n  icc-profiles-free) which is more compatible with the MIT license\n- More helpful error message for an error related to certain types of\n  malformed PDFs\n\n## v4.1\n\n- `--rotate-pages` now only rotates pages when reasonably confidence\n  in the orientation. This behavior can be adjusted with the new\n  argument `--rotate-pages-threshold`\n- Fixed problems in error checking if `unpaper` is uninstalled or\n  missing at run-time\n- Fixed problems with \"RethrownJobError\" errors during error handling\n  that suppressed the useful error messages\n\n## v4.0.7\n\n- Minor correction to Ghostscript output settings\n\n## v4.0.6\n\n- Update install instructions\n- Provide a sRGB profile instead of using Ghostscript's\n\n## v4.0.5\n\n- Remove some verbose debug messages from v4.0.4\n- Fixed temporary that wasn't being deleted\n- DPI is now calculated correctly for cropped images, along with other\n  image transformations\n- Inline images are now checked during DPI calculation instead of\n  rejecting the image\n\n## v4.0.4\n\nReleased with verbose debug message turned on. Do not use. Skip to\nv4.0.5.\n\n## v4.0.3\n\nNew features\n\n- Page orientations detected are now reported in a summary comment\n\nFixes\n\n- Show stack trace if unexpected errors occur\n- Treat \"too few characters\" error message from Tesseract as a reason\n  to skip that page rather than abort the file\n- Docker: fix blank JPEG2000 issue by insisting on Ghostscript versions\n  that have this fixed\n\n## v4.0.2\n\nFixes\n\n- Fixed compatibility with Tesseract 3.04.01 release, particularly its\n  different way of outputting orientation information\n- Improved handling of Tesseract errors and crashes\n- Fixed use of chmod on Docker that broke most test cases\n\n## v4.0.1\n\nFixes\n\n- Fixed a KeyError if tesseract fails to find page orientation\n  information\n\n## v4.0\n\nNew features\n\n- Automatic page rotation (`-r`) is now available. It uses ignores\n  any prior rotation information on PDFs and sets rotation based on the\n  dominant orientation of detectable text. This feature is fairly\n  reliable but some false positives occur especially if there is not\n  much text to work with.\n  ({issue}`4`)\n- Deskewing is now performed using Leptonica instead of unpaper.\n  Leptonica is faster and more reliable at image deskewing than\n  unpaper.\n\nFixes\n\n- Fixed an issue where lossless reconstruction could cause some pages\n  to be appear incorrectly if the page was rotated by the user in\n  Acrobat after being scanned (specifically if it a /Rotate tag)\n- Fixed an issue where lossless reconstruction could misalign the\n  graphics layer with respect to text layer if the page had been\n  cropped such that its origin is not (0, 0)\n  ({issue}`49`)\n\nChanges\n\n- Logging output is now much easier to read\n- `--deskew` is now performed by Leptonica instead of unpaper\n  ({issue}`25`)\n- libffi is now required\n- Some changes were made to the Docker and Travis build environments to\n  support libffi\n- `--pdf-renderer=tesseract` now displays a warning if the Tesseract\n  version is less than 3.04.01, the planned release that will include\n  fixes to an important OCR text rendering bug in Tesseract 3.04.00.\n  You can also manually install ./share/sharp2.ttf on top of pdf.ttf in\n  your Tesseract tessdata folder to correct the problem.\n\n"
  },
  {
    "path": "docs/releasenotes/version05.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v5\n\n## v5.7.0\n\n- Fixed an issue that caused poor CPU utilization on machines with more\n  than 4 cores when running Tesseract 4. (Related to {issue}`217`.)\n\n- The 'hocr' renderer has been improved. The 'sandwich' and 'tesseract'\n  renderers are still better for most use cases, but 'hocr' may be\n  useful for people who work with the PDF.js renderer in English/ASCII\n  languages. ({issue}`225`)\n\n  - It now formats text in a matter that is easier for certain PDF\n    viewers to select and extract copy and paste text. This should\n    help macOS Preview and PDF.js in particular.\n  - The appearance of selected text and behavior of selecting text is\n    improved.\n  - The PDF content stream now uses relative moves, making it more\n    compact and easier for viewers to determine when two words on the\n    same line.\n  - It can now deal with text on a skewed baseline.\n  - Thanks to @cforcey for the pull request, @jbreiden for many\n    helpful suggestions, @ctbarbour for another round of improvements,\n    and @acaloiaro for an independent review.\n\n## v5.6.3\n\n- Suppress two debug messages that were too verbose\n\n## v5.6.2\n\n- Development branch accidentally tagged as release. Do not use.\n\n## v5.6.1\n\n- Fixed {issue}`219`: change\n  how the final output file is created to avoid triggering permission\n  errors when the output is a special file such as `/dev/null`\n- Fixed test suite failures due to a qpdf 8.0.0 regression and Python\n  3.5's handling of symlink\n- The \"encrypted PDF\" error message was different depending on the type\n  of PDF encryption. Now a single clear message appears for all types\n  of PDF encryption.\n- ocrmypdf is now in Homebrew. Homebrew users are advised to the\n  version of ocrmypdf in the official homebrew-core formulas rather\n  than the private tap.\n- Some linting\n\n## v5.6.0\n\n- Fixed {issue}`216`: preserve\n  \"text as curves\" PDFs without rasterizing file\n- Related to the above, messages about rasterizing are more consistent\n- For consistency versions minor releases will now get the trailing .0\n  they always should have had.\n\n## v5.5\n\n- Add new argument `--max-image-mpixels`. Pillow 5.0 now raises an\n  exception when images may be decompression bombs. This argument can\n  be used to override the limit Pillow sets.\n- Fixed output page cropped when using the sandwich renderer and OCR is\n  skipped on a rotated and image-processed page\n- A warning is now issued when old versions of Ghostscript are used in\n  cases known to cause issues with non-Latin characters\n- Fixed a few parameter validation checks for `-output-type pdfa-1` and\n  `pdfa-2`\n\n## v5.4.4\n\n- Fixed {issue}`181`: fix\n  final merge failure for PDFs with more pages than the system file\n  handle limit (`ulimit -n`)\n- Fixed {issue}`200`: an\n  uncommon syntax for formatting decimal numbers in a PDF would cause\n  qpdf to issue a warning, which ocrmypdf treated as an error. Now this\n  the warning is relayed.\n- Fixed an issue where intermediate PDFs would be created at version 1.3\n  instead of the version of the original file. It's possible but\n  unlikely this had side effects.\n- A warning is now issued when older versions of qpdf are used since\n  issues like\n  {issue}`200` cause\n  qpdf to infinite-loop\n- Address issue\n  {issue}`140`: if\n  Tesseract outputs invalid UTF-8, escape it and print its message\n  instead of aborting with a Unicode error\n- Adding previously unlisted setup requirement, pytest-runner\n- Update documentation: fix an error in the example script for Synology\n  with Docker images, improved security guidance, advised\n  `pip install --user`\n\n## v5.4.3\n\n- If a subprocess fails to report its version when queried, exit\n  cleanly with an error instead of throwing an exception\n- Added test to confirm that the system locale is Unicode-aware and\n  fail early if it's not\n- Clarified some copyright information\n- Updated pinned requirements.txt so the homebrew formula captures more\n  recent versions\n\n## v5.4.2\n\n- Fixed a regression from v5.4.1 that caused sidecar files to be\n  created as empty files\n\n## v5.4.1\n\n- Add workaround for Tesseract v4.00alpha crash when trying to obtain\n  orientation and the latest language packs are installed\n\n## v5.4\n\n- Change wording of a deprecation warning to improve clarity\n- Added option to generate PDF/A-1b output if desired\n  (`--output-type pdfa-1`); default remains PDF/A-2b generation\n- Update documentation\n\n## v5.3.3\n\n- Fixed missing error message that should occur when trying to force\n  `--pdf-renderer sandwich` on old versions of Tesseract\n- Update copyright information in test files\n- Set system `LANG` to UTF-8 in Dockerfiles to avoid UTF-8 encoding\n  errors\n\n## v5.3.2\n\n- Fixed a broken test case related to language packs\n\n## v5.3.1\n\n- Fixed wrong return code given for missing Tesseract language packs\n- Fixed \"brew audit\" crashing on Travis when trying to auto-brew\n\n## v5.3\n\n- Added `--user-words` and `--user-patterns` arguments which are\n  forwarded to Tesseract OCR as words and regular expressions\n  respective to use to guide OCR. Supplying a list of subject-domain\n  words should assist Tesseract with resolving words.\n  ({issue}`165`)\n- Using a non Latin-1 language with the \"hocr\" renderer now warns about\n  possible OCR quality and recommends workarounds\n  ({issue}`176`)\n- Output file path added to error message when that location is not\n  writable\n  ({issue}`175`)\n- Otherwise valid PDFs with leading whitespace at the beginning of the\n  file are now accepted\n\n## v5.2\n\n- When using Tesseract 3.05.01 or newer, OCRmyPDF will select the\n  \"sandwich\" PDF renderer by default, unless another PDF renderer is\n  specified with the `--pdf-renderer` argument. The previous behavior\n  was to select `--pdf-renderer=hocr`.\n- The \"tesseract\" PDF renderer is now deprecated, since it can cause\n  problems with Ghostscript on Tesseract 3.05.00\n- The \"tess4\" PDF renderer has been renamed to \"sandwich\". \"tess4\" is\n  now a deprecated alias for \"sandwich\".\n\n## v5.1\n\n- Files with pages larger than 200\" (5080 mm) in either dimension are\n  now supported with `--output-type=pdf` with the page size preserved\n  (in the PDF specification this feature is called UserUnit scaling).\n  Due to Ghostscript limitations this is not available in conjunction\n  with PDF/A output.\n\n## v5.0.1\n\n- Fixed {issue}`169`,\n  exception due to failure to create sidecar text files on some\n  versions of Tesseract 3.04, including the jbarlow83/ocrmypdf Docker\n  image\n\n## v5.0\n\n- Backward incompatible changes\n\n  > - Support for Python 3.4 dropped. Python 3.5 is now required.\n  > - Support for Tesseract 3.02 and 3.03 dropped. Tesseract 3.04 or\n  >   newer is required. Tesseract 4.00 (alpha) is supported.\n  > - The OCRmyPDF.sh script was removed.\n\n- Add a new feature, `--sidecar`, which allows creating \"sidecar\"\n  text files which contain the OCR results in plain text. These OCR\n  text is more reliable than extracting text from PDFs. Closes\n  {issue}`126`.\n\n- New feature: `--pdfa-image-compression`, which allows overriding\n  Ghostscript's lossy-or-lossless image encoding heuristic and making\n  all images JPEG encoded or lossless encoded as desired. Fixes\n  {issue}`163`.\n\n- Fixed {issue}`143`, added\n  `--quiet` to suppress \"INFO\" messages\n\n- Fixed {issue}`164`, a typo\n\n- Removed the command line parameters `-n` and `--just-print` since\n  they have not worked for some time (reported as Ubuntu bug\n  [#1687308](https://bugs.launchpad.net/ubuntu/+source/ocrmypdf/+bug/1687308))\n\n"
  },
  {
    "path": "docs/releasenotes/version06.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v6\n\n## v6.2.5\n\n- Disable a failing test due to Tesseract 4.0rc1 behavior change.\n  Previously, Tesseract would exit with an error message if its\n  configuration was invalid, and OCRmyPDF would intercept this message.\n  Now Tesseract issues a warning, which OCRmyPDF v6.2.5 may relay or\n  ignore. (In v7.x, OCRmyPDF will respond to the warning.)\n- This release branch no longer supports using the optional PyMuPDF\n  installation, since it was removed in v7.x.\n- This release branch no longer supports macOS. macOS users should\n  upgrade to v7.x.\n\n## v6.2.4\n\n- Backport Ghostscript 9.25 compatibility fixes, which removes support\n  for setting Unicode metadata\n- Backport blacklisting Ghostscript 9.24\n- Older versions of Ghostscript are still supported\n\n## v6.2.3\n\n- Fixed compatibility with img2pdf >= 0.3.0 by rejecting input images\n  that have an alpha channel\n- This version will be included in Ubuntu 18.10\n\n## v6.2.2\n\n- Backport compatibility fixes for Python 3.7 and ruffus 2.7.0 from\n  v7.0.0\n- Backport fix to ignore masks when deciding what colors are on a page\n- Backport some minor improvements from v7.0.0: better argument\n  validation and warnings about the Tesseract 4.0.0 `--user-words`\n  regression\n\n## v6.2.1\n\n- Fixed recent versions of Tesseract (after 4.0.0-beta1) not being\n  detected as supporting the `sandwich` renderer ({issue}`271`).\n\n## v6.2.0\n\n- **Docker**: The Docker image `ocrmypdf-tess4` has been removed. The\n  main Docker images, `ocrmypdf` and `ocrmypdf-polyglot` now use\n  Ubuntu 18.04 as a base image, and as such Tesseract 4.0.0-beta1 is\n  now the Tesseract version they use. There is no Docker image based on\n  Tesseract 3.05 anymore.\n- Creation of PDF/A-3 is now supported. However, there is no ability to\n  attach files to PDF/A-3.\n- Lists more reasons why the file size might grow.\n- Fixed {issue}`262`,\n  `--remove-background` error on PDFs contained colormapped\n  (paletted) images.\n- Fixed another XMP metadata validation issue, in cases where the input\n  file's creation date has no timezone and the creation date is not\n  overridden.\n\n## v6.1.5\n\n- Fixed {issue}`253`, a\n  possible division by zero when using the `hocr` renderer.\n- Fixed incorrectly formatted `<xmp:ModifyDate>` field inside XMP\n  metadata for PDF/As. veraPDF flags this as a PDF/A validation\n  failure. The error is caused the timezone and final digit of the\n  seconds of modified time to be omitted, so at worst the modification\n  time stamp is rounded to the nearest 10 seconds.\n\n## v6.1.4\n\n- Fixed {issue}`248`\n  `--clean` argument may remove OCR from left column of text on\n  certain documents. We now set `--layout none` to suppress this.\n- The test cache was updated to reflect the change above.\n- Change test suite to accommodate Ghostscript 9.23's new ability to\n  insert JPEGs into PDFs without transcoding.\n- XMP metadata in PDFs is now examined using `defusedxml` for safety.\n- If an external process exits with a signal when asked to report its\n  version, we now print the system error message instead of suppressing\n  it. This occurred when the required executable was found but was\n  missing a shared library.\n- qpdf 7.0.0 or newer is now required as the test suite can no longer\n  pass without it.\n\n### Notes\n\n- An apparent [regression in Ghostscript\n  9.23](https://bugs.ghostscript.com/show_bug.cgi?id=699216) will\n  cause some ocrmypdf output files to become invalid in rare cases; the\n  workaround for the moment is to set `--force-ocr`.\n\n## v6.1.3\n\n- Fixed {issue}`247`,\n  `/CreationDate` metadata not copied from input to output.\n- A warning is now issued when Python 3.5 is used on files with a large\n  page count, as this case is known to regress to single core\n  performance. The cause of this problem is unknown.\n\n## v6.1.2\n\n- Upgrade to PyMuPDF v1.12.5 which includes a more complete fix to\n  {issue}`239`.\n- Add `defusedxml` dependency.\n\n## v6.1.1\n\n- Fixed text being reported as found on all pages if PyMuPDF is not\n  installed.\n\n## v6.1.0\n\n- PyMuPDF is now an optional but recommended dependency, to alleviate\n  installation difficulties on platforms that have less access to\n  PyMuPDF than the author anticipated. (For version 6.x only) install\n  OCRmyPDF with `pip install ocrmypdf[fitz]` to use it to its full\n  potential.\n- Fixed `FileExistsError` that could occur if OCR timed out while it\n  was generating the output file.\n  ({issue}`218`)\n- Fixed table of contents/bookmarks all being redirected to page 1 when\n  generating a PDF/A (with PyMuPDF). (Without PyMuPDF the table of\n  contents is removed in PDF/A mode.)\n- Fixed \"RuntimeError: invalid key in dict\" when table of\n  contents/bookmarks titles contained the character `)`.\n  ({issue}`239`)\n- Added a new argument `--skip-repair` to skip the initial PDF repair\n  step if the PDF is already well-formed (because another program\n  repaired it).\n\n## v6.0.0\n\n- The software license has been changed to GPLv3 [it has since changed again].\n  Test resource files and some individual sources may have other licenses.\n\n- OCRmyPDF now depends on\n  [PyMuPDF](https://pymupdf.readthedocs.io/en/latest/installation/).\n  Including PyMuPDF is the primary reason for the change to GPLv3.\n\n- Other backward incompatible changes\n\n  - The `OCRMYPDF_TESSERACT`, `OCRMYPDF_QPDF`, `OCRMYPDF_GS` and\n    `OCRMYPDF_UNPAPER` environment variables are no longer used.\n    Change `PATH` if you need to override the external programs\n    OCRmyPDF uses.\n  - The `ocrmypdf` package has been moved to `src/ocrmypdf` to\n    avoid issues with accidental import.\n  - The function `ocrmypdf.exec.get_program` was removed.\n  - The deprecated module `ocrmypdf.pageinfo` was removed.\n  - The `--pdf-renderer tess4` alias for `sandwich` was removed.\n\n- Fixed an issue where OCRmyPDF failed to detect existing text on\n  pages, depending on how the text and fonts were encoded within the\n  PDF. ({issue}`233,232`)\n\n- Fixed an issue that caused dramatic inflation of file sizes when\n  `--skip-text --output-type pdf` was used. OCRmyPDF now removes\n  duplicate resources such as fonts, images and other objects that it\n  generates. ({issue}`237`)\n\n- Improved performance of the initial page splitting step. Originally\n  this step was not believed to be expensive and ran in a process.\n  Large file testing revealed it to be a bottleneck, so it is now\n  parallelized. On a 700 page file with quad core machine, this change\n  saves about 2 minutes. ({issue}`234`)\n\n- The test suite now includes a cache that can be used to speed up test\n  runs across platforms. This also does not require computing\n  checksums, so it's faster. ({issue}`217`)\n\n"
  },
  {
    "path": "docs/releasenotes/version07.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v7\n\n## v7.4.0\n\n- `--force-ocr` may now be used with the new `--threshold` and\n  `--mask-barcodes` features\n- pikepdf >= 0.9.1 is now required.\n- Changed metadata handling to pikepdf 0.9.1. As a result, metadata\n  handling of non-ASCII characters in Ghostscript 9.25 or later is\n  fixed.\n- chardet >= 3.0.4 is temporarily listed as required. pdfminer.six\n  depends on it, but the most recent release does not specify this\n  requirement.\n  ({issue}`326`)\n- python-xmp-toolkit and libexempi are no longer required.\n- A new Docker image is now being provided for users who wish to access\n  OCRmyPDF over a simple HTTP interface, instead of the command line.\n- Increase tolerance of PDFs that overflow or underflow the PDF\n  graphics stack.\n  ({issue}`325`)\n\n## v7.3.1\n\n- Fixed performance regression from v7.3.0; fast page analysis was not\n  selected when it should be.\n- Fixed a few exceptions related to the new `--mask-barcodes` feature\n  and improved argument checking\n- Added missing detection of TrueType fonts that lack a Unicode mapping\n\n## v7.3.0\n\n- Added a new feature `--redo-ocr` to detect existing OCR in a file,\n  remove it, and redo the OCR. This may be particularly helpful for\n  anyone who wants to take advantage of OCR quality improvements in\n  Tesseract 4.0. Note that OCR added by OCRmyPDF before version 3.0\n  cannot be detected since it was not properly marked as invisible text\n  in the earliest versions. OCR that constructs a font from visible\n  text, such as Adobe Acrobat's ClearScan.\n\n- OCRmyPDF's content detection is generally more sophisticated. It\n  learns more about the contents of each PDF and makes better\n  recommendations:\n\n  - OCRmyPDF can now detect when a PDF contains text that cannot be\n    mapped to Unicode (meaning it is readable to human eyes but\n    copy-pastes as gibberish). In these cases it recommends\n    `--force-ocr` to make the text searchable.\n  - PDFs containing vector objects are now rendered at more\n    appropriate resolution for OCR.\n  - We now exit with an error for PDFs that contain Adobe LiveCycle\n    Designer's dynamic XFA forms. Currently the open source community\n    does not have tools to work with these files.\n  - OCRmyPDF now warns when a PDF that contains Adobe AcroForms, since\n    such files probably do not need OCR. It can work with these files.\n\n- Added three new **experimental** features to improve OCR quality in\n  certain conditions. The name, syntax and behavior of these arguments\n  is subject to change. They may also be incompatible with some other\n  features.\n\n  - `--remove-vectors` which strips out vector graphics. This can\n    improve OCR quality since OCR will not search artwork for readable\n    text; however, it currently removes \"text as curves\" as well.\n  - `--mask-barcodes` to detect and suppress barcodes in files. We\n    have observed that barcodes can interfere with OCR because they\n    are \"text-like\" but not actually textual.\n  - `--threshold` which uses a more sophisticated thresholding\n    algorithm than is currently in use in Tesseract OCR. This works\n    around a [known issue in Tesseract\n    4.0](https://github.com/tesseract-ocr/tesseract/issues/1990)\n    with dark text on bright backgrounds.\n\n- Fixed an issue where an error message was not reported when the\n  installed Ghostscript was very old.\n\n- The PDF optimizer now saves files with object streams enabled when\n  the optimization level is `--optimize 1` or higher (the default).\n  This makes files a little bit smaller, but requires PDF 1.5. PDF 1.5\n  was first released in 2003 and is broadly supported by PDF viewers,\n  but some rudimentary PDF parsers such as PyPDF2 do not understand\n  object streams. You can use the command line tool\n  `qpdf --object-streams=disable` or\n  [pikepdf](https://github.com/pikepdf/pikepdf) library to remove\n  them.\n\n- New dependency: pdfminer.six 20181108. Note this is a fork of the\n  Python 2-only pdfminer.\n\n- Deprecation notice: At the end of 2018, we will be ending support for\n  Python 3.5 and Tesseract 3.x. OCRmyPDF v7 will continue to work with\n  older versions.\n\n## v7.2.1\n\n- Fixed compatibility with an API change in pikepdf 0.3.5.\n- A kludge to support Leptonica versions older than 1.72 in the test\n  suite was dropped. Older versions of Leptonica are likely still\n  compatible. The only impact is that a portion of the test suite will\n  be skipped.\n\n## v7.2.0\n\n**Lossy JBIG2 behavior change**\n\nA user reported that ocrmypdf was in fact using JBIG2 in **lossy**\ncompression mode. This was not the intended behavior. Users should\n[review the technical concerns with JBIG2 in lossy\nmode](https://abbyy.technology/en:kb:tip:jbig2_compression_and_ocr)\nand decide if this is a concern for their use case.\n\nJBIG2 lossy mode does achieve higher compression ratios than any other\nmonochrome compression technology; for large text documents the savings\nare considerable. JBIG2 lossless still gives great compression ratios\nand is a major improvement over the older CCITT G4 standard.\n\nOnly users who have reviewed the concerns with JBIG2 in lossy mode\nshould opt-in. As such, lossy mode JBIG2 is only turned on when the new\nargument `--jbig2-lossy` is issued. This is independent of the setting\nfor `--optimize`.\n\nUsers who did not install an optional JBIG2 encoder are unaffected.\n\n(Thanks to user 'bsdice' for reporting this issue.)\n\n**Other issues**\n\n- When the image optimizer quantizes an image to 1 bit per pixel, it\n  will now attempt to further optimize that image as CCITT or JBIG2,\n  instead of keeping it in the \"flate\" encoding which is not efficient\n  for 1 bpp images.\n  ({issue}`297`)\n- Images in PDFs that are used as soft masks (i.e. transparency masks\n  or alpha channels) are now excluded from optimization.\n- Fixed handling of Tesseract 4.0-rc1 which now accepts invalid\n  Tesseract configuration files, which broke the test suite.\n\n## v7.1.0\n\n- Improve the performance of initial text extraction, which is done to\n  determine if a file contains existing text of some kind or not. On\n  large files, this initial processing is now about 20x times faster.\n  ({issue}`299`)\n- pikepdf 0.3.3 is now required.\n- Fixed {issue}`231`, a\n  problem with JPEG2000 images where image metadata was only available\n  inside the JPEG2000 file.\n- Fixed some additional Ghostscript 9.25 compatibility issues.\n- Improved handling of KeyboardInterrupt error messages.\n  ({issue}`301`)\n- README.md is now served in GitHub markdown instead of\n  reStructuredText.\n\n## v7.0.6\n\n- Blacklist Ghostscript 9.24, now that 9.25 is available and fixes many\n  regressions in 9.24.\n\n## v7.0.5\n\n- Improve capability with Ghostscript 9.24, and enable the JPEG\n  passthrough feature when this version in installed.\n- Ghostscript 9.24 lost the ability to set PDF title, author, subject\n  and keyword metadata to Unicode strings. OCRmyPDF will set ASCII\n  strings and warn when Unicode is suppressed. Other software may be\n  used to update metadata. This is a short term work around.\n- PDFs generated by Kodak Capture Desktop, or generally PDFs that\n  contain indirect references to null objects in their table of\n  contents, would have an invalid table of contents after processing by\n  OCRmyPDF that might interfere with other viewers. This has been\n  fixed.\n- Detect PDFs generated by Adobe LiveCycle, which can only be displayed\n  in Adobe Acrobat and Reader currently. When these are encountered,\n  exit with an error instead of performing OCR on the \"Please wait\"\n  error message page.\n\n## v7.0.4\n\n- Fixed exception thrown when trying to optimize a certain type of PNG\n  embedded in a PDF with the `-O2`\n- Update to pikepdf 0.3.2, to gain support for optimizing some\n  additional image types that were previously excluded from\n  optimization (CMYK and grayscale). Fixes\n  {issue}`285`.\n\n## v7.0.3\n\n- Fixed {issue}`284`, an error\n  when parsing inline images that have are also image masks, by\n  upgrading pikepdf to 0.3.1\n\n## v7.0.2\n\n- Fixed a regression with `--rotate-pages` on pages that already had\n  rotations applied.\n  ({issue}`279`)\n- Improve quality of page rotation in some cases by rasterizing a\n  higher quality preview image.\n  ({issue}`281`)\n\n## v7.0.1\n\n- Fixed compatibility with img2pdf >= 0.3.0 by rejecting input images\n  that have an alpha channel\n- Add forward compatibility for pikepdf 0.3.0 (unrelated to img2pdf)\n- Various documentation updates for v7.0.0 changes\n\n## v7.0.0\n\n- The core algorithm for combining OCR layers with existing PDF pages\n  has been rewritten and improved considerably. PDFs are no longer\n  split into single page PDFs for processing; instead, images are\n  rendered and the OCR results are grafted onto the input PDF. The new\n  algorithm uses less temporary disk space and is much more performant\n  especially for large files.\n\n- New dependency: [pikepdf](https://github.com/pikepdf/pikepdf).\n  pikepdf is a powerful new Python PDF library driving the latest\n  OCRmyPDF features, built on the QPDF C++ library (libqpdf).\n\n- New feature: PDF optimization with `-O` or `--optimize`. After\n  OCR, OCRmyPDF will perform image optimizations relevant to OCR PDFs.\n\n  - If a JBIG2 encoder is available, then monochrome images will be\n    converted, with the potential for huge savings on large black and\n    white images, since JBIG2 is far more efficient than any other\n    monochrome (bi-level) compression. (All known US patents related\n    to JBIG2 have probably expired, but it remains the responsibility\n    of the user to supply a JBIG2 encoder such as\n    [jbig2enc](https://github.com/agl/jbig2enc). OCRmyPDF does not\n    implement JBIG2 encoding.)\n  - If `pngquant` is installed, OCRmyPDF will optionally use it to\n    perform lossy quantization and compression of PNG images.\n  - The quality of JPEGs can also be lowered, on the assumption that a\n    lower quality image may be suitable for storage after OCR.\n  - This image optimization component will eventually be offered as an\n    independent command line utility.\n  - Optimization ranges from `-O0` through `-O3`, where `0`\n    disables optimization and `3` implements all options. `1`, the\n    default, performs only safe and lossless optimizations. (This is\n    similar to GCC's optimization parameter.) The exact type of\n    optimizations performed will vary over time.\n\n- Small amounts of text in the margins of a page, such as watermarks,\n  page numbers, or digital stamps, will no longer prevent the rest of a\n  page from being OCRed when `--skip-text` is issued. This behavior\n  is based on a heuristic.\n\n- Removed features\n\n  - The deprecated `--pdf-renderer tesseract` PDF renderer was\n    removed.\n  - `-g`, the option to generate debug text pages, was removed\n    because it was a maintenance burden and only worked in isolated\n    cases. HOCR pages can still be previewed by running the\n    hocrtransform.py with appropriate settings.\n\n- Removed dependencies\n\n  - `PyPDF2`\n  - `defusedxml`\n  - `PyMuPDF`\n\n- The `sandwich` PDF renderer can be used with all supported versions\n  of Tesseract, including that those prior to v3.05 which don't support\n  `-c textonly`. (Tesseract v4.0.0 is recommended and more\n  efficient.)\n\n- `--pdf-renderer auto` option and the diagnostics used to select a\n  PDF renderer now work better with old versions, but may make\n  different decisions than past versions.\n\n- If everything succeeds but PDF/A conversion fails, a distinct return\n  code is now returned (`ExitCode.pdfa_conversion_failed (10)`) where\n  this situation previously returned\n  `ExitCode.invalid_output_pdf (4)`. The latter is now returned only\n  if there is some indication that the output file is invalid.\n\n- Notes for downstream packagers\n\n  - There is also a new dependency on `python-xmp-toolkit` which in\n    turn depends on `libexempi3`.\n  - It may be necessary to separately `pip install pycparser` to\n    avoid [another Python 3.7\n    issue](https://github.com/eliben/pycparser/pull/135).\n\n"
  },
  {
    "path": "docs/releasenotes/version08.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v8\n\n## v8.3.2\n\n- Dropped workaround for macOS that allowed it work without pdfminer.six,\n  now a proper sdist release of pdfminer.six is available.\n- pikepdf 1.5.0 is now required.\n\n## v8.3.1\n\n- Fixed an issue where PDFs with malformed metadata would be rendered as\n  blank pages. {issue}`398`.\n\n## v8.3.0\n\n- Improved the strategy for updating pages when a new image of the page\n  was produced. We now attempt to preserve more content from the\n  original file, for annotations in particular.\n- For PDFs with more than 100 pages and a sequence where one PDF page\n  was replaced and one or more subsequent ones were skipped, an\n  intermediate file would be corrupted while grafting OCR text, causing\n  processing to fail. This is a regression, likely introduced in\n  v8.2.4.\n- Previously, we resized the images produced by Ghostscript by a small\n  number of pixels to ensure the output image size was an exactly what\n  we wanted. Having discovered a way to get Ghostscript to produce the\n  exact image sizes we require, we eliminated the resizing step.\n- Command line completions for `bash` are now available, in addition\n  to `fish`, both in `misc/completion`. Package maintainers, please\n  install these so users can take advantage.\n- Updated requirements.\n- pikepdf 1.3.0 is now required.\n\n## v8.2.4\n\n- Fixed a false positive while checking for a certain type of PDF that\n  only Acrobat can read. We now more accurately detect Acrobat-only\n  PDFs.\n- OCRmyPDF holds fewer open file handles and is more prompt about\n  releasing those it no longer needs.\n- Minor optimization: we no longer traverse the table of contents to\n  ensure all references in it are resolved, as changes to libqpdf have\n  made this unnecessary.\n- pikepdf 1.2.0 is now required.\n\n## v8.2.3\n\n- Fixed that `--mask-barcodes` would occasionally leave a unwanted\n  temporary file named `junkpixt` in the current working folder.\n- Fixed (hopefully) handling of Leptonica errors in an environment\n  where a non-standard `sys.stderr` is present.\n- Improved help text for `--verbose`.\n\n## v8.2.2\n\n- Fixed a regression from v8.2.0, an exception that occurred while\n  attempting to report that `unpaper` or another optional dependency\n  was unavailable.\n- In some cases, `ocrmypdf [-c|--clean]` failed to exit with an error\n  when `unpaper` is not installed.\n\n## v8.2.1\n\n- This release was canceled.\n\n## v8.2.0\n\n- A major improvement to our Docker image is now available thanks to\n  hard work contributed by @mawi12345. The new Docker image,\n  ocrmypdf-alpine, is based on Alpine Linux, and includes most of the\n  functionality of three existed images in a smaller package. This\n  image will replace the main Docker image eventually but for now all\n  are being built. [See documentation for\n  details](https://ocrmypdf.readthedocs.io/en/latest/docker.html).\n- Documentation reorganized especially around the use of Docker images.\n- Fixed a problem with PDF image optimization, where the optimizer\n  would unnecessarily decompress and recompress PNG images, in some\n  cases losing the benefits of the quantization it just had just\n  performed. The optimizer is now capable of embedding PNG images into\n  PDFs without transcoding them.\n- Fixed a minor regression with lossy JBIG2 image optimization. All\n  JBIG2 candidates images were incorrectly placed into a single\n  optimization group for the whole file, instead of grouping pages\n  together. This usually makes a larger JBIG2Globals dictionary and\n  results in inferior compression, so it worked less well than\n  designed. However, quality would not be impacted. Lossless JBIG2 was\n  entirely unaffected.\n- Updated dependencies, including pikepdf to 1.1.0. This fixes\n  {issue}`358`.\n- The install-time version checks for certain external programs have\n  been removed from setup.py. These tests are now performed at\n  run-time.\n- The non-standard option to override install-time checks\n  (`setup.py install --force`) is now deprecated and prints a\n  warning. It will be removed in a future release.\n\n## v8.1.0\n\n- Added a feature, `--unpaper-args`, which allows passing arbitrary\n  arguments to `unpaper` when using `--clean` or `--clean-final`.\n  The default, very conservative unpaper settings are suppressed.\n- The argument `--clean-final` now implies `--clean`. It was\n  possible to issue `--clean-final` on its before this, but it would\n  have no useful effect.\n- Fixed an exception on traversing corrupt table of contents entries\n  (specifically, those with invalid destination objects)\n- Fixed an issue when using `--tesseract-timeout` and image\n  processing features on a file with more than 100 pages.\n  {issue}`347`\n- OCRmyPDF now always calls `os.nice(5)` to signal to operating\n  systems that it is a background process.\n\n## v8.0.1\n\n- Fixed an exception when parsing PDFs that are missing a required\n  field. {issue}`325`\n- pikepdf 1.0.5 is now required, to address some other PDF parsing\n  issues.\n\n## v8.0.0\n\nNo major features. The intent of this release is to sever support for\nolder versions of certain dependencies.\n\n**Breaking changes**\n\n- Dropped support for Tesseract 3.x. Tesseract 4.0 or newer is now\n  required.\n- Dropped support for Python 3.5.\n- Some `ocrmypdf.pdfa` APIs that were deprecated in v7.x were\n  removed. This functionality has been moved to pikepdf.\n\n**Other changes**\n\n- Fixed an unhandled exception when attempting to mask barcodes.\n  {issue}`322`\n- It is now possible to use ocrmypdf without pdfminer.six, to support\n  distributions that do not have it or cannot currently use it (e.g.\n  Homebrew). Downstream maintainers should include pdfminer.six if\n  possible.\n- A warning is now issue when PDF/A conversion removes some XMP\n  metadata from the input PDF. (Only a \"whitelist\" of certain XMP\n  metadata types are allowed in PDF/A.)\n- Fixed several issues that caused PDF/As to be produced with\n  nonconforming XMP metadata (would fail validation with veraPDF).\n- Fixed some instances where invalid DocumentInfo from a PDF cause XMP\n  metadata creation to fail.\n- Fixed a few documentation problems.\n- pikepdf 1.0.2 is now required.\n\n"
  },
  {
    "path": "docs/releasenotes/version09.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v9\n\n## v9.8.2\n\n- Fixed an issue where OCRmyPDF would ignore text inside Form XObject when\n  making certain decisions about whether a document already had text.\n- Fixed file size increase warning to take overhead of small files into account.\n- Added instructions for installing on Cygwin.\n\n## v9.8.1\n\n- Fixed an issue where unexpected files in the `%PROGRAMFILES%\\gs` directory\n  (Windows) caused an exception.\n- Mark pdfminer.six 20200517 as supported.\n- If jbig2enc is missing and optimization is requested, a warning is issued\n  instead of an error, which was the intended behavior.\n- Documentation updates.\n\n## v9.8.0\n\n- Fixed issue where only the first PNG (FlateDecode) image in a file would be\n  considered for optimization. File sizes should be improved from here on.\n- Fixed a startup crash when the chosen language was Japanese ({issue}`543`).\n- Added options to configure polling and log level to watcher.py.\n\n## v9.7.2\n\n- Fixed an issue with `ocrmypdf.ocr(...language=)` not accepting a list of\n  languages as documented.\n- Updated setup.py to confirm that pdfminer.six version 20200402 is supported.\n\n## v9.7.1\n\n- Fixed version check failing when used with qpdf 10.0.0.\n- Added some missing type annotations.\n- Updated documentation to warn about need for \"ifmain\" guard and Windows.\n\n## v9.7.0\n\n- Fixed an error in watcher.py if `OCR_JSON_SETTINGS` was not defined.\n- Ghostscript 9.51 is now blacklisted, due to numerous problems with this version.\n- Added a workaround for a problem with \"txtwrite\" in Ghostscript 9.52.\n- Fixed an issue where the incorrect number of threads used was shown when\n  `OMP_THREAD_LIMIT` was manipulated.\n- Removed a possible performance bottlenecks for files that use hundreds to\n  thousands of images on the same page.\n- Documentation improvements.\n- Optimization will now be applied to some monochrome images that have a color\n  profile defined instead of only black and white.\n- ICC profiles are consulted when determining the simplified colorspace of an\n  image.\n\n## v9.6.1\n\n- Documentation improvements - thanks to many users for their contributions!\n\n  > - Fixed installation instructions for ArchLinux (@pigmonkey)\n  > - Updated installation instructions for FreeBSD and other OSes (@knobix)\n  > - Added instructions for using Docker Compose with watchdog (@ianalexander,\n  >   @deisi)\n  > - Other miscellany (@mb720, @toy, @caiofacchinato)\n  > - Some scripts provided in the documentation have been migrated out so that\n  >   they can be copied out as whole files, and to ensure syntax checking\n  >   is maintained.\n\n- Fixed an error that caused bash completions to fail on macOS. ({issue}`502,504`;\n  @AlexanderWillner)\n\n- Fixed a rare case where OCRmyPDF threw an exception while processing a PDF\n  with the wrong object type in its `/Trailer /Info`. The error is now logged\n  and incorrect object is ignored. ({issue}`497`)\n\n- Removed potentially non-free file `enron1.pdf` and simplified the test that\n  used it.\n\n- Removed potentially non-free file `misc/media/logo.afdesign`.\n\n## v9.6.0\n\n- Fixed a regression with transferring metadata from the input PDF to the output\n  PDF in certain situations.\n- pdfminer.six is now supported up to version 2020-01-24.\n- Messages are explaining page rotation decisions are now shown at the standard\n  verbosity level again when `--rotate-pages`. In some previous version they\n  were set to debug level messages that only appeared with the parameter `-v1`.\n- Improvements to `misc/watcher.py`. Thanks to @ianalexander and @svenihoney.\n- Documentation improvements.\n\n## v9.5.0\n\n- Added API functions to measure OCR quality.\n- Modest improvements to handling PDFs with difficult/non compliant metadata.\n\n## v9.4.0\n\n- Updated recommended dependency versions.\n- Improvements to test coverage and changes to facilitate better measurement of\n  test coverage, such as when tests run in subprocesses.\n- Improvements to error messages when Leptonica is not installed correctly.\n- Fixed use of pytest \"session scope\" that may have caused some intermittent\n  CI failures.\n- When the argument `--keep-temporary-files` or verbosity is set to `-v1`,\n  a debug log file is generated in the working temporary folder.\n\n## v9.3.0\n\n- Improved native Windows support: we now check in the obvious places in\n  the \"Program Files\" folders installations of Tesseract and Ghostscript,\n  rather than relying on the user to edit `PATH` to specify their location.\n  The `PATH` environment variable can still be used to differentiate when\n  multiple installations are present or the programs are installed to non-\n  standard locations.\n- Fixed an exception on parsing Ghostscript error messages.\n- Added an improved example demonstrating how to set up a watched folder\n  for automated OCR processing (thanks to @ianalexander for the contribution).\n\n## v9.2.0\n\n- Native Windows is now supported.\n- Continuous integration moved to Azure Pipelines.\n- Improved test coverage and speed of tests.\n- Fixed an issue where a page that was originally a JPEG would be saved as a\n  PNG, increasing file size. This occurred only when a preprocessing option\n  was selected along with `--output-type=pdf` and all images on the original\n  page were JPEGs. Regression since v7.0.0.\n- OCRmyPDF no longer depends on the QPDF executable `qpdf` or `libqpdf`.\n  It uses pikepdf (which in turn depends on `libqpdf`). Package maintainers\n  should adjust dependencies so that OCRmyPDF no longer calls for libqpdf on\n  its own. For users of Python binary wheels, this change means a separate\n  installation of QPDF is no longer necessary. This change is mainly to\n  simplify installation on Windows.\n- Fixed a rare case where log messages from Tesseract would be discarded.\n- Fixed incorrect function signature for pixFindPageForeground, causing\n  exceptions on certain platforms/Leptonica versions.\n\n## v9.1.1\n\n- Expand the range of pdfminer.six versions that are supported.\n- Fixed Docker build when using pikepdf 1.7.0.\n- Fixed documentation to recommend using pip from get-pip.py.\n\n## v9.1.0\n\n- Improved diagnostics when file size increases at output. Now warns if JBIG2\n  or pngquant were not available.\n- pikepdf 1.7.0 is now required, to pick up changes that remove the need for\n  a source install on Linux systems running Python 3.8.\n\n## v9.0.5\n\n- The Alpine Docker image (jbarlow83/ocrmypdf-alpine) has been dropped due to\n  the difficulties of supporting Alpine Linux.\n- The primary Docker image (jbarlow83/ocrmypdf) has been improved to take on\n  the extra features that used to be exclusive to the Alpine image.\n- No changes to application code.\n- pdfminer.six version 20191020 is now supported.\n\n## v9.0.4\n\n- Fixed compatibility with Python 3.8 (but requires source install for the moment).\n- Fixed Tesseract settings for `--user-words` and `--user-patterns`.\n- Changed to pikepdf 1.6.5 (for Python 3.8).\n- Changed to Pillow 6.2.0 (to mitigate a security vulnerability in earlier Pillow).\n- A debug message now mentions when English is automatically selected if the locale\n  is not English.\n\n## v9.0.3\n\n- Embed an encoded version of the sRGB ICC profile in the intermediate\n  Postscript file (used for PDF/A conversion). Previously we included the\n  filename, which required Postscript to run with file access enabled. For\n  security, Ghostscript 9.28 enables `-dSAFER` and as such, no longer\n  permits access to any file by default. This fix is necessary for\n  compatibility with Ghostscript 9.28.\n- Exclude a test that sometimes times out and fails in continuous integration\n  from the standard test suite.\n\n## v9.0.2\n\n- The image optimizer now skips optimizing flate (PNG) encoded images in some\n  situations where the optimization effort was likely wasted.\n- The image optimizer now ignores images that specify arbitrary decode arrays,\n  since these are rare.\n- Fixed an issue that caused inversion of black and white in monochrome images.\n  We are not certain but the problem seems to be linked to Leptonica 1.76.0 and\n  older.\n- Fixed some cases where the test suite failed if\n  English or German Tesseract language packs were not installed.\n- Fixed a runtime error if the Tesseract English language is not installed.\n- Improved explicit closing of Pillow images after use.\n- Actually fixed of Alpine Docker image build.\n- Changed to pikepdf 1.6.3.\n\n## v9.0.1\n\n- Fixed test suite failing when either of optional dependencies unpaper and\n  pngquant were missing.\n- Attempted fix of Alpine Docker image build.\n- Documented that FreeBSD ports are now available.\n- Changed to pikepdf 1.6.1.\n\n## v9.0.0\n\n**Breaking changes**\n\n- The `--mask-barcodes` experimental feature has been dropped due to poor\n  reliability and occasional crashes, both due to the underlying library that\n  implements this feature (Leptonica).\n- The `-v` (verbosity level) parameter now accepts only `0`, `1`, and\n  `2`.\n- Dropped support for Tesseract 4.00.00-alpha releases. Tesseract 4.0 beta and\n  later remain supported.\n- Dropped the `ocrmypdf-polyglot` and `ocrmypdf-webservice` images.\n\n**New features**\n\n- Added a high level API for applications that want to integrate OCRmyPDF.\n  Special thanks to Martin Wind (@mawi1988) whose made significant contributions\n  to this effort.\n- Added progress bars for long-running steps. ■■■■■■■□□\n- We now create linearized (\"fast web view\") PDFs by default. The new parameter\n  `--fast-web-view` provides control over when this feature is applied.\n- Added a new `--pages` feature to limit OCR to only a specific page range.\n  The list may contain commas or single pages, such as `1, 3, 5-11`.\n- When the number of pages is small compared to the number of allowed jobs, we\n  run Tesseract in multithreaded (OpenMP) mode when available. This should\n  improve performance on files with low page counts.\n- Removed dependency on `ruffus`, and with that, the non-reentrancy\n  restrictions that previous made an API impossible.\n- Output and logging messages overhauled so that ocrmypdf may be integrated\n  into applications that use the logging module.\n- pikepdf 1.6.0 is required.\n- Added a logo. 😊\n\n**Bug fixes**\n\n- Pages with vector artwork are treated as full color. Previously, vectors\n  were ignored when considering the colorspace needed to cover a page, which\n  could cause loss of color under certain settings.\n- Test suite now spawns processes less frequently, allowing more accurate\n  measurement of code coverage.\n- Improved test coverage.\n- Fixed a rare division by zero (if optimization produced an invalid file).\n- Updated Docker images to use newer versions.\n- Fixed images encoded as JBIG2 with a colorspace other than `/DeviceGray`\n  were not interpreted correctly.\n- Fixed a OCR text-image registration (i.e. alignment) problem when the page\n  when MediaBox had a nonzero corner.\n\n"
  },
  {
    "path": "docs/releasenotes/version10.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v10\n\n## v10.3.3\n\n- Fixed a \"KeyError: 'dpi'\" error message when using `--threshold` on an image.\n  ({issue}`607`)\n\n## v10.3.2\n\n- Fixed a case where we reported \"no reason\" for a file size increase, when we\n  could determine the reason.\n- Enabled support for pdfminer.six 20200726.\n\n## v10.3.1\n\n- Fixed a number of test suite failures with pdfminer.six older than version 20200402.\n- Enabled support for pdfminer.six 20200720.\n\n## v10.3.0\n\n- Fixed an issue where we would consider images that were already JBIG2-encoded\n  for optimization, potentially producing a less optimized image than the original.\n  We do not believe this issue would ever cause an image to loss fidelity.\n- Where available, pikepdf memory mapping is now used. This improves performance.\n- When Leptonica 1.79+ is installed, use its new error handling API to avoid\n  a \"messy\" redirection of stderr which was necessary to capture its error\n  messages.\n- For older versions of Leptonica, added a new thread level lock. This fixes a\n  possible race condition in handling error conditions in Leptonica (although\n  there is no evidence it ever caused issues in practice).\n- Documentation improvements and more type hinting.\n\n## v10.2.1\n\n- Disabled calculation of text box order with pdfminer. We never needed this result\n  and it is expensive to calculate on files with complex pre-existing text.\n- Fixed plugin manager to accept `Path(plugin)` as a path to a plugin.\n- Fixed some typing errors.\n- Documentation improvements.\n\n## v10.2.0\n\n- Update Docker image to use Ubuntu 20.04.\n- Fixed issue PDF/A acquires title \"Untitled\" after conversion. ({issue}`582`)\n- Fixed a problem where, when using `--pdf-renderer hocr`, some text would\n  be missing from the output when using a more recent version of Tesseract.\n  Tesseract began adding more detailed markup about the semantics of text\n  that our HOCR transform did not recognize, so it ignored them. This option is\n  not the default. If necessary `--redo-ocr` also redoing OCR to fix such issues.\n- Fixed an error in Python 3.9 beta, due to removal of deprecated\n  `Element.getchildren()`. ({issue}`584`)\n- Implemented support using the API with `BytesIO` and other file stream objects.\n  ({issue}`545`)\n\n## v10.1.1\n\n- Fixed `OMP_THREAD_LIMIT` set to invalid value error messages on some input\n  files. (The error was harmless, apart from less than optimal performance in\n  some cases.)\n\n## v10.1.0\n\n- Previously, we `--clean-final` would cause an unpaper-cleaned page image to\n  be produced twice, which was necessary in some cases but not in general. We\n  now take this optimization opportunity and reuse the image if possible.\n- We now provide PNG files as input to unpaper, since it accepts them, instead\n  of generating PPM files which can be very large. This can improve performance\n  and temporary disk usage.\n- Documentation updated for plugins.\n\n## v10.0.1\n\n- Fixed regression when `-l lang1+lang2` is used from command line.\n\n## v10.0.0\n\n**Breaking changes**\n\n- Support for pdfminer.six version 20181108 has been dropped, along with a\n  monkeypatch that made this version work.\n- Output messages are now displayed in color (when supported by the terminal)\n  and prefixes describing the severity of the message are removed. As such\n  programs that parse OCRmyPDF's log message will need to be revised. (Please\n  consider using OCRmyPDF as a library instead.)\n- The minimum version for certain dependencies has increased.\n- Many API changes; see developer changes.\n- The Python libraries pluggy and coloredlogs are now required.\n\n**New features and improvements**\n\n- PDF page scanning is now parallelized across CPUs, speeding up this phase\n  dramatically for files with a high page counts.\n- PDF page scanning is optimized, addressing some performance regressions.\n- PDF page scanning is no longer run on pages that are not selected when the\n  `--pages` argument is used.\n- PDF page scanning is now independent of Ghostscript, ending our past reliance\n  on this occasionally unstable feature in Ghostscript.\n- A plugin architecture has been added, currently allowing one to more easily\n  use a different OCR engine or PDF renderer from Tesseract and Ghostscript,\n  respectively. A plugin can also override some decisions, such changing\n  the OCR settings after initial scanning.\n- Colored log messages.\n\n**Developer changes**\n\n- The test spoofing mechanism, used to test correct handling of failures in\n  Tesseract and Ghostscript, has been removed in favor of using plugins for\n  testing. The spoofing mechanism was fairly complex and required many special\n  hacks for Windows.\n- Code describing the resolution in DPI of images was refactored into a\n  `ocrmypdf.helpers.Resolution` class.\n- The module `ocrmypdf._exec` is now private to OCRmyPDF.\n- The `ocrmypdf.hocrtransform` module has been updated to follow PEP8 naming\n  conventions.\n- Ghostscript is no longer used for finding the location of text in PDFs, and\n  APIs related to this feature have been removed.\n- Lots of internal reorganization to support plugins.\n\n"
  },
  {
    "path": "docs/releasenotes/version11.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v11\n\n## v11.7.3\n\n- Exclude CCITT Group 3 images from being optimized. Some libraries\n  OCRmyPDF uses do not seem to handle this obscure compression format properly.\n  You may get errors or possible corrupted output images without this fix.\n\n## v11.7.2\n\n- Updated pinned versions in main.txt, primarily to upgrade Pillow to 8.1.2, due\n  to recently disclosed security vulnerabilities in that software.\n- The `--sidecar` parameter now causes an exception if set to the same file as\n  the input or output PDF.\n\n## v11.7.1\n\n- Some exceptions while attempting image optimization were only logged at the debug\n  level, causing them to be suppressed. These errors are now logged appropriately.\n- Improved the error message related to `--unpaper-args`.\n- Updated documentation to mention the new conda distribution.\n\n## v11.7.0\n\n- We now support using `--sidecar` in conjunction with `--pages`; these arguments\n  used to be mutually exclusive. ({issue}`735`)\n- Fixed a possible issue with PDF/A-1b generation. Acrobat complained that our PDFs use\n  object streams. More robust PDF/A validators like veraPDF don't consider this a\n  problem, but we'll honor Acrobat's objection from here on. This may increase file\n  size of PDF/A-1b files. PDF/A-2b files will not be affected.\n\n## v11.6.2\n\n- Fixed a regression where the wrong page orientation would be produced when using\n  arguments such as `--deskew --rotate-pages` ({issue}`730`).\n\n## v11.6.1\n\n- Fixed an issue with attempting optimize unusually narrow-width images by excluding\n  these images from optimization ({issue}`732`).\n- Remove an obsolete compatibility shim for a version of pikepdf that is no longer\n  supported.\n\n## v11.6.0\n\n- OCRmyPDF will now automatically register plugins from the same virtual environment\n  with an appropriate setuptools entrypoint.\n- Refactor the plugin manager to remove unnecessary complications and make plugin\n  registration more automatic.\n- `PageContext` and `PdfContext` are now formally part of the API, as they\n  should have been, since they were part of `ocrmypdf.pluginspec`.\n\n## v11.5.0\n\n- Fixed an issue where the output page size might differ by a fractional amount\n  due to rounding, when `--force-ocr` was used and the page contained objects\n  with multiple resolutions.\n- When determining the resolution at which to rasterize a page, we now consider\n  printed text on the page as requiring a higher resolution. This fixes issues\n  with certain pages being rendered with unacceptably low resolution text, but\n  may increase output file sizes in some workflows where low resolution text\n  is acceptable.\n- Added a workaround to fix an exception that occurs when trying to\n  `import ocrmypdf.leptonica` on Apple ARM silicon (or potentially, other\n  platforms that do not permit write+executable memory).\n\n## v11.4.5\n\n- Fixed an issue where files may not be closed when the API is used.\n- Improved `setup.cfg` with better settings for test coverage.\n\n## v11.4.4\n\n- Fixed `AttributeError: 'NoneType' object has no attribute 'userunit'` ({issue}`700`),\n  related to OCRmyPDF not properly forwarded an error message from pdfminer.six.\n- Adjusted typing of some arguments.\n- `ocrmypdf.ocr` now takes a `threading.Lock` for reasons outlined in the\n  documentation.\n\n## v11.4.3\n\n- Removed a redundant debug message.\n- Test suite now asserts that most patched functions are called when they should be.\n- Test suite now skips a test that fails on two particular versions of piekpdf.\n\n## v11.4.2\n\n- Fixed support for Cygwin, hopefully.\n- watcher.py: Fixed an issue with the OCR_LOGLEVEL not being interpreted.\n\n## v11.4.1\n\n- Fixed an issue where invalid pages ranges passed using the `pages` argument,\n  such as \"1-0\" would cause unhandled exceptions.\n- Accepted a user-contributed to the Synology demo script in misc/synology.py.\n- Clarified documentation about change of temporary file location `ocrmypdf.io`.\n- Fixed Python wheel tag which was incorrectly set to py35 even though we long\n  since dropped support for Python 3.5.\n\n## v11.4.0\n\n- When looking for Tesseract and Ghostscript, we now check the Windows Registry to\n  see if their installers registered the location of their executables. This should\n  help Windows users who have installed these programs to non-standard\n  locations.\n- We now report on the progress of PDF/A conversion, since this operation is\n  sometimes slow.\n- Improved command line completions.\n- The prefix of the temporary folder OCRmyPDF creates has been changed from\n  `com.github.ocrmypdf` to `ocrmypdf.io`. Scripts that chose to depend on this\n  prefix may need to be adjusted. (This has always been an implementation detail so is\n  not considered part of the semantic versioning \"contract\".)\n- Fixed {issue}`692`, where a particular file with malformed fonts would flood an\n  internal message cue by generating so many debug messages.\n- Fixed an exception on processing hOCR files with no page record. Tesseract\n  is not known to generate such files.\n\n## v11.3.4\n\n- Fixed an error message 'called readLinearizationData for file that is not\n  linearized' that may occur when pikepdf 2.1.0 is used. (Upgrading to pikepdf\n  2.1.1 also fixes the issue.)\n- File watcher now automatically includes `.PDF` in addition to `.pdf` to\n  better support case sensitive file systems.\n- Some documentation and comment improvements.\n\n## v11.3.3\n\n- If unpaper outputs non-UTF-8 data, quietly fix this rather than choke on the\n  conversion. (Possibly addresses {issue}`671`.)\n\n## v11.3.2\n\n- Explicitly require pikepdf 2.0.0 or newer when running on Python 3.9. (There are\n  concerns about the stability of pybind11 2.5.x with Python 3.9, which is used in\n  pikepdf 1.x.)\n- Fixed another issue related to page rotation.\n- Fixed an issue where image marked as image masks were not properly considered\n  as optimization candidates.\n- On some systems, unpaper seems to be unable to process the PNGs we offer it\n  as input. We now convert the input to PNM format, which unpaper always accepts.\n  Fixes {issue}`665` and {issue}`667`.\n- DPI sent to unpaper is now rounded to a more reasonable number of decimal digits.\n- Debug and error messages from unpaper were being suppressed.\n- Some documentation tweaks.\n\n## v11.3.1\n\n- Declare support for new versions: pdfminer.six 20201018 and pikepdf 2.x\n- Fixed warning related to `--pdfa-image-compression` that appears at the wrong\n  time.\n\n## v11.3.0\n\n- The \"OCR\" step is describing as \"Image processing\" in the output messages when\n  OCR is disabled, to better explain the application's behavior.\n- Debug logs are now only created when run as a command line, and not when OCR\n  is performed for an API call. It is the calling application's responsibility\n  to set up logging.\n- For PDFs with a low number of pages, we gathered information about the input PDF\n  in a thread rather than process (when there are more pages). When run as a\n  thread, we did not close the file handle to the working PDF, leaking one file\n  handle per call of `ocrmypdf.ocr`.\n- Fixed an issue where debug messages send by child worker processes did not match\n  the log settings of parent process, causing messages to be dropped. This affected\n  macOS and Windows only where the parent process is not forked.\n- Fixed the hookspec of rasterize_pdf_page to remove default parameters that\n  were not handled in an expected way by pluggy.\n- Fixed another issue with automatic page rotation ({issue}`658`) due to the issue above.\n\n## v11.2.1\n\n- Fixed an issue where optimization of a 1-bit image with a color palette or\n  associated ICC that was optimized to JBIG2 could have its colors inverted.\n\n## v11.2.0\n\n- Fixed an issue with optimizing PNG-type images that had soft masks or image masks.\n  This is a regression introduced in (or about) v11.1.0.\n- Improved type checking of the `plugins` parameter for the `ocrmypdf.ocr`\n  API call.\n\n## v11.1.2\n\n- Fixed hOCR renderer writing the text in roughly reverse order. This should not\n  affect reasonably smart PDF readers that properly locate the position of all\n  text, but may confuse those that rely on the order of objects in the content\n  stream. ({issue}`642`)\n\n## v11.1.1\n\n- We now avoid using named temporary files when using pngquant allowing containerized\n  pngquant installs to be used.\n- Clarified an error message.\n- Highest number of 1's in a release ever!\n\n## v11.1.0\n\n- Fixed page rotation issues: {issue}`634,589`.\n- Fixed some cases where optimization created an invalid image such as a\n  1-bit \"RGB\" image: {issue}`629,620`.\n- Page numbers are now displayed in debug logs when pages are being grafted.\n- ocrmypdf.optimize.rewrite_png and ocrmypdf.optimize.rewrite_png_as_g4 were\n  marked deprecated. Strictly speaking these should have been internal APIs,\n  but they were never hidden.\n- As a precaution, pikepdf mmap-based file access has been disabled due to a\n  rare race condition that causes a crash when certain objects are deallocated.\n  The problem is likely in pikepdf's dependency pybind11.\n- Extended the example plugin to demonstrate conversion to mono.\n\n## v11.0.2\n\n- Fixed {issue}`612`, TypeError exception. Fixed by eliminating unnecessary repair of\n  input PDF metadata in memory.\n\n## v11.0.1\n\n- Blacklist pdfminer.six 20200720, which has a regression fixed in 20200726.\n- Approve img2pdf 0.4 as it passes tests.\n- Clarify that the GPL-3 portion of pdfa.py was removed with the changes in v11.0.0;\n  the debian/copyright file did not properly annotate this change.\n\n## v11.0.0\n\n- Project license changed to Mozilla Public License 2.0. Some miscellaneous\n  code is now under MIT license and non-code content/media remains under\n  CC-BY-SA 4.0. License changed with approval of all people who were found\n  to have contributed to GPLv3 licensed sections of the project. ({issue}`600`)\n- Because the license changed, this is being treated as a major version number\n  change; however, there are no known breaking changes in functional behavior\n  or API compared to v10.x.\n\n"
  },
  {
    "path": "docs/releasenotes/version12.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v12\n\n## v12.7.2\n\n- Fixed \"invalid version number\" error for Tesseract packaging with nonstandard\n  version \"5.0.0-rc1.20211030\".\n- Fixed use of deprecated `importlib.resources.read_binary`.\n- Replace some uses of string paths with `pathlib.Path`.\n- Fixed a leaked file handle when using `--output-type none`.\n- Removed shims to support versions of pikepdf that are no longer supported.\n\n## v12.7.1\n\n- Declare support for pdfminer.six v20211012.\n\n## v12.7.0\n\n- Fixed test suite failure when using pikepdf 3.2.0 that was compiled with pybind11\n  2.8.0. {issue}`843`\n- Improve advice to user about using `--max-image-mpixels` if OCR fails for this\n  reason.\n- Minor documentation fixes. (Thanks to @mara004.)\n- Don't require importlib-metadata and importlib-resources backports on versions of\n  Python where the standard library implementation is sufficient.\n  (Thanks to Marco Genasci.)\n\n## v12.6.0\n\n- Implemented `--output-type=none` to skip producing PDFs for applications that\n  only want sidecar files ({issue}`787`).\n- Fixed ambiguities in descriptions of behavior of `--jbig2-lossy`.\n- Various improvements to documentation.\n\n## v12.5.0\n\n- Fixed build failure for the combination of PyPy 3.6 and pikepdf 3.0. This\n  combination can work in a source build but does not work with wheels.\n- Accepted bot that wanted to upgrade our deprecated requirements.txt.\n- Documentation updates.\n- Replace pkg_resources and install dependency on setuptools with\n  importlib-metadata and importlib-resources.\n- Fixed regression in hocrtransform causing text to be omitted when this\n  renderer was used.\n- Fixed some typing errors.\n\n## v12.4.0\n\n- When grafting text layers, use pikepdf's `unparse_content_stream` if available.\n- Confirmed support for pluggy 1.0. (Thanks @QuLogic.)\n- Fixed some typing issues, improved pre-commit settings, and fixed issues\n  flagged by linters.\n- PyPy 7.3.3 (=Python 3.6) is now supported. Note that PyPy does not necessarily\n  run faster, because the vast majority of OCRmyPDF's execution time is spent\n  running OCR or generally executing native code. However, PyPy may bring speed\n  improvements in some areas.\n\n## v12.3.3\n\n- watcher.py: fixed interpretation of boolean env vars ({issue}`821`).\n- Adjust CI scripts to test Tesseract 5 betas.\n- Document our support for the Tesseract 5 betas.\n\n## v12.3.2\n\n- Indicate support for flask 2.x, watcher 2.x ({issue}`815, 816`).\n\n## v12.3.1\n\n- Fixed issue with selection of text when using the hOCR renderer ({issue}`813`).\n- Fixed build errors with the Docker image by upgrading to a newer Ubuntu.\n  Also set the timezone of this image to UTC.\n\n## v12.3.0\n\n- Fixed a regression introduced in Pillow 8.3.0. Pillow no longer rounds DPI\n  for image resolutions. We now account for this ({issue}`802`).\n- We no longer use some API calls that are deprecated in the latest versions of\n  pikepdf.\n- Improved error message when a language is requested that doesn't look like a\n  typical ISO 639-2 code.\n- Fixed some tests that attempted to symlink on Windows, breaking tests on a\n  Windows desktop but not usually on CI.\n- Documentation fixes (thanks to @mara004)\n\n## v12.2.0\n\n- Fixed invalid Tesseract version number on Windows ({issue}`795`).\n- Documentation tweaks. Documentation build now depends on sphinx-issues package.\n\n## v12.1.0\n\n- For security reasons we now require Pillow >= 8.2.x. (Older versions will continue\n  to work if upgrading is not an option.)\n- The build system was reorganized to rely on `setup.cfg` instead of `setup.py`.\n  All changes should work with previously supported versions of setuptools.\n- The files in `requirements/*` are now considered deprecated but will be retained for v12.\n  Instead use `pip install ocrmypdf[test]` instead of `requirements/test.txt`, etc.\n  These files will be removed in v13.\n\n## v12.0.3\n\n- Expand the list of languages supported by the hocr PDF renderer.\n  Several languages were previously considered not supported, particularly those\n  non-European languages that use the Latin alphabet.\n- Fixed a case where the exception stack trace was suppressed in verbose mode.\n- Improved documentation around commercial OCR.\n\n## v12.0.2\n\n- Fixed exception thrown when using `--remove-background` on files containing small\n  images ({issue}`769`).\n- Improve documentation for description of adding language packs to the Docker image\n  and corrected name of French language pack.\n\n## v12.0.1\n\n- Fixed \"invalid version number\" for untagged tesseract versions ({issue}`770`).\n\n## v12.0.0\n\n**Breaking changes**\n\n- Due to recent security issues in pikepdf, Pillow and reportlab, we now require\n  newer versions of these libraries and some of their dependencies. (If necessary,\n  package maintainers may override these versions at their discretion; lower\n  versions will often work.)\n- We now use the \"LeaveColorUnchanged\" color conversion strategy when directing\n  Ghostscript to create a PDF/A. Generally this is faster than performing a\n  color conversion, which is not always necessary.\n- OCR text is now packaged in a Form XObject. This makes it easier to isolate\n  OCR from other document content. However, some poorly implemented PDF text\n  extraction algorithms may fail to detect the text.\n- Many API functions have stricter parameter checking or expect keyword arguments\n  were they previously did not.\n- Some deprecated functions in `ocrmypdf.optimize` were removed.\n- The `ocrmypdf.leptonica` module is now deprecated, due to difficulties with\n  the current strategy of ABI binding on newer platforms like Apple Silicon.\n  It will be removed and replaced, either by repackaging Leptonica as an\n  independent library using or using a different image processing library.\n- Continuous integration moved to GitHub Actions.\n- We no longer depend on `pytest_helpers_namespace` for testing.\n\n**New features**\n\n- New plugin hook: `get_progressbar_class`, for progress reporting,\n  allowing developers to replace the standard console progress bar with some\n  other mechanism, such as updating a GUI progress bar.\n- New plugin hook: `get_executor`, for replacing the concurrency model.\n  This is primarily to support execution on AWS Lambda, which does not support\n  standard Python `multiprocessing` due to its lack of shared memory.\n- New plugin hook: `get_logging_console`, for replacing the standard\n  way OCRmyPDF outputs its messages.\n- New plugin hook: `filter_pdf_page`, for modifying individual PDF\n  pages produced by OCRmyPDF.\n- OCRmyPDF now runs on nonstandard execution environments that do not have\n  interprocess semaphores, such as AWS Lambda and Android Termux. If the environment\n  does not have semaphores, OCRmyPDF will automatically select an alternate\n  process executor that does not use semaphores.\n- Continuous integration moved to GitHub Actions.\n- We now generate an ARM64-compatible Docker image alongside the x64 image.\n  Thanks to @andkrause for doing most of the work in a pull request several months\n  ago, which we were finally able to integrate now. Also thanks to @0x326 for\n  review comments.\n\n**Fixes**\n\n- Fixed a possible deadlock on attempting to flush `sys.stderr` when older\n  versions of Leptonica are in use.\n- Some worker processes inherited resources from their parents such as log\n  handlers that may have also lead to deadlocks. These resources are now released.\n- Improvements to test coverage.\n- Removed vestiges of support for Tesseract versions older than 4.0.0-beta1 (\n  which ships with Ubuntu 18.04).\n- OCRmyPDF can now parse all of Tesseract version numbers, since several\n  schemes have been in use.\n- Fixed an issue with parsing PDFs that contain images drawn at a scale of 0. ({issue}`761`)\n- Removed a frequently repeated message about disabling mmap.\n\n"
  },
  {
    "path": "docs/releasenotes/version13.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v13\n\n## v13.7.0\n\n- Fixed an exception when attempting to run and Tesseract is not installed.\n- Changed to SPDX license tracking and information files.\n\n## v13.6.2\n\n- Added a shim to prevent an \"error during error handling\" for Python 3.7 and 3.8.\n- Modernized some type annotations.\n- Improved annotations on our \\_windows module to help IDEs and mypy figure out what\n  we're doing.\n\n## v13.6.1\n\n- Require setuptools-scm 7.0.5 to avoid possible issues with source distributions in\n  earlier versions of setuptools-scm.\n- Suppress a spurious warning, improve tests, improve typing and other miscellany.\n\n## v13.6.0\n\n- Added a new `initialize` plugin hook, making it possible to suppress built-in\n  plugins more easily, among other possibilities.\n- Fixed an issue where unpaper would exit with a \"wrong stream\" error, probably\n  related to images with an odd integer width. {issue}`887, 665`\n\n## v13.5.0\n\n- Added a new `optimize_pdf` plugin hook, making it possible to create plugins that\n  replace or enhance OCRmyPDF's PDF optimizer.\n- Removed all max version restrictions. Our new policy is to blacklist known-bad releases\n  and only block known-bad versions of dependencies.\n- The naming schema for object that holds all OCR text that OCRmyPDF inserts has\n  changed. This has always been an implementation detail (and remains so), but possibly,\n  someone was relying on it and would appreciate the heads-up.\n- Cleanup.\n\n## v13.4.7\n\n- Fixed PermissionError when cleaning up temporary files in rare cases. {issue}`974`\n- Fixed PermissionError when calling `os.nice` on platforms that lack it. {issue}`973`\n- Suppressed some warnings from libxmp during tests.\n\n## v13.4.6\n\n- Convert error on corrupt ICC profiles into a warning. Thanks to @oscherler.\n\n## v13.4.5\n\n- Remove upper bound on pdfminer.six version.\n- Documentation.\n\n## v13.4.4\n\n- Updated pdfminer.six version.\n- Docker image changed to Ubuntu 22.04 now that it is released and provides the\n  dependencies we need. This seems more consistent than our recent change to\n  Debian.\n\n## v13.4.3\n\n- Fix error on pytest.skip() with older versions of pytest.\n- Documentation updates.\n\n## v13.4.2\n\n- Worked around a\n  [major regression in Ghostscript 9.56.0](https://bugs.ghostscript.com/show_bug.cgi?id=705187)\n  where **all OCR text is stripped out of the PDF**. It simply removes all text,\n  even generated by software other than OCRmyPDF. Fortunately, we can ask\n  Ghostscript 9.56.0 to use its old behavior that worked correctly for our purposes.\n  Users must avoid the combination (Ghostscript 9.56.0, ocrmypdf \\<13.4.2) since\n  older versions of OCRmyPDF have no way of detecting that this particular\n  version of Ghostscript removes all OCR text.\n- Marked pdfminer 20220319 as supported.\n- Fixed some deprecation warnings from recent versions of Pillow and pytest.\n- Test suite now covers Python 3.10 (Python 3.10 worked fine before, but was not\n  being tested).\n- Docker image now uses debian:bookworm-slim as the base image to fix the Docker\n  image build.\n\n## v13.4.1\n\n- Temporarily make threads rather than processes the default executor worker, due\n  to a persistent deadlock issue when processes are used. Add a new command line\n  argument `--no-use-threads` to disable this.\n\n## v13.4.0\n\n- Fixed test failures when using pikepdf 5.0.0.\n- Various improvements to the optimizer. In particular, we now recognize PDF images\n  that are encoded with both deflate (PNG) and DCT (JPEG), and also produce PDF\n  with images compressed with deflate and DCT, since this often yields file size\n  improvements compared to plain DCT.\n\n## v13.3.0\n\n- Made a harmless but \"scary\" exception after failing to optimize an image less scary.\n- Added a warning if a page image is too large for unpaper to clean. The image is\n  passed through without cleaning. This is due to a hard-coded limitation in a\n  C library used by unpaper so it cannot be rectified easily.\n- We now use better default settings when calling img2pdf.\n- We no longer try to optimize images that we failed to save in certain situations.\n- We now account for some differences in text output from Tesseract 5 compared to\n  Tesseract 4.\n- Better handling of Ghostscript producing empty images when attempting to rasterize\n  page images.\n\n## v13.2.0\n\n- Removed all runtime uses of distutils since it is deprecated in standard library. We\n  previous used `distutils.version` to examine version numbers of dependencies\n  at run time, and now use `packaging.version` for this. This is a new\n  dependency.\n- Fixed an error message advising the user that Ghostscript was not installed being\n  suppressed when this condition actually happens.\n- Fixed an issue with incorrect page number and totals being displayed in the progress\n  bar. This was purely a display/presentation issue. {issue}`876`.\n\n## v13.1.1\n\n- Fixed issue with attempting to deskew a blank page on Tesseract 5. {issue}`868`.\n\n## v13.1.0\n\n- Changed to using Python concurrent.futures-based parallel execution instead of\n  pools, since futures have now exceed pools in features.\n- If a child worker is terminated (perhaps by the operating system or the user\n  killing it in a task manager), the parallel task will fail an error message.\n  Previously, the main ocrmypdf process would \"hang\" indefinitely, waiting for the\n  child to report.\n- Added new argument `--tesseract-thresholding` to provide control over Tesseract 5's\n  threshold parameter.\n- Documentation updates and changes. Better documentation for `--output-type none`,\n  added a few releases ago. Removed some obsolete documentation.\n- Improved bash completions - thanks to @FPille.\n\n## v13.0.0\n\n**Breaking changes**\n\n- The deprecated module `ocrmypdf.leptonica` has been removed.\n- We no longer depend on Leptonica (`liblept`) or CFFI (`libffi`,\n  `python3-cffi`). (Note that Tesseract still requires Leptonica; OCRmyPDF no longer\n  directly uses this library.)\n- The argument `--remove-background` is temporarily disabled while we search for an\n  alternative to the Leptonica implementation of this feature.\n- The `--threshold` argument has been removed, since this also depended on Leptonica.\n  Tesseract 5.x has implemented improvements to thresholding, so this feature will be\n  redundant anyway.\n- `--deskew` was previous calculated by a Leptonica algorithm. We now use a feature\n  of Tesseract to find the appropriate the angle to deskew a page. The deskew angle\n  according to Tesseract may differ from Leptonica's algorithm. At least in theory,\n  Tesseract's deskew angle is informed by a more complex analysis than Leptonica,\n  so this should improve results in general. We also use Pillow to perform the\n  deskewing, which may affect the appearance of the image compared to Leptonica.\n- Support for Python 3.6 was dropped, since this release is approaching end of life.\n- We now require pikepdf 4.0 or newer. This, in turn, means that OCRmyPDF requires\n  a system compatible with the manylinux2014 specification. This change was \"forced\"\n  by Pillow not releasing manylinux2010 wheels anymore.\n- We no longer provide requirements.txt-style files. Use `pip install ocrmypdf[...]`\n  instead.\n- Bumped required versions of several libraries.\n\n**Fixes**\n\n- Fixed an issue where OCRmyPDF failed to find Ghostscript on Windows even when\n  installed, and would exit with an error.\n- By removing Leptonica, we fixed all issues related to Leptonica on Apple\n  Silicon or Leptonica failing to import on Windows.\n\n"
  },
  {
    "path": "docs/releasenotes/version14.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v14\n\n## v14.4.0\n\n- Digitally signed PDFs are now detected. If the PDF is signed, OCRmyPDF will\n  refuse to modify it. Previously, only encrypted PDFs were detected, not\n  those that were signed but not encrypted. {issue}`1040`\n- In addition, `--invalidate-digital-signatures` can be used to override the\n  above behavior and modify the PDF anyway. {issue}`1040`\n- tqdm progress bars replaced with \"rich\" progress bars. The rich library is\n  a new dependency. Certain APIs that used tqdm are now deprecated and will\n  be removed in the next major release.\n- Improved integration with GitHub Releases. Thanks to @stumpylog.\n\n## v14.3.0\n\n- Renamed master branch to main.\n- Improve PDF rasterization accuracy by using the `-dPDFSTOPONERROR` option\n  to Ghostscript. Use `--continue-on-soft-render-error` if you want to render\n  the PDF anyway. The plugin specification was adjusted to support this feature;\n  plugin authors may want to adapt PDF rasterizing and rendering\n  plugins. {issue}`1083`\n- The calculated deskew angle is now recorded in the logged output. {issue}`1101`\n- Metadata can now be unset by setting a metadata type such as `--title` to an\n  empty string. {issue}`1117,1059`\n- Fixed random order of languages due to use of a set. This may have caused output\n  to vary when multiple languages were set for OCR. {issue}`1113`\n- Clarified the optimization ratio reported in the log output.\n- Documentation improvements.\n\n## v14.2.1\n\n- Fixed {issue}`977`, where images inside Form XObjects were always excluded\n  from image optimization.\n\n## v14.2.0\n\n- Added `--tesseract-downsample-above` to downsample larger images even when\n  they do not exceed Tesseract's internal limits. This can be used to speed\n  up OCR, possibly sacrificing accuracy.\n- Fixed resampling AttributeError on older Pillow. {issue}`1096`\n- Removed an error about using Ghostscript on PDFs with that have the /UserUnit\n  feature in use. Previously, Ghostscript would fail to process these PDFs,\n  but in all supported versions it is now supported, so the error is no longer\n  needed.\n- Improved documentation around installing other language packs for Tesseract.\n\n## v14.1.0\n\n- Added `--tesseract-non-ocr-timeout`. This allows using Tesseract's deskew\n  and other non-OCR features while disabling OCR using `--tesseract-timeout 0`.\n- Added `--tesseract-downsample-large-images`. This downsamples larges images\n  that exceed the maximum image size Tesseract can handle. Large images may still\n  take a long time to process, but this allows them to be processed if that\n  is desired.\n- Fixed {issue}`1082`, an issue with snap packaged building.\n- Change linter to ruff, fix lint errors, update documentation.\n\n## v14.0.4\n\n- Fixed {issue}`1066, 1075`, an exception when processing certain malformed PDFs.\n\n## v14.0.3\n\n- Fixed {issue}`1068`, avoid deleting /dev/null when running as root.\n- Other documentation fixes.\n\n## v14.0.2\n\n- Fixed {issue}`1052`, an exception on attempting to process certain nonconforming PDFs.\n- Explicitly documented that Windows 32-bit is no longer supported.\n- Fixed source installation instructions.\n- Other documentation fixes.\n\n## v14.0.1\n\n- Fixed some version checks done with smart version comparison.\n- Added missing jbig2dec to Docker image.\n\n## v14.0.0\n\n- Dropped support for Python 3.7.\n- Dropped support generally speaking, all dependencies older than what Ubuntu 20.04\n  provides.\n- Ghostscript 9.50 or newer is now required. Shims to support old versions were\n  removed.\n- Tesseract 4.1.1 or newer is now required. Shims to support old versions were\n  removed.\n- Docker image now uses Tesseract 5.\n- Dropped setup.cfg configuration for pyproject.toml.\n- Removed deprecation exception PdfMergeFailedError.\n- A few more public domain test files were removed or replaced. We are aiming for\n  100% compliance with SPDX and generally towards simplifying copyright.\n\n"
  },
  {
    "path": "docs/releasenotes/version15.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v15\n\n## v15.4.4\n\n- Fixed documentation for installing Ghostscript on Windows. {issue}`1198`\n- Added warning message about security issue in older versions of Ghostscript.\n\n## v15.4.3\n\n- Fixed deprecation warning in pikepdf older than 8.7.1; pikepdf >= 8.7.1 is\n  now required.\n\n## v15.4.2\n\n- We now raise an exception on a certain class of PDFs that likely need an\n  explicit color conversion strategy selected to display correctly\n  for PDF/A conversion.\n- Fixed an error that occurred while trying to write a log message after the\n  debug log handler was removed.\n\n## v15.4.1\n\n- Fixed misc/watcher.py regressions: accept `--ocr-json-settings` as either\n  filename or JSON string, as previously; and argument count mismatch.\n  {issue}`1183,1185`\n- We no longer attempt to set /ProcSet in the PDF output, since this is an\n  obsolete PDF feature.\n- Documentation improvements.\n\n## v15.4.0\n\n- Added new experimental APIs to support offline editing of the final text.\n  Specifically, one can now generate hOCR files with OCRmyPDF, edit them with\n  some other tool, and then finalize the PDF. They are experimental and\n  subject to change, including details of how the working folder is used.\n  There is no command line interface.\n- Code reorganization: executors, progress bars, initialization and setup.\n- Fixed test coverage in cases where the coverage tool did not properly trace\n  into threads or subprocesses. This code was still being tested but appeared\n  as not covered.\n- In the test suite, reduced use of subprocesses and other techniques that\n  interfere with coverage measurement.\n- Improved error check for when we appear to be running inside a snap container\n  and files are not available.\n- Plugin specification now properly defines progress bars as a protocol rather\n  than defining them as \"tqdm-like\".\n- We now default to using \"forkserver\" process creation on POSIX platforms\n  rather than fork, since this is method is more robust and avoids some\n  issues when threads are present.\n- Fixed an instance where the user's request to `--no-use-threads` was ignored.\n- If a PDF does not have language metadata on its top level object, we add\n  the OCR language.\n- Replace some cryptic test error messages with more helpful ones.\n- Debug messages for how OCRmyPDF picks the colorspace for a page are now\n  more descriptive.\n\n## v15.3.1\n\n- Fixed an issue with logging settings for misc/watcher.py introduced in the\n  previous release. {issue}`1180`\n- We now attempt to preserve the input's extended attributes when creating\n  the output file.\n- For some reason, the macOS build now needs OpenSSL explicitly installed.\n- Updated documentation on Docker performance concerns.\n\n## v15.3.0\n\n- Update misc/watcher.py to improve command line interface using Typer, and\n  support `.env` specification of environment variables. Improved error\n  messages. Thanks to @mflagg2814 for the PR that prompted this improvement.\n- Improved error message when a file cannot be read because we are running in\n  a snap container.\n\n## v15.2.0\n\n- Added a Docker image based on Alpine Linux. This image is smaller than the\n  Ubuntu-based image and may be useful in some situations. Currently hosted at\n  jbarlow83/ocrmypdf-alpine. Currently not available in ARM flavor.\n- The Ubuntu Docker is now aliased to jbarlow83/ocrmypdf-ubuntu.\n- Updated Docker documentation.\n\n## v15.1.0\n\n- We now require Pillow 10.0.1, due a serious security vulnerability in all earlier\n  versions of that dependency. The vulnerability concerns WebP images and could\n  be triggered in OCRmyPDF when creating a PDF from a malicious WebP image.\n- Added some keyword arguments to `ocrmypdf.ocr` that were previously accepted\n  but undocumented.\n- Documentation updates and typing improvements.\n\n## v15.0.2\n\n- Added Python 3.12 to test matrix.\n- Updated documentation for notes on Python 3.12, 32-bit support and some new\n  features in v15.\n\n## v15.0.1\n\n- Wheels Python tag changed to py39.\n- Marked as a expected fail a test that fails on recent Ghostscript versions.\n- Clarified documentation and release notes around the extent of 32-bit support.\n- Updated installation documentation to changes in v15.\n\n## v15.0.0\n\n- Dropped support for Python 3.8.\n- Dropped support some older dependencies, specifically `coloredlogs` and\n  `tqdm` in favor of rich - see `pyproject.toml` for details.\n  Generally speaking, Ubuntu 22.04 is our new baseline system.\n- Tightened version requirements for some dependencies.\n- Dropped support for 32-bit Linux wheels. We strongly recommend a 64-bit operating\n  system, and 64-bit versions of Python, Tesseract and Ghostscript to use OCRmyPDF.\n  Many of our dependencies are dropping 32-bit builds (e.g. Pillow), and we are\n  following suit. (Maintainers may still build 32-bit versions from source.)\n- Changed to trusted release for PyPI publishing.\n- pikepdf memory mapping is enabled again for improved performance, now that an\n  issue with feature in pikepdf is fixed.\n- `ocrmypdf.helpers.calculate_downsample` previously had two variants, one\n  that took a `PIL.Image` and one that took a `tuple[int, int]`. The latter\n  was removed.\n- The snap version of ocrmypdf is now based on Ubuntu core22.\n- We now account for situations where a small portion of an image on a page is drawn\n  at high DPI (resolution). Previously, the entire page would be rasterized at the\n  highest resolution of any feature, which caused performance problems. Now,\n  the page is rasterized\n  at a resolution based on the average DPI of the page, weighted by the area that\n  each feature occupies. Typically, small areas of high resolution in PDFs are\n  errors or quirks from the repeated use of assets and high resolution is not\n  beneficial. {issue}`1010,1104,1004,1079,1010`\n- Ghostscript color conversion strategy is now configurable using\n  `--color-conversion-strategy`. {issue}`1143`\n- JBIG2 threshold for optimization is now configurable using\n  `--jbig2-threshold`. {issue}`1133`\n\n"
  },
  {
    "path": "docs/releasenotes/version16.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v16\n\n## v16.13.0\n\n- Added detection and repair for Ghostscript 10.6 JPEG corruption. When GS 10.6\n  truncates JPEG data by 1-15 bytes, OCRmyPDF now restores the original image\n  bytes from the input PDF. A warning is issued when GS 10.6+ is detected.\n  {issue}`1603`\n- We continue to force re-optimization of JPEGs, since this catches some issues with corruption for situations where Ghostscript modifies an image. It is likely there are still cases where we cannot mitigate all corruption issues. {issue}`1585`\n- Fixed handling of PDF page boxes (ArtBox, BleedBox) which were not being\n  processed correctly in some cases. {issue}`1181,1360`\n- Documentation: clarified podman usage instructions.\n\n## v16.12.0\n\n- Disable Ghostscript's subset fonts feature, which was found to corrupt text in certain\n  PDFs. Thanks @mnaegler for identifying this issue. {issue}`1592`\n- Users of Ghostscript 10.6.0+ reported that Ghostscript seems to generate corrupted\n  JPEGs. We force re-optimization of these JPEGs to mitigate the corruption until\n  Ghostscript fixes the issue. {issue}`1585`\n- OCRmyPDF now avoids applying flate compression to large JPEG images, unless maximum\n  optimization is requested, since flate+DCT compression reduces performances in PDF\n  viewers with large images.\n- Updated Dockerfiles to use more recent base operating systems.\n- Updated build and test matrix to include Python 3.14.\n- Minor documentation improvements.\n- pikepdf >= 10.0.0 is now required.\n\n## v16.11.1\n\n- Fixed issue with Tesseract changing an error message related to skew. {issue}`1576`\n- Dropped macOS 13 from build-test matrix since it is no longer supported by Apple.\n\n## v16.11.0\n\n- Deprecated \"semfree\" plugin in favor of falling back to threads if the platform\n  does not support semaphores. Fixes an issue with Python 3.14.\n- Fixed references to PDF/A compliances levels to be consistent with ISO nomenclature.\n  Thanks @5HT2. {issue}`1557`\n- Fixed an issue around using plugin_manager as an argument. {issue}`1555`\n- Added OpenBSD install steps to README. {issue}`1554`\n- Removed PyPy from test matrix due to declining support in third party libraries.\n- Documentation improvements.\n\n## v16.10.4\n\n- Corrected build errors in Python 3.13.3 and 3.13.4.\n\n## v16.10.3 (not released)\n\n- Blocked optimization of images with pre-blended soft masks. {issue}`1536`\n- Fixed warning from hypothesis on running tests.\n- Release incomplete due to new test failures in Python 3.13.3 and 3.13.4.\n\n## v16.10.2\n\n- Blacklist pikepdf 9.8.0 due to an incompatible change.\n\n## v16.10.1\n\n- No changes affecting OCRmyPDF functionality for command line end users.\n- webservice: made page specification easier to find in UI.\n- webservice: fix download button downloads wrong file.\n- Converted project documentation from rST to Markdown.\n- Added README translation to Simplified Chinese. Thanks @HuaPai.\n- Modernized license specification in pyproject.toml.\n- Modernized SPDX license to REUSE.toml.\n\n## v16.10.0\n\n- Added hocr textangle processing, improving handling of text at angles.\n  Thanks @0dinD {issue}`1467`\n- Docker documentation updates related to podman. Thanks @rugk. {issue}`1489,1488`\n- Dropped webservice.py's fragile use of ttyd. Instead, messages from ocrmypdf are\n  printed to the console.\n- Fixed broken test test_hocrtransform_matches_sandwich, which had become\n  an invalid test. Thanks @QuLogic for reporting.\n- Improved install instructions for Windows. Thanks @alex.\n\n## v16.9.0\n\n- Added hocr caption processing. Thanks @0dinD {issue}`1466`\n- ocrmypdf-alpine Docker image is now built with Alpine 3.21.\n- Fixed error handling of PDFs that contain invalid images with both ImageMask\n  and ColorSpace defined. {issue}`1453`\n- Fixed test suite regression when only older Ghostscripts are installed.\n- Improved documetnation of \\_progressbar.py. Thanks @QuentinFuxa. {issue}`1456`\n- Disabling building of documentation as PDF on ReadTheDocs, as this caused\n  complex build issues deemed not worth solving.\n\n## v16.8.0\n\n- Upgraded webservice.py demonstration using streamlit. It's now possible to\n  exercise most of OCRmyPDF's functionality in a simple web UI.\n- Added cache to Dockerfiles to improve build speed.\n- Fixed numerous formatting errors in the documentation that prevented some\n  parts of documentation from generating correctly.\n- Improved OCR text rendering by suppressing negative-width spaces. Thanks\n  @pajowu. {issue}`1446`\n- Improved detecting of invisible text when using `--redo-ocr`. Thanks\n  @pajowu. {issue}`1448``\n\n## v16.7.0\n\n- Fixed further issues with Docker build and updated some versions.\n- Main Docker image returned to Ubuntu 24.04 since the fix in v16.6.2 resolved\n  that concern.\n- Code that previously sent Ghostscript output to stdout has been changed to\n  output to temporary files, since Ghostscript was doing that anyway internally.\n  This is a modest efficiency improvement.\n- Fixed an issue with debug log output being parsed as rich markup. {issue}`1444`\n\n## v16.6.2\n\n- Remove invalid hyperlink annotations to satisfy Ghostscript 10.x during PDF/A\n  conversion. {issue}`1425`\n\n## v16.6.1\n\n- Fixed some issues with Docker build, such as removing unnecessary content and using\n  a stable Tesseract version.\n- Reverted Docker image to Ubuntu 22.04 to access older/more stable Ghostscript\n  for now.\n- Clarified batch commands in documentation.\n- Fixed an issue with JSON serialization and pickling of HOCRResult. {issue}`1427`\n\n## v16.6.0\n\n- Fixed an issue where damaged PDFs would fail with `--redo-ocr`. {issue}`1403`\n- Fixed an error that prevented JBIG2 optimization on Windows if the image\n  was optimized in an earlier step. {issue}`1396`\n- Fixed an error detecting the version of unpaper 7.0.0. {issue}`1409`\n- Fixed a performance regression when scanning pages. {issue}`1378`. Thanks @aliemjay.\n- Fixed Alpine Docker image by enforcing Alpine 3.19. Alpine 3.20 includes a\n  defective version of Tesseract OCR and so is not usable.\n- Upgraded Ubuntu Docker image to use Ubuntu 24.04.\n- Build and test scripts/actions switched to uv.\n- When running in a container, we now remind the user that temporary folders\n  are inside the container and may not be accessible.\n- Fixed Linux test coverage matrix, which was missing some key versions.\n\n## v16.5.0\n\n- Fixed issue with interpreting PDFs that have images with array masks.\n  {issue}`1377`\n- Enabled testing on Python 3.13.\n- Fixed a test that did not work correctly but still passed. {issue}`1382`\n- Improved \"PDF/A conversion failed\" warning message to better describe implications.\n- Updated documentation to better explain OCR_JSON_SETTINGS in batch processing.\n- Build backend changed from setuptools to hatchling.\n\n## v16.4.3\n\n- Work around pdfminer.six issue where a token on the buffer boundary is incorrectly\n  parsed as two tokens. {issue}`1361`\n- New rules are applied to stencil masks and explicit masks when calculating the\n  optimal page DPI for rendering. {issue}`1362`\n- Fixed attempts to use an incompatible jbig2.EXE provided by TeX Live. {issue}`1363`\n\n## v16.4.2\n\n- Fixed order of filenames passed to Ghostscript for PDF/A generation. {issue}`1359`\n- Suppressed missing jbig2dec warning message. {issue}`1358`\n- Fixed calculation of image size when soft mask dimensions don't match image\n  dimension. {issue}`1351`\n- Several fixes to documentation. Thanks to users Iris and JoKalliauer\n  who contributed these changes.\n- Fixed error on processing PDFs that are missing certain image metadata. {issue}`1315`\n\n## v16.4.1\n\n- Fixed calculation of image printed area (used in finding weighted DPI for OCR).\n  {issue}`1334`\n- Fixed \"NotImplementedError: not sure how to get colorspace\" error\n  messages in logs which simply records a failure to optimize images with\n  print production colorspaces. {issue}`1315`\n\n## v16.4.0\n\n- Selecting the `osd` and `equ` pseudo-languages with `-l/--language` now\n  exits with an error when using Tesseract OCR, because these are not\n  regular Tesseract languages but implementation details implemented.\n  Using them can cause Tesseract to crash.\n- The hOCR renderer is more tolerant of extra whitespace in input files.\n- watcher.py now changes the output file extension to .pdf when the input is not\n  .pdf.\n- Improved handling of PDFs that contain circularly referenced Form XObjects.\n  {issue}`1321`\n- Fixed Alpine Docker image for ARM64, which was not building correctly.\n- Docker images now use pikepdf 9.0.0.\n- Prevent use of Tesseract OCR 5.4.0, a version with known regressions.\n- Disabled progressbar for \"Linearizing\" when `--no-progress-bar` set.\n- Fixed some tests that warn about missing JBIG2 decoding via pikepdf, by\n  installing the necessary libraries during tests.\n\n## v16.3.1\n\n- Fixed a test suite failure with Ghostscript 10.03.0+. {issue}`1316`\n- Fixed an issue with the presentation of the \"OCR\" progress bar. {issue}`1313`\n\n## v16.3.0\n\n- Fixed progress bar not displaying for Ghostscript PDF/A conversion. {issue}`1313`\n- Added progress bar for linearization. {issue}`1313`\n- If `--rotate-pages-threshold` issued without `--rotate-pages` we now exit with\n  an error since the user likely intended to use `--rotate-pages`. {issue}`1309`\n- If Tesseract hOCR gives an invalid line box, print an error message instead of\n  exiting with an error. {issue}`1312`\n\n## v16.2.0\n\n- Fixed issue 'NoneType' object has no attribute 'get' when optimizing certain PDFs.\n  {issue}`1293,1271`\n- Switched formatting from black to ruff.\n- Added support for sending sidecar output to io.BytesIO.\n- Added support for converting HEIF/HEIC images (the native image of iPhones and\n  some other devices) to PDFs, when the appropriate pi-hief library is installed.\n  This library is marked as a dependency, but maintainers may opt out if needed.\n- We now default to downsampling large images that would exceed Tesseract's internal\n  limits, but only if it cause processing to fail. Previously, this behavior only\n  occurred if specifically requested on command line. It can still be configured\n  and disabled. See the --tesseract command line options.\n- Added Macports install instructions. Thanks @akierig.\n- Improved logging output when an unexpected error occurs while trying to obtain\n  the version of a third party program.\n\n## v16.1.2\n\n- Fixed test suite failure when using Ghostscript 10.3.\n- Other minor corrections.\n\n## v16.1.1\n\n- Fixed PyPy 3.10 support.\n\n## v16.1.0\n\n- Improved hOCR renderer is now default for left to right languages.\n- Improved handling of rotated pages. Previously, OCR text might be missing for\n  pages that were rotated with a /Rotate tag on the page entry.\n- Improved handling of cropped pages. Previously, in some cases a page with a\n  crop box would not have its OCR applied correctly and misalignment between\n  OCR text and visible text coudl occur.\n- Documentation improvements, especially installation instructions for less\n  common platforms.\n\n## v16.0.4\n\n- Fixed some issues for left-to-right text with the new hOCR renderer. It is still\n  not default yet but will be made so soon. Right-to-left text is still in progress.\n- Added an error to prevent use of several versions of Ghostscript that seem\n  corrupt existing text in input PDFs. Newly generated OCR is not affected.\n  For best results, use Ghostscript 10.02.1 or newer, which contains the fix\n  for the issue.\n\n## v16.0.3\n\n- Changed minimum required Ghostscript to 9.54, to support users of RHEL 9 and its\n  derivatives, since that is the latest version available there.\n- Removed warning message about CVE-2023-43115, on the assumption that most\n  distributions have backported the patch by now.\n\n## v16.0.2\n\n- Temporarily changed PDF text renderer back to sandwich by default to address\n  regressions in macOS Preview.\n\n## v16.0.1\n\n- Fixed text rendering issue with new hOCR text renderer - extraneous byte order\n  marks.\n- Tightened dependencies.\n\n## v16.0.0\n\n- Added OCR text renderer, combined the best ideas of Tesseract's PDF\n  generator and the older hOCR transformer renderer. The result is a hopefully\n  permanent fix for wordssmushedtogetherwithoutspaces issues in extracted text,\n  better registration/position of text on skewed baselines {issue}`1009`,\n  fixes to character output when the German Fraktur script is used {issue}`1191`,\n  proper rendering of right to left languages (Arabic, Hebrew, Persian) {issue}`1157`.\n  Asian languages may still have excessive word breaks compared to expectations.\n  The new renderer is the default; the old sandwich renderer is still available\n  using `--pdf-renderer sandwich`; the old hOCR renderer is no more.\n- The `ocrmypdf.hocrtransform` API has changed substantially.\n- Support for Python 3.9 has been dropped. Python 3.10+ is now required.\n- pikepdf >= 8.8.0 is now required.\n\n"
  },
  {
    "path": "docs/releasenotes/version17.md",
    "content": "% SPDX-FileCopyrightText: 2022 James R. Barlow\n% SPDX-License-Identifier: CC-BY-SA-4.0\n\n# v17\n\n## v17.3.0\n\n- Fixed Python API ignoring the ``language`` parameter, always defaulting to\n  ``eng``. The API now correctly maps ``language`` to OcrOptions ``languages``\n  and splits ``+``-separated codes (e.g. ``eng+deu``) to match CLI behavior.\n  {issue}`1640`\n- Fixed Python API producing empty OCR output because ``tesseract_timeout``\n  defaulted to 0, causing Tesseract to time out immediately. The default is\n  now ``None``, falling back to the plugin's 180-second timeout. {issue}`1636`\n- Fixed OCR text layer displacement on PDFs with non-zero MediaBox origins\n  (e.g. JSTOR or cropped PDFs). The coordinate transformation matrix is now\n  always computed, not skipped when rotation is zero. {issue}`1630`\n- Restored image overlay support (``--image``) for the hocrtransform tool,\n  enabling sandwich PDF output with the fpdf2 renderer. {issue}`1634`\n- Docker: updated Alpine base image to 3.23.\n- Documentation restructured into per-major-version release notes files.\n- Release process improvements.\n\n## v17.2.0\n\n- Fixed incorrect word spacing in poppler-based PDF viewers and tools (Evince,\n  pdftotext, and others) where words on the same line appeared separated by\n  double newlines. This works around a poppler bug where Tz (horizontal scaling)\n  is not carried across BT/ET boundaries. {issue}`1632`\n- Fixed OCR text layer being visible instead of invisible due to incorrect fpdf2\n  text rendering mode attribute. This caused OCR text to appear when images were\n  removed from the PDF. {issue}`1631`\n- Fixed OCR text layer misalignment with non-zero mediabox origins, which\n  affected cropped PDFs and JSTOR PDFs generated by iText. The ``--redo-ocr``\n  mode would shift text vertically on these files. {issue}`1630`\n- Fixed Ghostscript rasterization failure with very low DPI values (below 10).\n  OCRmyPDF now renders at a minimum of 10 DPI and resizes the output to match\n  the originally requested dimensions. {issue}`1612`\n\n## v17.1.0\n\n- Added `--tagged-pdf-mode` to allow skipping the TaggedPDF error message, if desired.\n- Fixed an issue where deflated JPEGs (FlateDecode + DCTDecode) were counted as\n  lossless images for the purpose of determining whether to compress to JPEG,\n  causing file size inflation with some workflows (`--mode force` in particular).\n\n## v17.0.1\n\n- Fixed output file size inflation when using pypdfium as rasterizer and force-ocr\n  mode.\n\n## v17.0.0\n\n**Breaking changes**\n\n- **Plugin interface migration**: Plugin hooks now receive `OcrOptions` objects instead of\n  `argparse.Namespace` objects. Most plugins will continue working due to duck-typing\n  compatibility, but plugin developers should update their type hints from `Namespace`\n  to `OcrOptions`.\n- Built-in plugins no longer modify options in-place, improving immutability and\n  code clarity.\n- **Lossy JBIG2 removed**: The `--jbig2-lossy` and `--jbig2-page-group-size` options have been\n  removed due to well-documented risks of character substitution errors. These options are now\n  deprecated and will emit warnings if used. Only lossless JBIG2 compression is supported.\n- **PDF/A output behavior change**: If neither Ghostscript nor verapdf is installed,\n  `--output-type auto` (the new default) will produce a standard PDF instead of PDF/A. This is\n  a change from previous versions where Ghostscript was required and PDF/A was always produced.\n  This configuration is rare but users should be aware of the change.\n\n**New features**\n\n- **pypdfium2 rasterizer**: Added optional pypdfium2-based PDF rasterization plugin as an\n  alternative to Ghostscript for page rendering. Use `--rasterizer pypdfium` to enable\n  (requires `pip install pypdfium2`). The default `--rasterizer auto` prefers pypdfium when\n  available and falls back to Ghostscript.\n- **Pluggable OCR engines**: New `--ocr-engine` option allows selecting OCR engines:\n  - `auto` (default): Uses Tesseract\n  - `tesseract`: Explicit Tesseract selection\n  - `none`: Skip OCR entirely for PDF processing-only workflows\n\n  This prepares the foundation for future third-party OCR engine plugins.\n- **Smart PDF/A conversion**: New `--output-type auto` (now the default) produces best-effort\n  PDF/A output without requiring Ghostscript when the verapdf validator is available. Falls back\n  to traditional Ghostscript conversion when needed.\n- **verapdf integration**: Added optional verapdf validation for fast PDF/A conversion. When\n  available, OCRmyPDF attempts speculative PDF/A conversion using pikepdf, validates with verapdf,\n  and skips Ghostscript if validation passes.\n- **Optional Ghostscript**: As a consequence of the changes above, Ghostscript is no longer a required dependency. It is optional.\n- **fpdf2 text renderer**: Replaced legacy hOCR text renderer with new fpdf2-based implementation,\n  providing better multilingual support and more accurate text positioning.\n- **Improved Occulta glyphless font**: The new Occulta font provides better handling of\n  zero-width markers and double-width CJK characters for accurate text layer positioning.\n- **Expanded multilingual font support**: Added FontProvider infrastructure with language-aware\n  font selection for Devanagari (Hindi, Sanskrit, Marathi, Nepali), CJK (Chinese, Japanese,\n  Korean), Arabic script, and many other scripts. System font discovery reduces package size.\n- **Simplified mode selection**: New `--mode` (`-m`) argument consolidates processing options:\n  - `default`: Error if text is found (standard behavior)\n  - `force`: Rasterize all content and run OCR (replaces `--force-ocr`)\n  - `skip`: Skip pages with existing text (replaces `--skip-text`)\n  - `redo`: Re-OCR pages, stripping old text layer (replaces `--redo-ocr`)\n\n  Legacy flags remain as silent aliases for backward compatibility.\n\n**API improvements**\n\n- Centralized validation logic in the `OcrOptions` Pydantic model\n- Removed scattered option mutation throughout the codebase\n- Better type safety for plugin development\n- Simplified plugin option handling\n- New `OcrElement`, `OcrClass`, and `BoundingBox` exports for OCR engine plugin developers\n- Extended `OcrEngine` ABC with `generate_ocr()` method for direct OCR tree output, eliding the need to translate a modern engine's output to hOCR or directly write to PDF.\n\n**Bug fixes**\n\n- Fixed double-compression of already-deflated JPEGs.\n- Fixed tesseract_cache plugin to properly handle cache misses.\n- Fixed handling of PDF page boxes (ArtBox, BleedBox) which were not being processed correctly.\n- Added thread safety lock to pypdfium plugin for concurrent operations.\n- Improved pdfminer.six compatibility with explicit word spacing.\n\n**Documentation**\n\n- Updated cookbook to replace deprecated `--tesseract-timeout 0` with `--ocr-engine none`.\n- Added comprehensive plugin documentation for new OCR engine framework.\n\n**Dependency changes**\n\n- Requires: one of `pypdfium2` or `ghostscript` for PDF rasterization (PDF to image)\n  - Preferred: both\n- Requires: one of `verapdf` or `ghostscript` for PDF/A generation\n  - Preferred: both\n- Recommended: `pypdfium2` for PDF rasterization (new dependency)\n- Recommended: `ghostscript` (used to be Required)\n- Recommended: Noto fonts for improved OCR text positioning\n- Optional: `verapdf` for fast PDF/A validation (new dependency)\n- Requires: `fpdf2` for text layer rendering (new dependency)\n- Recommended: replace `typer` with `cyclopts` in misc scripts (new dependency)\n- See docs/maintainers.md for details.\n\n**Migration guide for plugin developers**\n\n- Update imports: `from ocrmypdf._options import OcrOptions`\n- Update type hints: `def check_options(options: OcrOptions)` instead of `options: Namespace`\n- Attribute access remains unchanged: `options.languages`, `options.output_type`, etc.\n- Remove any in-place option modifications - compute values at point of use instead\n- Most existing plugins will continue working without changes due to duck-typing\n\n"
  },
  {
    "path": "misc/_webservice.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: AGPL-3.0-or-later\n\n\"\"\"This is a simple web service/HTTP wrapper for OCRmyPDF.\n\nThis may be more convenient than the command line tool for some Docker users.\nNote that OCRmyPDF uses Ghostscript, which is licensed under AGPLv3+. While\nOCRmyPDF is under GPLv3, this file is distributed under the Affero GPLv3+ license,\nto emphasize that SaaS deployments should make sure they comply with\nGhostscript's license as well as OCRmyPDF's.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport os\nimport subprocess\nimport sys\nfrom functools import partial\nfrom operator import getitem\nfrom pathlib import Path\nfrom tempfile import NamedTemporaryFile\n\nimport pikepdf\nimport streamlit as st\n\nfrom ocrmypdf._defaults import DEFAULT_ROTATE_PAGES_THRESHOLD\n\n\ndef get_host_url_with_port(port: int) -> str:\n    \"\"\"Get the host URL for the web service. Hacky.\"\"\"\n    host_url = st.context.headers[\"host\"]\n    try:\n        host, _streamlit_port = host_url.split(\":\", maxsplit=1)\n    except ValueError:\n        host = host_url\n    return f\"//{host}:{port}\"  # Use the same protocol\n\n\nst.title(\"OCRmyPDF Web Service\")\n\nuploaded = st.file_uploader(\"Upload input PDF or image\", type=[\"pdf\"], key=\"file\")\n\nmode = st.selectbox(\"Mode\", options=[\"normal\", \"skip-text\", \"force-ocr\", \"redo-ocr\"])\n\npages = st.text_input(\n    \"Pages\", value=\"\", help=\"Comma-separated list of pages to process\"\n)\n\nwith st.expander(\"Input options\"):\n    invalidate_digital_signatures = st.checkbox(\n        \"Invalidate digital signatures\", value=False\n    )\n    language = st.selectbox(\"Language\", options=[\"eng\", \"deu\", \"fra\", \"spa\"])\n\n    image_dpi = st.slider(\n        \"Image DPI\", value=300, key=\"image_dpi\", min_value=1, max_value=5000, step=50\n    )\nwith st.expander(\"Preprocessing\"):\n    skip_big = st.checkbox(\"Skip OCR on big pages\", value=False, key=\"skip_big\")\n    oversample = st.slider(\"Oversample\", min_value=0, max_value=5000, value=0, step=50)\n    rotate_pages = st.checkbox(\"Rotate pages\", value=False, key=\"rotate\")\n    deskew = st.checkbox(\"Deskew pages\", value=False, key=\"deskew\")\n    clean = st.checkbox(\"Clean pages before OCR\", value=False, key=\"clean\")\n    clean_final = st.checkbox(\"Clean final\", value=False, key=\"clean_final\")\n    remove_vectors = st.checkbox(\"Remove vectors\", value=False, key=\"remove_vectors\")\n\n\nwith st.expander(\"Output options\"):\n    output_type = st.selectbox(\n        \"Output type\", options=[\"pdfa\", \"pdf\", \"pdfa-1\", \"pdfa-2\", \"pdfa-3\", \"none\"]\n    )\n\n    pdf_renderer = st.selectbox(\n        \"PDF renderer\", options=[\"auto\", \"hocr\", \"hocrdebug\", \"sandwich\"]\n    )\n\n    optimize = st.selectbox(\"Optimize\", options=[\"0\", \"1\", \"2\", \"3\"])\n\n    st.selectbox(\"PDF/A compression\", options=[\"auto\", \"jpeg\", \"lossless\"])\n\nwith st.expander(\"Metadata\"):\n    title = author = keywords = subject = None\n    if uploaded:\n        with pikepdf.open(uploaded) as pdf, pdf.open_metadata() as meta:\n            st.code(str(meta), language=\"xml\")\n            title = st.text_input(\"Title\", value=meta.get('dc:title', ''))\n            author = st.text_input(\"Author\", value=meta.get('dc:creator', ''))\n            keywords = st.text_input(\"Keywords\", value=meta.get('dc:subject', ''))\n            subject = st.text_input(\"Subject\", value=meta.get('dc:description', ''))\n\n\nwith st.expander(\"Optimization after OCR\"):\n    jpeg_quality = st.slider(\n        \"JPEG quality\", min_value=0, max_value=100, value=75, key=\"jpeg_quality\"\n    )\n    png_quality = st.slider(\n        \"PNG quality\", min_value=0, max_value=100, value=75, key=\"png_quality\"\n    )\n    jbig2_threshold = st.number_input(\n        \"JBIG2 threshold\", value=0.85, key=\"jbig2_threshold\"\n    )\n\nwith st.expander(\"Advanced options\"):\n    jobs = st.slider(\n        \"Threads\",\n        min_value=1,\n        max_value=os.cpu_count(),\n        value=os.cpu_count(),\n        key=\"threads\",\n    )\n    max_image_mpixels = st.number_input(\n        \"Max image size\",\n        value=250.0,\n        min_value=0.0,\n        help=\"Maximum image size in megapixels\",\n    )\n    rotate_pages_threshold = st.number_input(\n        \"Rotate pages threshold\",\n        value=DEFAULT_ROTATE_PAGES_THRESHOLD,\n        min_value=0.0,\n        max_value=1000.0,\n        help=\"Threshold for automatic page rotation\",\n    )\n    fast_web_view = st.number_input(\n        \"Fast web view\",\n        value=1.0,\n        min_value=0.0,\n        help=\"Linearize files above this size in MB\",\n    )\n    continue_on_soft_render_error = st.checkbox(\n        \"Continue on soft render error\", value=True\n    )\n    verbose_labels = [\"quiet\", \"default\", \"debug\", \"debug_all\"]\n    verbose = st.selectbox(\n        \"Verbosity level\",\n        options=[-1, 0, 1, 2],\n        index=1,\n        format_func=partial(getitem, verbose_labels),\n    )\n\nif uploaded:\n    args = []\n    if mode and mode != 'normal':\n        args.append(f\"--{mode}\")\n    if language:\n        args.append(f\"--language={language}\")\n    if not uploaded.name.lower().endswith(\".pdf\") and image_dpi:\n        args.append(f\"--image-dpi={image_dpi}\")\n    if skip_big:\n        args.append(\"--skip-big\")\n    if oversample:\n        args.append(f\"--oversample={oversample}\")\n    if rotate_pages:\n        args.append(\"--rotate-pages\")\n    if deskew:\n        args.append(\"--deskew\")\n    if clean:\n        args.append(\"--clean\")\n    if clean_final:\n        args.append(\"--clean-final\")\n    if remove_vectors:\n        args.append(\"--remove-vectors\")\n    if output_type:\n        args.append(f\"--output-type={output_type}\")\n    if pdf_renderer:\n        args.append(f\"--pdf-renderer={pdf_renderer}\")\n    if optimize:\n        args.append(f\"--optimize={optimize}\")\n    if title:\n        args.append(f\"--title={title}\")\n    if author:\n        args.append(f\"--author={author}\")\n    if keywords:\n        args.append(f\"--keywords={keywords}\")\n    if subject:\n        args.append(f\"--subject={subject}\")\n    if pages:\n        args.append(f\"--pages={pages}\")\n    if max_image_mpixels:\n        args.append(f\"--max-image-mpixels={max_image_mpixels}\")\n    if rotate_pages_threshold:\n        args.append(f\"--rotate-pages-threshold={rotate_pages_threshold}\")\n    if fast_web_view:\n        args.append(f\"--fast-web-view={fast_web_view}\")\n    if continue_on_soft_render_error:\n        args.append(\"--continue-on-soft-render-error\")\n    if verbose:\n        args.append(f\"--verbose={verbose}\")\n    if optimize > '0' and jpeg_quality:\n        args.append(f\"--jpeg-quality={jpeg_quality}\")\n    if optimize > '0' and png_quality:\n        args.append(f\"--png-quality={png_quality}\")\n    if jbig2_threshold:\n        args.append(f\"--jbig2-threshold={jbig2_threshold}\")\n    if jobs:\n        args.append(f\"--jobs={jobs}\")\n    with NamedTemporaryFile(delete=True, suffix=f\"_{uploaded.name}\") as input_file:\n        input_file.write(uploaded.getvalue())\n        input_file.flush()\n        input_file.seek(0)\n        args.append(str(input_file.name))\n        with NamedTemporaryFile(delete=True, suffix=\".pdf\") as output_file:\n            args.append(str(output_file.name))\n\n            st.session_state['running'] = (\n                'run_button' in st.session_state and st.session_state.run_button\n            )\n            if st.button(\n                \"Run OCRmyPDF\",\n                disabled=st.session_state.get(\"running\", False),\n                key='run_button',\n            ):\n                st.session_state['running'] = True\n                args = [sys.executable, '-u', '-m', \"ocrmypdf\"] + args\n\n                proc = subprocess.Popen(\n                    args, stdout=subprocess.PIPE, stderr=subprocess.PIPE\n                )\n                with st.container(border=True):\n                    while proc.poll() is None:\n                        line = proc.stderr.readline()\n                        if line:\n                            st.html(\"<code>\" + line.decode().strip() + \"</code>\")\n\n                if proc.returncode != 0:\n                    st.error(f\"ocrmypdf failed with exit code {proc.returncode}\")\n                    st.session_state['running'] = False\n                    st.stop()\n\n                if Path(output_file.name).stat().st_size == 0:\n                    st.error(\"No output PDF file was generated\")\n                    st.stop()\n\n                st.download_button(\n                    label=\"Download output PDF\",\n                    data=output_file.read(),\n                    file_name=uploaded.name,\n                    mime=\"application/pdf\",\n                )\n                st.session_state['running'] = False\n"
  },
  {
    "path": "misc/batch.py",
    "content": "#!/usr/bin/env python3\n# SPDX-FileCopyrightText: 2016 findingorder <https://github.com/findingorder>\n# SPDX-FileCopyrightText: 2024 nilsro <https://github.com/nilsro>\n# SPDX-License-Identifier: MIT\n\n\"\"\"Example of using ocrmypdf as a library in a script.\n\nThis script will recursively search a directory for PDF files and run OCR on\nthem. It will log the results. It runs OCR on every file, even if it already\nhas text. OCRmyPDF will detect files that already have text.\n\nYou should edit this script to meet your needs.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport filecmp\nimport logging\nimport os\nimport posixpath\nimport shutil\nimport sys\nfrom pathlib import Path\n\nimport ocrmypdf\n\n# pylint: disable=logging-format-interpolation\n# pylint: disable=logging-not-lazy\n\n\ndef filecompare(a, b):\n    try:\n        return filecmp.cmp(a, b, shallow=True)\n    except FileNotFoundError:\n        return False\n\n\nscript_dir = Path(__file__).parent\n# set archive_dir to a path for backup original documents. Leave empty if not required.\narchive_dir = \"/pdfbak\"\n\nstart_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(\".\")\n\nif len(sys.argv) > 2:\n    log_file = Path(sys.argv[2])\nelse:\n    log_file = script_dir.with_name(\"ocr-tree.log\")\n\nlogging.basicConfig(\n    level=logging.INFO,\n    format=\"%(asctime)s %(message)s\",\n    filename=log_file,\n    filemode=\"a\",\n)\n\nlogging.info(f\"Start directory {start_dir}\")\n\nocrmypdf.configure_logging(ocrmypdf.Verbosity.default)\n\nfor filename in start_dir.glob(\"**/*.pdf\"):\n    logging.info(f\"Processing {filename}\")\n    if ocrmypdf.pdfa.file_claims_pdfa(filename)[\"pass\"]:\n        logging.info(\"Skipped document because it already contained text\")\n    else:\n        archive_filename = archive_dir + str(filename)\n        if len(archive_dir) > 0 and not filecompare(filename, archive_filename):\n            logging.info(f\"Archiving document to {archive_filename}\")\n            try:\n                shutil.copy2(filename, posixpath.dirname(archive_filename))\n            except OSError:\n                os.makedirs(posixpath.dirname(archive_filename))\n                shutil.copy2(filename, posixpath.dirname(archive_filename))\n        try:\n            result = ocrmypdf.ocr(filename, filename, deskew=True)\n            logging.info(result)\n        except ocrmypdf.exceptions.EncryptedPdfError:\n            logging.info(\"Skipped document because it is encrypted\")\n        except ocrmypdf.exceptions.PriorOcrFoundError:\n            logging.info(\"Skipped document because it already contained text\")\n        except ocrmypdf.exceptions.DigitalSignatureError:\n            logging.info(\"Skipped document because it has a digital signature\")\n        except ocrmypdf.exceptions.TaggedPDFError:\n            logging.info(\n                \"Skipped document because it does not need ocr as it is tagged\"\n            )\n        except Exception:\n            logging.error(\"Unhandled error occured\")\n        logging.info(\"OCR complete\")\n"
  },
  {
    "path": "misc/bisect_pdf.py",
    "content": "#!/usr/bin/env python3\n# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MIT\n\n\"\"\"Helper script for bisecting PDFs to find a page with an issue.\"\"\"\nfrom __future__ import annotations\n\nimport sys\n\nimport pikepdf\n\nif len(sys.argv) != 2:\n    print(f\"Usage: {sys.argv[0]} <input.pdf>\")\n    sys.exit(1)\n\nwith pikepdf.open(sys.argv[1]) as pdf:\n    num_pages = len(pdf.pages)\n    low = 0\n    high = num_pages - 1\n    while low <= high:\n        mid = (low + high) // 2\n        with pikepdf.new() as new_pdf:\n            new_pdf.pages.extend(pdf.pages[low : mid + 1])\n            new_pdf.save(f\"bisect-issue-{low + 1}-{mid + 1}.pdf\")\n        print(f\"Is bisect-issue-{low + 1}-{mid + 1}.pdf good or bad?\", end=\" \")\n        while True:\n            response = input().lower()\n            if response == \"good\":\n                low = mid + 1\n                break\n            elif response == \"bad\":\n                high = mid - 1\n                break\n            else:\n                print(\"Please respond with 'good' or 'bad'.\")\n    print(f\"The issue is on page {low + 1} of the original PDF.\")\n    with pikepdf.new() as new_pdf:\n        new_pdf.pages.extend(pdf.pages[low])\n        new_pdf.save(f\"bisect-issue-bad-{low + 1}.pdf\")\n    with pikepdf.new() as new_pdf:\n        new_pdf.pages.extend(pdf.pages[:low])\n        new_pdf.pages.extend(pdf.pages[low + 1 :])\n        new_pdf.save(f\"bisect-issue-good-{low + 1}.pdf\")\n"
  },
  {
    "path": "misc/completion/ocrmypdf.bash",
    "content": "# SPDX-FileCopyrightText: 2021 Frank Pille\n# SPDX-FileCopyrightText: 2020 Alex Willner\n# SPDX-License-Identifier: MIT\n\nset -o errexit\n\n__ocrmypdf_arguments()\n{\n    local arguments=\"\\\n--help                          (show help message)\n--language                      (language(s) of the file to be OCRed)\n--image-dpi                     (assume this DPI if input image DPI is unknown)\n--output-type                   (select PDF output options)\n--sidecar                       (write OCR to text file)\n--version                       (print program version and exit)\n--jobs                          (how many worker processes to use)\n--quiet                         (suppress INFO messages)\n--verbose                       (set verbosity level)\n--title                         (set metadata)\n--author                        (set metadata)\n--subject                       (set metadata)\n--keywords                      (set metadata)\n--rotate-pages                  (rotate pages to correct orientation)\n--deskew                        (fix small horizontal alignment skew)\n--clean                         (clean document images before OCR)\n--clean-final                   (clean document images and keep result)\n--unpaper-args                  (a quoted string of arguments to pass to unpaper)\n--oversample                    (oversample images to this DPI)\n--remove-vectors                (don\\'t send vector objects to OCR)\n--mode                          (processing mode for pages with existing text)\n--force-ocr                     (OCR documents that already have printable text)\n--skip-text                     (skip OCR on any pages that already contain text)\n--redo-ocr                      (redo OCR on any pages that seem to have OCR already)\n--invalidate-digital-signatures (remove digital signatures from PDF)\n--tagged-pdf-mode               (control behavior for Tagged PDFs)\n--skip-big                      (skip OCR on pages larger than this many MPixels)\n--optimize                      (select optimization level)\n--jpeg-quality                  (JPEG quality [0..100])\n--png-quality                   (PNG quality [0..100])\n--jbig2-lossy                   (enable lossy JBIG2 (see docs))\n--jbig2-threshold               (set JBIG2 threshold (see docs))\n--pages                         (apply OCR to only the specified pages)\n--max-image-mpixels             (image decompression bomb threshold)\n--pdf-renderer                  (select PDF renderer options)\n--ocr-engine                    (OCR engine to use)\n--rasterizer                    (PDF page rasterizer)\n--rotate-pages-threshold        (page rotation confidence)\n--pdfa-image-compression        (set PDF/A image compression options)\n--fast-web-view                 (if file size if above this amount in MB linearize PDF)\n--continue-on-soft-render-error (continue after recoverable render errors)\n--plugin                        (name of plugin to import)\n--keep-temporary-files          (keep temporary files (debug)\n--tesseract-config              (set custom tesseract config file)\n--tesseract-pagesegmode         (set tesseract --psm)\n--tesseract-oem                 (set tesseract --oem)\n--tesseract-thresholding        (set tesseract image thresholding)\n--tesseract-timeout             (maximum number of seconds to wait for OCR)\n--tesseract-non-ocr-timeout     (maximum seconds for non-OCR operations)\n--tesseract-downsample-large-images    (downsample large images before OCR)\n--no-tesseract-downsample-large-images (do not downsample large images)\n--tesseract-downsample-above    (downsample images larger than this pixel size)\n--user-words                    (specify location of user words file)\n--user-patterns                 (specify location of user patterns file)\n--no-progress-bar               (disable the progress bar)\n--color-conversion-strategy     (select color conversion strategy)\n\"\n\n    COMPREPLY=( $( compgen -W \"$arguments\" -- \"$cur\") )\n\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_output-type()\n{\n    local choices=\"auto   (best-effort PDF/A without Ghostscript (default))\npdfa   (output a PDF/A-2b)\npdf    (output a standard PDF)\npdfa-1 (output a PDF/A-1b)\npdfa-2 (output a PDF/A-2b)\npdfa-3 (output a PDF/A-3b)\nnone   (do not produce an output PDF (for example, if you only care about --sidecar))\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_verbose()\n{\n    local choices=\"0  (standard output messages)\n1  (troubleshooting output messages)\n2  (debugging output messages)\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_optimize()\n{\n    local choices=\"0  (do not optimize)\n1  (do safe, lossless optimizations (default))\n2  (do some lossy optimizations)\n3  (do aggressive lossy optimizations (including lossy JBIG2))\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_pdf-renderer()\n{\n    local choices=\"auto      (auto select PDF renderer, uses fpdf2)\nfpdf2     (use fpdf2 renderer with full language support)\nsandwich  (use sandwich renderer)\nhocr      (use hOCR renderer - deprecated)\nhocrdebug (uses hOCR renderer in debug mode - deprecated)\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_pdfa-image-compression()\n{\n    local choices=\"auto     (let Ghostscript decide how to compress images)\njpeg     (convert color and grayscale images to JPEG)\nlossless (convert color and grayscale images to lossless (PNG))\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_tesseract-pagesegmode()\n{\n    local choices=\"0  (orientation and script detection (OSD) only)\n1  (automatic page segmentation with OSD)\n2  (automatic page segmentation, but no OSD, or OCR)\n3  (fully automatic page segmentation, but no OSD (default))\n4  (assume a single column of text of variable sizes)\n5  (assume a single uniform block of vertically aligned text)\n6  (assume a single uniform block of text)\n7  (treat the image as a single text line)\n8  (treat the image as a single word)\n9  (treat the image as a single word in a circle)\n10 (treat the image as a single character)\n11 (sparse text - find as much text as possible in no particular order)\n12 (sparse text with OSD)\n13 (raw line - treat the image as a single text line)\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_tesseract-oem()\n{\n    local choices=\"0 (legacy engine only)\n1 (neural nets LSTM engine only)\n2 (legacy + LSTM engines)\n3 (default, based on what is available)\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_tesseract-thresholding()\n{\n    local choices=\"auto          (let OCRmyPDF pick thresholding - current always uses otsu)\notsu          (use hOCR renderer)\nadaptive-otsu (use adaptive Otsu thresholding)\nsauvola       (use Sauvola thresholding)\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_color-conversion-strategy()\n{\n    local choices=\"LeaveColorUnchanged (default)\nCMYK (convert to CMYK)\nGray (convert to grayscale)\nRGB (convert to RGB)\nUseDeviceIndependentColor (convert with device independent color)\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_mode()\n{\n    local choices=\"default (error if text is found)\nforce   (rasterize all content and run OCR)\nskip    (skip pages with existing text)\nredo    (re-OCR pages, replacing old invisible text)\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_tagged-pdf-mode()\n{\n    local choices=\"default (error if --mode is default, otherwise warn)\nignore  (always warn but continue processing)\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_ocr-engine()\n{\n    local choices=\"auto      (select best available engine)\ntesseract (use Tesseract OCR)\nnone      (skip OCR entirely)\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_rasterizer()\n{\n    local choices=\"auto        (prefer pypdfium, fall back to Ghostscript)\nghostscript (use Ghostscript rasterizer)\npypdfium    (use pypdfium rasterizer - faster)\"\n\n    COMPREPLY=( $( compgen -W \"$choices\" -- \"$cur\") )\n    # Remove description if only one completion exists\n    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then\n        COMPREPLY=( ${COMPREPLY[0]%% *} )\n    fi\n}\n\n__ocrmypdf_check_previous()\n{\n    case $prev in\n        -h|--help|--version)\n            return 0\n            ;;\n        -l|--language)\n            COMPREPLY=$( command tesseract --list-langs 2>/dev/null )\n            COMPREPLY=( $( compgen -W '${COMPREPLY[@]##*:}' -- \"$cur\" ) )\n            return 0\n            ;;\n        --output-type)\n            __ocrmypdf_output-type\n            return 0\n            ;;\n        -j|--jobs)\n            COMPREPLY=( $( compgen -W '{1..'$( _ncpus )'}' -- \"$cur\" ) )\n            return 0\n            ;;\n        -v|--verbose)\n            __ocrmypdf_verbose\n            return 0\n            ;;\n        -O|--optimize)\n            __ocrmypdf_optimize\n            return 0\n            ;;\n        --pdf-renderer)\n            __ocrmypdf_pdf-renderer\n            return 0\n            ;;\n        -m|--mode)\n            __ocrmypdf_mode\n            return 0\n            ;;\n        --tagged-pdf-mode)\n            __ocrmypdf_tagged-pdf-mode\n            return 0\n            ;;\n        --ocr-engine)\n            __ocrmypdf_ocr-engine\n            return 0\n            ;;\n        --rasterizer)\n            __ocrmypdf_rasterizer\n            return 0\n            ;;\n        --pdfa-image-compression)\n            __ocrmypdf_pdfa-image-compression\n            return 0\n            ;;\n        --tesseract-pagesegmode)\n            __ocrmypdf_tesseract-pagesegmode\n            return 0\n            ;;\n        --tesseract-oem)\n            __ocrmypdf_tesseract-oem\n            return 0\n            ;;\n        --tesseract-thresholding)\n            __ocrmypdf_tesseract-thresholding\n            return 0\n            ;;\n\n        --title|--author|--subject|--keywords|--unpaper-args|--pages|--plugin|\\\n        --jpeg-quality|--png-quality|--image-dpi|--oversample|--skip-big|--max-image-mpixels|\\\n        --tesseract-timeout|--tesseract-non-ocr-timeout|--tesseract-downsample-above|\\\n        --rotate-pages-threshold|--fast-web-view)\n            # argument required but no completions available\n            return 0\n            ;;\n        --tesseract-config|--user-words|--user-patterns|--sidecar)\n            _filedir\n            return 0\n            ;;\n        --color-conversion-strategy)\n            __ocrmypdf_color-conversion-strategy\n            return 0\n            ;;\n    esac\n\n    return 1\n}\n\n_ocrmypdf()\n{\n    local OLDIFS=\"$IFS\"\n    local IFS=$'\\n'\n\n    local cur prev\n\n    # Homebrew on Macs have version 1.3 of bash-completion which doesn't include - see #502\n    if declare -F _init_completion >/dev/null 2>&1; then\n      _init_completion  || return\n    else\n        COMPREPLY=()\n        _get_comp_words_by_ref cur prev\n    fi\n\n    if __ocrmypdf_check_previous -ne 0; then\n        return\n    fi\n\n    if [[ \"$cur\" == -* ]]; then\n        __ocrmypdf_arguments\n    else\n        _filedir\n    fi\n\n    IFS=\"$OLDIFS\"\n\n    return\n} &&\ncomplete -F _ocrmypdf ocrmypdf\n\nset +o errexit\n\n# ex: filetype=sh\n"
  },
  {
    "path": "misc/completion/ocrmypdf.fish",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\ncomplete -c ocrmypdf -x -n __fish_is_first_arg -l version\ncomplete -c ocrmypdf -x -n __fish_is_first_arg -s h -s \"?\" -l help\n\ncomplete -c ocrmypdf -r -l sidecar -d \"write OCR to text file\"\ncomplete -c ocrmypdf -x -s q -l quiet\n\ncomplete -c ocrmypdf -s r -l rotate-pages -d \"rotate pages to correct orientation\"\ncomplete -c ocrmypdf -s d -l deskew -d \"fix small horizontal alignment skew\"\ncomplete -c ocrmypdf -s c -l clean -d \"clean document images before OCR\"\ncomplete -c ocrmypdf -s i -l clean-final -d \"clean document images and keep result\"\ncomplete -c ocrmypdf -x -l unpaper-args -d \"quoted string of arguments to pass to unpaper\"\ncomplete -c ocrmypdf -l remove-vectors -d \"don't send vector objects to OCR\"\n\nfunction __fish_ocrmypdf_mode\n    echo -e \"default\\t\"(_ \"error if text is found\")\n    echo -e \"force\\t\"(_ \"rasterize all content and run OCR\")\n    echo -e \"skip\\t\"(_ \"skip pages with existing text\")\n    echo -e \"redo\\t\"(_ \"re-OCR pages, replacing old invisible text\")\nend\ncomplete -c ocrmypdf -x -s m -l mode -a '(__fish_ocrmypdf_mode)' -d \"processing mode for pages with existing text\"\ncomplete -c ocrmypdf -s f -l force-ocr -d \"OCR documents that already have printable text\"\ncomplete -c ocrmypdf -s s -l skip-text -d \"skip OCR on any pages that already contain text\"\ncomplete -c ocrmypdf -l redo-ocr -d \"redo OCR on any pages that seem to have OCR already\"\ncomplete -c ocrmypdf -l invalidate-digital-signatures -d \"invalidate digital signatures and allow OCR to proceed\"\n\nfunction __fish_ocrmypdf_tagged_pdf_mode\n    echo -e \"default\\t\"(_ \"error if --mode is default, otherwise warn\")\n    echo -e \"ignore\\t\"(_ \"always warn but continue processing\")\nend\ncomplete -c ocrmypdf -x -l tagged-pdf-mode -a '(__fish_ocrmypdf_tagged_pdf_mode)' -d \"control behavior for Tagged PDFs\"\n\ncomplete -c ocrmypdf -s k -l keep-temporary-files -d \"keep temporary files (debug)\"\n\nfunction __fish_ocrmypdf_languages\n    set langs (tesseract --list-langs ^/dev/null)\n    set arr (string split '\\n' $langs)\n    for lang in $arr[2..-1]\n        echo $lang\n    end\nend\ncomplete -c ocrmypdf -x -s l -l language -a '(__fish_ocrmypdf_languages)' -d language\n\ncomplete -c ocrmypdf -x -l image-dpi -d \"assume this DPI if input image DPI is unknown\"\n\nfunction __fish_ocrmypdf_output_type\n    echo -e \"auto\\t\"(_ \"best-effort PDF/A without requiring Ghostscript (default)\")\n    echo -e \"pdfa\\t\"(_ \"output a PDF/A-2b\")\n    echo -e \"pdf\\t\"(_ \"output a standard PDF\")\n    echo -e \"pdfa-1\\t\"(_ \"output a PDF/A-1b\")\n    echo -e \"pdfa-2\\t\"(_ \"output a PDF/A-2b\")\n    echo -e \"pdfa-3\\t\"(_ \"output a PDF/A-3b\")\n    echo -e \"none\\t\"(_ \"do not produce an output PDF (for example, if you only care about --sidecar)\")\nend\ncomplete -c ocrmypdf -x -l output-type -a '(__fish_ocrmypdf_output_type)' -d \"select PDF output options\"\n\nfunction __fish_ocrmypdf_pdf_renderer\n    echo -e \"auto\\t\"(_ \"auto select PDF renderer (default, uses fpdf2)\")\n    echo -e \"fpdf2\\t\"(_ \"use fpdf2 renderer with full language support\")\n    echo -e \"sandwich\\t\"(_ \"use sandwich renderer\")\n    echo -e \"hocr\\t\"(_ \"use hOCR renderer (deprecated)\")\n    echo -e \"hocrdebug\\t\"(_ \"uses hOCR renderer in debug mode (deprecated)\")\nend\ncomplete -c ocrmypdf -x -l pdf-renderer -a '(__fish_ocrmypdf_pdf_renderer)' -d \"select PDF renderer options\"\n\nfunction __fish_ocrmypdf_ocr_engine\n    echo -e \"auto\\t\"(_ \"select best available engine (default)\")\n    echo -e \"tesseract\\t\"(_ \"use Tesseract OCR\")\n    echo -e \"none\\t\"(_ \"skip OCR entirely\")\nend\ncomplete -c ocrmypdf -x -l ocr-engine -a '(__fish_ocrmypdf_ocr_engine)' -d \"OCR engine to use\"\n\nfunction __fish_ocrmypdf_rasterizer\n    echo -e \"auto\\t\"(_ \"prefer pypdfium, fall back to Ghostscript (default)\")\n    echo -e \"ghostscript\\t\"(_ \"use Ghostscript rasterizer\")\n    echo -e \"pypdfium\\t\"(_ \"use pypdfium rasterizer (faster)\")\nend\ncomplete -c ocrmypdf -x -l rasterizer -a '(__fish_ocrmypdf_rasterizer)' -d \"PDF page rasterizer\"\n\nfunction __fish_ocrmypdf_optimize\n    echo -e \"0\\t\"(_ \"do not optimize\")\n    echo -e \"1\\t\"(_ \"do safe, lossless optimizations (default)\")\n    echo -e \"2\\t\"(_ \"do some lossy optimizations\")\n    echo -e \"3\\t\"(_ \"do aggressive lossy optimizations (including lossy JBIG2)\")\nend\ncomplete -c ocrmypdf -x -s O -l optimize -a '(__fish_ocrmypdf_optimize)' -d \"select optimization level\"\n\nfunction __fish_ocrmypdf_verbose\n    echo -e \"0\\t\"(_ \"standard output messages\")\n    echo -e \"1\\t\"(_ \"troubleshooting output messages\")\n    echo -e \"2\\t\"(_ \"debugging output messages\")\nend\ncomplete -c ocrmypdf -x -s v -l verbose -a '(__fish_ocrmypdf_verbose)' -d \"set verbosity level\"\n\ncomplete -c ocrmypdf -x -l no-progress-bar -d \"disable the progress bar\"\n\nfunction __fish_ocrmypdf_pdfa_compression\n    echo -e \"auto\\t\"(_ \"let Ghostscript decide how to compress images\")\n    echo -e \"jpeg\\t\"(_ \"convert color and grayscale images to JPEG\")\n    echo -e \"lossless\\t\"(_ \"convert color and grayscale images to lossless (PNG)\")\nend\ncomplete -c ocrmypdf -x -l pdfa-image-compression -a '(__fish_ocrmypdf_pdfa_compression)' -d \"set PDF/A image compression options\"\n\ncomplete -c ocrmypdf -x -s j -l jobs -d \"how many worker processes to use\"\ncomplete -c ocrmypdf -x -l title -d \"set metadata\"\ncomplete -c ocrmypdf -x -l author -d \"set metadata\"\ncomplete -c ocrmypdf -x -l subject -d \"set metadata\"\ncomplete -c ocrmypdf -x -l keywords -d \"set metadata\"\ncomplete -c ocrmypdf -x -l oversample -d \"oversample images to this DPI\"\ncomplete -c ocrmypdf -x -l skip-big -d \"skip OCR on pages larger than this many MPixels\"\n\ncomplete -c ocrmypdf -x -l jpeg-quality -d \"JPEG quality [0..100]\"\ncomplete -c ocrmypdf -x -l png-quality -d \"PNG quality [0..100]\"\ncomplete -c ocrmypdf -x -l jbig2-lossy -d \"enable lossy JBIG2 (see docs)\"\ncomplete -c ocrmypdf -x -l jbig2-threshold -d \"JBIG2 compression threshold (see docs)\"\ncomplete -c ocrmypdf -x -l max-image-mpixels -d \"image decompression bomb threshold\"\ncomplete -c ocrmypdf -x -l pages -d \"apply OCR to only the specified pages\"\ncomplete -c ocrmypdf -x -l tesseract-config -d \"set custom tesseract config file\"\n\nfunction __fish_ocrmypdf_tesseract_pagesegmode\n    echo -e \"0\\t\"(_ \"orientation and script detection (OSD) only\")\n    echo -e \"1\\t\"(_ \"automatic page segmentation with OSD\")\n    echo -e \"2\\t\"(_ \"automatic page segmentation, but no OSD, or OCR\")\n    echo -e \"3\\t\"(_ \"fully automatic page segmentation, but no OSD (default)\")\n    echo -e \"4\\t\"(_ \"assume a single column of text of variable sizes\")\n    echo -e \"5\\t\"(_ \"assume a single uniform block of vertically aligned text\")\n    echo -e \"6\\t\"(_ \"assume a single uniform block of text\")\n    echo -e \"7\\t\"(_ \"treat the image as a single text line\")\n    echo -e \"8\\t\"(_ \"treat the image as a single word\")\n    echo -e \"9\\t\"(_ \"treat the image as a single word in a circle\")\n    echo -e \"10\\t\"(_ \"treat the image as a single character\")\n    echo -e \"11\\t\"(_ \"sparse text - find as much text as possible in no particular order\")\n    echo -e \"12\\t\"(_ \"sparse text with OSD\")\n    echo -e \"13\\t\"(_ \"raw line - treat the image as a single text line\")\nend\ncomplete -c ocrmypdf -x -l tesseract-pagesegmode -a '(__fish_ocrmypdf_tesseract_pagesegmode)' -d \"set tesseract --psm\"\n\nfunction __fish_ocrmypdf_tesseract_oem\n    echo -e \"0\\t\"(_ \"legacy engine only\")\n    echo -e \"1\\t\"(_ \"neural nets LSTM engine only\")\n    echo -e \"2\\t\"(_ \"legacy + LSTM engines\")\n    echo -e \"3\\t\"(_ \"default, based on what is available\")\nend\ncomplete -c ocrmypdf -x -l tesseract-oem -a '(__fish_ocrmypdf_tesseract_oem)' -d \"set tesseract --oem\"\n\nfunction __fish_ocrmypdf_tesseract_thresholding\n    echo -e \"auto\\t\"(_ \"let OCRmyPDF pick thresholding (current always uses otsu)\")\n    echo -e \"otsu\\t\"(_ \"legacy Otsu thresholding\")\n    echo -e \"adaptive-otsu\\t\"(_ \"use adaptive Otsu thresholding\")\n    echo -e \"sauvola\\t\"(_ \"use Sauvola thresholding\")\nend\ncomplete -c ocrmypdf -x -l tesseract-thresholding -a '(__fish_ocrmypdf_tesseract_thresholding)' -d \"set tesseract thresholding method (needs Tesseract 5.x)\"\n\ncomplete -c ocrmypdf -x -l tesseract-timeout -d \"maximum number of seconds to wait for OCR\"\ncomplete -c ocrmypdf -x -l tesseract-non-ocr-timeout -d \"maximum seconds to wait for non-OCR operations\"\ncomplete -c ocrmypdf -l tesseract-downsample-large-images -d \"downsample large images before OCR\"\ncomplete -c ocrmypdf -l no-tesseract-downsample-large-images -d \"do not downsample large images\"\ncomplete -c ocrmypdf -x -l tesseract-downsample-above -d \"downsample images larger than this pixel size\"\ncomplete -c ocrmypdf -x -l rotate-pages-threshold -d \"page rotation confidence\"\n\ncomplete -c ocrmypdf -r -l user-words -d \"specify location of user words file\"\ncomplete -c ocrmypdf -r -l user-patterns -d \"specify location of user patterns file\"\ncomplete -c ocrmypdf -x -l fast-web-view -d \"if file size if above this amount in MB, linearize PDF\"\ncomplete -c ocrmypdf -l continue-on-soft-render-error -d \"continue processing after recoverable render errors\"\ncomplete -c ocrmypdf -r -l plugin -d \"name of plugin to import\"\n\nfunction __fish_ocrmypdf_color_conversion_strategy\n    echo -e \"LeaveColorUnchanged\\t\"(_ \"do not convert color spaces (default)\")\n    echo -e \"CMYK\\t\"(_ \"convert all color spaces to CMYK\")\n    echo -e \"Gray\\t\"(_ \"convert all color spaces to grayscale\")\n    echo -e \"RGB\\t\"(_ \"convert all color spaces to RGB\")\n    echo -e \"UseDeviceIndependentColor\\t\"(_ \"convert all color spaces to ICC-based color spaces\")\nend\n\ncomplete -c ocrmypdf -x -l color-conversion-strategy -a '(__fish_ocrmypdf_color_conversion_strategy)' -d \"set color conversion strategy\"\n\nfunction __fish_ocrmypdf_input_file_given\n    set -l tokens (commandline -opc)\n    for token in $tokens\n        if string match -q -r '^-' -- $token\n            continue\n        end\n        if test -f \"$token\"\n            return 0\n        end\n    end\n    return 1\nend\n\ncomplete -c ocrmypdf -x -n 'not __fish_ocrmypdf_input_file_given' -a \"(__fish_complete_suffix .pdf)\" -d \"input file\"\n"
  },
  {
    "path": "misc/docker-compose.example.yml",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n---\nversion: \"3.3\"\nservices:\n  ocrmypdf:\n    restart: always\n    container_name: ocrmypdf\n    image: jbarlow83/ocrmypdf\n    volumes:\n      - \"/media/scan:/input\"\n      - \"/mnt/scan:/output\"\n    environment:\n      - OCR_OUTPUT_DIRECTORY_YEAR_MONTH=0\n    user: \"<SET TO YOUR USER ID>:<SET TO YOUR GROUP ID>\"\n    entrypoint: python3\n    command: watcher.py\n"
  },
  {
    "path": "misc/example_plugin.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R Barlow: https://github.com/jbarlow83\n# SPDX-License-Identifier: MIT\n\n\"\"\"An example of an OCRmyPDF plugin.\n\nThis plugin adds two new command line arguments\n    --grayscale-ocr: converts the image to grayscale before performing OCR on it\n        (This is occasionally useful for images whose color confounds OCR. It only\n        affects the image shown to OCR. The image is not saved.)\n    --mono-page: converts pages all pages in the output file to black and white\n\nTo use this from the command line:\n    ocrmypdf --plugin path/to/example_plugin.py --mono-page input.pdf output.pdf\n\nTo use this as an API:\n    import ocrmypdf\n    ocrmypdf.ocr('input.pdf', 'output.pdf',\n        plugins=['path/to/example_plugin.py'], mono_page=True\n    )\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\n\nfrom PIL import Image\n\nfrom ocrmypdf import hookimpl\n\nlog = logging.getLogger(__name__)\n\n\n@hookimpl\ndef add_options(parser):\n    parser.add_argument('--grayscale-ocr', action='store_true')\n    parser.add_argument('--mono-page', action='store_true')\n\n\n@hookimpl\ndef prepare(options):\n    pass\n\n\n@hookimpl\ndef validate(pdfinfo, options):\n    pass\n\n\n@hookimpl\ndef filter_ocr_image(page, image):\n    if page.options.grayscale_ocr:\n        log.info(\"graying\")\n        return image.convert('L')\n    return image\n\n\n@hookimpl\ndef filter_page_image(page, image_filename):\n    if page.options.mono_page:\n        with Image.open(image_filename) as im:\n            im = im.convert('1')\n            im.save(image_filename)\n        return image_filename\n    else:\n        output = image_filename.with_suffix('.jpg')\n        with Image.open(image_filename) as im:\n            im.save(output)\n        return output\n"
  },
  {
    "path": "misc/flatpak/io.ocrmypdf.ocrmypdf.metainfo.xml",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<component type=\"console-application\">\n  <id>io.ocrmypdf.ocrmypdf</id>\n\n  <name>OCRmyPDF</name>\n  <summary>Adds an OCR text layer to scanned PDF files, allowing them to be searched</summary>\n\n  <developer id=\"io.ocrmypdf\">\n      <name>OCRmyPDF Developers</name>\n  </developer>\n\n  <url type=\"homepage\">https://github.com/ocrmypdf/ocrmypdf</url>\n  <url type=\"bugtracker\">https://github.com/ocrmypdf/OCRmyPDF/issues</url>\n\n  <content_rating type=\"oars-1.1\" />\n\n  <metadata_license>CC0-1.0</metadata_license>\n  <project_license>MPL-2.0</project_license>\n\n  <description>\n    <ul>\n        <li>Generates a searchable PDF/A file from a regular PDF</li>\n        <li>Places OCR text accurately below the image to ease copy / paste</li>\n        <li>Keeps the exact resolution of the original embedded images</li>\n        <li>When possible, inserts OCR information as a lossless operation without disrupting any other content</li>\n        <li>Optimizes PDF images, often producing files smaller than the input file If requested, deskews and/or cleans the image before performing OCR</li>\n        <li>Validates input and output files</li>\n        <li>Distributes work across all available CPU cores</li>\n        <li>Uses Tesseract OCR engine to recognize more than 100 languages</li>\n        <li>Keeps your private data private</li>\n        <li>Scales properly to handle files with thousands of pages</li>\n        <li>Battle-tested on millions of PDFs</li>\n    </ul>\n  </description>\n\n  <provides>\n    <binary>ocrmypdf</binary>\n  </provides>\n\n  <icon type=\"stock\">io.ocrmypdf.ocrmypdf</icon>\n\n  <screenshots>\n    <screenshot type=\"default\">\n      <image>https://raw.githubusercontent.com/ocrmypdf/OCRmyPDF/f7ad5f16bd0340b0b1803dada0c02f9f40542bd8/misc/flatpak/sample_screenshot.png</image>\n      <caption>Sample usage of OCRmyPDF</caption>\n    </screenshot>\n  </screenshots>\n\n  <categories>\n    <category>Office</category>\n    <category>Utility</category>\n  </categories>\n\n  <keywords>\n    <keyword>ocr</keyword>\n    <keyword>pdf</keyword>\n    <keyword>tool</keyword>\n  </keywords>\n\n  <releases>\n    <release version=\"16.8.0\" date=\"2025-01-05\"/>\n  </releases>\n</component>\n"
  },
  {
    "path": "misc/ocrmypdf_compare.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MIT\n\n\"\"\"Run OCRmyPDF on the same PDF with different options.\"\"\"\n\nfrom __future__ import annotations\n\nimport os\nimport shlex\nfrom io import BytesIO\nfrom pathlib import Path\nfrom subprocess import check_output, run\nfrom tempfile import TemporaryDirectory\n\nimport pikepdf\nimport pymupdf\nimport streamlit as st\nfrom lxml import etree\nfrom streamlit_pdf_viewer import pdf_viewer\n\n\ndef do_column(label, suffix, d):\n    cli = st.text_area(\n        f\"Command line arguments for {label}\",\n        key=f\"args{suffix}\",\n        value=\"ocrmypdf {in_} {out}\",\n    )\n    env_text = st.text_area(f\"Environment variables for {label}\", key=f\"env{suffix}\")\n    env = os.environ.copy()\n    for line in env_text.splitlines():\n        if line:\n            try:\n                k, v = line.split(\"=\", 1)\n            except ValueError:\n                st.error(f\"Invalid environment variable: {line}\")\n                break\n            env[k] = v\n    args = shlex.split(\n        cli.format(\n            in_=os.path.join(d, \"input.pdf\"),\n            out=os.path.join(d, f\"output{suffix}.pdf\"),\n        )\n    )\n    with st.expander(\"Environment variables\", expanded=bool(env_text.strip())):\n        st.code('\\n'.join(f\"{k}={v}\" for k, v in env.items()))\n    st.code(shlex.join(args))\n    return env, args\n\n\ndef main():\n    st.set_page_config(layout=\"wide\")\n\n    st.title(\"OCRmyPDF Compare\")\n    st.write(\"Run OCRmyPDF on the same PDF with different options.\")\n    st.warning(\"This is a testing tool and is not intended for production use.\")\n\n    uploaded_pdf = st.file_uploader(\"Upload a PDF\", type=[\"pdf\"])\n    if uploaded_pdf is None:\n        return\n\n    pdf_bytes = uploaded_pdf.read()\n\n    with pikepdf.open(BytesIO(pdf_bytes)) as p, TemporaryDirectory() as d:\n        with st.expander(\"PDF Metadata\"):\n            with p.open_metadata() as meta:\n                xml_txt = str(meta)\n                parser = etree.XMLParser(remove_blank_text=True)\n                tree = etree.fromstring(xml_txt, parser=parser)\n                st.code(\n                    etree.tostring(tree, pretty_print=True).decode(\"utf-8\"),\n                    language=\"xml\",\n                )\n            st.write(p.docinfo)\n            st.write(\"Number of pages:\", len(p.pages))\n\n        col1, col2 = st.columns(2)\n        with col1:\n            env1, args1 = do_column(\"A\", \"1\", d)\n        with col2:\n            env2, args2 = do_column(\"B\", \"2\", d)\n\n        if not st.button(\"Execute and Compare\"):\n            return\n        with st.spinner(\"Executing...\"):\n            Path(d, \"input.pdf\").write_bytes(pdf_bytes)\n            run(args1, env=env1)\n            run(args2, env=env2)\n\n            col1, col2 = st.columns(2)\n            with col1:\n                st.text(\n                    \"Ghostscript version A: \"\n                    + check_output(\n                        [\"gs\", \"--version\"],\n                        env=env1,\n                        text=True,\n                    )\n                )\n            with col2:\n                st.text(\n                    \"Ghostscript version B: \"\n                    + check_output(\n                        [\"gs\", \"--version\"],\n                        env=env2,\n                        text=True,\n                    )\n                )\n\n            doc1 = pymupdf.open(os.path.join(d, \"output1.pdf\"))\n            doc2 = pymupdf.open(os.path.join(d, \"output2.pdf\"))\n            for i, page1_2 in enumerate(zip(doc1, doc2, strict=False)):\n                st.write(f\"Page {i+1}\")\n                page1, page2 = page1_2\n                col1, col2 = st.columns(2)\n                with col1, st.container(border=True):\n                    st.write(page1.get_text())\n                with col2, st.container(border=True):\n                    st.write(page2.get_text())\n\n            col1, col2 = st.columns(2)\n            with col1, st.expander(\"PDF Viewer\"):\n                pdf_viewer(Path(d, \"output1.pdf\"))\n            with col2, st.expander(\"PDF Viewer\"):\n                pdf_viewer(Path(d, \"output2.pdf\"))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "misc/pdf_compare.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MIT\n\n\"\"\"Compare two PDFs.\"\"\"\n\nfrom __future__ import annotations\n\nimport os\nfrom io import BytesIO\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\n\nimport pikepdf\nimport pymupdf\nimport streamlit as st\nfrom lxml import etree\nfrom streamlit_pdf_viewer import pdf_viewer\n\n\ndef do_metadata(pdf):\n    with pikepdf.open(pdf) as pdf:\n        with pdf.open_metadata() as meta:\n            xml_txt = str(meta)\n            parser = etree.XMLParser(remove_blank_text=True)\n            tree = etree.fromstring(xml_txt, parser=parser)\n            st.code(\n                etree.tostring(tree, pretty_print=True).decode(\"utf-8\"),\n                language=\"xml\",\n            )\n        st.write(pdf.docinfo)\n        st.write(\"Number of pages:\", len(pdf.pages))\n\n\ndef main():\n    st.set_page_config(layout=\"wide\")\n\n    st.title(\"PDF Compare\")\n    st.write(\"Compare two PDFs.\")\n\n    col1, col2 = st.columns(2)\n    with col1:\n        uploaded_pdf1 = st.file_uploader(\"Upload a PDF\", type=[\"pdf\"], key='pdf1')\n    with col2:\n        uploaded_pdf2 = st.file_uploader(\"Upload a PDF\", type=[\"pdf\"], key='pdf2')\n    if uploaded_pdf1 is None or uploaded_pdf2 is None:\n        return\n\n    pdf_bytes1 = uploaded_pdf1.getvalue()\n    pdf_bytes2 = uploaded_pdf2.getvalue()\n\n    with st.expander(\"PDF Metadata\"):\n        col1, col2 = st.columns(2)\n        with col1:\n            do_metadata(BytesIO(pdf_bytes1))\n        with col2:\n            do_metadata(BytesIO(pdf_bytes2))\n\n    with TemporaryDirectory() as d:\n        Path(d, \"1.pdf\").write_bytes(pdf_bytes1)\n        Path(d, \"2.pdf\").write_bytes(pdf_bytes2)\n\n        with st.expander(\"Text\"):\n            doc1 = pymupdf.open(os.path.join(d, \"1.pdf\"))\n            doc2 = pymupdf.open(os.path.join(d, \"2.pdf\"))\n            for i, page1_2 in enumerate(zip(doc1, doc2, strict=False)):\n                st.write(f\"Page {i+1}\")\n                page1, page2 = page1_2\n                col1, col2 = st.columns(2)\n                with col1, st.container(border=True):\n                    st.write(page1.get_text())\n                with col2, st.container(border=True):\n                    st.write(page2.get_text())\n\n        with st.expander(\"PDF Viewer\"):\n            col1, col2 = st.columns(2)\n            with col1:\n                pdf_viewer(Path(d, \"1.pdf\"), key='pdf_viewer1', render_text=True)\n            with col2:\n                pdf_viewer(Path(d, \"2.pdf\"), key='pdf_viewer2', render_text=True)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "misc/pdf_text_diff.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Compare text in PDFs.\"\"\"\n\nfrom __future__ import annotations\n\nfrom pathlib import Path\nfrom subprocess import run\nfrom tempfile import NamedTemporaryFile\nfrom typing import Annotated\n\nimport cyclopts\n\napp = cyclopts.App()\n\n\n@app.default\ndef main(\n    pdf1: Annotated[Path, cyclopts.Parameter()],\n    pdf2: Annotated[Path, cyclopts.Parameter()],\n    *,\n    engine: Annotated[str, cyclopts.Parameter()] = 'pdftotext',\n):\n    \"\"\"Compare text in PDFs.\"\"\"\n    with open(pdf1, 'rb') as f1, open(pdf2, 'rb') as f2:\n        text1 = run(\n            ['pdftotext', '-layout', '-', '-'],\n            stdin=f1,\n            capture_output=True,\n            check=True,\n        )\n        text2 = run(\n            ['pdftotext', '-layout', '-', '-'],\n            stdin=f2,\n            capture_output=True,\n            check=True,\n        )\n\n    with NamedTemporaryFile() as t1, NamedTemporaryFile() as t2:\n        t1.write(text1.stdout)\n        t1.flush()\n        t2.write(text2.stdout)\n        t2.flush()\n        diff = run(\n            ['diff', '--color=always', '--side-by-side', t1.name, t2.name],\n            capture_output=True,\n        )\n        run(['less', '-R'], input=diff.stdout, check=True)\n        if text1.stdout.strip() != text2.stdout.strip():\n            return 1\n\n    return 0\n\n\nif __name__ == '__main__':\n    app()\n"
  },
  {
    "path": "misc/screencast/README.md",
    "content": "<!-- SPDX-License-Identifier: CC-BY-SA-4.0 -->\n\nTo regenerate\n=============\n\nUsing asciinema and svg-term (`npm install -g svg-term-cli`).\n\nCreate `~/.config/asciinema/config` to disable prompt.\n\n```\n[record]\n\ncommand = fish --init-command 'alias fish_prompt=\"echo \\>\\ \"'\n```\n\nRun asciinema\n\n```\nasciinema rec new_input.cast\n```\n\nRe-record faster version with fewer pauses\n\n```\nasciinema rec demo.cast -c \"asciinema play new_input.cast --speed 2 --idle-time-limit 0.5\"\n```\n\nConvert to SVG\n```\nsvg-term --in=misc/screencast/demo.cast --out=misc/screencast/demo.svg --window\n```\n"
  },
  {
    "path": "misc/screencast/demo.cast",
    "content": "{\"version\": 2, \"width\": 131, \"height\": 24, \"timestamp\": 1687247006, \"env\": {\"SHELL\": \"/usr/bin/fish\", \"TERM\": \"xterm-256color\"}}\n[0.103649, \"o\", \"\\u001b[?2004h\\u001b]7; \\u0007\"]\n[0.104223, \"o\", \"\\u001b]0;fish  \\u0007\\u001b[30m\\u001b(B\\u001b[m\\r> \\u001b[K\\r\\u001b[C\\u001b[C\"]\n[0.604542, \"o\", \"o\\r\\u001b[3C\\b\\u001b[38;2;255;0;0mo\\r\\u001b[3C\\u001b[30m\\u001b(B\\u001b[m\\u001b[38;2;85;85;85mcrmypdf multipage.pdf multipage_with_ocr.pdf\\r\\u001b[3C\\u001b[30m\\u001b(B\\u001b[m\"]\n[0.679571, \"o\", \"\\u001b[38;2;255;0;0mc\\u001b[38;2;85;85;85mrmypdf multipage.pdf multipage_with_ocr.pdf\\r\\u001b[4C\\u001b[30m\\u001b(B\\u001b[m\"]\n[0.767271, \"o\", \"\\u001b[38;2;255;0;0mr\\u001b[38;2;85;85;85mmypdf multipage.pdf multipage_with_ocr.pdf\\r\\u001b[5C\\u001b[30m\\u001b(B\\u001b[m\"]\n[0.814505, \"o\", \"\\u001b[38;2;255;0;0mm\\u001b[38;2;85;85;85mypdf multipage.pdf multipage_with_ocr.pdf\\r\\u001b[6C\\u001b[30m\\u001b(B\\u001b[m\"]\n[0.938919, \"o\", \"\\u001b[38;2;255;0;0my\\u001b[38;2;85;85;85mpdf multipage.pdf multipage_with_ocr.pdf\\r\\u001b[7C\\u001b[30m\\u001b(B\\u001b[m\"]\n[0.967347, \"o\", \"\\u001b[38;2;255;0;0mp\\u001b[38;2;85;85;85mdf multipage.pdf multipage_with_ocr.pdf\\r\\u001b[8C\\u001b[30m\\u001b(B\\u001b[m\"]\n[1.009954, \"o\", \"\\u001b[38;2;255;0;0md\\u001b[38;2;85;85;85mf multipage.pdf multipage_with_ocr.pdf\\r\\u001b[9C\\u001b[30m\\u001b(B\\u001b[m\"]\n[1.034488, \"o\", \"\\u001b[38;2;255;0;0mf\\u001b[38;2;85;85;85m multipage.pdf multipage_with_ocr.pdf\\r\\u001b[10C\\u001b[30m\\u001b(B\\u001b[m\\b\\b\\b\\b\\b\\b\\b\\b\\u001b[38;2;0;95;215mocrmypdf\\u001b[38;2;85;85;85m multipage.pdf multipage_with_ocr.pdf\\r\\u001b[10C\\u001b[30m\\u001b(B\\u001b[m\"]\n[1.069226, \"o\", \"\\u001b[38;2;0;95;215m \\u001b[38;2;85;85;85mmultipage.pdf multipage_with_ocr.pdf\\r\\u001b[11C\\u001b[30m\\u001b(B\\u001b[m\\b \\u001b[38;2;85;85;85mmultipage.pdf multipage_with_ocr.pdf\\r\\u001b[11C\\u001b[30m\\u001b(B\\u001b[m\"]\n[1.569682, \"o\", \"-\\u001b[K\\r\\u001b[12C\\u001b[38;2;85;85;85m-version\\r\\u001b[12C\\u001b[30m\\u001b(B\\u001b[m\\b\\u001b[38;2;0;175;255m-\\u001b[38;2;85;85;85m-version\\r\\u001b[12C\\u001b[30m\\u001b(B\\u001b[m\"]\n[1.642096, \"o\", \"\\u001b[38;2;0;175;255m-\\u001b[38;2;85;85;85mversion\\r\\u001b[13C\\u001b[30m\\u001b(B\\u001b[m\"]\n[1.71793, \"o\", \"\\u001b[38;2;0;175;255ms\\u001b[30m\\u001b(B\\u001b[m\\u001b[K\\r\\u001b[14C\"]\n[1.771483, \"o\", \"\\u001b[38;2;0;175;255mk\\r\\u001b[15C\\u001b[30m\\u001b(B\\u001b[m\"]\n[1.864664, \"o\", \"\\u001b[38;2;0;175;255mi\\r\\u001b[16C\\u001b[30m\\u001b(B\\u001b[m\"]\n[1.876085, \"o\", \"\\u001b[38;2;0;175;255mp\\r\\u001b[17C\\u001b[30m\\u001b(B\\u001b[m\"]\n[2.092979, \"o\", \"\\u001b[38;2;0;175;255m-\\r\\u001b[18C\\u001b[30m\\u001b(B\\u001b[m\"]\n[2.138821, \"o\", \"\\u001b[38;2;0;175;255mt\\r\\u001b[19C\\u001b[30m\\u001b(B\\u001b[m\"]\n[2.18017, \"o\", \"\\u001b[38;2;0;175;255me\\r\\u001b[20C\\u001b[30m\\u001b(B\\u001b[m\"]\n[2.268222, \"o\", \"\\u001b[38;2;0;175;255mx\\r\\u001b[21C\\u001b[30m\\u001b(B\\u001b[m\"]\n[2.277031, \"o\", \"\\u001b[38;2;0;175;255mt\\r\\u001b[22C\\u001b[30m\\u001b(B\\u001b[m\"]\n[2.322469, \"o\", \"\\u001b[38;2;0;175;255m \\r\\u001b[23C\\u001b[30m\\u001b(B\\u001b[m\\b \\r\\u001b[23C\"]\n[2.824696, \"o\", \"m\\r\\u001b[24C\\b\\u001b[38;2;0;175;255m\\u001b[4mm\\r\\u001b[24C\\u001b[30m\\u001b(B\\u001b[m\\u001b[38;2;85;85;85masks.pdf \\r\\u001b[24C\\u001b[30m\\u001b(B\\u001b[m\"]\n[2.923234, \"o\", \"\\u001b[38;2;0;175;255m\\u001b[4mu\\u001b[30m\\u001b(B\\u001b[m\\u001b[K\\r\\u001b[25C\\u001b[38;2;85;85;85mltipage.pdf \\r\\u001b[25C\\u001b[30m\\u001b(B\\u001b[m\"]\n[2.960685, \"o\", \"\\u001b[38;2;0;175;255m\\u001b[4ml\\u001b[38;2;85;85;85m\\u001b[24mtipage.pdf \\r\\u001b[26C\\u001b[30m\\u001b(B\\u001b[m\"]\n[3.03365, \"o\", \"\\u001b[38;2;0;175;255m\\u001b[4mt\\u001b[38;2;85;85;85m\\u001b[24mipage.pdf \\r\\u001b[27C\\u001b[30m\\u001b(B\\u001b[m\"]\n[3.479338, \"o\", \"\\u001b[38;2;0;175;255m\\u001b[4mipage.pdf \\r\\u001b[37C\\u001b[30m\\u001b(B\\u001b[m\\b \\r\\u001b[37C\"]\n[3.754818, \"o\", \"m\\r\\u001b[38C\\b\\u001b[38;2;0;175;255m\\u001b[4mm\\r\\u001b[38C\\u001b[30m\\u001b(B\\u001b[m\\u001b[38;2;85;85;85masks.pdf \\r\\u001b[38C\\u001b[30m\\u001b(B\\u001b[m\"]\n[3.873318, \"o\", \"\\u001b[38;2;0;175;255m\\u001b[4mu\\u001b[30m\\u001b(B\\u001b[m\\u001b[K\\r\\u001b[39C\\u001b[38;2;85;85;85mltipage.pdf \\r\\u001b[39C\\u001b[30m\\u001b(B\\u001b[m\"]\n[3.926829, \"o\", \"\\u001b[38;2;0;175;255m\\u001b[4ml\\u001b[38;2;85;85;85m\\u001b[24mtipage.pdf \\r\\u001b[40C\\u001b[30m\\u001b(B\\u001b[m\"]\n[4.272251, \"o\", \"\\u001b[38;2;0;175;255m\\u001b[4mtipage.pdf \\r\\u001b[51C\\u001b[30m\\u001b(B\\u001b[m\\b \\r\\u001b[51C\"]\n[4.343464, \"o\", \"\\r\\u001b[50C\"]\n[4.416286, \"o\", \"\\r\\u001b[49C\"]\n[4.490574, \"o\", \"\\r\\u001b[48C\"]\n[4.564115, \"o\", \"\\r\\u001b[47C\"]\n[4.630398, \"o\", \"\\r\\u001b[46C\"]\n[4.76825, \"o\", \"\\u001b[38;2;0;175;255m\\u001b[4m_.pd\\u001b[30m\\u001b(B\\u001b[mf \\r\\u001b[47C\\u001b[10D\\u001b[38;2;0;175;255mmultipage_.pdf\\u001b[30m\\u001b(B\\u001b[m \\r\\u001b[47C\"]\n[5.012506, \"o\", \"\\u001b[38;2;0;175;255mo.pd\\u001b[30m\\u001b(B\\u001b[mf \\r\\u001b[48C\\u001b[3C\\u001b[38;2;0;175;255mf\\u001b[30m\\u001b(B\\u001b[m \\r\\u001b[48C\"]\n[5.053615, \"o\", \"\\u001b[38;2;0;175;255mc.pd\\u001b[30m\\u001b(B\\u001b[mf \\r\\u001b[49C\\u001b[3C\\u001b[38;2;0;175;255mf\\u001b[30m\\u001b(B\\u001b[m \\r\\u001b[49C\"]\n[5.103957, \"o\", \"\\u001b[38;2;0;175;255mr.pd\\u001b[30m\\u001b(B\\u001b[mf \\r\\u001b[50C\\u001b[3C\\u001b[38;2;0;175;255mf\\u001b[30m\\u001b(B\\u001b[m \\r\\u001b[50C\"]\n[5.226183, \"o\", \"\\r\\u001b[55C\"]\n[5.728321, \"o\", \"\\r\\n\\u001b[30m\\u001b(B\\u001b[m\\u001b[?2004l\\u001b]0;ocrmypdf --skip-text multipage.pdf multipage_ocr.pdf  /home/jb/src/ocrmypdf/tests/resources\\u0007\\u001b[30m\\u001b(B\\u001b[m\\r\"]\n[5.801032, \"o\", \"\\rScanning contents:   0%|                                                                                   | 0/6 [00:00<?, ?page/s]\"]\n[5.802664, \"o\", \"\\rScanning contents: 100%|█████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1270.68page/s]\\r\\n\"]\n[5.802747, \"o\", \"Start processing 6 pages concurrently\\r\\n\"]\n[5.803488, \"o\", \"\\rOCR:   0%|                                                                                             | 0.0/6.0 [00:00<?, ?page/s]\"]\n[5.804896, \"o\", \"\\r                                                                                                                                   \\r    4 skipping all processing on this page\\r\\n\\rOCR:   0%|                                                                                             | 0.0/6.0 [00:00<?, ?page/s]\"]\n[5.896969, \"o\", \"\\rOCR:  25%|█████████████████████▎                                                               | 1.5/6.0 [00:00<00:00,  8.12page/s]\"]\n[6.170021, \"o\", \"\\rOCR:  42%|███████████████████████████████████▍                                                 | 2.5/6.0 [00:00<00:01,  3.05page/s]\"]\n[6.292338, \"o\", \"\\rOCR:  58%|█████████████████████████████████████████████████▌                                   | 3.5/6.0 [00:00<00:00,  3.39page/s]\"]\n[6.586017, \"o\", \"\\rOCR:  75%|███████████████████████████████████████████████████████████████▊                     | 4.5/6.0 [00:01<00:00,  2.49page/s]\"]\n[7.087058, \"o\", \"\\rOCR:  92%|█████████████████████████████████████████████████████████████████████████████▉       | 5.5/6.0 [00:06<00:00,  1.98s/page]\\rOCR: 100%|█████████████████████████████████████████████████████████████████████████████████████| 6.0/6.0 [00:06<00:00,  1.09s/page]\\r\\nPostprocessing...\\r\\n\"]\n[7.104927, \"o\", \"\\rPDF/A conversion:   0%|                                                                                    | 0/6 [00:00<?, ?page/s]\"]\n[7.607392, \"o\", \"\\rPDF/A conversion:  50%|██████████████████████████████████████                                      | 3/6 [00:01<00:01,  1.61page/s]\"]\n[7.653781, \"o\", \"\\rPDF/A conversion:  83%|███████████████████████████████████████████████████████████████▎            | 5/6 [00:01<00:00,  2.90page/s]\"]\n[7.774532, \"o\", \"\\rPDF/A conversion: 100%|████████████████████████████████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.71page/s]\\r\\n\"]\n[7.778252, \"o\", \"\\u001b[33mSome input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.\\u001b[0m\\r\\n\"]\n[8.280789, \"o\", \"\\rRecompressing JPEGs: 0image [00:00, ?image/s]\\rRecompressing JPEGs: 0image [00:00, ?image/s]\\r\\n\\rDeflating JPEGs:   0%|                                                                                    | 0/4 [00:00<?, ?image/s]\\rDeflating JPEGs: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 238.28image/s]\\r\\n\"]\n[8.28149, \"o\", \"\\rJBIG2: 0item [00:00, ?item/s]\\rJBIG2: 0item [00:00, ?item/s]\\r\\n\"]\n[8.289998, \"o\", \"Image optimization ratio: 1.01 savings: 1.3%\\r\\nTotal file size ratio: 1.02 savings: 1.6%\\r\\n\"]\n[8.291209, \"o\", \"Output file is a PDF/A-2b (as expected)\\r\\n\"]\n[8.361316, \"o\", \"\\u001b[2m⏎\\u001b(B\\u001b[m                                                                                                                                  \\r⏎ \\r\\u001b[K\\u001b[?2004h\\u001b]0;fish /home/jb/src/ocrmypdf/tests/resources\\u0007\\u001b[30m\\u001b(B\\u001b[m> \\u001b[K\\r\\u001b[C\\u001b[C\"]\n[8.862206, \"o\", \"\\r\\n\\u001b[30m\\u001b(B\\u001b[m\\u001b[30m\\u001b(B\\u001b[m\\u001b[?2004l\"]\n"
  },
  {
    "path": "misc/synology.py",
    "content": "#!/bin/env python3\n# SPDX-FileCopyrightText: 2017 Enantiomerie\n# SPDX-License-Identifier: MIT\n\n\"\"\"Example OCRmyPDF for Synology NAS.\"\"\"\n\nfrom __future__ import annotations\n\n# This script must be edited to meet your needs.\nimport logging\nimport os\nimport shutil\nimport subprocess\nimport sys\nimport time\n\n# pylint: disable=logging-format-interpolation\n# pylint: disable=logging-not-lazy\n\nscript_dir = os.path.dirname(os.path.realpath(__file__))\ntimestamp = time.strftime(\"%Y-%m-%d-%H%M_\")\nlog_file = script_dir + '/' + timestamp + 'ocrmypdf.log'\nlogging.basicConfig(\n    level=logging.INFO,\n    format='%(asctime)s %(message)s',\n    filename=log_file,\n    filemode='w',\n)\n\nstart_dir = sys.argv[1] if len(sys.argv) > 1 else '.'\n\nfor dir_name, _subdirs, file_list in os.walk(start_dir):\n    logging.info(dir_name)\n    os.chdir(dir_name)\n    for filename in file_list:\n        file_stem, file_ext = os.path.splitext(filename)\n        if file_ext != '.pdf':\n            continue\n        full_path = os.path.join(dir_name, filename)\n        timestamp_ocr = time.strftime(\"%Y-%m-%d-%H%M_OCR_\")\n        filename_ocr = timestamp_ocr + file_stem + '.pdf'\n        # create string for pdf processing\n        # the script is processed as root user via chron\n        cmd = [\n            'docker',\n            'run',\n            '--rm',\n            '-i',\n            'jbarlow83/ocrmypdf',\n            '--deskew',\n            '-',\n            '-',\n        ]\n        logging.info(cmd)\n        full_path_ocr = os.path.join(dir_name, filename_ocr)\n        with (\n            open(filename, 'rb') as input_file,\n            open(full_path_ocr, 'wb') as output_file,\n        ):\n            proc = subprocess.run(\n                cmd,\n                stdin=input_file,\n                stdout=output_file,\n                stderr=subprocess.PIPE,\n                check=False,\n                text=True,\n                errors='ignore',\n            )\n        logging.info(proc.stderr)\n        os.chmod(full_path_ocr, 0o664)\n        os.chmod(full_path, 0o664)\n        full_path_ocr_archive = sys.argv[2]\n        full_path_archive = sys.argv[2] + '/no_ocr'\n        shutil.move(full_path_ocr, full_path_ocr_archive)\n        shutil.move(full_path, full_path_archive)\nlogging.info('Finished.\\n')\n"
  },
  {
    "path": "misc/watcher.py",
    "content": "#!/usr/bin/env python3\n# SPDX-FileCopyrightText: 2019 Ian Alexander <https://github.com/ianalexander>\n# SPDX-FileCopyrightText: 2020 James R Barlow <https://github.com/jbarlow83>\n# SPDX-License-Identifier: MIT\n\n\"\"\"Watch a directory for new PDFs and OCR them.\"\"\"\n\nfrom __future__ import annotations\n\nimport datetime as dt\nimport json\nimport logging\nimport shutil\nimport sys\nimport time\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import Annotated, Any\n\nimport cyclopts\nimport pikepdf\nfrom dotenv import load_dotenv\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers import Observer\nfrom watchdog.observers.polling import PollingObserver\n\nimport ocrmypdf\n\nload_dotenv()\n\n\n# pylint: disable=logging-format-interpolation\napp = cyclopts.App(name=\"ocrmypdf-watcher\")\n\nlog = logging.getLogger('ocrmypdf-watcher')\n\n\nclass LoggingLevelEnum(str, Enum):\n    \"\"\"Enum for logging levels.\"\"\"\n\n    DEBUG = \"DEBUG\"\n    INFO = \"INFO\"\n    WARNING = \"WARNING\"\n    ERROR = \"ERROR\"\n    CRITICAL = \"CRITICAL\"\n\n\ndef get_output_path(root: Path, basename: str, output_dir_year_month: bool) -> Path:\n    assert '/' not in basename, \"basename must not contain '/'\"\n    if output_dir_year_month:\n        today = dt.datetime.today()\n        output_directory_year_month = root / str(today.year) / f'{today.month:02d}'\n        if not output_directory_year_month.exists():\n            output_directory_year_month.mkdir(parents=True, exist_ok=True)\n        output_path = Path(output_directory_year_month) / Path(basename).with_suffix(\n            '.pdf'\n        )\n    else:\n        output_path = root / Path(basename).with_suffix('.pdf')\n    return output_path\n\n\ndef wait_for_file_ready(\n    file_path: Path, poll_new_file_seconds: int, retries_loading_file: int\n):\n    # This loop waits to make sure that the file is completely loaded on\n    # disk before attempting to read. Docker sometimes will publish the\n    # watchdog event before the file is actually fully on disk, causing\n    # pikepdf to fail.\n\n    tries = retries_loading_file + 1\n    while tries:\n        try:\n            with pikepdf.Pdf.open(file_path) as pdf:\n                log.debug(f\"{file_path} ready with {pdf.pages} pages\")\n                return True\n        except (FileNotFoundError, OSError) as e:\n            log.info(f\"File {file_path} is not ready yet\")\n            log.debug(\"Exception was\", exc_info=e)\n            time.sleep(poll_new_file_seconds)\n            tries -= 1\n        except pikepdf.PdfError as e:\n            log.info(f\"File {file_path} is not full written yet\")\n            log.debug(\"Exception was\", exc_info=e)\n            time.sleep(poll_new_file_seconds)\n            tries -= 1\n\n    return False\n\n\ndef execute_ocrmypdf(\n    *,\n    file_path: Path,\n    archive_dir: Path,\n    output_dir: Path,\n    ocrmypdf_kwargs: dict[str, Any],\n    on_success_delete: bool,\n    on_success_archive: bool,\n    poll_new_file_seconds: int,\n    retries_loading_file: int,\n    output_dir_year_month: bool,\n):\n    output_path = get_output_path(output_dir, file_path.name, output_dir_year_month)\n\n    log.info(\"-\" * 20)\n    log.info(f'New file: {file_path}. Waiting until fully written...')\n    if not wait_for_file_ready(file_path, poll_new_file_seconds, retries_loading_file):\n        log.info(f\"Gave up waiting for {file_path} to become ready\")\n        return\n    log.info(f'Attempting to OCRmyPDF to: {output_path}')\n\n    log.debug(\n        f'OCRmyPDF input_file={file_path} output_file={output_path} '\n        f'kwargs: {ocrmypdf_kwargs}'\n    )\n    exit_code = ocrmypdf.ocr(\n        ocrmypdf.OcrOptions(\n            input_file=file_path,\n            output_file=output_path,\n            **ocrmypdf_kwargs,\n        )\n    )\n    if exit_code == 0:\n        if on_success_delete:\n            log.info(f'OCR is done. Deleting: {file_path}')\n            file_path.unlink()\n        elif on_success_archive:\n            log.info(f'OCR is done. Archiving {file_path.name} to {archive_dir}')\n            shutil.move(file_path, f'{archive_dir}/{file_path.name}')\n        else:\n            log.info('OCR is done')\n    else:\n        log.info('OCR is done')\n\n\nclass HandleObserverEvent(PatternMatchingEventHandler):\n    def __init__(  # noqa: D107\n        self,\n        patterns=None,\n        ignore_patterns=None,\n        ignore_directories=False,\n        case_sensitive=False,\n        settings=None,\n    ):\n        super().__init__(\n            patterns=patterns,\n            ignore_patterns=ignore_patterns,\n            ignore_directories=ignore_directories,\n            case_sensitive=case_sensitive,\n        )\n        self._settings = settings if settings else {}\n\n    def on_any_event(self, event):\n        if event.event_type in ['created']:\n            execute_ocrmypdf(file_path=Path(event.src_path), **self._settings)\n\n\n@app.default\ndef main(\n    input_dir: Annotated[\n        Path,\n        cyclopts.Parameter(\n            env_var='OCR_INPUT_DIRECTORY',\n        ),\n    ] = Path('/input'),\n    output_dir: Annotated[\n        Path,\n        cyclopts.Parameter(\n            env_var='OCR_OUTPUT_DIRECTORY',\n        ),\n    ] = Path('/output'),\n    archive_dir: Annotated[\n        Path,\n        cyclopts.Parameter(\n            env_var='OCR_ARCHIVE_DIRECTORY',\n        ),\n    ] = Path('/processed'),\n    *,\n    output_dir_year_month: Annotated[\n        bool,\n        cyclopts.Parameter(\n            env_var='OCR_OUTPUT_DIRECTORY_YEAR_MONTH',\n            help='Create a subdirectory in the output directory for each year/month',\n        ),\n    ] = False,\n    on_success_delete: Annotated[\n        bool,\n        cyclopts.Parameter(\n            env_var='OCR_ON_SUCCESS_DELETE',\n            help='Delete the input file after successful OCR',\n        ),\n    ] = False,\n    on_success_archive: Annotated[\n        bool,\n        cyclopts.Parameter(\n            env_var='OCR_ON_SUCCESS_ARCHIVE',\n            help='Archive the input file after successful OCR',\n        ),\n    ] = False,\n    deskew: Annotated[\n        bool,\n        cyclopts.Parameter(\n            env_var='OCR_DESKEW',\n            help='Deskew the input file before OCR',\n        ),\n    ] = False,\n    ocr_json_settings: Annotated[\n        str | None,\n        cyclopts.Parameter(\n            env_var='OCR_JSON_SETTINGS',\n            help='JSON settings to pass to OCRmyPDF (JSON string or file path)',\n        ),\n    ] = None,\n    poll_new_file_seconds: Annotated[\n        int,\n        cyclopts.Parameter(\n            env_var='OCR_POLL_NEW_FILE_SECONDS',\n            help='Seconds to wait before polling a new file',\n        ),\n    ] = 1,\n    use_polling: Annotated[\n        bool,\n        cyclopts.Parameter(\n            env_var='OCR_USE_POLLING',\n            help='Use polling instead of filesystem events',\n        ),\n    ] = False,\n    retries_loading_file: Annotated[\n        int,\n        cyclopts.Parameter(\n            env_var='OCR_RETRIES_LOADING_FILE',\n            help='Number of times to retry loading a file before giving up',\n        ),\n    ] = 5,\n    loglevel: Annotated[\n        LoggingLevelEnum,\n        cyclopts.Parameter(\n            env_var='OCR_LOGLEVEL',\n            help='Logging level',\n        ),\n    ] = LoggingLevelEnum.INFO,\n    patterns: Annotated[\n        str,\n        cyclopts.Parameter(\n            env_var='OCR_PATTERNS',\n            help='File patterns to watch',\n        ),\n    ] = '*.pdf,*.PDF',\n):\n    ocrmypdf.configure_logging(\n        verbosity=(\n            ocrmypdf.Verbosity.default\n            if loglevel != LoggingLevelEnum.DEBUG\n            else ocrmypdf.Verbosity.debug\n        ),\n        manage_root_logger=True,\n    )\n    log.setLevel(loglevel.value)\n    log.info(\n        f\"Starting OCRmyPDF watcher with config:\\n\"\n        f\"Input Directory: {input_dir}\\n\"\n        f\"Output Directory: {output_dir}\\n\"\n        f\"Output Directory Year & Month: {output_dir_year_month}\\n\"\n        f\"Archive Directory: {archive_dir}\"\n    )\n    log.info(\n        f\"INPUT_DIRECTORY: {input_dir}\\n\"\n        f\"OUTPUT_DIRECTORY: {output_dir}\\n\"\n        f\"ARCHIVE_DIRECTORY: {archive_dir}\\n\"\n        f\"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\\n\"\n        f\"ON_SUCCESS_DELETE: {on_success_delete}\\n\"\n        f\"ON_SUCCESS_ARCHIVE: {on_success_archive}\\n\"\n        f\"DESKEW: {deskew}\\n\"\n        f\"ARGS: {ocr_json_settings}\\n\"\n        f\"POLL_NEW_FILE_SECONDS: {poll_new_file_seconds}\\n\"\n        f\"RETRIES_LOADING_FILE: {retries_loading_file}\\n\"\n        f\"USE_POLLING: {use_polling}\\n\"\n        f\"LOGLEVEL: {loglevel.value}\"\n    )\n\n    if ocr_json_settings and Path(ocr_json_settings).exists():\n        json_settings = json.loads(Path(ocr_json_settings).read_text())\n    else:\n        json_settings = json.loads(ocr_json_settings or '{}')\n\n    if 'input_file' in json_settings or 'output_file' in json_settings:\n        log.error(\n            'OCR_JSON_SETTINGS (--ocr-json-settings) may not specify input/output file'\n        )\n        sys.exit(1)\n\n    handler = HandleObserverEvent(\n        patterns=patterns.split(','),\n        settings={\n            'archive_dir': archive_dir,\n            'output_dir': output_dir,\n            'ocrmypdf_kwargs': json_settings | {'deskew': deskew},\n            'on_success_delete': on_success_delete,\n            'on_success_archive': on_success_archive,\n            'poll_new_file_seconds': poll_new_file_seconds,\n            'retries_loading_file': retries_loading_file,\n            'output_dir_year_month': output_dir_year_month,\n        },\n    )\n    observer = PollingObserver() if use_polling else Observer()\n    observer.schedule(handler, input_dir, recursive=True)\n    observer.start()\n    print(f\"Watching {input_dir} for new PDFs. Press Ctrl+C to exit.\")\n    try:\n        while True:\n            time.sleep(30)\n    except KeyboardInterrupt:\n        observer.stop()\n    observer.join()\n\n\nif __name__ == \"__main__\":\n    app()\n"
  },
  {
    "path": "misc/webservice.py",
    "content": "#!/usr/bin/env python\n# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: AGPL-3.0-or-later\n\n\"\"\"Run the OCRmyPDF web service.\"\"\"\n\nfrom __future__ import annotations\n\nimport os\nimport sys\n\ntry:\n    import streamlit  # noqa: F401\nexcept ImportError:\n    raise ImportError(\n        'You need to install streamlit in the Python environment '\n        'to run the web service.\\n'\n    ) from None\n\nif __name__ == '__main__':\n    os.execvp(\n        sys.executable,\n        [\n            sys.executable,\n            '-m',\n            'streamlit',\n            'run',\n            'misc/_webservice.py',\n            *sys.argv[1:],\n        ],\n    )\n"
  },
  {
    "path": "pyproject.toml",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n[build-system]\nrequires = [\"hatchling\"]\nbuild-backend = \"hatchling.build\"\n\n[project]\nname = \"ocrmypdf\"\nversion = \"17.3.0\"\ndescription = \"OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched\"\nreadme = \"README.md\"\nlicense = \"MPL-2.0\"\nrequires-python = \">=3.11\"\ndependencies = [\n  \"deprecation>=2.1.0\",\n  \"fpdf2>=2.8.0\",\n  \"img2pdf>=0.5\",\n  \"packaging>=20\",\n  \"pdfminer.six>=20220319\",\n  \"pi-heif\",                # Heif image format - maintainers: if this is removed, it will NOT break\n  \"pikepdf>=10\",\n  \"Pillow>=10.0.1\",\n  \"pluggy>=1\",\n  \"pydantic>=2.12.5\",\n  \"pypdfium2>=5.0.0\",\n  \"rich>=13\",\n  \"uharfbuzz>=0.53.2\",\n]\nauthors = [{ name = \"James R. Barlow\", email = \"james@purplerock.ca\" }]\nclassifiers = [\n  \"Development Status :: 5 - Production/Stable\",\n  \"Environment :: Console\",\n  \"Intended Audience :: End Users/Desktop\",\n  \"Intended Audience :: Science/Research\",\n  \"Intended Audience :: System Administrators\",\n  \"Operating System :: MacOS\",\n  \"Operating System :: Microsoft :: Windows\",\n  \"Operating System :: POSIX\",\n  \"Operating System :: POSIX :: BSD\",\n  \"Operating System :: POSIX :: Linux\",\n  \"Programming Language :: Python :: 3\",\n  \"Topic :: Scientific/Engineering :: Image Recognition\",\n  \"Topic :: Text Processing :: Indexing\",\n  \"Topic :: Text Processing :: Linguistic\",\n]\nkeywords = [\"PDF\", \"OCR\", \"optical character recognition\", \"PDF/A\", \"scanning\"]\n\n[project.urls]\nDocumentation = \"https://ocrmypdf.readthedocs.io/\"\nSource = \"https://github.com/ocrmypdf/OCRmyPDF\"\nTracker = \"https://github.com/ocrmypdf/OCRmyPDF/issues\"\nChangelog = \"https://github.com/ocrmypdf/OCRmyPDF/tree/main/docs/releasenotes\"\n\n[project.optional-dependencies]\n# User-installable features - use `uv sync --extra <name>` or `pip install ocrmypdf[name]`\nwatcher = [\"watchdog>=1.0.2\", \"cyclopts>=3\", \"python-dotenv\"]\nwebservice = [\"streamlit>=1.41.0\"]\n\n[project.scripts]\nocrmypdf = \"ocrmypdf.__main__:run\"\n\n[tool.distutils.bdist_wheel]\npython-tag = \"py311\"\n\n[tool.coverage.run]\nbranch = true\nparallel = true\nconcurrency = [\"multiprocessing\", \"thread\"]\nsigterm = true\n\n[tool.coverage.paths]\nsource = [\"src/ocrmypdf\"]\n\n[tool.coverage.report]\n# Regexes for lines to exclude from consideration\nexclude_lines = [\n  # Have to re-enable the standard pragma\n  \"pragma: no cover\",\n  # Don't complain if tests don't hit defensive assertion code:\n  \"raise AssertionError\",\n  \"raise NotImplementedError\",\n  # Don't complain if non-runnable code isn't run:\n  \"if 0:\",\n  \"if False:\",\n  \"if __name__ == .__main__.:\",\n  \"if TYPE_CHECKING:\",\n]\n\n[tool.pytest.ini_options]\nminversion = \"6.0\"\ntestpaths = [\"tests\"]\naddopts = \"-n auto\"\nmarkers = [\"slow\"]\nfilterwarnings = [\n  \"ignore:.*XMLParser.*:DeprecationWarning\",\n  \"ignore:.*ast.NameConstant.*:DeprecationWarning:reportlab\",\n  \"ignore:.*distutils.*:DeprecationWarning:libxmp\",\n]\n\n[tool.mypy]\n\n[[tool.mypy.overrides]]\nmodule = [\n  'pluggy',\n  'img2pdf',\n  'pdfminer.*',\n  'reportlab.*',\n  'fitz',\n  'libxmp.utils',\n]\nignore_missing_imports = true\n\n[tool.ruff]\ntarget-version = \"py311\"\nexclude = [\"src/ocrmypdf/_version.py\"] # Autogenerated\n\n[tool.ruff.lint]\n\"select\" = [\n  \"D\",   # pydocstyle\n  \"E\",   # pycodestyle\n  \"W\",   # pycodestyle\n  \"F\",   # pyflakes\n  \"I\",   # isort\n  \"UP\",  # pyupgrade\n  \"SIM\", # simplify\n  \"B\",   # flake8-bugbear\n  \"ICN\", # flake8-import-conventions\n]\nignore = [\n  \"B028\", # warning with no explicit stacklevel\n  # rule is key in dict instead of key in dict.keys(); but pikepdf semantics differ\n  \"SIM118\",\n]\n\n[tool.ruff.lint.isort]\nknown-first-party = [\"ocrmypdf\"]\nrequired-imports = [\"from __future__ import annotations\"]\n\n[tool.ruff.lint.flake8-import-conventions]\n# Prohibit explicit imports from the 'datetime' module\nbanned-from = [\"datetime\"]\n# Optionally, suggest an alias for 'import datetime' (e.g., as dt)\nextend-aliases = { \"datetime\" = \"dt\" }\n\n[tool.ruff.lint.pydocstyle]\nconvention = \"google\"\n\n[tool.ruff.lint.per-file-ignores]\n\"docs/conf.py\" = [\"D100\", \"D101\", \"D105\"]\n\"tests/*.py\" = [\"D100\", \"D101\", \"D102\", \"D103\", \"D105\", \"E501\"]\n\"misc/*.py\" = [\"D103\", \"D101\", \"D102\"]\n\"src/ocrmypdf/builtin_plugins/*.py\" = [\"D103\", \"D102\", \"D105\"]\n\n[tool.ruff.format]\nquote-style = \"preserve\"\n\n[dependency-groups]\n# Developer-only tools - use `uv sync --group <name>`\ndev = [\"mypy>=1.13.0\", \"ipykernel>=6.29.5\", \"reportlab>=4.4.4\"]\ntest = [\n  # Core testing framework\n  \"coverage[toml]>=6.2\",\n  \"hypothesis>=6.36.0\",\n  \"pytest>=6.2.5\",\n  \"pytest-cov>=3.0.0\",\n  \"pytest-xdist>=2.5.0\",\n  # Test dependencies\n  \"python-xmp-toolkit==2.0.1\", # also requires apt-get install libexempi3\n  \"reportlab>=3.6.8\",\n  # Type stubs for testing\n  \"types-Pillow\",\n  \"types-humanfriendly\",\n  # Extended test capabilities (merged from extended_test)\n  \"pymupdf>=1.24.14\",\n]\ndocs = [\n  \"myst-parser>=4.0.1\",\n  \"sphinx\",\n  \"sphinx-issues\",\n  \"sphinx-reredirects\",\n  \"sphinx-rtd-theme\",\n  \"sphinxcontrib-mermaid\",\n]\nstreamlit-dev = [\"streamlit>=1.40.2\", \"streamlit-pdf-viewer>=0.0.19\"]\n"
  },
  {
    "path": "scripts/generate_glyphless_font.py",
    "content": "#!/usr/bin/env python3\n# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Generate the Occulta glyphless font for OCRmyPDF.\n\nOcculta (Latin for \"hidden\") is a glyphless font designed for invisible text layers\nin searchable PDFs. It has proper Unicode cmap coverage using format 13 (many-to-one)\nfor efficient mapping of all BMP codepoints to a small set of width-specific glyphs.\n\nFeatures:\n- Full BMP coverage (U+0000 to U+FFFF)\n- Width-aware glyphs for proper text selection:\n  - Zero-width for combining marks and invisible characters\n  - Regular width (500 units) for Latin, Greek, Cyrillic, Arabic, Hebrew, etc.\n  - Double width (1000 units) for CJK and fullwidth characters\n- Uses cmap format 13 (many-to-one) for ~12KB size vs ~780KB with format 12\n- Compatible with fpdf2 and other modern PDF libraries\n\nUsage:\n    python scripts/generate_glyphless_font.py\n\nOutput:\n    src/ocrmypdf/data/Occulta.ttf\n\"\"\"\n\nfrom __future__ import annotations\n\nimport unicodedata\nfrom pathlib import Path\n\nfrom fontTools.fontBuilder import FontBuilder\nfrom fontTools.ttLib import TTFont\nfrom fontTools.ttLib.tables._c_m_a_p import CmapSubtable\nfrom fontTools.ttLib.tables._g_l_y_f import Glyph\n\n# Output path relative to this script\nOUTPUT_PATH = Path(__file__).parent.parent / \"src\" / \"ocrmypdf\" / \"data\" / \"Occulta.ttf\"\n\n# Font metrics (units per em = 1000)\nUNITS_PER_EM = 1000\nASCENT = 800\nDESCENT = -200\n\n# Glyph definitions: (name, advance_width, left_side_bearing)\nGLYPHS = [\n    (\".notdef\", 500, 0),  # Required, used for unmapped characters\n    (\"space\", 500, 0),  # U+0020 SPACE\n    (\"nbspace\", 500, 0),  # U+00A0 NO-BREAK SPACE\n    (\"blank0\", 0, 0),  # Zero-width (combining marks, ZWNJ, ZWJ, BOM)\n    (\"blank1\", 500, 0),  # Regular width (most scripts)\n    (\"blank2\", 1000, 0),  # Double width (CJK, fullwidth)\n]\n\n# Explicit zero-width character codepoints\nZERO_WIDTH_CHARS = frozenset(\n    [\n        0x200B,  # ZERO WIDTH SPACE\n        0x200C,  # ZERO WIDTH NON-JOINER\n        0x200D,  # ZERO WIDTH JOINER\n        0xFEFF,  # ZERO WIDTH NO-BREAK SPACE (BOM)\n        0x200E,  # LEFT-TO-RIGHT MARK\n        0x200F,  # RIGHT-TO-LEFT MARK\n        0x202A,  # LEFT-TO-RIGHT EMBEDDING\n        0x202B,  # RIGHT-TO-LEFT EMBEDDING\n        0x202C,  # POP DIRECTIONAL FORMATTING\n        0x202D,  # LEFT-TO-RIGHT OVERRIDE\n        0x202E,  # RIGHT-TO-LEFT OVERRIDE\n        0x2060,  # WORD JOINER\n        0x2061,  # FUNCTION APPLICATION\n        0x2062,  # INVISIBLE TIMES\n        0x2063,  # INVISIBLE SEPARATOR\n        0x2064,  # INVISIBLE PLUS\n    ]\n)\n\n\ndef classify_codepoint(codepoint: int) -> str:\n    \"\"\"Classify a Unicode codepoint into one of our glyph categories.\n\n    Args:\n        codepoint: Unicode codepoint (0x0000 to 0xFFFF)\n\n    Returns:\n        Glyph name to map this codepoint to\n    \"\"\"\n    # Special cases first\n    if codepoint == 0x0020:\n        return \"space\"\n    if codepoint == 0x00A0:\n        return \"nbspace\"\n    if codepoint in ZERO_WIDTH_CHARS:\n        return \"blank0\"\n\n    # Use Unicode properties for the rest\n    char = chr(codepoint)\n    try:\n        category = unicodedata.category(char)\n        east_asian_width = unicodedata.east_asian_width(char)\n\n        # Combining marks are zero-width\n        if category.startswith(\"M\"):\n            return \"blank0\"\n\n        # Wide and Fullwidth characters are double-width\n        if east_asian_width in (\"W\", \"F\"):\n            return \"blank2\"\n\n        # Everything else is regular width\n        return \"blank1\"\n\n    except (ValueError, TypeError):\n        # Fallback for any edge cases\n        return \"blank1\"\n\n\ndef build_cmap() -> dict[int, str]:\n    \"\"\"Build the Unicode to glyph name mapping for the entire BMP.\n\n    Returns:\n        Dictionary mapping codepoints to glyph names\n    \"\"\"\n    return {cp: classify_codepoint(cp) for cp in range(0x10000)}\n\n\ndef create_font() -> TTFont:\n    \"\"\"Create the Occulta glyphless font.\n\n    Returns:\n        TTFont object ready to be saved\n    \"\"\"\n    glyph_names = [g[0] for g in GLYPHS]\n\n    # Start building the font\n    fb = FontBuilder(UNITS_PER_EM, isTTF=True)\n    fb.setupGlyphOrder(glyph_names)\n\n    # Create empty (invisible) glyphs\n    glyphs = {}\n    for name, _, _ in GLYPHS:\n        glyph = Glyph()\n        glyph.numberOfContours = 0\n        glyphs[name] = glyph\n    fb.setupGlyf(glyphs)\n\n    # Set up horizontal metrics\n    metrics = {name: (width, lsb) for name, width, lsb in GLYPHS}\n    fb.setupHorizontalMetrics(metrics)\n\n    # Minimal cmap to satisfy FontBuilder (we'll replace it later)\n    fb.setupCharacterMap({0x0020: \"space\", 0x00A0: \"nbspace\"})\n\n    # Set up other required tables\n    fb.setupHorizontalHeader(ascent=ASCENT, descent=DESCENT)\n    fb.setupOS2(\n        sTypoAscender=ASCENT,\n        sTypoDescender=DESCENT,\n        sTypoLineGap=0,\n        usWinAscent=UNITS_PER_EM,\n        usWinDescent=abs(DESCENT),\n        sxHeight=500,\n        sCapHeight=700,\n    )\n    import time\n\n    # Use current time for font timestamps\n    now = int(time.time())\n    fb.setupHead(unitsPerEm=UNITS_PER_EM, created=now, modified=now)\n    fb.setupPost()\n    fb.setupNameTable(\n        {\n            \"familyName\": \"Occulta\",\n            \"styleName\": \"Regular\",\n            \"uniqueFontIdentifier\": \"OCRmyPDF;Occulta-Regular;2026\",\n            \"fullName\": \"Occulta Regular\",\n            \"version\": \"Version 2.0\",\n            \"psName\": \"Occulta-Regular\",\n        }\n    )\n\n    # Build the font\n    font = fb.font\n\n    # Now replace the cmap with format 13 for efficient many-to-one mapping\n    char_to_glyph = build_cmap()\n\n    cmap13 = CmapSubtable.newSubtable(13)\n    cmap13.platformID = 3  # Windows\n    cmap13.platEncID = 10  # Unicode full repertoire\n    cmap13.language = 0\n    cmap13.cmap = char_to_glyph\n\n    font[\"cmap\"].tables = [cmap13]\n\n    return font\n\n\ndef main() -> None:\n    \"\"\"Generate the Occulta font and save it.\"\"\"\n    print(\"Generating Occulta glyphless font...\")\n\n    font = create_font()\n\n    # Create output directory if needed\n    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)\n\n    # Save the font\n    font.save(str(OUTPUT_PATH))\n    font.close()\n\n    # Report statistics\n    size = OUTPUT_PATH.stat().st_size\n    print(f\"Saved to: {OUTPUT_PATH}\")\n    print(f\"Size: {size:,} bytes\")\n\n    # Verify cmap\n    font = TTFont(str(OUTPUT_PATH))\n    for table in font[\"cmap\"].tables:\n        print(\n            f\"cmap: Platform {table.platformID}, \"\n            f\"Encoding {table.platEncID}, \"\n            f\"Format {table.format}, \"\n            f\"{len(table.cmap)} mappings\"\n        )\n    font.close()\n\n    print(\"Done!\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "snapcraft.yaml",
    "content": "# SPDX-FileCopyrightText: 2022 Alexander Langanke\n# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-FileCopyrightText: 2023 林博仁(Buo-ren, Lin) <Buo.Ren.Lin@gmail.com>\n# SPDX-License-Identifier: MPL-2.0\n\nname: ocrmypdf\ntitle: OCRmyPDF\nbase: core24\nversion: git\nsummary: OCRmyPDF adds a searchable text layer to scanned PDF files\ndescription: OCRmyPDF packaged for snap\ngrade: stable\nconfinement: strict\nicon: docs/images/logo-square-256.svg\nlicense: MPL-2.0\n\nplatforms:\n  amd64:\n\nenvironment:\n  TESSDATA_PREFIX: $SNAP/usr/share/tesseract-ocr/5/tessdata\n  GS_LIB: $SNAP/usr/share/ghostscript/10.02.1/Resource/Init\n  GS_FONTPATH: $SNAP/usr/share/ghostscript/10.02.1/Resource/Font\n  LD_LIBRARY_PATH: $SNAP/usr/lib/x86_64-linux-gnu\n\napps:\n  ocrmypdf:\n    command: usr/bin/snapcraft-preload python3 -m ocrmypdf\n    plugs:\n      - desktop\n      - desktop-legacy\n      - wayland\n      - x11\n      - home\n      - removable-media\n\nparts:\n  snapcraft-preload:\n    source: https://github.com/sergiusens/snapcraft-preload.git\n    plugin: cmake\n    cmake-parameters:\n      - -DCMAKE_INSTALL_PREFIX=/usr -DLIBPATH=/usr/lib\n    build-packages:\n      - on amd64:\n          - gcc-multilib\n          - g++-multilib\n    stage-packages:\n      - lib32stdc++6\n\n  jbig2enc:\n    plugin: autotools\n    source: https://github.com/agl/jbig2enc.git\n    source-tag: \"0.29\"\n    build-packages:\n      - libleptonica-dev\n\n  ocrmypdf:\n    plugin: python\n    source: .\n\n    build-packages:\n      - python3-pip\n\n    stage-packages:\n      - ghostscript\n      - icc-profiles-free\n      - liblept5\n      - libxml2\n      - pngquant\n      - tesseract-ocr-all\n      - unpaper\n      - qpdf\n      - zlib1g\n\n    python-packages:\n      - cffi\n      - pdfminer.six\n      - pikepdf\n      - Pillow\n      - pluggy\n      - reportlab\n      - setuptools\n      - tqdm\n      - pipe\n      - wheel\n\n    override-build: |\n      craftctl default\n      ln -sf ../usr/lib/libsnapcraft-preload.so $CRAFT_PART_INSTALL/lib/libsnapcraft-preload.so\n"
  },
  {
    "path": "src/ocrmypdf/RELEASE.md",
    "content": "<!-- SPDX-FileCopyrightText: 2022 James R. Barlow -->\n<!-- SPDX-License-Identifier: CC-BY-SA-4.0 -->\n\n# Release checklist\n\n## Patch release\n\n- Check `pytest`\n\n- Update release notes\n\n## Minor release\n\n## Major release\n\n- Run `pre-commit autoupdate`\n\n- Check README.md\n\n- Check pyproject.toml\n\n    - Are classifiers up to date?\n    - Is `python_requires` correct?\n    - Is it to drop support for older Pythons?\n    - Can we tighten any `install_requires` dependencies?\n\n- Search for old version shims we can remove\n\n    - \"shim\"\n    - ` pikepdf.__version__`\n\n- Search for deprecation: search all files for deprec*, etc.\n\n- Check requirements in setup.cfg\n\n- Delete `tests/cache`, do `pytest --runslow`, and update cache.\n\n- Do `pytest --cov-report html`\n"
  },
  {
    "path": "src/ocrmypdf/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Adds OCR layer to PDFs.\"\"\"\n\nfrom __future__ import annotations\n\nfrom pluggy import HookimplMarker as _HookimplMarker\n\nfrom ocrmypdf import helpers, hocrtransform, pdfa, pdfinfo\nfrom ocrmypdf._concurrent import Executor\nfrom ocrmypdf._defaults import PROGRAM_NAME\nfrom ocrmypdf._jobcontext import PageContext, PdfContext\nfrom ocrmypdf._options import OcrOptions, TaggedPdfMode\nfrom ocrmypdf._pipelines._common import (\n    configure_debug_logging,\n)\nfrom ocrmypdf._version import __version__\nfrom ocrmypdf.api import (\n    Verbosity,\n    configure_logging,\n    ocr,\n)\nfrom ocrmypdf.exceptions import (\n    BadArgsError,\n    DpiError,\n    EncryptedPdfError,\n    ExitCode,\n    ExitCodeException,\n    InputFileError,\n    MissingDependencyError,\n    OutputFileAccessError,\n    PriorOcrFoundError,\n    SubprocessOutputError,\n    TesseractConfigError,\n    UnsupportedImageFormatError,\n)\nfrom ocrmypdf.models.ocr_element import (\n    Baseline,\n    BoundingBox,\n    FontInfo,\n    OcrClass,\n    OcrElement,\n)\nfrom ocrmypdf.pluginspec import OcrEngine, OrientationConfidence\n\nhookimpl = _HookimplMarker('ocrmypdf')\n\n__all__ = [\n    '__version__',\n    'BadArgsError',\n    'Baseline',\n    'BoundingBox',\n    'configure_debug_logging',\n    'configure_logging',\n    'DpiError',\n    'EncryptedPdfError',\n    'Executor',\n    'ExitCode',\n    'ExitCodeException',\n    'FontInfo',\n    'helpers',\n    'hocrtransform',\n    'hookimpl',\n    'InputFileError',\n    'MissingDependencyError',\n    'ocr',\n    'OcrClass',\n    'OcrElement',\n    'OcrEngine',\n    'OcrOptions',\n    'OrientationConfidence',\n    'OutputFileAccessError',\n    'PageContext',\n    'pdfa',\n    'PdfContext',\n    'pdfinfo',\n    'PriorOcrFoundError',\n    'PROGRAM_NAME',\n    'SubprocessOutputError',\n    'TaggedPdfMode',\n    'TesseractConfigError',\n    'UnsupportedImageFormatError',\n    'Verbosity',\n]\n"
  },
  {
    "path": "src/ocrmypdf/__main__.py",
    "content": "#!/usr/bin/env python3\n# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"ocrmypdf command line entrypoint.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport multiprocessing\nimport os\nimport signal\nimport sys\nfrom contextlib import suppress\n\nfrom ocrmypdf import __version__\nfrom ocrmypdf._pipelines.ocr import run_pipeline_cli\nfrom ocrmypdf._validation import check_options\nfrom ocrmypdf.api import Verbosity, configure_logging\nfrom ocrmypdf.cli import get_options_and_plugins\nfrom ocrmypdf.exceptions import (\n    BadArgsError,\n    ExitCode,\n    InputFileError,\n    MissingDependencyError,\n)\n\nlog = logging.getLogger('ocrmypdf')\n\n\ndef sigbus(*args):\n    \"\"\"Handle SIGBUS signals.\n\n    pikepdf, depending on configuration, may use mmap so SIGBUS is a\n    possibility.\n    \"\"\"\n    raise InputFileError(\"Lost access to the input file\")\n\n\ndef run(args=None):\n    \"\"\"Run the ocrmypdf command line interface.\"\"\"\n    options, plugin_manager = get_options_and_plugins(args=args)\n\n    with suppress(AttributeError, PermissionError):\n        os.nice(5)\n\n    verbosity = options.verbose\n    if not os.isatty(sys.stderr.fileno()):\n        options.progress_bar = False\n    if options.quiet:\n        verbosity = Verbosity.quiet\n        options.progress_bar = False\n    configure_logging(\n        verbosity,\n        progress_bar_friendly=options.progress_bar,\n        manage_root_logger=True,\n        plugin_manager=plugin_manager,\n    )\n    log.debug('ocrmypdf %s', __version__)\n    try:\n        check_options(options, plugin_manager)\n    except ValueError as e:\n        log.error(e)\n        return ExitCode.bad_args\n    except BadArgsError as e:\n        log.error(e)\n        return e.exit_code\n    except MissingDependencyError as e:\n        log.error(e)\n        return ExitCode.missing_dependency\n\n    with suppress(AttributeError, OSError):\n        signal.signal(signal.SIGBUS, sigbus)\n\n    result = run_pipeline_cli(options=options, plugin_manager=plugin_manager)\n    return result\n\n\nif __name__ == '__main__':\n    multiprocessing.freeze_support()\n    if sys.platform not in ('win32', 'darwin'):\n        with suppress(RuntimeError):\n            multiprocessing.set_start_method('forkserver')\n    sys.exit(run())\n"
  },
  {
    "path": "src/ocrmypdf/_annots.py",
    "content": "# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"OCRmyPDF PDF annotation cleanup.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\n\nfrom pikepdf import Dictionary, Name, NameTree, Pdf\n\nlog = logging.getLogger(__name__)\n\n\ndef remove_broken_goto_annotations(pdf: Pdf) -> bool:\n    \"\"\"Remove broken goto annotations from a PDF.\n\n    If a PDF contains a GoTo Action that points to a named destination that does not\n    exist, Ghostscript PDF/A conversion will fail. In any event, a named destination\n    that is not defined is not useful.\n\n    Args:\n        pdf: Opened PDF file.\n\n    Returns:\n        bool: True if the file was modified, False if not.\n    \"\"\"\n    modified = False\n\n    # Check if there are any named destinations\n    if Name.Names not in pdf.Root:\n        return modified\n    if Name.Dests not in pdf.Root[Name.Names]:\n        return modified\n\n    dests = pdf.Root[Name.Names][Name.Dests]\n    if not isinstance(dests, Dictionary):\n        return modified\n    nametree = NameTree(dests)\n\n    # Create a set of all named destinations\n    names = set(k for k in nametree.keys())\n\n    for n, page in enumerate(pdf.pages):\n        if Name.Annots not in page:\n            continue\n        for annot in page[Name.Annots]:\n            if not isinstance(annot, Dictionary):\n                continue\n            if Name.A not in annot or Name.D not in annot[Name.A]:\n                continue\n            # We found an annotation that points to a named destination\n            named_destination = str(annot[Name.A][Name.D])\n            if named_destination not in names:\n                # If there is no corresponding named destination, remove the\n                # annotation. Having no destination set is still valid and just\n                # makes the link non-functional.\n                log.warning(\n                    f\"Disabling a hyperlink annotation on page {n + 1} to a \"\n                    \"non-existent named destination \"\n                    f\"{named_destination}.\"\n                )\n                del annot[Name.A][Name.D]\n                modified = True\n\n    return modified\n"
  },
  {
    "path": "src/ocrmypdf/_concurrent.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"OCRmyPDF concurrency abstractions.\"\"\"\n\nfrom __future__ import annotations\n\nimport threading\nfrom abc import ABC, abstractmethod\nfrom collections.abc import Callable, Iterable\nfrom typing import Any, TypeVar\n\nfrom ocrmypdf._progressbar import NullProgressBar, ProgressBar\n\nT = TypeVar('T')\n\n\ndef _task_noop(*_args, **_kwargs) -> None:\n    return\n\n\ndef _task_finished_noop(_result: Any, pbar: ProgressBar):\n    pbar.update()\n\n\nclass Executor(ABC):\n    \"\"\"Abstract concurrent executor.\"\"\"\n\n    pool_lock = threading.Lock()\n    pbar_class = NullProgressBar\n\n    def __init__(self, *, pbar_class=None):\n        if pbar_class:\n            self.pbar_class = pbar_class\n\n    def __call__(\n        self,\n        *,\n        use_threads: bool,\n        max_workers: int,\n        progress_kwargs: dict,\n        worker_initializer: Callable | None = None,\n        task: Callable[..., T] | None = None,\n        task_arguments: Iterable | None = None,\n        task_finished: Callable[[T, ProgressBar], None] | None = None,\n    ) -> None:\n        \"\"\"Set up parallel execution and progress reporting.\n\n        Args:\n            use_threads: If ``False``, the workload is the sort that will benefit from\n                running in a multiprocessing context (for example, it uses Python\n                heavily, and parallelizing it with threads is not expected to be\n                performant).\n            max_workers: The maximum number of workers that should be run.\n            progress_kwargs: Arguments to set up the progress bar.\n            worker_initializer: Called when a worker is initialized, in the worker's\n                execution context. If the child workers are processes, it must be\n                possible to marshall/pickle the worker initializer.\n                ``functools.partial`` can be used to bind parameters.\n            task: Called when the worker starts a new task, in the worker's execution\n                context. Must be possible to marshall to the worker.\n            task_finished: Called when a worker finishes a task, in the parent's\n                context.\n            task_arguments: An iterable that generates a group of parameters for each\n                task. This runs in the parent's context, but the parameters must be\n                marshallable to the worker.\n        \"\"\"\n        if not task_arguments:\n            return  # Nothing to do!\n        if not worker_initializer:\n            worker_initializer = _task_noop\n        if not task_finished:\n            task_finished = _task_finished_noop\n        if not task:\n            task = _task_noop\n\n        with self.pool_lock:\n            self._execute(\n                use_threads=use_threads,\n                max_workers=max_workers,\n                progress_kwargs=progress_kwargs,\n                worker_initializer=worker_initializer,\n                task=task,\n                task_arguments=task_arguments,\n                task_finished=task_finished,\n            )\n\n    @abstractmethod\n    def _execute(\n        self,\n        *,\n        use_threads: bool,\n        max_workers: int,\n        progress_kwargs: dict,\n        worker_initializer: Callable,\n        task: Callable,\n        task_arguments: Iterable,\n        task_finished: Callable,\n    ):\n        \"\"\"Custom executors should override this method.\"\"\"\n\n\ndef setup_executor(plugin_manager) -> Executor:\n    pbar_class = plugin_manager.get_progressbar_class()\n    return plugin_manager.get_executor(progressbar_class=pbar_class)\n\n\nclass SerialExecutor(Executor):\n    \"\"\"Implements a purely sequential executor using the parallel protocol.\n\n    The current process/thread will be the worker that executes all tasks\n    in order. As such, ``worker_initializer`` will never be called.\n    \"\"\"\n\n    def _execute(\n        self,\n        *,\n        use_threads: bool,\n        max_workers: int,\n        progress_kwargs: dict,\n        worker_initializer: Callable,\n        task: Callable,\n        task_arguments: Iterable,\n        task_finished: Callable,\n    ):  # pylint: disable=unused-argument\n        with self.pbar_class(**progress_kwargs) as pbar:\n            for args in task_arguments:\n                result = task(*args)\n                task_finished(result, pbar)\n"
  },
  {
    "path": "src/ocrmypdf/_defaults.py",
    "content": "# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n# Enforce English hegemony\nfrom __future__ import annotations\n\nDEFAULT_LANGUAGE = 'eng'\n\n# Default rotation threshold\nDEFAULT_ROTATE_PAGES_THRESHOLD = 14.0\n\nPROGRAM_NAME = 'OCRmyPDF'\n"
  },
  {
    "path": "src/ocrmypdf/_exec/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Manage third party executables.\"\"\"\n\nfrom __future__ import annotations\n"
  },
  {
    "path": "src/ocrmypdf/_exec/ghostscript.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Interface to Ghostscript executable.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nfrom collections import deque\nfrom os import fspath\nfrom pathlib import Path\nfrom subprocess import PIPE, CalledProcessError\n\nfrom packaging.version import Version\nfrom PIL import Image, UnidentifiedImageError\n\nfrom ocrmypdf.exceptions import (\n    ColorConversionNeededError,\n    InputFileError,\n    SubprocessOutputError,\n)\nfrom ocrmypdf.helpers import Resolution\nfrom ocrmypdf.pluginspec import GhostscriptRasterDevice\nfrom ocrmypdf.subprocess import get_version, run, run_polling_stderr\n\nCOLOR_CONVERSION_STRATEGIES = frozenset(\n    [\n        'CMYK',\n        'Gray',\n        'LeaveColorUnchanged',\n        'RGB',\n        'UseDeviceIndependentColor',\n    ]\n)\n# Ghostscript executable - gswin32c is not supported\nGS = 'gswin64c' if os.name == 'nt' else 'gs'\n\n\nlog = logging.getLogger(__name__)\n\n\nclass DuplicateFilter(logging.Filter):\n    \"\"\"Filter out duplicate log messages.\n\n    A context window of default 5 messages is used to determine if a message is a\n    duplicate. This is because some Ghostscript messages are word wrapped.\n    \"\"\"\n\n    def __init__(self, logger: logging.Logger, context_window=5):\n        self.window: deque[str] = deque([], maxlen=context_window)\n        self.logger = logger\n        self.levelno = logging.DEBUG\n        self.count = 0\n\n    def filter(self, record):\n        if record.msg in self.window:\n            self.count += 1\n            self.levelno = record.levelno\n            return False\n        else:\n            if self.count >= 1:\n                rep_msg = f\"(suppressed {self.count} repeated lines)\"\n                self.count = 0  # Avoid infinite recursion\n                self.logger.log(self.levelno, rep_msg)\n                self.window.clear()\n            self.window.append(record.msg)\n            return True\n\n\nlog.addFilter(DuplicateFilter(log))\n\n\ndef version() -> Version:\n    return Version(get_version(GS))\n\n\ndef _gs_error_reported(stream) -> bool:\n    match = re.search(r'error', stream, flags=re.IGNORECASE)\n    return bool(match)\n\n\ndef _gs_devicen_reported(stream) -> bool:\n    \"\"\"Did Ghostscript warn about a DeviceN with inappropriate alternate?\n\n    If so, we need the user to select a color conversion, or the resulting PDF will\n    not present correctly in some PDF viewers.\n    \"\"\"\n    match = re.search(\n        r'DeviceN.*inappropriate alternate',\n        stream,\n        flags=re.IGNORECASE | re.MULTILINE,\n    )\n    return bool(match)\n\n\ndef rasterize_pdf(\n    input_file: os.PathLike,\n    output_file: os.PathLike,\n    *,\n    raster_device: GhostscriptRasterDevice,\n    raster_dpi: Resolution,\n    pageno: int = 1,\n    page_dpi: Resolution | None = None,\n    rotation: int | None = None,\n    filter_vector: bool = False,\n    stop_on_error: bool = False,\n    use_cropbox: bool = False,\n):\n    \"\"\"Rasterize one page of a PDF at resolution raster_dpi in canvas units.\n\n    Args:\n        input_file: The PDF file to rasterize.\n        output_file: The file to write the rasterized PDF to.\n        raster_device: The Ghostscript raster device to use to rasterize the PDF.\n        raster_dpi: Resolution in dots per inch at which to rasterize page.\n        pageno: Page number to rasterize (beginning at page 1).\n        page_dpi: Resolution, overriding output image DPI.\n        rotation: Cardinal angle, clockwise, to rotate page.\n        filter_vector: If True, remove vector graphics objects.\n        stop_on_error: If True, stop rasterizing on the first error.\n        use_cropbox: If True, rasterize the CropBox instead of MediaBox.\n            Default is False (use MediaBox).\n    \"\"\"\n    raster_dpi = raster_dpi.round(6)\n    if not page_dpi:\n        page_dpi = raster_dpi\n\n    # Ghostscript may fail with very low DPI values (below 10). If the requested\n    # DPI is too low, use a minimum of 10 DPI and resize the output afterward.\n    MIN_RASTER_DPI = 10\n    needs_low_dpi_resize = (\n        raster_dpi.x < MIN_RASTER_DPI or raster_dpi.y < MIN_RASTER_DPI\n    )\n    if needs_low_dpi_resize:\n        effective_dpi = Resolution(\n            max(raster_dpi.x, MIN_RASTER_DPI), max(raster_dpi.y, MIN_RASTER_DPI)\n        )\n    else:\n        effective_dpi = raster_dpi\n\n    args_gs = (\n        [\n            GS,\n            '-dSAFER',\n            '-dBATCH',\n            '-dNOPAUSE',\n            '-dInterpolateControl=-1',\n            f'-sDEVICE={raster_device}',\n            f'-dFirstPage={pageno}',\n            f'-dLastPage={pageno}',\n            f'-r{effective_dpi.x:f}x{effective_dpi.y:f}',\n        ]\n        + (['-dUseCropBox'] if use_cropbox else [])\n        + (['-dFILTERVECTOR'] if filter_vector else [])\n        + (['-dPDFSTOPONERROR'] if stop_on_error else [])\n        + [\n            '-o',\n            fspath(output_file),\n            '-sstdout=%stderr',  # Literal %s, not string interpolation\n            '-dAutoRotatePages=/None',  # Probably has no effect on raster\n            '-f',\n            fspath(input_file),\n        ]\n    )\n\n    try:\n        p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)\n    except CalledProcessError as e:\n        log.error(e.stderr.decode(errors='replace'))\n        Path(output_file).unlink(missing_ok=True)\n        raise SubprocessOutputError(\"Ghostscript rasterizing failed\") from e\n\n    stderr = p.stderr.decode(errors='replace')\n    if _gs_error_reported(stderr):\n        log.error(stderr)\n        if stop_on_error and \"recoverable image error\" in stderr:\n            Path(output_file).unlink(missing_ok=True)\n            raise InputFileError(\n                \"Ghostscript rasterizing failed. The input file contains errors that \"\n                \"cause PDF viewers to interpret it differently and incorrectly. \"\n                \"Try using --continue-on-soft-render-error and manually inspect the \"\n                \"input and output files to check for visual differences or errors.\"\n            )\n\n    try:\n        with Image.open(output_file) as im:\n            if needs_low_dpi_resize:\n                # Resize to the dimensions that would have resulted from the\n                # original low DPI request\n                scale_x = raster_dpi.x / effective_dpi.x\n                scale_y = raster_dpi.y / effective_dpi.y\n                new_size = (\n                    max(1, int(round(im.width * scale_x))),\n                    max(1, int(round(im.height * scale_y))),\n                )\n                im = im.resize(new_size, Image.Resampling.LANCZOS)\n            if rotation is not None:\n                log.debug(\"Rotating output by %i\", rotation)\n                # rotation is a clockwise angle and Image.ROTATE_* is\n                # counterclockwise so this cancels out the rotation\n                if rotation == 90:\n                    im = im.transpose(Image.Transpose.ROTATE_90)\n                elif rotation == 180:\n                    im = im.transpose(Image.Transpose.ROTATE_180)\n                elif rotation == 270:\n                    im = im.transpose(Image.Transpose.ROTATE_270)\n                if rotation % 180 == 90:\n                    page_dpi = page_dpi.flip_axis()\n            im.save(output_file, dpi=page_dpi)\n    except UnidentifiedImageError:\n        log.error(\n            f\"Ghostscript (using {raster_device} at {raster_dpi} dpi) produced \"\n            \"an invalid page image file.\"\n        )\n        raise\n    except OSError as e:\n        log.error(\n            f\"Ghostscript (using {raster_device} at {raster_dpi} dpi) produced \"\n            \"an invalid page image file.\"\n        )\n        raise UnidentifiedImageError() from e\n\n\nclass GhostscriptFollower:\n    \"\"\"Parses the output of Ghostscript and uses it to update the progress bar.\"\"\"\n\n    re_process = re.compile(r\"Processing pages \\d+ through (\\d+).\")\n    re_page = re.compile(r\"Page (\\d+)\")\n\n    def __init__(self, progressbar_class):\n        self.count = 0\n        self.progressbar_class = progressbar_class\n        self.progressbar = None\n\n    def __enter__(self):\n        # We can't actually set up the progressbar here, because we don't know\n        # how many pages there are until the first __call__() happens. So we\n        # do it in __call__().\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback):\n        if self.progressbar:\n            return self.progressbar.__exit__(exc_type, exc_value, traceback)\n        return False\n\n    def __call__(self, line):\n        if not self.progressbar_class:\n            return\n        if not self.progressbar:\n            m = self.re_process.match(line.strip())\n            if m:\n                self.count = int(m.group(1))\n                self.progressbar = self.progressbar_class(\n                    total=self.count, desc=\"PDF/A conversion\", unit='page'\n                )\n                # Now that we know the count, we can set up the progressbar.\n                self.progressbar.__enter__()\n        else:\n            if self.re_page.match(line.strip()):\n                self.progressbar.update()\n\n\ndef generate_pdfa(\n    pdf_pages,\n    output_file: os.PathLike,\n    *,\n    compression: str,\n    color_conversion_strategy: str,\n    pdf_version: str = '1.5',\n    pdfa_part: str = '2',\n    progressbar_class=None,\n    stop_on_error: bool = False,\n):\n    # Ghostscript's compression is all or nothing. We can either force all images\n    # to JPEG, force all to Flate/PNG, or let it decide how to encode the images.\n    # In most case it's best to let it decide.\n    compression_args = []\n    if compression == 'jpeg':\n        compression_args = [\n            \"-dAutoFilterColorImages=false\",\n            \"-dColorImageFilter=/DCTEncode\",\n            \"-dAutoFilterGrayImages=false\",\n            \"-dGrayImageFilter=/DCTEncode\",\n        ]\n    elif compression == 'lossless':\n        compression_args = [\n            \"-dAutoFilterColorImages=false\",\n            \"-dColorImageFilter=/FlateEncode\",\n            \"-dAutoFilterGrayImages=false\",\n            \"-dGrayImageFilter=/FlateEncode\",\n        ]\n    else:\n        compression_args = [\n            \"-dAutoFilterColorImages=true\",\n            \"-dAutoFilterGrayImages=true\",\n        ]\n\n    gs_version = version()\n    if gs_version == Version('9.56.0'):\n        # 9.56.0 breaks our OCR, should be fixed in 9.56.1\n        # https://bugs.ghostscript.com/show_bug.cgi?id=705187\n        compression_args.append('-dNEWPDF=false')\n\n    if os.name == 'nt':\n        # Windows has lots of fatal \"permission denied\" errors\n        stop_on_error = False\n\n    # nb no need to specify ProcessColorModel when ColorConversionStrategy\n    # is set; see:\n    # https://bugs.ghostscript.com/show_bug.cgi?id=699392\n    args_gs = (\n        [\n            GS,\n            \"-dBATCH\",\n            \"-dNOPAUSE\",\n            \"-dSAFER\",\n            f\"-dCompatibilityLevel={str(pdf_version)}\",\n            \"-sDEVICE=pdfwrite\",\n            \"-dAutoRotatePages=/None\",\n            f\"-sColorConversionStrategy={color_conversion_strategy}\",\n        ]\n        + (['-dPDFSTOPONERROR'] if stop_on_error else [])\n        + compression_args\n        + [\n            \"-dJPEGQ=95\",\n            \"-dSubsetFonts=false\",  # Prevents GS from messing up some encodings\n            f\"-dPDFA={pdfa_part}\",\n            \"-dPDFACompatibilityPolicy=1\",\n            \"-o\",\n            fspath(output_file),\n            \"-sstdout=%stderr\",  # Literal %s, not string interpolation\n        ]\n    )\n    args_gs.extend(fspath(s) for s in pdf_pages)  # Stringify Path objs\n    try:\n        with GhostscriptFollower(progressbar_class) as pbar:\n            p = run_polling_stderr(\n                args_gs,\n                stderr=PIPE,\n                check=True,\n                text=True,\n                encoding='utf-8',\n                errors='replace',\n                callback=pbar,\n            )\n    except CalledProcessError as e:\n        # Ghostscript does not change return code when it fails to create\n        # PDF/A - check PDF/A status elsewhere\n        log.error(e.stderr)\n        raise SubprocessOutputError('Ghostscript PDF/A rendering failed') from e\n    else:\n        stderr = p.stderr\n        # If there is an error we log the whole stderr, except for filtering\n        # duplicates.\n        if _gs_error_reported(stderr):\n            # Ghostscript outputs the pattern **** Error: ....  frequently.\n            # Occasionally the error message is spammed many times. We filter\n            # out duplicates of this message using the filter above. We use\n            # the **** pattern to split the stderr into parts.\n            for part in stderr.split('****'):\n                log.error(part)\n        if _gs_devicen_reported(stderr):\n            raise ColorConversionNeededError()\n"
  },
  {
    "path": "src/ocrmypdf/_exec/jbig2enc.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Interface to jbig2 executable.\"\"\"\n\nfrom __future__ import annotations\n\nfrom subprocess import PIPE, CalledProcessError\n\nfrom packaging.version import Version\n\nfrom ocrmypdf.exceptions import MissingDependencyError\nfrom ocrmypdf.subprocess import get_version, run\n\n\ndef version() -> Version:\n    try:\n        version = get_version('jbig2', regex=r'jbig2enc (\\d+(\\.\\d+)*).*')\n    except CalledProcessError as e:\n        # TeX Live for Windows provides an incompatible jbig2.EXE which may\n        # be on the PATH.\n        raise MissingDependencyError('jbig2enc') from e\n    return Version(version)\n\n\ndef available():\n    try:\n        version()\n    except MissingDependencyError:\n        return False\n    return True\n\n\ndef convert_single(cwd, infile, outfile, threshold):\n    args = ['jbig2', '--pdf', '-t', str(threshold), infile]\n    with open(outfile, 'wb') as fstdout:\n        proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE)\n    proc.check_returncode()\n    return proc\n"
  },
  {
    "path": "src/ocrmypdf/_exec/pngquant.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Interface to pngquant executable.\"\"\"\n\nfrom __future__ import annotations\n\nfrom pathlib import Path\nfrom subprocess import PIPE\n\nfrom packaging.version import Version\n\nfrom ocrmypdf.exceptions import MissingDependencyError\nfrom ocrmypdf.subprocess import get_version, run\n\n\ndef version() -> Version:\n    return Version(get_version('pngquant', regex=r'(\\d+(\\.\\d+)*).*'))\n\n\ndef available():\n    try:\n        version()\n    except MissingDependencyError:\n        return False\n    return True\n\n\ndef quantize(input_file: Path, output_file: Path, quality_min: int, quality_max: int):\n    \"\"\"Quantize a PNG image using pngquant.\n\n    Args:\n        input_file: Input PNG image\n        output_file: Output PNG image\n        quality_min: Minimum quality to use\n        quality_max: Maximum quality to use\n    \"\"\"\n    with open(input_file, 'rb') as input_stream:\n        args = [\n            'pngquant',\n            '--force',\n            '--skip-if-larger',\n            '--quality',\n            f'{quality_min}-{quality_max}',\n            '--',  # pngquant: stop processing arguments\n            '-',  # pngquant: stream input and output\n        ]\n        result = run(args, stdin=input_stream, stdout=PIPE, stderr=PIPE, check=False)\n\n    if result.returncode == 0:\n        # input_file could be the same as output_file, so we defer the write\n        output_file.write_bytes(result.stdout)\n"
  },
  {
    "path": "src/ocrmypdf/_exec/tesseract.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Interface to Tesseract executable.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nfrom contextlib import suppress\nfrom enum import IntEnum\nfrom math import pi\nfrom os import fspath\nfrom pathlib import Path\nfrom subprocess import PIPE, STDOUT, CalledProcessError, TimeoutExpired\n\nfrom packaging.version import Version\n\nfrom ocrmypdf.exceptions import (\n    MissingDependencyError,\n    SubprocessOutputError,\n    TesseractConfigError,\n)\nfrom ocrmypdf.pluginspec import OrientationConfidence\nfrom ocrmypdf.subprocess import get_version, run\n\nlog = logging.getLogger(__name__)\n\n\ndef _tesseract_env(omp_thread_limit: int | None) -> dict[str, str] | None:\n    \"\"\"Create environment dict with OMP_THREAD_LIMIT set for Tesseract subprocesses.\"\"\"\n    if omp_thread_limit is None:\n        return None\n    env = os.environ.copy()\n    env['OMP_THREAD_LIMIT'] = str(omp_thread_limit)\n    return env\n\n\nclass ThresholdingMethod(IntEnum):\n    \"\"\"Tesseract thresholding methods for image binarization.\"\"\"\n\n    AUTO = 0\n    OTSU = 0  # Alias for AUTO - uses Tesseract's default (legacy Otsu)\n    ADAPTIVE_OTSU = 1\n    SAUVOLA = 2\n\n\n# Legacy dictionary for backward compatibility\nTESSERACT_THRESHOLDING_METHODS: dict[str, int] = {\n    'auto': ThresholdingMethod.AUTO,\n    'otsu': ThresholdingMethod.OTSU,\n    'adaptive-otsu': ThresholdingMethod.ADAPTIVE_OTSU,\n    'sauvola': ThresholdingMethod.SAUVOLA,\n}\n\n\nclass TesseractLoggerAdapter(logging.LoggerAdapter):\n    \"\"\"Prepend [tesseract] to messages emitted from tesseract.\"\"\"\n\n    def process(self, msg, kwargs):\n        kwargs['extra'] = self.extra\n        return f'[tesseract] {msg}', kwargs\n\n\nTESSERACT_VERSION_PATTERN = r\"\"\"\n    v?\n    (?:\n        (?:(?P<epoch>[0-9]+)!)?                           # epoch\n        (?P<release>[0-9]+(?:\\.[0-9]+)*)                  # release segment\n        (?P<pre>                                          # pre-release\n            [-_\\.]?\n            (?P<pre_l>(a|b|c|rc|alpha|beta|pre|preview))\n            [-_\\.]?\n            (?P<pre_n>[0-9]+)?\n        )?\n        (?P<post>                                         # post release\n            (?:-(?P<post_n1>[0-9]+))\n            |\n            (?:\n                [-_\\.]?\n                (?P<post_l>post|rev|r)\n                [-_\\.]?\n                (?P<post_n2>[0-9]+)?\n            )\n        )?\n        (?P<dev>                                          # dev release\n            [-_\\.]?\n            (?P<dev_l>dev)\n            [-_\\.]?\n            (?P<dev_n>[0-9]+)?\n        )?\n        (?P<date>\n            [-_\\.]\n            (?:20[0-9][0-9] [0-1][0-9] [0-3][0-9])       # yyyy mm dd\n        )?\n        (?P<gitcount>\n            [-_\\.]?\n            [0-9]+\n        )?\n        (?P<gitcommit>\n            [-_\\.]?\n            g[0-9a-f]{2,10}\n        )?\n    )\n    (?:\\+(?P<local>[a-z0-9]+(?:[-_\\.][a-z0-9]+)*))?       # local version\n\"\"\"\n\n\nclass TesseractVersion(Version):\n    \"\"\"Modify standard packaging.Version regex to support Tesseract idiosyncrasies.\"\"\"\n\n    _regex = re.compile(\n        r\"^\\s*\" + TESSERACT_VERSION_PATTERN + r\"\\s*$\", re.VERBOSE | re.IGNORECASE\n    )\n\n\ndef version() -> Version:\n    return TesseractVersion(get_version('tesseract', regex=r'tesseract\\s(.+)'))\n\n\ndef has_thresholding() -> bool:\n    \"\"\"Does Tesseract have -c thresholding method capability?\"\"\"\n    return version() >= Version('5.0')\n\n\ndef get_languages() -> set[str]:\n    def lang_error(output):\n        msg = (\n            \"Tesseract failed to report available languages.\\n\"\n            \"Output from Tesseract:\\n\"\n            \"-----------\\n\"\n        )\n        msg += output\n        return msg\n\n    args_tess = ['tesseract', '--list-langs']\n    try:\n        proc = run(\n            args_tess,\n            text=True,\n            stdout=PIPE,\n            stderr=STDOUT,\n            logs_errors_to_stdout=True,\n            check=True,\n        )\n        output = proc.stdout\n    except CalledProcessError as e:\n        raise MissingDependencyError(lang_error(e.output)) from e\n\n    for line in output.splitlines():\n        if line.startswith('Error'):\n            raise MissingDependencyError(lang_error(output))\n    _header, *rest = output.splitlines()\n    return {lang.strip() for lang in rest}\n\n\ndef tess_base_args(langs: list[str], engine_mode: int | None) -> list[str]:\n    args = ['tesseract']\n    if langs:\n        args.extend(['-l', '+'.join(langs)])\n    if engine_mode is not None:\n        args.extend(['--oem', str(engine_mode)])\n    return args\n\n\ndef _parse_tesseract_output(binary_output: bytes) -> dict[str, str]:\n    def gen():\n        for line in binary_output.decode().splitlines():\n            line = line.strip()\n            parts = line.split(':', maxsplit=2)\n            if len(parts) == 2:\n                yield parts[0].strip(), parts[1].strip()\n\n    return dict(gen())\n\n\ndef get_orientation(\n    input_file: Path,\n    engine_mode: int | None,\n    timeout: float,\n    omp_thread_limit: int | None = None,\n) -> OrientationConfidence:\n    args_tesseract = tess_base_args(['osd'], engine_mode) + [\n        '--psm',\n        '0',\n        fspath(input_file),\n        'stdout',\n    ]\n\n    try:\n        p = run(\n            args_tesseract,\n            stdout=PIPE,\n            stderr=STDOUT,\n            timeout=timeout,\n            check=True,\n            env=_tesseract_env(omp_thread_limit),\n        )\n    except TimeoutExpired:\n        return OrientationConfidence(angle=0, confidence=0.0)\n    except CalledProcessError as e:\n        tesseract_log_output(e.stdout)\n        tesseract_log_output(e.stderr)\n        # Check both stdout (e.output) and stderr for known non-fatal messages\n        all_output = (e.output or b'') + (e.stderr or b'')\n        if (\n            b'Too few characters. Skipping this page' in all_output\n            or b'Image too large' in all_output\n        ):\n            return OrientationConfidence(0, 0)\n        raise SubprocessOutputError() from e\n\n    osd = _parse_tesseract_output(p.stdout)\n    angle = int(osd.get('Orientation in degrees', 0))\n    orient_conf = OrientationConfidence(\n        angle=angle, confidence=float(osd.get('Orientation confidence', 0))\n    )\n    return orient_conf\n\n\ndef _is_empty_page_error(exc):\n    if b'Empty page!!' in exc.output:  # Tesseract 4.x\n        return True\n\n    return exc.returncode == 1 and (\n        # Tesseract 5.0-5.4 or so\n        exc.output == b''\n        # Tesseract 5.5+\n        or exc.output.startswith(b\"Error in boxClipToRectangle: box outside rectangle\")\n    )\n\n\ndef get_deskew(\n    input_file: Path,\n    languages: list[str],\n    engine_mode: int | None,\n    timeout: float,\n    omp_thread_limit: int | None = None,\n) -> float:\n    \"\"\"Gets angle to deskew this page, in degrees.\"\"\"\n    args_tesseract = tess_base_args(languages, engine_mode) + [\n        '--psm',\n        '2',\n        fspath(input_file),\n        'stdout',\n    ]\n\n    try:\n        p = run(\n            args_tesseract,\n            stdout=PIPE,\n            stderr=STDOUT,\n            timeout=timeout,\n            check=True,\n            env=_tesseract_env(omp_thread_limit),\n        )\n    except TimeoutExpired:\n        return 0.0\n    except CalledProcessError as e:\n        tesseract_log_output(e.stdout)\n        tesseract_log_output(e.stderr)\n        if _is_empty_page_error(e):\n            # Not enough info for a skew angle\n            return 0.0\n        raise SubprocessOutputError() from e\n\n    parsed = _parse_tesseract_output(p.stdout)\n    deskew_radians = float(parsed.get('Deskew angle', 0))\n    deskew_degrees = 180 / pi * deskew_radians\n    log.debug(f\"Deskew angle: {deskew_degrees:.3f}\")\n    return deskew_degrees\n\n\ndef tesseract_log_output(stream: bytes) -> None:\n    tlog = TesseractLoggerAdapter(\n        log,\n        extra=log.extra if hasattr(log, 'extra') else None,  # type: ignore\n    )\n\n    if not stream:\n        return\n    try:\n        text = stream.decode()\n    except UnicodeDecodeError:\n        text = stream.decode('utf-8', 'ignore')\n\n    lines = text.splitlines()\n    for line in lines:\n        if line.startswith(\n            (\"Tesseract Open Source\", \"Warning in pixReadMem\")\n        ):\n            continue\n        elif 'diacritics' in line:\n            tlog.warning(\"lots of diacritics - possibly poor OCR\")\n        elif line.startswith('OSD: Weak margin'):\n            tlog.warning(\"unsure about page orientation\")\n        elif 'Error in pixScanForForeground' in line:\n            pass  # Appears to be spurious/problem with nonwhite borders\n        elif 'Error in boxClipToRectangle' in line:\n            pass  # Always appears with pixScanForForeground message\n        elif 'parameter not found: ' in line.lower():\n            tlog.error(line.strip())\n            problem = line.split('found: ')[1]\n            raise TesseractConfigError(problem)\n        elif 'error' in line.lower() or 'exception' in line.lower():\n            tlog.error(line.strip())\n        elif 'warning' in line.lower():\n            tlog.warning(line.strip())\n        elif 'read_params_file' in line.lower():\n            tlog.error(line.strip())\n        else:\n            tlog.info(line.strip())\n\n\ndef page_timedout(timeout: float) -> None:\n    if timeout == 0:\n        return\n    log.warning(\"[tesseract] took too long to OCR - skipping\")\n\n\ndef _generate_null_hocr(output_hocr: Path, output_text: Path, image: Path) -> None:\n    \"\"\"Produce an empty .hocr file.\n\n    Ensures page is the same size as the input image.\n    \"\"\"\n    output_hocr.write_text('', encoding='utf-8')\n    output_text.write_text('[skipped page]', encoding='utf-8')\n\n\ndef generate_hocr(\n    *,\n    input_file: Path,\n    output_hocr: Path,\n    output_text: Path,\n    languages: list[str],\n    engine_mode: int,\n    tessconfig: list[str],\n    timeout: float,\n    pagesegmode: int,\n    thresholding: ThresholdingMethod,\n    user_words,\n    user_patterns,\n    omp_thread_limit: int | None = None,\n) -> None:\n    \"\"\"Generate a hOCR file, which must be converted to PDF.\"\"\"\n    prefix = output_hocr.with_suffix('')\n\n    args_tesseract = tess_base_args(languages, engine_mode)\n\n    if pagesegmode is not None:\n        args_tesseract.extend(['--psm', str(pagesegmode)])\n\n    if thresholding != ThresholdingMethod.AUTO and has_thresholding():\n        args_tesseract.extend(['-c', f'thresholding_method={thresholding}'])\n\n    if user_words:\n        args_tesseract.extend(['--user-words', user_words])\n\n    if user_patterns:\n        args_tesseract.extend(['--user-patterns', user_patterns])\n\n    # Reminder: test suite tesseract test plugins will break after any changes\n    # to the number of order parameters here\n    args_tesseract.extend([fspath(input_file), fspath(prefix), 'hocr', 'txt'])\n    args_tesseract.extend(tessconfig)\n    try:\n        p = run(\n            args_tesseract,\n            stdout=PIPE,\n            stderr=STDOUT,\n            timeout=timeout,\n            check=True,\n            env=_tesseract_env(omp_thread_limit),\n        )\n        stdout = p.stdout\n    except TimeoutExpired:\n        # Generate a HOCR file with no recognized text if tesseract times out\n        # Temporary workaround to hocrTransform not being able to function if\n        # it does not have a valid hOCR file.\n        page_timedout(timeout)\n        _generate_null_hocr(output_hocr, output_text, input_file)\n    except CalledProcessError as e:\n        tesseract_log_output(e.output)\n        if b'Image too large' in e.output or b'Empty page!!' in e.output:\n            _generate_null_hocr(output_hocr, output_text, input_file)\n            return\n\n        raise SubprocessOutputError() from e\n    else:\n        tesseract_log_output(stdout)\n        # The sidecar text file will get the suffix .txt; rename it to\n        # whatever caller wants it named\n        with suppress(FileNotFoundError):\n            prefix.with_suffix('.txt').replace(output_text)\n\n\ndef use_skip_page(output_pdf: Path, output_text: Path) -> None:\n    output_text.write_text('[skipped page]', encoding='utf-8')\n\n    # A 0 byte file to the output to indicate a skip\n    output_pdf.write_bytes(b'')\n\n\ndef generate_pdf(\n    *,\n    input_file: Path,\n    output_pdf: Path,\n    output_text: Path,\n    languages: list[str],\n    engine_mode: int,\n    tessconfig: list[str],\n    timeout: float,\n    pagesegmode: int,\n    thresholding: ThresholdingMethod,\n    user_words,\n    user_patterns,\n    omp_thread_limit: int | None = None,\n) -> None:\n    \"\"\"Generate a PDF using Tesseract's internal PDF generator.\n\n    We specifically a text-only PDF which is more suitable for combining with\n    the input page.\n    \"\"\"\n    args_tesseract = tess_base_args(languages, engine_mode)\n\n    if pagesegmode is not None:\n        args_tesseract.extend(['--psm', str(pagesegmode)])\n\n    args_tesseract.extend(['-c', 'textonly_pdf=1'])\n\n    if thresholding != ThresholdingMethod.AUTO and has_thresholding():\n        args_tesseract.extend(['-c', f'thresholding_method={thresholding}'])\n\n    if user_words:\n        args_tesseract.extend(['--user-words', user_words])\n\n    if user_patterns:\n        args_tesseract.extend(['--user-patterns', user_patterns])\n\n    prefix = output_pdf.parent / Path(output_pdf.stem)\n\n    # Reminder: test suite tesseract test plugins might break after any changes\n    # to the number of order parameters here\n\n    args_tesseract.extend([fspath(input_file), fspath(prefix), 'pdf', 'txt'])\n    args_tesseract.extend(tessconfig)\n    try:\n        p = run(\n            args_tesseract,\n            stdout=PIPE,\n            stderr=STDOUT,\n            timeout=timeout,\n            check=True,\n            env=_tesseract_env(omp_thread_limit),\n        )\n        stdout = p.stdout\n        with suppress(FileNotFoundError):\n            prefix.with_suffix('.txt').replace(output_text)\n    except TimeoutExpired:\n        page_timedout(timeout)\n        use_skip_page(output_pdf, output_text)\n    except CalledProcessError as e:\n        tesseract_log_output(e.output)\n        if b'Image too large' in e.output or b'Empty page!!' in e.output:\n            use_skip_page(output_pdf, output_text)\n            return\n        raise SubprocessOutputError() from e\n    else:\n        tesseract_log_output(stdout)\n"
  },
  {
    "path": "src/ocrmypdf/_exec/unpaper.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Interface to unpaper executable.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nfrom collections.abc import Iterator\nfrom contextlib import contextmanager\nfrom decimal import Decimal\nfrom pathlib import Path\nfrom subprocess import PIPE, STDOUT\nfrom tempfile import TemporaryDirectory\n\nfrom packaging.version import Version\nfrom PIL import Image\n\nfrom ocrmypdf.exceptions import SubprocessOutputError\nfrom ocrmypdf.subprocess import get_version, run\n\n# unpaper documentation:\n# https://github.com/Flameeyes/unpaper/blob/main/doc/basic-concepts.md\n\n\nUNPAPER_IMAGE_PIXEL_LIMIT = 256 * 1024 * 1024\n\nDecFloat = Decimal | float\n\nlog = logging.getLogger(__name__)\n\n\nclass UnpaperImageTooLargeError(Exception):\n    \"\"\"To capture details when an image is too large for unpaper.\"\"\"\n\n    def __init__(\n        self,\n        w,\n        h,\n        message=\"Image with size {}x{} is too large for cleaning with 'unpaper'.\",\n    ):\n        self.w = w\n        self.h = h\n        self.message = message.format(w, h)\n        super().__init__(self.message)\n\n\ndef version() -> Version:\n    return Version(get_version('unpaper', regex=r'(?m).*?(\\d+(\\.\\d+)(\\.\\d+)?)'))\n\n\n@contextmanager\ndef _setup_unpaper_io(input_file: Path) -> Iterator[tuple[Path, Path, Path]]:\n    with Image.open(input_file) as im:\n        if im.width * im.height >= UNPAPER_IMAGE_PIXEL_LIMIT:\n            raise UnpaperImageTooLargeError(w=im.width, h=im.height)\n\n    with TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir:\n        tmppath = Path(tmpdir)\n        # No changes, PNG input, just use the file we already have\n        input_png = input_file\n        # unpaper can write .png too, but it seems to write them slowly\n        # adds a few seconds to test suite - so just use pnm\n        output_pnm = tmppath / 'output.pnm'\n        yield input_png, output_pnm, tmppath\n\n\ndef run_unpaper(\n    input_file: Path, output_file: Path, *, dpi: DecFloat, mode_args: list[str]\n) -> None:\n    args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args\n\n    with _setup_unpaper_io(input_file) as (input_png, output_pnm, tmpdir):\n        # To prevent any shenanigans from accepting arbitrary parameters in\n        # --unpaper-args, we:\n        # 1) run with cwd set to a tmpdir with only unpaper's files\n        # 2) forbid the use of '/' in arguments, to prevent changing paths\n        # 3) append absolute paths for the input and output file\n        # This should ensure that a user cannot clobber some other file with\n        # their unpaper arguments (whether intentionally or otherwise)\n        args_unpaper.extend([os.fspath(input_png), os.fspath(output_pnm)])\n        run(\n            args_unpaper,\n            close_fds=True,\n            check=True,\n            stderr=STDOUT,  # unpaper writes logging output to stdout and stderr\n            stdout=PIPE,  # and cannot send file output to stdout\n            cwd=tmpdir,\n            logs_errors_to_stdout=True,\n        )\n        try:\n            with Image.open(output_pnm) as imout:\n                imout.save(output_file, dpi=(dpi, dpi))\n        except OSError as e:\n            raise SubprocessOutputError(\n                \"unpaper: failed to produce the expected output file. \"\n                + \" Called with: \"\n                + str(args_unpaper)\n            ) from e\n\n\ndef clean(\n    input_file: Path,\n    output_file: Path,\n    *,\n    dpi: DecFloat,\n    unpaper_args: list[str] | None = None,\n) -> Path:\n    default_args = [\n        '--layout',\n        'none',\n        '--mask-scan-size',\n        '100',  # don't blank out narrow columns\n        '--no-border-align',  # don't align visible content to borders\n        '--no-mask-center',  # don't center visible content within page\n        '--no-grayfilter',  # don't remove light gray areas\n        '--no-blackfilter',  # don't remove solid black areas\n        '--no-deskew',  # don't deskew\n    ]\n    if not unpaper_args:\n        unpaper_args = default_args\n    try:\n        run_unpaper(input_file, output_file, dpi=dpi, mode_args=unpaper_args)\n        return output_file\n    except UnpaperImageTooLargeError as e:\n        log.warning(str(e))\n        return input_file\n"
  },
  {
    "path": "src/ocrmypdf/_exec/verapdf.py",
    "content": "# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Interface to verapdf executable.\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport logging\nfrom pathlib import Path\nfrom subprocess import PIPE\nfrom typing import NamedTuple\n\nfrom packaging.version import Version\n\nfrom ocrmypdf.exceptions import MissingDependencyError\nfrom ocrmypdf.subprocess import get_version, run\n\nlog = logging.getLogger(__name__)\n\n\nclass ValidationResult(NamedTuple):\n    \"\"\"Result of PDF/A validation.\"\"\"\n\n    valid: bool\n    failed_rules: int\n    message: str\n\n\ndef version() -> Version:\n    \"\"\"Get verapdf version.\"\"\"\n    return Version(get_version('verapdf', regex=r'veraPDF (\\d+(\\.\\d+)*)'))\n\n\ndef available() -> bool:\n    \"\"\"Check if verapdf is available.\"\"\"\n    try:\n        version()\n    except (MissingDependencyError, OSError):\n        return False\n    return True\n\n\ndef output_type_to_flavour(output_type: str) -> str:\n    \"\"\"Map OCRmyPDF output_type to verapdf flavour.\n\n    Args:\n        output_type: One of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'\n\n    Returns:\n        verapdf flavour string like '1b', '2b', '3b'\n    \"\"\"\n    mapping = {\n        'pdfa': '2b',\n        'pdfa-1': '1b',\n        'pdfa-2': '2b',\n        'pdfa-3': '3b',\n    }\n    return mapping.get(output_type, '2b')\n\n\ndef validate(input_file: Path, flavour: str) -> ValidationResult:\n    \"\"\"Validate a PDF against a PDF/A profile.\n\n    Args:\n        input_file: Path to PDF file to validate\n        flavour: verapdf flavour (1a, 1b, 2a, 2b, 2u, 3a, 3b, 3u)\n\n    Returns:\n        ValidationResult with validation status\n    \"\"\"\n    args = [\n        'verapdf',\n        '--format',\n        'json',\n        '--flavour',\n        flavour,\n        str(input_file),\n    ]\n\n    try:\n        proc = run(args, stdout=PIPE, stderr=PIPE, check=False)\n    except FileNotFoundError as e:\n        raise MissingDependencyError('verapdf') from e\n\n    try:\n        result = json.loads(proc.stdout)\n        jobs = result.get('report', {}).get('jobs', [])\n        if not jobs:\n            return ValidationResult(False, -1, 'No validation jobs in result')\n        validation_results = jobs[0].get('validationResult', [])\n        if not validation_results:\n            return ValidationResult(False, -1, 'No validation result in output')\n        validation_result = validation_results[0]\n        details = validation_result.get('details', {})\n        failed_rules = details.get('failedRules', 0)\n\n        if failed_rules == 0:\n            return ValidationResult(True, 0, 'PDF/A validation passed')\n        else:\n            return ValidationResult(\n                False,\n                failed_rules,\n                f'PDF/A validation failed with {failed_rules} rule violations',\n            )\n    except (json.JSONDecodeError, KeyError, TypeError) as e:\n        log.debug('Failed to parse verapdf output: %s', e)\n        return ValidationResult(False, -1, f'Failed to parse verapdf output: {e}')\n"
  },
  {
    "path": "src/ocrmypdf/_graft.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"For grafting text-only PDF pages onto freeform PDF pages.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom contextlib import suppress\nfrom dataclasses import dataclass\nfrom enum import Enum\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from ocrmypdf.hocrtransform import OcrElement\n\nfrom pikepdf import (\n    Dictionary,\n    Name,\n    Operator,\n    Page,\n    Pdf,\n    Stream,\n    parse_content_stream,\n    unparse_content_stream,\n)\n\nfrom ocrmypdf._jobcontext import PdfContext\nfrom ocrmypdf._options import ProcessingMode\nfrom ocrmypdf._pipeline import VECTOR_PAGE_DPI\n\n\nclass RenderMode(Enum):\n    \"\"\"Controls where the OCR text layer is placed relative to page content.\n\n    ON_TOP: Text layer renders above page content (reserved for future use).\n    UNDERNEATH: Text layer renders below page content (current default behavior).\n    \"\"\"\n\n    ON_TOP = 0\n    UNDERNEATH = 1\n\n\n@dataclass\nclass Fpdf2PageInfo:\n    \"\"\"Information needed to render and graft an fpdf2 page.\"\"\"\n\n    pageno: int\n    hocr_path: Path\n    dpi: float\n    autorotate_correction: int\n    emplaced_page: bool\n\n\n@dataclass\nclass Fpdf2ParsedPage:\n    \"\"\"Parsed page data ready for fpdf2 rendering.\"\"\"\n\n    pageno: int\n    ocr_tree: OcrElement\n    dpi: float\n    autorotate_correction: int\n    emplaced_page: bool\n\n\n# Alias for backward compatibility with plan documentation\nFpdf2DirectPage = Fpdf2ParsedPage\n\n\ndef _compute_text_misalignment(\n    content_rotation: int, autorotate_correction: int, emplaced_page: bool\n) -> int:\n    \"\"\"Compute rotation needed to align text layer with page content.\n\n    Args:\n        content_rotation: Original page /Rotate value (degrees).\n        autorotate_correction: Rotation applied during rasterization (degrees).\n        emplaced_page: Whether the page content was replaced with rasterized image.\n\n    Returns:\n        Rotation in degrees to apply to text layer to align with content.\n    \"\"\"\n    if emplaced_page:\n        # New image is upright after autorotation was applied\n        content_rotation = autorotate_correction\n    text_rotation = autorotate_correction\n    return (text_rotation - content_rotation) % 360\n\n\ndef _compute_page_rotation(\n    content_rotation: int, autorotate_correction: int, emplaced_page: bool\n) -> int:\n    \"\"\"Compute final page /Rotate value after grafting.\n\n    Args:\n        content_rotation: Original page /Rotate value (degrees).\n        autorotate_correction: Rotation applied during rasterization (degrees).\n        emplaced_page: Whether the page content was replaced with rasterized image.\n\n    Returns:\n        Final /Rotate value for the page.\n    \"\"\"\n    if emplaced_page:\n        content_rotation = autorotate_correction\n    return (content_rotation - autorotate_correction) % 360\n\n\ndef _build_text_layer_ctm(\n    text_width: float,\n    text_height: float,\n    page_width: float,\n    page_height: float,\n    page_origin_x: float,\n    page_origin_y: float,\n    text_rotation: int,\n):\n    \"\"\"Build transformation matrix to align text layer with page content.\n\n    Always computes the full CTM to handle non-zero page origins (e.g.,\n    JSTOR PDFs with MediaBox like [0, 100, 595, 982]) and minor scale\n    differences due to DPI rounding.\n\n    Args:\n        text_width: Width of text layer mediabox.\n        text_height: Height of text layer mediabox.\n        page_width: Width of target page mediabox.\n        page_height: Height of target page mediabox.\n        page_origin_x: X origin of target page mediabox.\n        page_origin_y: Y origin of target page mediabox.\n        text_rotation: Rotation in degrees (clockwise) to apply to text layer.\n\n    Returns:\n        pikepdf.Matrix transformation matrix, or None if identity.\n    \"\"\"\n    from pikepdf import Matrix\n\n    wt, ht = text_width, text_height\n\n    # Center text, rotate, scale to fit page, then position at page origin\n    translate = Matrix().translated(-wt / 2, -ht / 2)\n    untranslate = Matrix().translated(page_width / 2, page_height / 2)\n    corner = Matrix().translated(page_origin_x, page_origin_y)\n\n    # Negate rotation because input is clockwise angle\n    rotate = Matrix().rotated(-text_rotation % 360)\n\n    # Swap dimensions if 90 or 270 degree rotation\n    if text_rotation in (90, 270):\n        wt, ht = ht, wt\n\n    # Scale to fit page dimensions\n    scale_x = page_width / wt if wt else 1.0\n    scale_y = page_height / ht if ht else 1.0\n    scale = Matrix().scaled(scale_x, scale_y)\n\n    ctm = translate @ rotate @ scale @ untranslate @ corner\n\n    # Return None if the result is effectively identity\n    identity = Matrix()\n    if ctm == identity:\n        return None\n\n    return ctm\n\n\nlog = logging.getLogger(__name__)\nMAX_REPLACE_PAGES = 100\n\n\ndef _ensure_dictionary(obj: Dictionary | Stream, name: Name):\n    if name not in obj:\n        obj[name] = Dictionary({})\n    return obj[name]\n\n\ndef strip_invisible_text(pdf: Pdf, page: Page):\n    stream = []\n    in_text_obj = False\n    render_mode = 0\n    render_mode_stack = []\n    text_objects = []\n\n    for operands, operator in parse_content_stream(page, ''):\n        if operator == Operator('Tr'):\n            render_mode = operands[0]\n\n        if operator == Operator('q'):\n            render_mode_stack.append(render_mode)\n\n        if operator == Operator('Q'):\n            # IndexError is raised if stack is empty; try to carry on\n            with suppress(IndexError):\n                render_mode = render_mode_stack.pop()\n\n        if not in_text_obj:\n            if operator == Operator('BT'):\n                in_text_obj = True\n                text_objects.append((operands, operator))\n            else:\n                stream.append((operands, operator))\n        else:\n            text_objects.append((operands, operator))\n            if operator == Operator('ET'):\n                in_text_obj = False\n                if render_mode != 3:\n                    stream.extend(text_objects)\n                text_objects.clear()\n\n    content_stream = unparse_content_stream(stream)\n    page.Contents = Stream(pdf, content_stream)\n\n\nclass OcrGrafter:\n    \"\"\"Manages grafting text-only PDFs onto regular PDFs.\"\"\"\n\n    def __init__(self, context: PdfContext):\n        self.context = context\n        self.path_base = context.origin\n\n        self.pdf_base = Pdf.open(self.path_base)\n\n        self.pdfinfo = context.pdfinfo\n        self.output_file = context.get_path('graft_layers.pdf')\n\n        self.emplacements = 1\n        self.render_mode = RenderMode.UNDERNEATH\n\n        # Check renderer type\n        pdf_renderer = context.options.pdf_renderer\n        self.use_sandwich_renderer = pdf_renderer == 'sandwich'\n\n        # For fpdf2: accumulate pages before rendering\n        self.fpdf2_hocr_pages: list[Fpdf2PageInfo] = []\n        self.fpdf2_parsed_pages: list[Fpdf2ParsedPage] = []\n\n    def graft_page(\n        self,\n        *,\n        pageno: int,\n        image: Path | None,\n        ocr_output: Path | None,\n        ocr_tree: OcrElement | None,\n        autorotate_correction: int,\n    ):\n        \"\"\"Graft OCR output onto a page of the base PDF.\n\n        Args:\n            pageno: Zero-based page number.\n            image: Path to the visible page image PDF, or None if not replacing.\n            ocr_output: Path to OCR output file. For fpdf2 renderer this is an\n                hOCR file; for sandwich renderer this is a text-only PDF.\n            ocr_tree: OCR tree for fpdf2 renderer.\n            autorotate_correction: Orientation correction in degrees (0, 90, 180, 270).\n        \"\"\"\n        if ocr_output and ocr_tree:\n            raise ValueError(\n                'Cannot specify both ocr_output and ocr_tree for fpdf2 renderer'\n            )\n        # Handle image emplacement first\n        emplaced_page = False\n        content_rotation = self.pdfinfo[pageno].rotation\n        path_image = Path(image).resolve() if image else None\n        if path_image is not None and path_image != self.path_base:\n            # We are updating the old page with a rasterized PDF of the new\n            # page (without changing objgen, to preserve references)\n            log.debug(\"Emplacement update\")\n            with Pdf.open(path_image) as pdf_image:\n                self.emplacements += 1\n                foreign_image_page = pdf_image.pages[0]\n                self.pdf_base.pages.append(foreign_image_page)\n                local_image_page = self.pdf_base.pages[-1]\n                self.pdf_base.pages[pageno].emplace(\n                    local_image_page, retain=(Name.Parent,)\n                )\n                del self.pdf_base.pages[-1]\n            emplaced_page = True\n\n        if self.use_sandwich_renderer:\n            # Sandwich renderer: graft pre-rendered PDF immediately\n            if ocr_output:\n                text_misaligned = _compute_text_misalignment(\n                    content_rotation, autorotate_correction, emplaced_page\n                )\n                self._graft_sandwich_text_layer(\n                    pageno=pageno,\n                    textpdf=ocr_output,\n                    text_rotation=text_misaligned,\n                )\n                page_rotation = _compute_page_rotation(\n                    content_rotation, autorotate_correction, emplaced_page\n                )\n                self.pdf_base.pages[pageno].Rotate = page_rotation\n        else:\n            # fpdf2 renderer: accumulate page info for batch rendering.\n            # The hOCR coordinates are in the corrected (upright) coordinate system.\n            # We store autorotate_correction and emplaced_page to set the final\n            # page /Rotate tag after grafting.\n            if ocr_tree:\n                self.fpdf2_parsed_pages.append(\n                    Fpdf2ParsedPage(\n                        ocr_tree=ocr_tree,\n                        pageno=pageno,\n                        autorotate_correction=autorotate_correction,\n                        emplaced_page=emplaced_page,\n                        dpi=self.pdfinfo[pageno].dpi.to_scalar(),\n                    )\n                )\n            if ocr_output:\n                self.fpdf2_hocr_pages.append(\n                    Fpdf2PageInfo(\n                        hocr_path=ocr_output,\n                        pageno=pageno,\n                        autorotate_correction=autorotate_correction,\n                        emplaced_page=emplaced_page,\n                        dpi=self.pdfinfo[pageno].dpi.to_scalar(),\n                    )\n                )\n\n    def finalize(self):\n        # Can have hocr OR parsed pages OR neither (no OCR), but not both\n        assert not (\n            self.fpdf2_hocr_pages and self.fpdf2_parsed_pages\n        ), \"Can't have both hocr and ocrtree pages\"\n\n        if self.fpdf2_hocr_pages:\n            # Render all pages with fpdf2, then graft\n            parsed_pages = self._parse_hocr_pages()\n            self.fpdf2_parsed_pages = parsed_pages\n\n        if self.fpdf2_parsed_pages:\n            self._render_and_graft_fpdf2_pages()\n\n        self.pdf_base.save(self.output_file)\n        self.pdf_base.close()\n        return self.output_file\n\n    def _parse_hocr_pages(self):\n        \"\"\"Render all pages to multi-page PDF with shared fonts, then graft.\"\"\"\n        from ocrmypdf.hocrtransform.hocr_parser import HocrParser\n\n        log.info(\n            \"Parsing %d pages with HocrParser\",\n            len(self.fpdf2_hocr_pages),\n        )\n\n        # Parse all hOCR files and collect OcrElements\n        pages_data: list[Fpdf2ParsedPage] = []\n        for page_info in self.fpdf2_hocr_pages:\n            if page_info.hocr_path.stat().st_size == 0:\n                continue  # Skip empty pages\n\n            # Parse hOCR to OcrElement\n            parser = HocrParser(page_info.hocr_path)\n            ocr_tree = parser.parse()\n\n            # Use DPI from hOCR (scan_res) which reflects actual rasterization DPI.\n            # Fall back to pdfinfo DPI or VECTOR_PAGE_DPI for vector-only pages.\n            effective_dpi = ocr_tree.dpi or page_info.dpi or float(VECTOR_PAGE_DPI)\n            pages_data.append(\n                Fpdf2ParsedPage(\n                    pageno=page_info.pageno,\n                    ocr_tree=ocr_tree,\n                    dpi=effective_dpi,\n                    autorotate_correction=page_info.autorotate_correction,\n                    emplaced_page=page_info.emplaced_page,\n                )\n            )\n\n        return pages_data\n\n    def _render_and_graft_fpdf2_pages(self):\n        font_dir = Path(__file__).parent / \"data\"\n\n        # Render all pages to single PDF\n        multi_page_pdf_path = self.context.get_path('fpdf2_multipage.pdf')\n\n        from ocrmypdf.font import MultiFontManager\n        from ocrmypdf.fpdf_renderer import Fpdf2MultiPageRenderer\n\n        multi_font_manager = MultiFontManager(font_dir)\n        # Build renderer input as (pageno, ocr_tree, dpi) tuples\n        renderer_pages_data = [\n            (parsed.pageno, parsed.ocr_tree, parsed.dpi)\n            for parsed in self.fpdf2_parsed_pages\n        ]\n        renderer = Fpdf2MultiPageRenderer(\n            pages_data=renderer_pages_data,\n            multi_font_manager=multi_font_manager,\n            invisible_text=True,\n        )\n\n        renderer.render(multi_page_pdf_path)\n\n        # Now graft each page from the multi-page PDF\n        with Pdf.open(multi_page_pdf_path) as pdf_text:\n            for idx, parsed in enumerate(self.fpdf2_parsed_pages):\n                # Copy page from multi-page PDF\n                text_page = pdf_text.pages[idx]\n\n                content_rotation = self.pdfinfo[parsed.pageno].rotation\n                text_misaligned = _compute_text_misalignment(\n                    content_rotation,\n                    parsed.autorotate_correction,\n                    parsed.emplaced_page,\n                )\n                self._graft_fpdf2_text_layer(parsed.pageno, text_page, text_misaligned)\n\n                page_rotation = _compute_page_rotation(\n                    content_rotation,\n                    parsed.autorotate_correction,\n                    parsed.emplaced_page,\n                )\n                self.pdf_base.pages[parsed.pageno].Rotate = page_rotation\n\n        # Clean up multi-page PDF if not keeping temp files\n        if not self.context.options.keep_temporary_files:\n            with suppress(FileNotFoundError):\n                multi_page_pdf_path.unlink()\n\n    def _graft_fpdf2_text_layer(self, pageno: int, text_page: Page, text_rotation: int):\n        \"\"\"Graft a single text page onto the base PDF.\n\n        Similar to existing _graft_text_layer but works with\n        already-rendered pikepdf Page instead of file path.\n\n        Args:\n            pageno: Zero-based page number.\n            text_page: The text-only PDF page to graft.\n            text_rotation: Rotation to apply to align text with content (degrees).\n        \"\"\"\n        from pikepdf import Array\n\n        base_page = self.pdf_base.pages[pageno]\n\n        # Extract content stream from text_page\n        text_contents = text_page.Contents.read_bytes()\n\n        # Get the mediabox from the text page\n        mediabox = Array([float(x) for x in text_page.mediabox])  # type: ignore[misc]\n        wt = float(mediabox[2]) - float(mediabox[0])\n        ht = float(mediabox[3]) - float(mediabox[1])\n\n        # Get base page mediabox\n        base_mediabox = base_page.mediabox\n        wp = float(base_mediabox[2]) - float(base_mediabox[0])\n        hp = float(base_mediabox[3]) - float(base_mediabox[1])\n\n        # Create Form XObject from text page content\n        base_resources = _ensure_dictionary(base_page.obj, Name.Resources)\n        base_xobjs = _ensure_dictionary(base_resources, Name.XObject)\n        text_xobj_name = Name.random(prefix=\"OCR-\")\n        xobj = self.pdf_base.make_stream(text_contents)\n        base_xobjs[text_xobj_name] = xobj\n        xobj.Type = Name.XObject\n        xobj.Subtype = Name.Form\n        xobj.FormType = 1\n        xobj.BBox = base_mediabox\n\n        # Copy resources from text page's Resources to xobj\n        # We need to handle this carefully since text_page is from a foreign PDF\n        if hasattr(text_page, 'Resources') and text_page.Resources:\n            # Create empty Resources dictionary for xobj\n            xobj_resources = _ensure_dictionary(xobj, Name.Resources)\n\n            # Copy fonts if they exist\n            if Name.Font in text_page.Resources:\n                xobj_fonts = _ensure_dictionary(xobj_resources, Name.Font)\n                text_fonts = text_page.Resources[Name.Font]\n                # Copy each font from the foreign PDF\n                for font_name, font_obj in text_fonts.items():\n                    xobj_fonts[font_name] = self.pdf_base.copy_foreign(font_obj)\n\n            # Copy ExtGState (graphics state) if it exists - needed for transparency\n            if Name.ExtGState in text_page.Resources:\n                xobj_extstates = _ensure_dictionary(xobj_resources, Name.ExtGState)\n                text_extstates = text_page.Resources[Name.ExtGState]\n                # Copy each graphics state from the foreign PDF\n                for gs_name, gs_obj in text_extstates.items():\n                    xobj_extstates[gs_name] = self.pdf_base.copy_foreign(gs_obj)\n\n        # Build transformation matrix for rotation and scaling\n        ctm = _build_text_layer_ctm(\n            wt,\n            ht,\n            wp,\n            hp,\n            float(base_mediabox[0]),\n            float(base_mediabox[1]),\n            text_rotation,\n        )\n        if ctm is not None:\n            pdf_draw_xobj = (\n                (b'q %s cm\\n' % ctm.encode()) + (b'%s Do\\n' % text_xobj_name) + b'Q\\n'\n            )\n        else:\n            pdf_draw_xobj = b'q\\n' + (b'%s Do\\n' % text_xobj_name) + b'\\nQ\\n'\n\n        new_text_layer = Stream(self.pdf_base, pdf_draw_xobj)\n\n        # Strip old invisible text if redo mode is enabled\n        if self.context.options.mode == ProcessingMode.redo:\n            strip_invisible_text(self.pdf_base, base_page)\n\n        # Add text layer to base page\n        base_page.contents_coalesce()\n        base_page.contents_add(\n            new_text_layer, prepend=self.render_mode == RenderMode.UNDERNEATH\n        )\n        base_page.contents_coalesce()\n\n    def _graft_sandwich_text_layer(\n        self,\n        *,\n        pageno: int,\n        textpdf: Path,\n        text_rotation: int,\n    ):\n        \"\"\"Graft a pre-rendered text-only PDF onto the base PDF.\n\n        This is used by the sandwich renderer which generates PDFs directly\n        from Tesseract rather than going through hOCR.\n        \"\"\"\n        from pikepdf import PdfError\n\n        log.debug(\"Grafting sandwich text layer\")\n        if Path(textpdf).stat().st_size == 0:\n            return\n\n        try:\n            with Pdf.open(textpdf) as pdf_text:\n                pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()\n\n                base_page = self.pdf_base.pages[pageno]\n\n                # Get font from the text PDF\n                pdf_text_fonts = pdf_text.pages[0].Resources.get(\n                    Name.Font, Dictionary()\n                )\n                font = None\n                font_key = None\n                for f in ('/f-0-0', '/F1'):\n                    pdf_text_font = pdf_text_fonts.get(f, None)\n                    if pdf_text_font is not None:\n                        font_key = Name(f)\n                        font = self.pdf_base.copy_foreign(pdf_text_font)\n                        break\n\n                # Get mediabox dimensions for rotation calculations\n                mediabox = pdf_text.pages[0].mediabox\n                wt = float(mediabox[2]) - float(mediabox[0])\n                ht = float(mediabox[3]) - float(mediabox[1])\n\n                base_mediabox = base_page.mediabox\n                wp = float(base_mediabox[2]) - float(base_mediabox[0])\n                hp = float(base_mediabox[3]) - float(base_mediabox[1])\n\n                # Build transformation matrix for rotation and scaling\n                ctm = _build_text_layer_ctm(\n                    wt,\n                    ht,\n                    wp,\n                    hp,\n                    float(base_mediabox[0]),\n                    float(base_mediabox[1]),\n                    text_rotation,\n                )\n                log.debug(\"Grafting with ctm %r\", ctm)\n\n                # Create Form XObject\n                base_resources = _ensure_dictionary(base_page.obj, Name.Resources)\n                base_xobjs = _ensure_dictionary(base_resources, Name.XObject)\n                text_xobj_name = Name.random(prefix=\"OCR-\")\n                xobj = self.pdf_base.make_stream(pdf_text_contents)\n                base_xobjs[text_xobj_name] = xobj\n                xobj.Type = Name.XObject\n                xobj.Subtype = Name.Form\n                xobj.FormType = 1\n                xobj.BBox = base_mediabox\n\n                # Add font to xobj resources\n                if font_key is not None and font is not None:\n                    xobj_resources = _ensure_dictionary(xobj, Name.Resources)\n                    xobj_fonts = _ensure_dictionary(xobj_resources, Name.Font)\n                    if font_key not in xobj_fonts:\n                        xobj_fonts[font_key] = font\n\n                if ctm is not None:\n                    pdf_draw_xobj = (\n                        (b'q %s cm\\n' % ctm.encode())\n                        + (b'%s Do\\n' % text_xobj_name)\n                        + b'\\nQ\\n'\n                    )\n                else:\n                    pdf_draw_xobj = b'q\\n' + (b'%s Do\\n' % text_xobj_name) + b'\\nQ\\n'\n                new_text_layer = Stream(self.pdf_base, pdf_draw_xobj)\n\n                if self.context.options.mode == ProcessingMode.redo:\n                    strip_invisible_text(self.pdf_base, base_page)\n                base_page.contents_coalesce()\n                base_page.contents_add(\n                    new_text_layer, prepend=self.render_mode == RenderMode.UNDERNEATH\n                )\n                base_page.contents_coalesce()\n\n                # Add font to page resources\n                if font_key is not None and font is not None:\n                    page_resources = _ensure_dictionary(base_page.obj, Name.Resources)\n                    page_fonts = _ensure_dictionary(page_resources, Name.Font)\n                    if font_key not in page_fonts:\n                        page_fonts[font_key] = font\n        except (FileNotFoundError, PdfError):\n            # PdfError occurs if a 0-length file is written e.g. due to OCR timeout\n            pass\n"
  },
  {
    "path": "src/ocrmypdf/_jobcontext.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Defines context objects that are passed to child processes/threads.\"\"\"\n\nfrom __future__ import annotations\n\nfrom collections.abc import Iterator\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING\n\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf.pdfinfo import PdfInfo\nfrom ocrmypdf.pdfinfo.info import PageInfo\n\nif TYPE_CHECKING:\n    from ocrmypdf._plugin_manager import OcrmypdfPluginManager\n\n\nclass PdfContext:\n    \"\"\"Holds the context for a particular run of the pipeline.\"\"\"\n\n    options: OcrOptions  #: The specified options for processing this PDF.\n    origin: Path  #: The filename of the original input file.\n    pdfinfo: PdfInfo  #: Detailed data for this PDF.\n    plugin_manager: (\n        OcrmypdfPluginManager  #: PluginManager for processing the current PDF.\n    )\n\n    def __init__(\n        self,\n        options: OcrOptions,\n        work_folder: Path,\n        origin: Path,\n        pdfinfo: PdfInfo,\n        plugin_manager,\n    ):\n        self.options = options\n        self.work_folder = work_folder\n        self.origin = origin\n        self.pdfinfo = pdfinfo\n        self.plugin_manager = plugin_manager\n\n    def get_path(self, name: str) -> Path:\n        \"\"\"Generate a ``Path`` for an intermediate file involved in processing.\n\n        The path will be in a temporary folder that is common for all processing\n        of this particular PDF.\n        \"\"\"\n        return self.work_folder / name\n\n    def get_page_contexts(self) -> Iterator[PageContext]:\n        \"\"\"Get all ``PageContext`` for this PDF.\"\"\"\n        npages = len(self.pdfinfo)\n        for n in range(npages):\n            yield PageContext(self, n)\n\n    def get_page_context_args(self) -> Iterator[tuple[PageContext]]:\n        \"\"\"Get all ``PageContext`` for this PDF packaged in tuple for args-splatting.\"\"\"\n        npages = len(self.pdfinfo)\n        for n in range(npages):\n            yield (PageContext(self, n),)\n\n\nclass PageContext:\n    \"\"\"Holds our context for a page.\n\n    Must be pickle-able, so stores only intrinsic/simple data elements or those\n    capable of their serializing themselves via ``__getstate__``.\n\n    Note: Uses OcrOptions with JSON serialization for multiprocessing compatibility.\n    \"\"\"\n\n    origin: Path  #: The filename of the original input file.\n    pageno: int  #: This page number (zero-based).\n    pageinfo: PageInfo  #: Information on this page.\n    plugin_manager: (\n        OcrmypdfPluginManager  #: PluginManager for processing the current PDF.\n    )\n\n    def __init__(self, pdf_context: PdfContext, pageno):\n        self.work_folder = pdf_context.work_folder\n        self.origin = pdf_context.origin\n        # Store OcrOptions directly instead of Namespace\n        self.options = pdf_context.options\n        self.pageno = pageno\n        self.pageinfo = pdf_context.pdfinfo[pageno]\n        self.plugin_manager = pdf_context.plugin_manager\n        # Ensure no reference to PdfContext which contains OcrOptions\n        self._pdf_context = None\n\n    def get_path(self, name: str) -> Path:\n        \"\"\"Generate a ``Path`` for a file that is part of processing this page.\n\n        The path will be based in a common temporary folder and have a prefix based\n        on the page number.\n        \"\"\"\n        return self.work_folder / f\"{(self.pageno + 1):06d}_{name}\"\n\n    def __getstate__(self):\n        state = self.__dict__.copy()\n\n        options_json = self.options.model_dump_json_safe()\n        state['options_json'] = options_json\n        # Remove the OcrOptions object to avoid pickle issues\n        del state['options']\n\n        # Remove any potential references to Pydantic objects\n        state.pop('_pdf_context', None)\n        return state\n\n    def __setstate__(self, state):\n        self.__dict__.update(state)\n\n        # Reconstruct OcrOptions from JSON if available\n        if 'options_json' in state:\n            from ocrmypdf._options import OcrOptions\n\n            self.options = OcrOptions.model_validate_json_safe(state['options_json'])\n        # Otherwise, we have a fallback Namespace (shouldn't happen in normal operation)\n        # Leave it as-is for compatibility\n"
  },
  {
    "path": "src/ocrmypdf/_logging.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Logging support classes.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\n\nfrom rich.console import Console\nfrom rich.logging import RichHandler\n\n\nclass PageNumberFilter(logging.Filter):\n    \"\"\"Insert PDF page number that emitted log message to log record.\"\"\"\n\n    def filter(self, record):\n        pageno = getattr(record, 'pageno', None)\n        if isinstance(pageno, int):\n            record.pageno = f'{pageno:5d} '\n        elif pageno is None:\n            record.pageno = ''\n        return True\n\n\nclass RichLoggingHandler(RichHandler):\n    def __init__(self, console: Console, **kwargs):\n        super().__init__(\n            console=console, show_level=False, show_time=False, markup=False, **kwargs\n        )\n"
  },
  {
    "path": "src/ocrmypdf/_metadata.py",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"OCRmyPDF page processing pipeline functions.\"\"\"\n\nfrom __future__ import annotations\n\nimport datetime as dt\nimport logging\nimport os\nfrom pathlib import Path\nfrom typing import Any\n\nfrom pikepdf import Dictionary, Name, Pdf\nfrom pikepdf import __version__ as PIKEPDF_VERSION\nfrom pikepdf.models.metadata import PdfMetadata, encode_pdf_date\n\nfrom ocrmypdf._defaults import PROGRAM_NAME\nfrom ocrmypdf._jobcontext import PdfContext\nfrom ocrmypdf._version import __version__ as OCRMYPF_VERSION\nfrom ocrmypdf.languages import iso_639_2_from_3\n\nlog = logging.getLogger(__name__)\n\n\ndef get_docinfo(base_pdf: Pdf, context: PdfContext) -> dict[str, str]:\n    \"\"\"Read the document info and store it in a dictionary.\"\"\"\n    options = context.options\n\n    def from_document_info(key):\n        try:\n            s = base_pdf.docinfo[key]\n            return str(s)\n        except (KeyError, TypeError):\n            return ''\n\n    pdfmark = {\n        k: from_document_info(k)\n        for k in ('/Title', '/Author', '/Keywords', '/Subject', '/CreationDate')\n    }\n    if options.title:\n        pdfmark['/Title'] = options.title\n    if options.author:\n        pdfmark['/Author'] = options.author\n    if options.keywords:\n        pdfmark['/Keywords'] = options.keywords\n    if options.subject:\n        pdfmark['/Subject'] = options.subject\n\n    creator_tag = context.plugin_manager.get_ocr_engine(options=options).creator_tag(\n        options\n    )\n\n    pdfmark['/Creator'] = f'{PROGRAM_NAME} {OCRMYPF_VERSION} / {creator_tag}'\n    pdfmark['/Producer'] = f'pikepdf {PIKEPDF_VERSION}'\n    pdfmark['/ModDate'] = encode_pdf_date(dt.datetime.now(dt.UTC))\n    return pdfmark\n\n\ndef report_on_metadata(options, missing):\n    if not missing:\n        return\n    if options.output_type.startswith('pdfa'):\n        log.warning(\n            \"Some input metadata could not be copied because it is not \"\n            \"permitted in PDF/A. You may wish to examine the output \"\n            \"PDF's XMP metadata.\"\n        )\n        log.debug(\"The following metadata fields were not copied: %r\", missing)\n    else:\n        log.error(\n            \"Some input metadata could not be copied.\"\n            \"You may wish to examine the output PDF's XMP metadata.\"\n        )\n        log.info(\"The following metadata fields were not copied: %r\", missing)\n\n\ndef repair_docinfo_nuls(pdf):\n    \"\"\"If the DocumentInfo block contains NUL characters, remove them.\n\n    If the DocumentInfo block is malformed, log an error and continue.\n    \"\"\"\n    modified = False\n    try:\n        if not isinstance(pdf.docinfo, Dictionary):\n            raise TypeError(\"DocumentInfo is not a dictionary\")\n        for k, v in pdf.docinfo.items():\n            if isinstance(v, str) and b'\\x00' in bytes(v):\n                pdf.docinfo[k] = bytes(v).replace(b'\\x00', b'')\n                modified = True\n    except TypeError:\n        # TypeError can also be raised if dictionary items are unexpected types\n        log.error(\"File contains a malformed DocumentInfo block - continuing anyway.\")\n    return modified\n\n\ndef should_linearize(working_file: Path, context: PdfContext) -> bool:\n    \"\"\"Determine whether the PDF should be linearized.\n\n    For smaller files, linearization is not worth the effort.\n    \"\"\"\n    filesize = os.stat(working_file).st_size\n    return filesize > (context.options.fast_web_view * 1_000_000)\n\n\ndef _fix_metadata(meta_original: PdfMetadata, meta_pdf: PdfMetadata):\n    # If xmp:CreateDate is missing, set it to the modify date to\n    # ensure consistency with Ghostscript.\n    if 'xmp:CreateDate' not in meta_pdf:\n        meta_pdf['xmp:CreateDate'] = meta_pdf.get('xmp:ModifyDate', '')\n    if meta_pdf.get('dc:title') == 'Untitled' and ('dc:title' not in meta_original):\n        # Ghostscript likes to set title to Untitled if omitted from input.\n        # Reverse this, because PDF/A TechNote 0003:Metadata in PDF/A-1\n        # and the XMP Spec do not make this recommendation.\n        del meta_pdf['dc:title']\n\n\ndef _unset_empty_metadata(meta: PdfMetadata, options):\n    \"\"\"Unset metadata fields that were explicitly set to empty strings.\n\n    If the user explicitly specified an empty string for any of the\n    following, they should be unset and not reported as missing in\n    the output pdf. Note that some metadata fields use differing names\n    between PDF/A and PDF.\n    \"\"\"\n    if options.title == '' and 'dc:title' in meta:\n        del meta['dc:title']  # PDF/A and PDF\n    if options.author == '':\n        if 'dc:creator' in meta:\n            del meta['dc:creator']  # PDF/A (Not xmp:CreatorTool)\n        if 'pdf:Author' in meta:\n            del meta['pdf:Author']  # PDF\n    if options.subject == '':\n        if 'dc:description' in meta:\n            del meta['dc:description']  # PDF/A\n        if 'dc:subject' in meta:\n            del meta['dc:subject']  # PDF\n    if options.keywords == '' and 'pdf:Keywords' in meta:\n        del meta['pdf:Keywords']  # PDF/A and PDF\n\n\ndef _set_language(pdf: Pdf, languages: list[str]):\n    \"\"\"Set the language of the PDF.\"\"\"\n    if Name.Lang in pdf.Root or not languages:\n        return  # Already set or can't change\n    primary_language_iso639_3 = languages[0]\n    if not primary_language_iso639_3:\n        return\n    iso639_2 = iso_639_2_from_3(primary_language_iso639_3)\n    if not iso639_2:\n        return\n    pdf.Root.Lang = iso639_2\n\n\nclass MetadataProgress:\n    def __init__(self, progressbar_class, enable: bool = True):\n        self.progressbar_class = progressbar_class\n        self.progressbar = self.progressbar_class(\n            total=100, desc=\"Linearizing\", unit='%', disable=not enable\n        )\n\n    def __enter__(self):\n        self.progressbar.__enter__()\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback):\n        return self.progressbar.__exit__(exc_type, exc_value, traceback)\n\n    def __call__(self, percent: int):\n        if not self.progressbar_class:\n            return\n        self.progressbar.update(completed=percent)\n\n\ndef metadata_fixup(\n    working_file: Path, context: PdfContext, pdf_save_settings: dict[str, Any]\n) -> Path:\n    \"\"\"Fix certain metadata fields whether PDF or PDF/A.\n\n    Override some of Ghostscript's metadata choices.\n\n    Also report on metadata in the input file that was not retained during\n    conversion.\n    \"\"\"\n    output_file = context.get_path('metafix.pdf')\n    options = context.options\n\n    pbar_class = context.plugin_manager.get_progressbar_class()\n    with (\n        Pdf.open(context.origin) as original,\n        Pdf.open(working_file) as pdf,\n        MetadataProgress(pbar_class, options.progress_bar) as pbar,\n    ):\n        docinfo = get_docinfo(original, context)\n        with (\n            original.open_metadata(\n                set_pikepdf_as_editor=False, update_docinfo=False, strict=False\n            ) as meta_original,\n            pdf.open_metadata() as meta_pdf,\n        ):\n            meta_pdf.load_from_docinfo(\n                docinfo, delete_missing=False, raise_failure=False\n            )\n            _fix_metadata(meta_original, meta_pdf)\n            _unset_empty_metadata(meta_original, options)\n            _unset_empty_metadata(meta_pdf, options)\n            meta_missing = set(meta_original.keys()) - set(meta_pdf.keys())\n            report_on_metadata(options, meta_missing)\n\n        _set_language(pdf, options.languages)\n        pdf.save(output_file, progress=pbar, **pdf_save_settings)\n\n    return output_file\n"
  },
  {
    "path": "src/ocrmypdf/_options.py",
    "content": "# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Internal options model for OCRmyPDF.\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport logging\nimport os\nimport shlex\nimport unicodedata\nfrom collections.abc import Sequence\nfrom enum import StrEnum\nfrom io import IOBase\nfrom pathlib import Path\nfrom typing import Any, BinaryIO\n\nfrom pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator\n\nfrom ocrmypdf._defaults import DEFAULT_LANGUAGE, DEFAULT_ROTATE_PAGES_THRESHOLD\nfrom ocrmypdf.exceptions import BadArgsError\nfrom ocrmypdf.helpers import monotonic\n\n# Import plugin option models - these will be available after plugins are loaded\n# We'll use forward references and handle imports dynamically\n\nlog = logging.getLogger(__name__)\n\n# Module-level registry for plugin option models\n# This is populated by setup_plugin_infrastructure() after plugins are loaded\n_plugin_option_models: dict[str, type] = {}\n\nPathOrIO = BinaryIO | IOBase | Path | str | bytes\n\n\nclass ProcessingMode(StrEnum):\n    \"\"\"OCR processing mode for handling pages with existing text.\n\n    This enum controls how OCRmyPDF handles pages that already contain text:\n\n    - ``default``: Error if text is found (standard OCR behavior)\n    - ``force``: Rasterize all content and run OCR regardless of existing text\n    - ``skip``: Skip OCR on pages that already have text\n    - ``redo``: Re-OCR pages, stripping old invisible text layer\n    \"\"\"\n\n    default = 'default'\n    force = 'force'\n    skip = 'skip'\n    redo = 'redo'\n\n\nclass TaggedPdfMode(StrEnum):\n    \"\"\"Control behavior when encountering a Tagged PDF.\n\n    Tagged PDFs often indicate documents generated from office applications\n    that may not need OCR. This enum controls how OCRmyPDF handles them:\n\n    - ``default``: Error if ProcessingMode is default, otherwise warn\n    - ``ignore``: Always warn but continue processing (never error)\n    \"\"\"\n\n    default = 'default'\n    ignore = 'ignore'\n\n\ndef _pages_from_ranges(ranges: str) -> set[int]:\n    \"\"\"Convert page range string to set of page numbers.\"\"\"\n    pages: list[int] = []\n    page_groups = ranges.replace(' ', '').split(',')\n    for group in page_groups:\n        if not group:\n            continue\n        try:\n            start, end = group.split('-')\n        except ValueError:\n            pages.append(int(group) - 1)\n        else:\n            try:\n                new_pages = list(range(int(start) - 1, int(end)))\n                if not new_pages:\n                    raise BadArgsError(\n                        f\"invalid page subrange '{start}-{end}'\"\n                    ) from None\n                pages.extend(new_pages)\n            except ValueError:\n                raise BadArgsError(f\"invalid page subrange '{group}'\") from None\n\n    if not pages:\n        raise BadArgsError(\n            f\"The string of page ranges '{ranges}' did not contain any recognizable \"\n            f\"page ranges.\"\n        )\n\n    if not monotonic(pages):\n        log.warning(\n            \"List of pages to process contains duplicate pages, or pages that are \"\n            \"out of order\"\n        )\n    if any(page < 0 for page in pages):\n        raise BadArgsError(\"pages refers to a page number less than 1\")\n\n    log.debug(\"OCRing only these pages: %s\", pages)\n    return set(pages)\n\n\nclass OcrOptions(BaseModel):\n    \"\"\"Internal options model that can masquerade as argparse.Namespace.\n\n    This model provides proper typing and validation while maintaining\n    compatibility with existing code that expects argparse.Namespace behavior.\n    \"\"\"\n\n    # I/O options\n    input_file: PathOrIO\n    output_file: PathOrIO\n    sidecar: PathOrIO | None = None\n    output_folder: Path | None = None\n    work_folder: Path | None = None\n\n    # Core OCR options\n    languages: list[str] = Field(default_factory=lambda: [DEFAULT_LANGUAGE])\n    output_type: str = 'auto'\n    mode: ProcessingMode = ProcessingMode.default\n\n    # Backward compatibility properties for force_ocr, skip_text, redo_ocr\n    @property\n    def force_ocr(self) -> bool:\n        \"\"\"Backward compatibility alias for mode == ProcessingMode.force.\"\"\"\n        return self.mode == ProcessingMode.force\n\n    @property\n    def skip_text(self) -> bool:\n        \"\"\"Backward compatibility alias for mode == ProcessingMode.skip.\"\"\"\n        return self.mode == ProcessingMode.skip\n\n    @property\n    def redo_ocr(self) -> bool:\n        \"\"\"Backward compatibility alias for mode == ProcessingMode.redo.\"\"\"\n        return self.mode == ProcessingMode.redo\n\n    # Job control\n    jobs: int | None = None\n    use_threads: bool = True\n    progress_bar: bool = True\n    quiet: bool = False\n    verbose: int = 0\n    keep_temporary_files: bool = False\n\n    # Image processing\n    image_dpi: int | None = None\n    deskew: bool = False\n    clean: bool = False\n    clean_final: bool = False\n    rotate_pages: bool = False\n    remove_background: bool = False\n    remove_vectors: bool = False\n    oversample: int = 0\n    unpaper_args: list[str] | None = None\n\n    # OCR behavior\n    skip_big: float | None = None\n    pages: str | set[int] | None = None  # Can be string or set after validation\n    invalidate_digital_signatures: bool = False\n    tagged_pdf_mode: TaggedPdfMode = TaggedPdfMode.default\n\n    # Metadata\n    title: str | None = None\n    author: str | None = None\n    subject: str | None = None\n    keywords: str | None = None\n\n    # Optimization\n    optimize: int = 1\n    jpg_quality: int | None = None\n    png_quality: int | None = None\n    jbig2_threshold: float = 0.85\n\n    # Compatibility alias for plugins that expect jpeg_quality\n    @property\n    def jpeg_quality(self):\n        \"\"\"Compatibility alias for jpg_quality.\"\"\"\n        return self.jpg_quality\n\n    @jpeg_quality.setter\n    def jpeg_quality(self, value):\n        \"\"\"Compatibility alias for jpg_quality.\"\"\"\n        self.jpg_quality = value\n\n    # Output behavior\n    no_overwrite: bool = False\n\n    # Advanced options\n    max_image_mpixels: float = 250.0\n    pdf_renderer: str = 'auto'\n    ocr_engine: str = 'auto'\n    rasterizer: str = 'auto'\n    rotate_pages_threshold: float = DEFAULT_ROTATE_PAGES_THRESHOLD\n    user_words: os.PathLike | None = None\n    user_patterns: os.PathLike | None = None\n    fast_web_view: float = 1.0\n    continue_on_soft_render_error: bool | None = None\n\n    # Tesseract options - also accessible via options.tesseract.<field>\n    tesseract_config: list[str] = []\n    tesseract_pagesegmode: int | None = None\n    tesseract_oem: int | None = None\n    tesseract_thresholding: int | None = None\n    tesseract_timeout: float | None = None\n    tesseract_non_ocr_timeout: float | None = None\n    tesseract_downsample_above: int = 32767\n    tesseract_downsample_large_images: bool | None = None\n\n    # Ghostscript options - also accessible via options.ghostscript.<field>\n    pdfa_image_compression: str | None = None\n    color_conversion_strategy: str = \"LeaveColorUnchanged\"\n\n    # Optimize/JBIG2 options - also accessible via options.optimize.<field>\n    jbig2_threshold: float = 0.85\n\n    # Plugin system\n    plugins: Sequence[Path | str] | None = None\n\n    # Store any extra attributes (for plugins and dynamic options)\n    extra_attrs: dict[str, Any] = Field(\n        default_factory=dict, exclude=True, alias='_extra_attrs'\n    )\n\n    @field_validator('languages')\n    @classmethod\n    def validate_languages(cls, v):\n        \"\"\"Ensure languages list is not empty.\"\"\"\n        if not v:\n            return [DEFAULT_LANGUAGE]\n        return v\n\n    @field_validator('output_type')\n    @classmethod\n    def validate_output_type(cls, v):\n        \"\"\"Validate output type is one of the allowed values.\"\"\"\n        valid_types = {'auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'}\n        if v not in valid_types:\n            raise ValueError(f\"output_type must be one of {valid_types}\")\n        return v\n\n    @field_validator('pdf_renderer')\n    @classmethod\n    def validate_pdf_renderer(cls, v):\n        \"\"\"Validate PDF renderer is one of the allowed values.\"\"\"\n        valid_renderers = {'auto', 'sandwich', 'fpdf2'}\n        # Legacy hocr/hocrdebug are accepted but redirected to fpdf2\n        legacy_renderers = {'hocr', 'hocrdebug'}\n        all_accepted = valid_renderers | legacy_renderers\n        if v not in all_accepted:\n            raise ValueError(f\"pdf_renderer must be one of {all_accepted}\")\n        return v\n\n    @field_validator('rasterizer')\n    @classmethod\n    def validate_rasterizer(cls, v):\n        \"\"\"Validate rasterizer is one of the allowed values.\"\"\"\n        valid_rasterizers = {'auto', 'ghostscript', 'pypdfium'}\n        if v not in valid_rasterizers:\n            raise ValueError(f\"rasterizer must be one of {valid_rasterizers}\")\n        return v\n\n    @field_validator('clean_final')\n    @classmethod\n    def validate_clean_final(cls, v, info):\n        \"\"\"If clean_final is True, also set clean to True.\"\"\"\n        if v and hasattr(info, 'data') and 'clean' in info.data:\n            info.data['clean'] = True\n        return v\n\n    @field_validator('jobs')\n    @classmethod\n    def validate_jobs(cls, v):\n        \"\"\"Validate jobs is a reasonable number.\"\"\"\n        if v is not None and (v < 0 or v > 256):\n            raise ValueError(\"jobs must be between 0 and 256\")\n        return v\n\n    @field_validator('verbose')\n    @classmethod\n    def validate_verbose(cls, v):\n        \"\"\"Validate verbose level.\"\"\"\n        if v < 0 or v > 2:\n            raise ValueError(\"verbose must be between 0 and 2\")\n        return v\n\n    @field_validator('oversample')\n    @classmethod\n    def validate_oversample(cls, v):\n        \"\"\"Validate oversample DPI.\"\"\"\n        if v < 0 or v > 5000:\n            raise ValueError(\"oversample must be between 0 and 5000\")\n        return v\n\n    @field_validator('max_image_mpixels')\n    @classmethod\n    def validate_max_image_mpixels(cls, v):\n        \"\"\"Validate max image megapixels.\"\"\"\n        if v < 0:\n            raise ValueError(\"max_image_mpixels must be non-negative\")\n        return v\n\n    @field_validator('rotate_pages_threshold')\n    @classmethod\n    def validate_rotate_pages_threshold(cls, v):\n        \"\"\"Validate rotate pages threshold.\"\"\"\n        if v < 0 or v > 1000:\n            raise ValueError(\"rotate_pages_threshold must be between 0 and 1000\")\n        return v\n\n    @field_validator('title', 'author', 'keywords', 'subject')\n    @classmethod\n    def validate_metadata_unicode(cls, v):\n        \"\"\"Validate metadata strings don't contain unsupported Unicode characters.\"\"\"\n        if v is None:\n            return v\n\n        for char in v:\n            if unicodedata.category(char) == 'Co' or ord(char) >= 0x10000:\n                hexchar = hex(ord(char))[2:].upper()\n                raise ValueError(\n                    f\"Metadata string contains unsupported Unicode character: \"\n                    f\"{char} (U+{hexchar})\"\n                )\n        return v\n\n    @field_validator('pages')\n    @classmethod\n    def validate_pages_format(cls, v):\n        \"\"\"Convert page ranges string to set of page numbers.\"\"\"\n        if v is None:\n            return v\n        if isinstance(v, set):\n            return v  # Already processed\n\n        # Convert string ranges to set of page numbers\n        return _pages_from_ranges(v)\n\n    @field_validator('unpaper_args', mode='before')\n    @classmethod\n    def validate_unpaper_args(cls, v):\n        \"\"\"Normalize unpaper_args from string to list and validate security.\"\"\"\n        if v is None:\n            return v\n        if isinstance(v, str):\n            v = shlex.split(v)\n        if isinstance(v, list):\n            if any(('/' in arg or arg == '.' or arg == '..') for arg in v):\n                raise ValueError('No filenames allowed in --unpaper-args')\n            return v\n        raise ValueError(f'unpaper_args must be a string or list, got {type(v)}')\n\n    @model_validator(mode='before')\n    @classmethod\n    def handle_special_cases(cls, data):\n        \"\"\"Handle special cases for API compatibility and legacy options.\"\"\"\n        if isinstance(data, dict):\n            # For hOCR API, output_file might not be present\n            if 'output_folder' in data and 'output_file' not in data:\n                data['output_file'] = '/dev/null'  # Placeholder\n\n            # Convert legacy boolean options (force_ocr, skip_text, redo_ocr) to mode\n            force = data.pop('force_ocr', None)\n            skip = data.pop('skip_text', None)\n            redo = data.pop('redo_ocr', None)\n\n            # Count how many legacy options are set to True\n            legacy_set = [\n                (force, ProcessingMode.force),\n                (skip, ProcessingMode.skip),\n                (redo, ProcessingMode.redo),\n            ]\n            legacy_true = [(val, mode) for val, mode in legacy_set if val]\n            legacy_count = len(legacy_true)\n\n            # Get current mode value (may be string or enum)\n            current_mode = data.get('mode', ProcessingMode.default)\n            if isinstance(current_mode, str):\n                current_mode = ProcessingMode(current_mode)\n            mode_is_set = current_mode != ProcessingMode.default\n\n            if legacy_count > 1:\n                raise ValueError(\n                    \"Choose only one of --force-ocr, --skip-text, --redo-ocr.\"\n                )\n\n            if legacy_count == 1:\n                expected_mode = legacy_true[0][1]\n                if mode_is_set and current_mode != expected_mode:\n                    legacy_flag = f\"--{expected_mode.value.replace('_', '-')}-ocr\"\n                    raise ValueError(\n                        f\"Conflicting options: --mode {current_mode.value} \"\n                        f\"cannot be used with {legacy_flag} or similar legacy flag.\"\n                    )\n                # Set mode from legacy option\n                data['mode'] = expected_mode\n\n        return data\n\n    @model_validator(mode='after')\n    def validate_redo_ocr_options(self):\n        \"\"\"Validate options compatible with redo mode.\"\"\"\n        if self.mode == ProcessingMode.redo and (\n            self.deskew or self.clean_final or self.remove_background\n        ):\n            raise ValueError(\n                \"--redo-ocr (or --mode redo) is not currently compatible with \"\n                \"--deskew, --clean-final, and --remove-background\"\n            )\n        return self\n\n    @model_validator(mode='after')\n    def validate_output_type_compatibility(self):\n        \"\"\"Validate output type is compatible with output file.\"\"\"\n        if self.output_type == 'none' and str(self.output_file) not in (\n            os.devnull,\n            '-',\n        ):\n            raise ValueError(\n                \"Since you specified `--output-type none`, the output file \"\n                f\"{self.output_file} cannot be produced. Set the output file to \"\n                f\"`-` to suppress this message.\"\n            )\n        return self\n\n    @property\n    def lossless_reconstruction(self):\n        \"\"\"Determine lossless_reconstruction based on other options.\"\"\"\n        lossless = not any(\n            [\n                self.deskew,\n                self.clean_final,\n                self.mode == ProcessingMode.force,\n                self.remove_background,\n            ]\n        )\n        return lossless\n\n    def model_dump_json_safe(self) -> str:\n        \"\"\"Serialize to JSON with special handling for non-serializable types.\"\"\"\n        # Create a copy of the model data for serialization\n        data = self.model_dump()\n\n        # Handle special types that don't serialize to JSON directly\n        def _serialize_value(value):\n            if isinstance(value, Path):\n                return {'__type__': 'Path', 'value': str(value)}\n            elif (\n                isinstance(value, BinaryIO | IOBase)\n                or hasattr(value, 'read')\n                or hasattr(value, 'write')\n            ):\n                # Stream object - replace with placeholder\n                return {'__type__': 'Stream', 'value': 'stream'}\n            elif hasattr(value, '__class__') and 'Iterator' in value.__class__.__name__:\n                # Handle Pydantic serialization iterators\n                return {'__type__': 'Stream', 'value': 'stream'}\n            elif isinstance(value, property):\n                # Handle property objects that shouldn't be serialized\n                return None\n            elif isinstance(value, list | tuple):\n                return [_serialize_value(item) for item in value]\n            elif isinstance(value, dict):\n                return {k: _serialize_value(v) for k, v in value.items()}\n            else:\n                return value\n\n        # Process all fields\n        serializable_data = {}\n        for key, value in data.items():\n            serialized_value = _serialize_value(value)\n            if serialized_value is not None:  # Skip None values from properties\n                serializable_data[key] = serialized_value\n\n        # Add extra_attrs, excluding plugin cache entries (they'll be recreated lazily)\n        if self.extra_attrs:\n            filtered_extra = {\n                k: v\n                for k, v in self.extra_attrs.items()\n                if not k.startswith('_plugin_cache_')\n            }\n            if filtered_extra:\n                serializable_data['_extra_attrs'] = _serialize_value(filtered_extra)\n\n        return json.dumps(serializable_data)\n\n    @classmethod\n    def model_validate_json_safe(cls, json_str: str) -> OcrOptions:\n        \"\"\"Reconstruct from JSON with special handling for non-serializable types.\"\"\"\n        data = json.loads(json_str)\n\n        # Handle special types during deserialization\n        def _deserialize_value(value):\n            if isinstance(value, dict) and '__type__' in value:\n                if value['__type__'] == 'Path':\n                    return Path(value['value'])\n                elif value['__type__'] == 'Stream':\n                    # For streams, we'll use a placeholder string\n                    return value['value']\n                else:\n                    return value['value']\n            elif isinstance(value, list):\n                return [_deserialize_value(item) for item in value]\n            elif isinstance(value, dict):\n                return {k: _deserialize_value(v) for k, v in value.items()}\n            else:\n                return value\n\n        # Process all fields\n        deserialized_data = {}\n        extra_attrs = {}\n\n        for key, value in data.items():\n            if key == '_extra_attrs':\n                extra_attrs = _deserialize_value(value)\n            else:\n                deserialized_data[key] = _deserialize_value(value)\n\n        # Create instance\n        instance = cls(**deserialized_data)\n        instance.extra_attrs = extra_attrs\n\n        return instance\n\n    model_config = ConfigDict(\n        extra=\"forbid\",  # Force use of extra_attrs for unknown fields\n        arbitrary_types_allowed=True,  # Allow BinaryIO, Path, etc.\n        validate_assignment=True,  # Validate on attribute assignment\n    )\n\n    @classmethod\n    def register_plugin_models(cls, models: dict[str, type]) -> None:\n        \"\"\"Register plugin option model classes for nested access.\n\n        Args:\n            models: Dictionary mapping namespace to model class\n        \"\"\"\n        global _plugin_option_models\n        _plugin_option_models.update(models)\n\n    def _get_plugin_options(self, namespace: str) -> Any:\n        \"\"\"Get or create a plugin options instance for the given namespace.\n\n        This method creates plugin option instances lazily from flat field values.\n\n        Args:\n            namespace: The plugin namespace (e.g., 'tesseract', 'optimize')\n\n        Returns:\n            An instance of the plugin's option model, or None if not registered\n        \"\"\"\n        # Use extra_attrs to cache plugin option instances\n        cache_key = f'_plugin_cache_{namespace}'\n        if cache_key in self.extra_attrs:\n            return self.extra_attrs[cache_key]\n\n        if namespace not in _plugin_option_models:\n            raise AttributeError(\n                f\"Plugin namespace '{namespace}' is not registered. \"\n                f\"Ensure setup_plugin_infrastructure() was called.\"\n            )\n\n        model_class = _plugin_option_models[namespace]\n\n        def _convert_value(value):\n            \"\"\"Convert value to be compatible with plugin model fields.\"\"\"\n            if isinstance(value, os.PathLike):\n                return os.fspath(value)\n            return value\n\n        # Build kwargs from flat fields\n        kwargs = {}\n        for field_name in model_class.model_fields:\n            # Try namespace_field pattern first (e.g., tesseract_timeout)\n            flat_name = f\"{namespace}_{field_name}\"\n            if flat_name in OcrOptions.model_fields:\n                value = getattr(self, flat_name)\n                if value is not None:\n                    kwargs[field_name] = _convert_value(value)\n            # Also check direct field name (for fields like jbig2_lossy)\n            elif field_name in OcrOptions.model_fields:\n                value = getattr(self, field_name)\n                if value is not None:\n                    kwargs[field_name] = _convert_value(value)\n            # Check for special mappings\n            elif namespace == 'optimize' and field_name == 'level':\n                # 'optimize' field maps to 'level' in OptimizeOptions\n                if 'optimize' in OcrOptions.model_fields:\n                    value = self.optimize\n                    if value is not None:\n                        kwargs[field_name] = _convert_value(value)\n            elif namespace == 'optimize' and field_name == 'jpeg_quality':\n                # jpg_quality maps to jpeg_quality\n                if 'jpg_quality' in OcrOptions.model_fields:\n                    value = self.jpg_quality\n                    if value is not None:\n                        kwargs[field_name] = _convert_value(value)\n\n        # Create and cache the plugin options instance\n        instance = model_class(**kwargs)\n        self.extra_attrs[cache_key] = instance\n        return instance\n\n    def __getattr__(self, name: str) -> Any:\n        \"\"\"Support dynamic access to plugin option namespaces.\n\n        This allows accessing plugin options like:\n            options.tesseract.timeout\n            options.optimize.level\n\n        Plugin models must be registered via register_plugin_models() for\n        namespace access to work. Built-in plugins register their models\n        during initialization.\n\n        Args:\n            name: Attribute name\n\n        Returns:\n            Plugin options instance if name is a registered namespace,\n            otherwise raises AttributeError\n        \"\"\"\n        # Check if this is a plugin namespace\n        if name.startswith('_'):\n            # Private attributes should not trigger plugin lookup\n            raise AttributeError(\n                f\"'{type(self).__name__}' object has no attribute '{name}'\"\n            )\n\n        # Try to get plugin options for this namespace\n        if name in _plugin_option_models:\n            return self._get_plugin_options(name)\n\n        # Check extra_attrs\n        if 'extra_attrs' in self.__dict__ and name in self.extra_attrs:\n            return self.extra_attrs[name]\n\n        raise AttributeError(\n            f\"'{type(self).__name__}' object has no attribute '{name}'\"\n        )\n"
  },
  {
    "path": "src/ocrmypdf/_pipeline.py",
    "content": "# SPDX-FileCopyrightText: 2018-2022 James R. Barlow\n# SPDX-FileCopyrightText: 2019 Martin Wind\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"OCRmyPDF page processing pipeline functions.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nimport sys\nfrom collections.abc import Iterable, Iterator, Sequence\nfrom contextlib import suppress\nfrom io import BytesIO\nfrom pathlib import Path\nfrom shutil import copyfileobj\nfrom typing import TYPE_CHECKING, Any, BinaryIO, TypeVar, cast\n\nif TYPE_CHECKING:\n    from ocrmypdf.hocrtransform import OcrElement\n\nimport img2pdf\nimport pikepdf\nfrom PIL import Image, ImageColor, ImageDraw\n\nfrom ocrmypdf._concurrent import Executor\nfrom ocrmypdf._exec import unpaper\nfrom ocrmypdf._jobcontext import PageContext, PdfContext\nfrom ocrmypdf._metadata import repair_docinfo_nuls\nfrom ocrmypdf._options import OcrOptions, ProcessingMode, TaggedPdfMode\nfrom ocrmypdf.exceptions import (\n    DigitalSignatureError,\n    DpiError,\n    EncryptedPdfError,\n    InputFileError,\n    PriorOcrFoundError,\n    TaggedPDFError,\n    UnsupportedImageFormatError,\n)\nfrom ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution, safe_symlink\nfrom ocrmypdf.pdfa import (\n    file_claims_pdfa,\n    generate_pdfa_ps,\n    speculative_pdfa_conversion,\n)\nfrom ocrmypdf.pdfinfo import Colorspace, Encoding, FloatRect, PageInfo, PdfInfo\nfrom ocrmypdf.pluginspec import GhostscriptRasterDevice, OrientationConfidence\n\ntry:\n    from pi_heif import register_heif_opener\nexcept ImportError:\n\n    def register_heif_opener():\n        pass\n\n\nT = TypeVar(\"T\")\nlog = logging.getLogger(__name__)\n\nVECTOR_PAGE_DPI = 400\n\n\nregister_heif_opener()\n\n\ndef triage_image_file(input_file: Path, output_file: Path, options: OcrOptions) -> None:\n    \"\"\"Triage the input image file.\n\n    If the input file is an image, check its resolution and convert it to PDF.\n\n    Args:\n        input_file: The path to the input file.\n        output_file: The path to the output file.\n        options: An object containing the options passed to the OCRmyPDF command.\n\n    Raises:\n        UnsupportedImageFormatError: If the input file is not a supported image format.\n        DpiError: If the input image has no resolution (DPI) in its metadata or if the\n            resolution is not credible.\n    \"\"\"\n    log.info(\"Input file is not a PDF, checking if it is an image...\")\n    try:\n        im = Image.open(input_file)\n    except OSError as e:\n        # Recover the original filename\n        log.error(str(e).replace(str(input_file), str(options.input_file)))\n        if not input_file.exists():\n            log.error(\"Input file does not exist: %s\", input_file)\n        if input_file.is_dir():\n            log.error(\"Input file is a directory: %s\", input_file)\n        if input_file.is_file():\n            log.error(\"Input file is a file: %s\", input_file)\n        if input_file.stat().st_size == 0:\n            log.error(\"Input file is empty: %s\", input_file)\n        raise UnsupportedImageFormatError() from e\n\n    with im:\n        log.info(\"Input file is an image\")\n        if 'dpi' in im.info:\n            if im.info['dpi'] <= (96, 96) and not options.image_dpi:\n                log.info(\"Image size: (%d, %d)\", *im.size)\n                log.info(\"Image resolution: (%d, %d)\", *im.info['dpi'])\n                raise DpiError(\n                    \"Input file is an image, but the resolution (DPI) is \"\n                    \"not credible.  Estimate the resolution at which the \"\n                    \"image was scanned and specify it using --image-dpi.\"\n                )\n        elif not options.image_dpi:\n            log.info(\"Image size: (%d, %d)\", *im.size)\n            raise DpiError(\n                \"Input file is an image, but has no resolution (DPI) \"\n                \"in its metadata.  Estimate the resolution at which \"\n                \"image was scanned and specify it using --image-dpi.\"\n            )\n\n        if im.mode in ('RGBA', 'LA'):\n            raise UnsupportedImageFormatError(\n                \"The input image has an alpha channel. Remove the alpha \"\n                \"channel first.\"\n            )\n\n        if 'iccprofile' not in im.info:\n            if im.mode == 'RGB':\n                log.info(\"Input image has no ICC profile, assuming sRGB\")\n            elif im.mode == 'CMYK':\n                raise UnsupportedImageFormatError(\n                    \"Input CMYK image has no ICC profile, not usable\"\n                )\n\n    try:\n        log.info(\"Image seems valid. Try converting to PDF...\")\n        layout_fun = img2pdf.default_layout_fun\n        if options.image_dpi:\n            layout_fun = img2pdf.get_fixed_dpi_layout_fun(\n                Resolution(options.image_dpi, options.image_dpi)\n            )\n        with open(output_file, 'wb') as outf:\n            img2pdf.convert(\n                os.fspath(input_file),\n                layout_fun=layout_fun,\n                outputstream=outf,\n                **IMG2PDF_KWARGS,\n            )\n        log.info(\"Successfully converted to PDF, processing...\")\n    except img2pdf.ImageOpenError as e:\n        raise UnsupportedImageFormatError() from e\n\n\ndef _pdf_guess_version(input_file: Path, search_window=1024) -> str:\n    \"\"\"Try to find version signature at start of file.\n\n    Not robust enough to deal with appended files.\n\n    Returns empty string if not found, indicating file is probably not PDF.\n    \"\"\"\n    with open(input_file, 'rb') as f:\n        signature = f.read(search_window)\n    m = re.search(rb'%PDF-(\\d\\.\\d)', signature)\n    if m:\n        return m.group(1).decode('ascii')\n    return ''\n\n\ndef triage(\n    original_filename: str, input_file: Path, output_file: Path, options: OcrOptions\n) -> Path:\n    \"\"\"Triage the input file. We can handle PDFs and images.\"\"\"\n    try:\n        if _pdf_guess_version(input_file):\n            if options.image_dpi:\n                log.warning(\n                    \"Argument --image-dpi is being ignored because the \"\n                    \"input file is a PDF, not an image.\"\n                )\n            try:\n                with pikepdf.open(input_file) as pdf:\n                    pdf.save(output_file)\n            except pikepdf.PdfError as e:\n                raise InputFileError() from e\n            except pikepdf.PasswordError as e:\n                raise EncryptedPdfError() from e\n            return output_file\n    except OSError as e:\n        log.debug(f\"Temporary file was at: {input_file}\")\n        msg = str(e).replace(str(input_file), original_filename)\n        raise InputFileError(msg) from e\n\n    triage_image_file(input_file, output_file, options)\n    return output_file\n\n\ndef get_pdfinfo(\n    input_file,\n    *,\n    executor: Executor,\n    detailed_analysis: bool = False,\n    progbar: bool = False,\n    max_workers: int | None = None,\n    use_threads: bool = True,\n    check_pages=None,\n) -> PdfInfo:\n    \"\"\"Get the PDF info.\"\"\"\n    try:\n        return PdfInfo(\n            input_file,\n            detailed_analysis=detailed_analysis,\n            progbar=progbar,\n            max_workers=max_workers,\n            use_threads=use_threads,\n            check_pages=check_pages,\n            executor=executor,\n        )\n    except pikepdf.PasswordError as e:\n        raise EncryptedPdfError() from e\n    except pikepdf.PdfError as e:\n        raise InputFileError() from e\n\n\ndef validate_pdfinfo_options(context: PdfContext) -> None:\n    \"\"\"Validate the PDF info options.\"\"\"\n    pdfinfo = context.pdfinfo\n    options = context.options\n\n    if pdfinfo.needs_rendering:\n        raise InputFileError(\n            \"This PDF contains dynamic XFA forms created by Adobe LiveCycle \"\n            \"Designer and can only be read by Adobe Acrobat or Adobe Reader.\"\n        )\n    if pdfinfo.has_signature:\n        if options.invalidate_digital_signatures:\n            log.warning(\"All digital signatures will be invalidated\")\n        else:\n            raise DigitalSignatureError()\n    if pdfinfo.has_acroform:\n        if options.mode == ProcessingMode.redo:\n            raise InputFileError(\n                \"This PDF has a user fillable form. --redo-ocr (or --mode redo) \"\n                \"is not currently possible on such files.\"\n            )\n        else:\n            log.warning(\n                \"This PDF has a fillable form. \"\n                \"Chances are it is a pure digital \"\n                \"document that does not need OCR.\"\n            )\n            if options.mode != ProcessingMode.force:\n                log.info(\n                    \"Use the option --force-ocr (or --mode force) to produce an \"\n                    \"image of the form and all filled form fields. The output PDF \"\n                    \"will be 'flattened' and will no longer be fillable.\"\n                )\n    if pdfinfo.is_tagged:\n        log.warning(\n            \"This PDF is marked as a Tagged PDF. This often indicates \"\n            \"that the PDF was generated from an office document and does \"\n            \"not need OCR. PDF pages processed by OCRmyPDF may not be \"\n            \"tagged correctly.\"\n        )\n        if (\n            options.tagged_pdf_mode == TaggedPdfMode.default\n            and options.mode == ProcessingMode.default\n        ):\n            log.info(\"Use --tagged-pdf-mode ignore to ignore Tagged PDFs.\")\n            raise TaggedPDFError()\n    context.plugin_manager.validate(pdfinfo=pdfinfo, options=options)\n\n\ndef _vector_page_dpi(pageinfo: PageInfo) -> int:\n    \"\"\"Get a DPI to use for vector pages, if the page has vector content.\"\"\"\n    return VECTOR_PAGE_DPI if pageinfo.has_vector or pageinfo.has_text else 0\n\n\ndef get_page_square_dpi(\n    page_context: PageContext, image_dpi: Resolution | None = None\n) -> Resolution:\n    \"\"\"Get the DPI when we require xres == yres, scaled to physical units.\n\n    Page DPI includes UserUnit scaling.\n    \"\"\"\n    pageinfo = page_context.pageinfo\n    options = page_context.options\n    if not image_dpi:\n        image_dpi = pageinfo.dpi\n    xres = image_dpi.x or 0.0\n    yres = image_dpi.y or 0.0\n    userunit = float(pageinfo.userunit) or 1.0\n    units = float(\n        max(\n            (xres * userunit) or VECTOR_PAGE_DPI,\n            (yres * userunit) or VECTOR_PAGE_DPI,\n            _vector_page_dpi(pageinfo),\n            options.oversample or 0.0,\n        )\n    )\n    return Resolution(units, units)\n\n\ndef get_canvas_square_dpi(\n    page_context: PageContext, image_dpi: Resolution | None = None\n) -> Resolution:\n    \"\"\"Get the DPI when we require xres == yres, in Postscript units.\n\n    Canvas DPI is independent of PDF UserUnit scaling, which is\n    used to describe situations where the PDF user space is not 1:1 with\n    the physical units of the page.\n    \"\"\"\n    pageinfo = page_context.pageinfo\n    options = page_context.options\n    if not image_dpi:\n        image_dpi = pageinfo.dpi\n    units = float(\n        max(\n            image_dpi.x or VECTOR_PAGE_DPI,\n            image_dpi.y or VECTOR_PAGE_DPI,\n            _vector_page_dpi(pageinfo),\n            options.oversample or 0.0,\n        )\n    )\n    return Resolution(units, units)\n\n\ndef is_ocr_required(page_context: PageContext) -> bool:\n    \"\"\"Check if the page needs to be OCR'd.\"\"\"\n    pageinfo = page_context.pageinfo\n    options = page_context.options\n\n    ocr_required = True\n\n    if options.pages and pageinfo.pageno not in options.pages:\n        log.debug(f\"skipped {pageinfo.pageno} as requested by --pages {options.pages}\")\n        ocr_required = False\n    elif pageinfo.has_text:\n        if options.mode == ProcessingMode.default:\n            raise PriorOcrFoundError(\n                \"page already has text! - aborting (use --force-ocr or --mode force \"\n                \"to force OCR; see also help for --skip-text, --redo-ocr, and --mode)\"\n            )\n        elif options.mode == ProcessingMode.force:\n            log.info(\"page already has text! - rasterizing text and running OCR anyway\")\n            ocr_required = True\n        elif options.mode == ProcessingMode.redo:\n            if pageinfo.has_corrupt_text:\n                log.warning(\n                    \"some text on this page cannot be mapped to characters: \"\n                    \"consider using --force-ocr (or --mode force) instead\"\n                )\n            else:\n                log.info(\"redoing OCR\")\n            ocr_required = True\n        elif options.mode == ProcessingMode.skip:\n            log.info(\"skipping all processing on this page\")\n            ocr_required = False\n    elif not pageinfo.images and not options.lossless_reconstruction:\n        # We found a page with no images and no text. That means it may\n        # have vector art that the user wants to OCR. If we determined\n        # lossless reconstruction is not possible then we have to rasterize\n        # the image. So if OCR is being forced, take that to mean YES, go\n        # ahead and rasterize. If not forced, then pretend there's no text\n        # on the page at all so we don't lose anything.\n        # This could be made smarter by explicitly searching for vector art.\n        if options.mode == ProcessingMode.force and options.oversample:\n            # The user really wants to reprocess this file\n            log.info(\n                \"page has no images - \"\n                f\"rasterizing at {options.oversample} DPI because \"\n                \"--force-ocr --oversample (or --mode force --oversample) was specified\"\n            )\n        elif options.mode == ProcessingMode.force:\n            # Warn the user they might not want to do this\n            log.warning(\n                \"page has no images - \"\n                \"all vector content will be \"\n                f\"rasterized at {VECTOR_PAGE_DPI} DPI, losing some resolution and \"\n                \"likely increasing file size. Use --oversample to adjust the \"\n                \"DPI.\"\n            )\n        else:\n            log.info(\n                \"page has no images - \"\n                \"skipping all processing on this page to avoid losing detail. \"\n                \"Use --force-ocr (or --mode force) if you wish to perform OCR on \"\n                \"pages that have vector content.\"\n            )\n            ocr_required = False\n\n    if ocr_required and options.skip_big and pageinfo.images:\n        pixel_count = pageinfo.width_pixels * pageinfo.height_pixels\n        if pixel_count > (options.skip_big * 1_000_000):\n            ocr_required = False\n            log.warning(\n                \"page too big, skipping OCR \"\n                f\"({(pixel_count / 1_000_000):.1f} MPixels > \"\n                f\"{options.skip_big:.1f} MPixels --skip-big)\"\n            )\n    return ocr_required\n\n\ndef rasterize_preview(input_file: Path, page_context: PageContext) -> Path:\n    \"\"\"Generate a lower quality preview image.\"\"\"\n    output_file = page_context.get_path('rasterize_preview.jpg')\n    canvas_dpi = Resolution(300.0, 300.0).take_min(\n        [get_canvas_square_dpi(page_context)]\n    )\n    page_dpi = Resolution(300.0, 300.0).take_min([get_page_square_dpi(page_context)])\n    page_context.plugin_manager.rasterize_pdf_page(\n        input_file=input_file,\n        output_file=output_file,\n        raster_device=GhostscriptRasterDevice.JPEGGRAY,\n        raster_dpi=canvas_dpi,\n        pageno=page_context.pageinfo.pageno + 1,\n        page_dpi=page_dpi,\n        rotation=0,\n        filter_vector=False,\n        stop_on_soft_error=not page_context.options.continue_on_soft_render_error,\n        options=page_context.options,\n        use_cropbox=False,\n    )\n    return output_file\n\n\ndef describe_rotation(\n    page_context: PageContext, orient_conf: OrientationConfidence, correction: int\n) -> str:\n    \"\"\"Describe the page rotation we are going to perform (or not perform).\"\"\"\n    direction = {0: '⇧', 90: '⇨', 180: '⇩', 270: '⇦'}\n    turns = {0: ' ', 90: '⬏', 180: '↻', 270: '⬑'}\n\n    existing_rotation = page_context.pageinfo.rotation\n    action = ''\n    if orient_conf.confidence >= page_context.options.rotate_pages_threshold:\n        if correction != 0:\n            action = 'will rotate ' + turns[correction]\n        else:\n            action = 'rotation appears correct'\n    else:\n        action = \"confidence too low to rotate\" if correction != 0 else \"no change\"\n\n    facing = ''\n\n    if existing_rotation != 0:\n        facing = f\"with existing rotation {direction.get(existing_rotation, '?')}, \"\n    facing += f\"page is facing {direction.get(orient_conf.angle, '?')}\"\n\n    return f\"{facing}, confidence {orient_conf.confidence:.2f} - {action}\"\n\n\ndef get_orientation_correction(preview: Path, page_context: PageContext) -> int:\n    \"\"\"Work out orientation correction for each page.\n\n    We ask Ghostscript to draw a preview page, which will rasterize with the\n    current /Rotate applied, and then ask OCR which way the page is\n    oriented. If the value of /Rotate is correct (e.g., a user already\n    manually fixed rotation), then OCR will say the page is pointing\n    up and the correction is zero. Otherwise, the orientation found by\n    OCR represents the clockwise rotation, or the counterclockwise\n    correction to rotation.\n\n    When we draw the real page for OCR, we rotate it by the CCW correction,\n    which points it (hopefully) upright. _graft.py takes care of the orienting\n    the image and text layers.\n    \"\"\"\n    ocr_engine = page_context.plugin_manager.get_ocr_engine(\n        options=page_context.options\n    )\n    orient_conf = ocr_engine.get_orientation(preview, page_context.options)\n\n    correction = orient_conf.angle % 360\n    log.info(describe_rotation(page_context, orient_conf, correction))\n    if (\n        orient_conf.confidence >= page_context.options.rotate_pages_threshold\n        and correction != 0\n    ):\n        return correction\n\n    return 0\n\n\ndef calculate_image_dpi(page_context: PageContext) -> Resolution:\n    \"\"\"Calculate the DPI for the page image.\"\"\"\n    pageinfo = page_context.pageinfo\n    dpi_profile = pageinfo.page_dpi_profile()\n    if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8:\n        image_dpi = Resolution(dpi_profile.weighted_dpi, dpi_profile.weighted_dpi)\n    else:\n        image_dpi = pageinfo.dpi\n    return image_dpi\n\n\ndef calculate_raster_dpi(page_context: PageContext):\n    \"\"\"Calculate the DPI for rasterization.\"\"\"\n    # Produce the page image with square resolution or else deskew and OCR\n    # will not work properly.\n    image_dpi = calculate_image_dpi(page_context)\n    dpi_profile = page_context.pageinfo.page_dpi_profile()\n    canvas_dpi = get_canvas_square_dpi(page_context, image_dpi)\n    page_dpi = get_page_square_dpi(page_context, image_dpi)\n    if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8:\n        log.warning(\n            \"Weighted average image DPI is %0.1f, max DPI is %0.1f. \"\n            \"The discrepancy may indicate a high detail region on this page, \"\n            \"but could also indicate a problem with the input PDF file. \"\n            \"Page image will be rendered at %0.1f DPI.\",\n            dpi_profile.weighted_dpi,\n            dpi_profile.max_dpi,\n            canvas_dpi.to_scalar(),\n        )\n    return canvas_dpi, page_dpi\n\n\ndef rasterize(\n    input_file: Path,\n    page_context: PageContext,\n    correction: int = 0,\n    output_tag: str = '',\n    remove_vectors: bool | None = None,\n) -> Path:\n    \"\"\"Rasterize a PDF page to a PNG image.\n\n    Args:\n        input_file: The input PDF file path.\n        page_context: The page context object.\n        correction: The orientation correction angle. Defaults to 0.\n        output_tag: The output tag. Defaults to ''.\n        remove_vectors: Whether to remove vectors. Defaults to None, which means\n            the value from the page context options will be used. If the value\n            is True or False, it will override the page context options.\n\n    Returns:\n        Path: The output PNG file path.\n    \"\"\"\n    colorspaces = [\n        GhostscriptRasterDevice.PNGMONO,\n        GhostscriptRasterDevice.PNGGRAY,\n        GhostscriptRasterDevice.PNG256,\n        GhostscriptRasterDevice.PNG16M,\n    ]\n    device_idx = 0\n\n    if remove_vectors is None:\n        remove_vectors = page_context.options.remove_vectors\n\n    output_file = page_context.get_path(f'rasterize{output_tag}.png')\n    pageinfo = page_context.pageinfo\n\n    def at_least(colorspace):\n        return max(device_idx, colorspaces.index(colorspace))\n\n    for image in pageinfo.images:\n        if image.type_ != 'image':\n            continue  # ignore masks\n        if image.bpc > 1:\n            if image.color == Colorspace.index:\n                device_idx = at_least(GhostscriptRasterDevice.PNG256)\n            elif image.color == Colorspace.gray:\n                device_idx = at_least(GhostscriptRasterDevice.PNGGRAY)\n            else:\n                device_idx = at_least(GhostscriptRasterDevice.PNG16M)\n\n    if pageinfo.has_vector:\n        log.debug(f\"Page has vector content, using {GhostscriptRasterDevice.PNG16M}\")\n        device_idx = at_least(GhostscriptRasterDevice.PNG16M)\n\n    device = colorspaces[device_idx]\n\n    log.debug(\n        f\"Rasterize with {device}, rotation {correction}, mediabox {pageinfo.mediabox}\"\n    )\n\n    canvas_dpi, page_dpi = calculate_raster_dpi(page_context)\n\n    page_context.plugin_manager.rasterize_pdf_page(\n        input_file=input_file,\n        output_file=output_file,\n        raster_device=device,\n        raster_dpi=canvas_dpi,\n        page_dpi=page_dpi,\n        pageno=pageinfo.pageno + 1,\n        rotation=correction,\n        filter_vector=remove_vectors,\n        stop_on_soft_error=not page_context.options.continue_on_soft_render_error,\n        options=page_context.options,\n        use_cropbox=False,\n    )\n    return output_file\n\n\ndef preprocess_remove_background(input_file: Path, page_context: PageContext) -> Path:\n    \"\"\"Remove the background from the input image (temporarily disabled).\"\"\"\n    if any(image.bpc > 1 for image in page_context.pageinfo.images):\n        raise NotImplementedError(\"--remove-background is temporarily not implemented\")\n        # output_file = page_context.get_path('pp_rm_bg.png')\n        # leptonica.remove_background(input_file, output_file)\n        # return output_file\n    log.info(\"background removal skipped on mono page\")\n    return input_file\n\n\ndef preprocess_deskew(input_file: Path, page_context: PageContext) -> Path:\n    \"\"\"Deskews the input image using the OCR engine and saves the output to a file.\n\n    Args:\n        input_file: The input image file to deskew.\n        page_context: The context of the page being processed.\n\n    Returns:\n        Path: The path to the deskewed image file.\n    \"\"\"\n    output_file = page_context.get_path('pp_deskew.png')\n    dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context))\n\n    ocr_engine = page_context.plugin_manager.get_ocr_engine(\n        options=page_context.options\n    )\n    deskew_angle_degrees = ocr_engine.get_deskew(input_file, page_context.options)\n\n    with Image.open(input_file) as im:\n        # According to Pillow docs, .rotate() will automatically use Image.NEAREST\n        # resampling if image is mode '1' or 'P'\n        deskewed = im.rotate(\n            deskew_angle_degrees,\n            resample=Image.Resampling.BICUBIC,\n            fillcolor=ImageColor.getcolor('white', mode=im.mode),  # type: ignore\n        )\n        deskewed.save(output_file, dpi=dpi)\n\n    return output_file\n\n\ndef preprocess_clean(input_file: Path, page_context: PageContext) -> Path:\n    \"\"\"Clean the input image using unpaper.\"\"\"\n    output_file = page_context.get_path('pp_clean.png')\n    dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context))\n    return unpaper.clean(\n        input_file,\n        output_file,\n        dpi=dpi.to_scalar(),\n        unpaper_args=page_context.options.unpaper_args,\n    )\n\n\ndef create_ocr_image(image: Path, page_context: PageContext) -> Path:\n    \"\"\"Create the image we send for OCR.\n\n    Might not be the same as the display image depending on preprocessing.\n    This image will never be shown to the user.\n    \"\"\"\n    output_file = page_context.get_path('ocr.png')\n    options = page_context.options\n    with Image.open(image) as im:\n        log.debug('resolution %r', im.info['dpi'])\n\n        if options.mode != ProcessingMode.force:\n            # Do not mask text areas when forcing OCR, because we need to OCR\n            # all text areas\n            mask = None  # Exclude both visible and invisible text from OCR\n            if options.mode == ProcessingMode.redo:\n                mask = True  # Mask visible text, but not invisible text\n\n            draw = ImageDraw.ImageDraw(im)\n            for textarea in page_context.pageinfo.get_textareas(\n                visible=mask, corrupt=None\n            ):\n                # Calculate resolution based on the image size and page dimensions\n                # without regard whatever resolution is in pageinfo (may differ or\n                # be None)\n                bbox = [float(v) for v in textarea]\n                xyscale = tuple(float(coord) / 72.0 for coord in im.info['dpi'])\n                pixcoords = (\n                    bbox[0] * xyscale[0],\n                    im.height - bbox[3] * xyscale[1],\n                    bbox[2] * xyscale[0],\n                    im.height - bbox[1] * xyscale[1],\n                )\n                log.debug('blanking %r', pixcoords)\n                draw.rectangle(pixcoords, fill='white')\n                # draw.rectangle(pixcoords, outline='pink')\n\n        filter_im = page_context.plugin_manager.filter_ocr_image(\n            page=page_context, image=im\n        )\n        if filter_im is not None:\n            im = filter_im\n\n        # Pillow requires integer DPI\n        dpi = tuple(round(coord) for coord in im.info['dpi'])\n        im.save(output_file, dpi=dpi)\n    return output_file\n\n\ndef ocr_engine_hocr(input_file: Path, page_context: PageContext) -> tuple[Path, Path]:\n    \"\"\"Run the OCR engine and generate hOCR output.\"\"\"\n    hocr_out = page_context.get_path('ocr_hocr.hocr')\n    hocr_text_out = page_context.get_path('ocr_hocr.txt')\n    options = page_context.options\n\n    ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)\n    ocr_engine.generate_hocr(\n        input_file=input_file,\n        output_hocr=hocr_out,\n        output_text=hocr_text_out,\n        options=options,\n    )\n    return hocr_out, hocr_text_out\n\n\ndef ocr_engine_direct(\n    input_file: Path, page_context: PageContext\n) -> tuple[OcrElement, Path]:\n    \"\"\"Run the OCR engine and return OcrElement tree directly.\n\n    This is the modern path for OCR engines that support the generate_ocr() API.\n    It bypasses hOCR file generation for better performance and richer data.\n\n    Args:\n        input_file: The image file to OCR.\n        page_context: The page context with options and path utilities.\n\n    Returns:\n        A tuple of (OcrElement tree, path to text sidecar file).\n    \"\"\"\n    text_out = page_context.get_path('ocr_direct.txt')\n    options = page_context.options\n\n    ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)\n    ocr_tree, text_content = ocr_engine.generate_ocr(\n        input_file=input_file,\n        options=options,\n        page_number=page_context.pageno,\n    )\n\n    # Write text sidecar file\n    text_out.write_text(text_content, encoding='utf-8')\n\n    return ocr_tree, text_out\n\n\ndef should_visible_page_image_use_jpg(pageinfo: PageInfo) -> bool:\n    \"\"\"Determines whether the visible page image should be saved as a JPEG.\n\n    If all images were JPEGs originally (including FlateDecode+DCTDecode),\n    permit a JPEG as output.\n\n    Args:\n        pageinfo: The PageInfo object containing information about the page.\n\n    Returns:\n        A boolean indicating whether the visible page image should be saved as a JPEG.\n    \"\"\"\n    return bool(pageinfo.images) and all(\n        im.enc in (Encoding.jpeg, Encoding.flate_jpeg) for im in pageinfo.images\n    )\n\n\ndef create_visible_page_jpg(image: Path, page_context: PageContext) -> Path:\n    \"\"\"Create a visible page image in JPEG format.\n\n    This is intended to be used when all images on the page were originally JPEGs.\n    \"\"\"\n    output_file = page_context.get_path('visible.jpg')\n    with Image.open(image) as im:\n        # At this point the image should be a .png, but deskew, unpaper\n        # might have removed the DPI information. In this case, fall back to\n        # square DPI used to rasterize. When the preview image was\n        # rasterized, it was also converted to square resolution, which is\n        # what we want to give to the OCR engine, so keep it square.\n        if 'dpi' in im.info:\n            dpi = Resolution(*im.info['dpi'])\n        else:\n            # Fallback to page-implied DPI\n            dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context))\n\n        # Pillow requires integer DPI\n        im.save(output_file, format='JPEG', dpi=dpi.to_int())\n    return output_file\n\n\ndef create_pdf_page_from_image(\n    image: Path, page_context: PageContext, orientation_correction: int\n) -> Path:\n    \"\"\"Create a PDF page from a page image.\"\"\"\n    # We rasterize a square DPI version of each page because most image\n    # processing tools don't support rectangular DPI. Use the square DPI as it\n    # accurately describes the image. It would be possible to resample the image\n    # at this stage back to non-square DPI to more closely resemble the input,\n    # except that the hocr renderer does not understand non-square DPI. The\n    # sandwich renderer would be fine.\n    output_file = page_context.get_path('visible.pdf')\n\n    pageinfo = page_context.pageinfo\n    pagesize = 72.0 * float(pageinfo.width_inches), 72.0 * float(pageinfo.height_inches)\n    effective_rotation = (pageinfo.rotation - orientation_correction) % 360\n    swap_axis = effective_rotation % 180 == 90\n    if swap_axis:\n        pagesize = pagesize[1], pagesize[0]\n\n    # Create a new single page PDF to hold\n    bio = BytesIO()\n    with open(image, 'rb') as imfile:\n        log.debug('convert')\n\n        layout_fun = img2pdf.get_layout_fun(pagesize)\n        img2pdf.convert(\n            imfile,\n            layout_fun=layout_fun,\n            outputstream=bio,\n            engine=img2pdf.Engine.pikepdf,\n            rotation=img2pdf.Rotation.ifvalid,\n        )\n        log.debug('convert done')\n\n    # img2pdf does not generate boxes correctly, so we fix them\n    bio.seek(0)\n    fix_pagepdf_boxes(bio, output_file, page_context, swap_axis=swap_axis)\n\n    output_file = page_context.plugin_manager.filter_pdf_page(\n        page=page_context, image_filename=image, output_pdf=output_file\n    )\n    return output_file\n\n\ndef ocr_engine_textonly_pdf(\n    input_image: Path, page_context: PageContext\n) -> tuple[Path, Path]:\n    \"\"\"Run the OCR engine and generate a text-only PDF (will look blank).\"\"\"\n    output_pdf = page_context.get_path('ocr_tess.pdf')\n    output_text = page_context.get_path('ocr_tess.txt')\n    options = page_context.options\n\n    ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)\n    ocr_engine.generate_pdf(\n        input_file=input_image,\n        output_pdf=output_pdf,\n        output_text=output_text,\n        options=options,\n    )\n    return output_pdf, output_text\n\n\ndef _offset_rect(rect: tuple[float, float, float, float], offset: tuple[float, float]):\n    \"\"\"Offset a rectangle by a given amount.\"\"\"\n    return (\n        rect[0] + offset[0],\n        rect[1] + offset[1],\n        rect[2] + offset[0],\n        rect[3] + offset[1],\n    )\n\n\ndef _adjust_pagebox(\n    page: pikepdf.Page,\n    media_box: FloatRect,\n    name: pikepdf.Name,\n    target_box: FloatRect,\n    offset: tuple[float, float],\n    swap_axis: bool,\n):\n    if media_box == target_box:\n        return\n    box = _offset_rect(target_box, offset)\n    if swap_axis:\n        box = box[1], box[0], box[3], box[2]\n    page[name] = box\n    log.debug(f\"{str(name)} = {target_box}\")\n\n\ndef fix_pagepdf_boxes(\n    infile: Path | BinaryIO,\n    out_file: Path,\n    page_context: PageContext,\n    swap_axis: bool = False,\n) -> Path:\n    \"\"\"Fix the bounding boxes in a single page PDF.\n\n    The single page PDF is created with a normal MediaBox with its lower left corner\n    at (0, 0). infile is the single page PDF. page_context.mediabox has the original\n    file's mediabox, which may have a different origin. We need to adjust the other\n    boxes in the single page PDF to match the effect they had on the original page.\n\n    When correcting page rotation, we create a single page PDF that is correctly\n    rotated instead of an incorrectly rotated and then setting page.Rotate on it.\n    If rotation is either 90 or 270 degrees, then this function can be called\n    with swap_axis to swap the X and Y coordinates of all the boxes.\n\n    We are not concerned with solving degenerate cases where the boxes overlap or\n    or express invalid rectangles. We merely pass the boxes, producing a\n    transformation equivalent to the change made by constructing a new page image.\n    \"\"\"\n    with pikepdf.open(infile) as pdf:\n        for page in pdf.pages:\n            log.debug(\n                f\"initial mediabox={page.MediaBox} and pageinfo \"\n                f\"mediabox={page_context.pageinfo.mediabox}\"\n            )\n            mediabox = page_context.pageinfo.mediabox\n            offset = -mediabox[0], -mediabox[1]\n            if swap_axis:\n                mediabox = mediabox[1], mediabox[0], mediabox[3], mediabox[2]\n            boxes = ['CropBox', 'TrimBox', 'ArtBox', 'BleedBox']\n            for box_name in boxes:\n                _adjust_pagebox(\n                    page,\n                    mediabox,\n                    pikepdf.Name(f\"/{box_name}\"),\n                    getattr(page_context.pageinfo, box_name.lower()),\n                    offset,\n                    swap_axis,\n                )\n\n        pdf.save(out_file)\n    return out_file\n\n\ndef generate_postscript_stub(context: PdfContext) -> Path:\n    \"\"\"Generates a PostScript file stub for the given PDF context.\n\n    Args:\n        context: The PDF context to generate the PostScript file stub for.\n\n    Returns:\n        Path: The path to the generated PostScript file stub.\n    \"\"\"\n    output_file = context.get_path('pdfa.ps')\n    generate_pdfa_ps(output_file)\n    return output_file\n\n\ndef convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext) -> Path:\n    \"\"\"Converts the given PDF to PDF/A.\n\n    Args:\n        input_pdf: The input PDF file path (presumably not PDF/A).\n        input_ps_stub: The input PostScript file path, containing instructions\n            for the PDF/A generator to use.\n        context: The PDF context.\n    \"\"\"\n    options = context.options\n    input_pdfinfo = context.pdfinfo\n    fix_docinfo_file = context.get_path('fix_docinfo.pdf')\n    output_file = context.get_path('pdfa.pdf')\n\n    # If the DocumentInfo record contains NUL characters, Ghostscript will\n    # produce XMP metadata which contains invalid XML entities (&#0;).\n    # NULs in DocumentInfo seem to be common since older Acrobats included them.\n    # pikepdf can deal with this, but we make the world a better place by\n    # stamping them out as soon as possible.\n    with pikepdf.open(input_pdf) as pdf_file:\n        if repair_docinfo_nuls(pdf_file):\n            pdf_file.save(fix_docinfo_file)\n        else:\n            safe_symlink(input_pdf, fix_docinfo_file)\n\n    # Extract PDF/A part correctly\n    if options.output_type.startswith('pdfa'):\n        if options.output_type == 'pdfa':\n            pdfa_part = '2'  # Default to PDF/A-2\n        else:\n            pdfa_part = options.output_type.split('-')[\n                -1\n            ]  # Extract number from pdfa-1, pdfa-2, etc.\n    else:\n        pdfa_part = '2'  # Fallback\n\n    context.plugin_manager.generate_pdfa(\n        pdf_version=input_pdfinfo.min_version,\n        pdf_pages=[fix_docinfo_file],\n        pdfmark=input_ps_stub,\n        output_file=output_file,\n        context=context,\n        pdfa_part=pdfa_part,\n        progressbar_class=(\n            context.plugin_manager.get_progressbar_class()\n            if options.progress_bar\n            else None\n        ),\n        stop_on_soft_error=not options.continue_on_soft_render_error,\n    )\n\n    return output_file\n\n\ndef try_speculative_pdfa(input_pdf: Path, context: PdfContext) -> Path | None:\n    \"\"\"Try speculative PDF/A conversion with verapdf validation.\n\n    This attempts a fast PDF/A conversion by adding PDF/A structures\n    directly with pikepdf, then validating with verapdf. If validation\n    passes, returns the converted file. If it fails or verapdf is not\n    available, returns None to signal that Ghostscript should be used.\n\n    Args:\n        input_pdf: Path to the PDF to convert\n        context: The PDF context\n\n    Returns:\n        Path to valid PDF/A file, or None if speculative conversion failed\n    \"\"\"\n    from ocrmypdf._exec import verapdf\n\n    options = context.options\n\n    # Skip speculative conversion if user requested specific image compression,\n    # since that requires Ghostscript to apply\n    gs_opts = getattr(options, 'ghostscript', None)\n    if gs_opts is not None:\n        compression = getattr(gs_opts, 'pdfa_image_compression', 'auto')\n        if compression != 'auto':\n            log.debug(\n                'Skipping speculative PDF/A: --pdfa-image-compression=%s requires '\n                'Ghostscript',\n                compression,\n            )\n            return None\n\n    if not verapdf.available():\n        log.debug('verapdf not available, skipping speculative PDF/A conversion')\n        return None\n    output_file = context.get_path('speculative_pdfa.pdf')\n\n    try:\n        speculative_pdfa_conversion(input_pdf, output_file, options.output_type)\n\n        flavour = verapdf.output_type_to_flavour(options.output_type)\n        result = verapdf.validate(output_file, flavour)\n\n        if result.valid:\n            log.info('Speculative PDF/A conversion succeeded - skipping Ghostscript')\n            return output_file\n        else:\n            log.debug(\n                'Speculative PDF/A validation failed (%d rule violations), '\n                'falling back to Ghostscript',\n                result.failed_rules,\n            )\n            return None\n\n    except Exception as e:\n        log.debug('Speculative PDF/A conversion failed: %s', e)\n        return None\n\n\ndef try_auto_pdfa(input_pdf: Path, context: PdfContext) -> tuple[Path, str]:\n    \"\"\"Best-effort PDF/A for 'auto' output type.\n\n    This function attempts to produce PDF/A without requiring Ghostscript:\n    1. If verapdf is available, tries speculative conversion with validation\n    2. Without verapdf, passes through as PDF/A if safe (input already PDF/A\n       or force-ocr was used)\n    3. Falls back to regular PDF if neither condition is met\n\n    Args:\n        input_pdf: Path to the PDF to convert\n        context: The PDF context\n\n    Returns:\n        Tuple of (output_path, actual_output_type) where actual_output_type\n        is 'pdfa' if PDF/A was achieved, 'pdf' otherwise\n    \"\"\"\n    from ocrmypdf._exec import verapdf\n\n    # If verapdf available, try speculative conversion with validation\n    if verapdf.available():\n        result = try_speculative_pdfa(input_pdf, context)\n        if result is not None:\n            return (result, 'pdfa')\n        # verapdf validation failed - fall through to regular PDF\n        log.info(\n            'Auto mode: speculative PDF/A validation failed, outputting regular PDF'\n        )\n        return (input_pdf, 'pdf')\n\n    # Without verapdf, check if we can pass through as PDF/A\n    if _is_safe_pdfa(input_pdf, context.options):\n        # Pass through as-is (no modifications needed)\n        log.info('Auto mode: passing through as PDF/A (input already compliant)')\n        return (input_pdf, 'pdfa')\n\n    # Fall through to regular PDF\n    log.info('Auto mode: no verapdf available and input is not PDF/A, outputting PDF')\n    return (input_pdf, 'pdf')\n\n\ndef _is_safe_pdfa(input_pdf: Path, options) -> bool:\n    \"\"\"Check if file can be considered PDF/A without validation.\n\n    These are cases where our modifications don't break PDF/A compliance:\n    1. Input already claims PDF/A (we just grafted OCR text onto it)\n    2. We used force-ocr (we rewrote the entire PDF from scratch)\n\n    Args:\n        input_pdf: Path to the PDF to check\n        options: OCR options\n\n    Returns:\n        True if file can safely be considered PDF/A\n    \"\"\"\n    # Safe if input already claims PDF/A\n    pdfa_status = file_claims_pdfa(input_pdf)\n    if pdfa_status['pass']:\n        return True\n\n    # Safe if we rewrote the PDF with force mode\n    return options.mode == ProcessingMode.force\n\n\ndef should_linearize(working_file: Path, context: PdfContext) -> bool:\n    \"\"\"Determine whether the PDF should be linearized.\n\n    For smaller files, linearization is not worth the effort.\n    \"\"\"\n    filesize = os.stat(working_file).st_size\n    return filesize > (context.options.fast_web_view * 1_000_000)\n\n\ndef get_pdf_save_settings(output_type: str) -> dict[str, Any]:\n    \"\"\"Get pikepdf.Pdf.save settings for the given output type.\n\n    Essentially, don't use features that are incompatible with a given\n    PDF/A specification.\n    \"\"\"\n    if output_type == 'pdfa-1':\n        # Trigger recompression to ensure object streams are removed, because\n        # Acrobat complains about them in PDF/A-1b validation.\n        return dict(\n            preserve_pdfa=True,\n            compress_streams=True,\n            stream_decode_level=pikepdf.StreamDecodeLevel.generalized,\n            object_stream_mode=pikepdf.ObjectStreamMode.disable,\n        )\n    else:\n        return dict(\n            preserve_pdfa=True,\n            compress_streams=True,\n            object_stream_mode=(pikepdf.ObjectStreamMode.generate),\n        )\n\n\ndef _file_size_ratio(\n    input_file: Path, output_file: Path\n) -> tuple[float | None, float | None]:\n    \"\"\"Calculate ratio of input to output file sizes and percentage savings.\n\n    Args:\n        input_file (Path): The path to the input file.\n        output_file (Path): The path to the output file.\n\n    Returns:\n        tuple[float | None, float | None]: A tuple containing the file size\n        ratio and the percentage savings achieved by the output file size\n        compared to the input file size.\n    \"\"\"\n    input_size = input_file.stat().st_size\n    output_size = output_file.stat().st_size\n    if output_size == 0:\n        return None, None\n    ratio = input_size / output_size\n    savings = 1 - output_size / input_size\n    return ratio, savings\n\n\ndef optimize_pdf(\n    input_file: Path, context: PdfContext, executor: Executor\n) -> tuple[Path, Sequence[str]]:\n    \"\"\"Optimize the given PDF file.\"\"\"\n    output_file = context.get_path('optimize.pdf')\n    output_pdf, messages = context.plugin_manager.optimize_pdf(\n        input_pdf=input_file,\n        output_pdf=output_file,\n        context=context,\n        executor=executor,\n        linearize=should_linearize(input_file, context),\n    )\n\n    ratio, savings = _file_size_ratio(input_file, output_file)\n    if ratio:\n        log.info(f\"Image optimization ratio: {ratio:.2f} savings: {(savings):.1%}\")\n    ratio, savings = _file_size_ratio(context.origin, output_file)\n    if ratio:\n        log.info(f\"Total file size ratio: {ratio:.2f} savings: {(savings):.1%}\")\n    return output_pdf, messages\n\n\ndef enumerate_compress_ranges(\n    iterable: Iterable[T],\n) -> Iterator[tuple[tuple[int, int], T | None]]:\n    \"\"\"Enumerate the ranges of non-empty elements in an iterable.\n\n    Compresses consecutive ranges of length 1 into single elements.\n\n    Args:\n        iterable: An iterable of elements to enumerate.\n\n    Yields:\n        A tuple containing a range of indices and the corresponding element.\n        If the element is None, the range represents a skipped range of indices.\n    \"\"\"\n    skipped_from, index = None, None\n    for index, txt_file in enumerate(iterable):\n        index += 1\n        if txt_file:\n            if skipped_from is not None:\n                yield (skipped_from, index - 1), None\n                skipped_from = None\n            yield (index, index), txt_file\n        else:\n            if skipped_from is None:\n                skipped_from = index\n    if skipped_from is not None:\n        yield (skipped_from, index), None\n\n\ndef merge_sidecars(txt_files: Iterable[Path | None], context: PdfContext) -> Path:\n    \"\"\"Merge the page sidecar files into a single file.\n\n    Sidecar files are created by the OCR engine and contain the text for each\n    page in the PDF. This function merges the sidecar files into a single file\n    and returns the path to the merged file.\n    \"\"\"\n    output_file = context.get_path('sidecar.txt')\n    with open(output_file, 'w', encoding=\"utf-8\") as stream:\n        for (from_, to_), txt_file in enumerate_compress_ranges(txt_files):\n            if from_ != 1:\n                stream.write('\\f')  # Form feed between pages for all pages after first\n            if txt_file:\n                txt = txt_file.read_text(encoding=\"utf-8\")\n                # Some versions of Tesseract add a form feed at the end and\n                # others don't. Remove it if it exists, since we add one manually.\n                stream.write(txt.removesuffix('\\f'))\n            else:\n                pages = f\"{from_}-{to_}\" if from_ != to_ else f\"{from_}\"\n                stream.write(f'[OCR skipped on page(s) {pages}]')\n    return output_file\n\n\ndef copy_final(\n    input_file: Path, output_file: str | Path | BinaryIO, original_file: Path | None\n) -> None:\n    \"\"\"Copy the final temporary file to the output destination.\n\n    Args:\n        input_file (Path): The intermediate input file to copy.\n        output_file (str | Path | BinaryIO): The output file to copy to.\n        original_file: The original file to copy attributes from.\n\n    Returns:\n        None\n    \"\"\"\n    log.debug('%s -> %s', input_file, output_file)\n    with input_file.open('rb') as input_stream:\n        if output_file == '-':\n            copyfileobj(input_stream, sys.stdout.buffer)  # type: ignore[misc]\n            sys.stdout.flush()\n        elif hasattr(output_file, 'writable'):\n            output_stream = cast(BinaryIO, output_file)\n            copyfileobj(input_stream, output_stream)  # type: ignore[misc]\n            with suppress(AttributeError):\n                output_stream.flush()\n        else:\n            # At this point we overwrite the output_file specified by the user\n            # use copyfileobj because then we use open() to create the file and\n            # get the appropriate umask, ownership, etc.\n            with open(output_file, 'w+b') as output_stream:\n                copyfileobj(input_stream, output_stream)\n"
  },
  {
    "path": "src/ocrmypdf/_pipelines/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\nfrom __future__ import annotations\n"
  },
  {
    "path": "src/ocrmypdf/_pipelines/_common.py",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport json\nimport logging\nimport logging.handlers\nimport os\nimport shutil\nimport sys\nimport threading\nfrom collections.abc import Callable, Sequence\nfrom concurrent.futures.process import BrokenProcessPool\nfrom concurrent.futures.thread import BrokenThreadPool\nfrom contextlib import contextmanager\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, NamedTuple, cast\n\nif TYPE_CHECKING:\n    from ocrmypdf.hocrtransform import OcrElement\n\nimport PIL\nimport PIL.Image\nfrom pikepdf import Pdf\n\nfrom ocrmypdf._annots import remove_broken_goto_annotations\nfrom ocrmypdf._concurrent import Executor, setup_executor\nfrom ocrmypdf._jobcontext import PageContext, PdfContext\nfrom ocrmypdf._logging import PageNumberFilter\nfrom ocrmypdf._metadata import metadata_fixup\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf._pipeline import (\n    convert_to_pdfa,\n    create_ocr_image,\n    create_pdf_page_from_image,\n    create_visible_page_jpg,\n    generate_postscript_stub,\n    get_orientation_correction,\n    get_pdf_save_settings,\n    get_pdfinfo,\n    optimize_pdf,\n    preprocess_clean,\n    preprocess_deskew,\n    preprocess_remove_background,\n    rasterize,\n    rasterize_preview,\n    should_linearize,\n    should_visible_page_image_use_jpg,\n    try_auto_pdfa,\n    try_speculative_pdfa,\n)\nfrom ocrmypdf._plugin_manager import OcrmypdfPluginManager\nfrom ocrmypdf._validation import (\n    report_output_file_size,\n)\nfrom ocrmypdf.exceptions import ExitCode, ExitCodeException\nfrom ocrmypdf.helpers import (\n    check_pdf,\n    pikepdf_enable_mmap,\n    running_in_docker,\n    running_in_snap,\n    samefile,\n)\nfrom ocrmypdf.pdfa import file_claims_pdfa\nfrom ocrmypdf.pdfinfo import PdfInfo\n\nlog = logging.getLogger(__name__)\ntls = threading.local()\ntls.pageno = None\n\n\ndef _set_logging_tls(tls):\n    \"\"\"Inject current page number (when available) into log records.\"\"\"\n    old_factory = logging.getLogRecordFactory()\n\n    def wrapper(*args, **kwargs):\n        record = old_factory(*args, **kwargs)\n        if hasattr(tls, 'pageno'):\n            record.pageno = tls.pageno\n        return record\n\n    logging.setLogRecordFactory(wrapper)\n\n\n_set_logging_tls(tls)\n\n\ndef set_thread_pageno(pageno: int | None):\n    \"\"\"Set page number (1-based) that the current thread is processing.\"\"\"\n    tls.pageno = pageno\n\n\nclass PageResult(NamedTuple):\n    \"\"\"Result when a page is finished processing.\"\"\"\n\n    pageno: int\n    \"\"\"Page number, 0-based.\"\"\"\n\n    pdf_page_from_image: Path | None = None\n    \"\"\"Single page PDF from image.\"\"\"\n\n    ocr: Path | None = None\n    \"\"\"Single page OCR PDF.\"\"\"\n\n    text: Path | None = None\n    \"\"\"Single page text file.\"\"\"\n\n    orientation_correction: int = 0\n    \"\"\"Orientation correction in degrees.\"\"\"\n\n    ocr_tree: OcrElement | None = None\n    \"\"\"Direct OcrElement tree (when using generate_ocr() API).\"\"\"\n\n\nclass HOCRResultEncoder(json.JSONEncoder):\n    def default(self, obj):\n        if isinstance(obj, Path):\n            return {'Path': str(obj)}\n        return super().default(obj)\n\n\nclass HOCRResultDecoder(json.JSONDecoder):\n    def __init__(self, *args, **kwargs):\n        kwargs['object_hook'] = self.dict_to_object\n        super().__init__(*args, **kwargs)\n\n    def dict_to_object(self, d):\n        if 'Path' in d:\n            return Path(d['Path'])\n        return d\n\n\n@dataclass\nclass HOCRResult:\n    \"\"\"Result when hOCR is finished processing.\"\"\"\n\n    pageno: int\n    \"\"\"Page number, 0-based.\"\"\"\n\n    pdf_page_from_image: Path | None = None\n    \"\"\"Single page PDF from image.\"\"\"\n\n    hocr: Path | None = None\n    \"\"\"Single page hOCR file.\"\"\"\n\n    textpdf: Path | None = None\n    \"\"\"hOCR file after conversion to PDF.\"\"\"\n\n    orientation_correction: int = 0\n    \"\"\"Orientation correction in degrees.\"\"\"\n\n    ocr_tree: OcrElement | None = None\n    \"\"\"Direct OcrElement tree (when using generate_ocr() API).\"\"\"\n\n    @classmethod\n    def from_json(cls, json_str: str) -> HOCRResult:\n        \"\"\"Create an instance from a dict.\"\"\"\n        return cls(**json.loads(json_str, cls=HOCRResultDecoder))\n\n    def to_json(self) -> str:\n        \"\"\"Serialize to a JSON string.\"\"\"\n        return json.dumps(self.__dict__, cls=HOCRResultEncoder)\n\n\ndef configure_debug_logging(\n    log_filename: Path, prefix: str = ''\n) -> tuple[logging.FileHandler, Callable[[], None]]:\n    \"\"\"Create a debug log file at a specified location.\n\n    Returns the log handler, and a function to remove the handler.\n\n    Args:\n        log_filename: Where to the put the log file.\n        prefix: The logging domain prefix that should be sent to the log.\n    \"\"\"\n    log_file_handler = logging.FileHandler(log_filename, delay=True)\n    log_file_handler.setLevel(logging.DEBUG)\n    formatter = logging.Formatter(\n        '[%(asctime)s] - %(name)s - %(levelname)7s -%(pageno)s %(message)s'\n    )\n    log_file_handler.setFormatter(formatter)\n    log_file_handler.addFilter(PageNumberFilter())\n    logging.getLogger(prefix).addHandler(log_file_handler)\n\n    def remover():\n        try:\n            logging.getLogger(prefix).removeHandler(log_file_handler)\n            log_file_handler.close()\n        except OSError as e:\n            print(e, file=sys.stderr)\n\n    return log_file_handler, remover\n\n\ndef worker_init(max_pixels: int | None) -> None:\n    \"\"\"Initialize a worker thread or process.\"\"\"\n    # In Windows, child process will not inherit our change to this value in\n    # the parent process, so ensure workers get it set. Not needed when running\n    # threaded, but harmless to set again.\n    PIL.Image.MAX_IMAGE_PIXELS = max_pixels\n    pikepdf_enable_mmap()\n\n\n@contextmanager\ndef manage_debug_log_handler(\n    *,\n    options: OcrOptions,\n    work_folder: Path,\n):\n    remover = None\n    if (options.keep_temporary_files or options.verbose >= 1) and not os.environ.get(\n        'PYTEST_CURRENT_TEST', ''\n    ):\n        # Debug log for command line interface only with verbose output\n        # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this\n        # when pytest is running\n        _debug_log_handler, remover = configure_debug_logging(\n            work_folder / \"debug.log\", prefix=\"\"\n        )  # pragma: no cover\n    try:\n        yield\n    finally:\n        if remover:\n            remover()\n\n\ndef _print_temp_folder_location(work_folder: Path):\n    \"\"\"Print the location of the temporary work folder.\"\"\"\n    msgs = [f\"Temporary working files retained at:\\n{work_folder}\"]\n    if running_in_docker():  # pragma: no cover\n        msgs.append(\n            \"OCRmyPDF is running in a Docker container, \"\n            \"so the files will be inside the container.\"\n        )\n    elif running_in_snap():  # pragma: no cover\n        msgs.append(\n            \"OCRmyPDF is running in a Snap container, \"\n            \"so the files will be inside the container.\"\n        )\n    print('\\n'.join(msgs), file=sys.stderr)\n\n\n@contextmanager\ndef manage_work_folder(*, work_folder: Path, retain: bool, print_location: bool):\n    try:\n        yield work_folder\n    finally:\n        if retain:\n            if print_location:\n                _print_temp_folder_location(work_folder)\n        else:\n            shutil.rmtree(work_folder, ignore_errors=True)\n\n\ndef cli_exception_handler(\n    fn: Callable[[OcrOptions, OcrmypdfPluginManager], ExitCode],\n    options: OcrOptions,\n    plugin_manager: OcrmypdfPluginManager,\n) -> ExitCode:\n    \"\"\"Convert exceptions into command line error messages and exit codes.\n\n    When known exceptions are raised, the exception message is printed to stderr\n    and the program exits with a non-zero exit code. When unknown exceptions are\n    raised, the exception traceback is printed to stderr and the program exits\n    with a non-zero exit code.\n    \"\"\"\n    try:\n        # We cannot use a generator and yield here, as would be the usual pattern\n        # for exception handling context managers, because we need to return an exit\n        # code.\n        return fn(options, plugin_manager)\n    except KeyboardInterrupt:\n        if options.verbose >= 1:\n            log.exception(\"KeyboardInterrupt\")\n        else:\n            log.error(\"KeyboardInterrupt\")\n        return ExitCode.ctrl_c\n    except ExitCodeException as e:\n        e = cast(ExitCodeException, e)\n        if options.verbose >= 1:\n            log.exception(\"ExitCodeException\")\n        elif str(e):\n            log.error(\"%s: %s\", type(e).__name__, str(e))\n        else:\n            log.error(type(e).__name__)\n        return e.exit_code\n    except ValueError as e:\n        # Convert Pydantic validation errors to BadArgsError for proper exit code\n        if \"validation error\" in str(e).lower() or \"value error\" in str(e).lower():\n            if options.verbose >= 1:\n                log.exception(\"Validation error\")\n            else:\n                log.error(\"Invalid argument: %s\", str(e))\n            return ExitCode.bad_args\n        # Re-raise other ValueErrors to be caught by the general exception handler\n        raise\n    except PIL.Image.DecompressionBombError:\n        log.exception(\n            \"A decompression bomb error was encountered while executing the \"\n            \"pipeline. Use the argument --max-image-mpixels to raise the maximum \"\n            \"image pixel limit.\"\n        )\n        return ExitCode.other_error\n    except (\n        BrokenProcessPool,\n        BrokenThreadPool,\n    ):\n        log.exception(\n            \"A worker process was terminated unexpectedly. This is known to occur if \"\n            \"processing your file takes all available swap space and RAM. It may \"\n            \"help to try again with a smaller number of jobs, using the --jobs \"\n            \"argument.\"\n        )\n        return ExitCode.child_process_error\n    except Exception:  # pylint: disable=broad-except\n        log.exception(\"An exception occurred while executing the pipeline\")\n        return ExitCode.other_error\n\n\ndef setup_pipeline(\n    options: OcrOptions,\n    plugin_manager: OcrmypdfPluginManager,\n) -> Executor:\n    # Any changes to options will not take effect for options that are already\n    # bound to function parameters in the pipeline. (For example\n    # options.input_file, options.pdf_renderer are already bound.)\n    # Note: OcrOptions is immutable, so we can't modify options.jobs directly\n    # The jobs field should already be set correctly during OcrOptions creation\n\n    # Apply PIL max image pixels side effect\n    PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)\n    if PIL.Image.MAX_IMAGE_PIXELS == 0:\n        PIL.Image.MAX_IMAGE_PIXELS = None  # type: ignore\n\n    pikepdf_enable_mmap()\n    executor = setup_executor(plugin_manager)\n    return executor\n\n\ndef do_get_pdfinfo(pdf_path: Path, executor: Executor, options) -> PdfInfo:\n    # Handle pages field - it might be a string that needs conversion\n    check_pages = options.pages\n    if isinstance(check_pages, str):\n        from ocrmypdf._options import _pages_from_ranges\n\n        check_pages = _pages_from_ranges(check_pages)\n\n    return get_pdfinfo(\n        pdf_path,\n        executor=executor,\n        detailed_analysis=options.redo_ocr,\n        progbar=options.progress_bar,\n        max_workers=options.jobs,\n        use_threads=options.use_threads,\n        check_pages=check_pages,\n    )\n\n\ndef preprocess(\n    page_context: PageContext,\n    image: Path,\n    remove_background: bool,\n    deskew: bool,\n    clean: bool,\n) -> Path:\n    \"\"\"Preprocess an image.\"\"\"\n    if remove_background:\n        image = preprocess_remove_background(image, page_context)\n    if deskew:\n        image = preprocess_deskew(image, page_context)\n    if clean:\n        image = preprocess_clean(image, page_context)\n    return image\n\n\ndef make_intermediate_images(\n    page_context: PageContext, orientation_correction: int\n) -> tuple[Path, Path | None]:\n    \"\"\"Create intermediate and preprocessed images for OCR.\"\"\"\n    options = page_context.options\n\n    ocr_image = preprocess_out = None\n    rasterize_out = rasterize(\n        page_context.origin,\n        page_context,\n        correction=orientation_correction,\n        remove_vectors=False,\n    )\n\n    if not any([options.clean, options.clean_final, options.remove_vectors]):\n        ocr_image = preprocess_out = preprocess(\n            page_context,\n            rasterize_out,\n            options.remove_background,\n            options.deskew,\n            clean=False,\n        )\n    else:\n        if not options.lossless_reconstruction:\n            preprocess_out = preprocess(\n                page_context,\n                rasterize_out,\n                options.remove_background,\n                options.deskew,\n                clean=options.clean_final,\n            )\n        if options.remove_vectors:\n            rasterize_ocr_out = rasterize(\n                page_context.origin,\n                page_context,\n                correction=orientation_correction,\n                remove_vectors=True,\n                output_tag='_ocr',\n            )\n        else:\n            rasterize_ocr_out = rasterize_out\n\n        if (\n            preprocess_out\n            and rasterize_ocr_out == rasterize_out\n            and options.clean == options.clean_final\n        ):\n            # Optimization: image for OCR is identical to presentation image\n            ocr_image = preprocess_out\n        else:\n            ocr_image = preprocess(\n                page_context,\n                rasterize_ocr_out,\n                options.remove_background,\n                options.deskew,\n                clean=options.clean,\n            )\n    return ocr_image, preprocess_out\n\n\ndef process_page(page_context: PageContext) -> tuple[Path, Path | None, int]:\n    \"\"\"Process page to create OCR image, visible page image and orientation.\"\"\"\n    options = page_context.options\n    orientation_correction = 0\n    if options.rotate_pages:\n        # Rasterize\n        rasterize_preview_out = rasterize_preview(page_context.origin, page_context)\n        orientation_correction = get_orientation_correction(\n            rasterize_preview_out, page_context\n        )\n\n    ocr_image, preprocess_out = make_intermediate_images(\n        page_context, orientation_correction\n    )\n    ocr_image_out = create_ocr_image(ocr_image, page_context)\n\n    pdf_page_from_image_out = None\n    if not options.lossless_reconstruction:\n        assert preprocess_out\n        visible_image_out = preprocess_out\n        if should_visible_page_image_use_jpg(page_context.pageinfo):\n            visible_image_out = create_visible_page_jpg(visible_image_out, page_context)\n        filtered_image = page_context.plugin_manager.filter_page_image(\n            page=page_context, image_filename=visible_image_out\n        )\n        if filtered_image is not None:  # None if no hook is present\n            visible_image_out = filtered_image\n        pdf_page_from_image_out = create_pdf_page_from_image(\n            visible_image_out, page_context, orientation_correction\n        )\n    return ocr_image_out, pdf_page_from_image_out, orientation_correction\n\n\ndef postprocess(\n    pdf_file: Path, context: PdfContext, executor: Executor\n) -> tuple[Path, Sequence[str]]:\n    \"\"\"Postprocess the PDF file.\"\"\"\n    # pdf_out = pdf_file\n    with Pdf.open(pdf_file) as pdf:\n        fix_annots = context.get_path('fix_annots.pdf')\n        if remove_broken_goto_annotations(pdf):\n            pdf.save(fix_annots)\n            pdf_out = fix_annots\n        else:\n            pdf_out = pdf_file\n    if context.options.output_type == 'auto':\n        # Best effort PDF/A - never uses Ghostscript\n        pdf_out, actual_type = try_auto_pdfa(pdf_out, context)\n        # Store actual output type for reporting\n        context.options.extra_attrs['_actual_output_type'] = actual_type\n    elif context.options.output_type.startswith('pdfa'):\n        # Required PDF/A - uses Ghostscript as fallback\n        speculative_result = try_speculative_pdfa(pdf_out, context)\n        if speculative_result is not None:\n            pdf_out = speculative_result\n        else:\n            # Fall back to Ghostscript conversion\n            ps_stub_out = generate_postscript_stub(context)\n            pdf_out = convert_to_pdfa(pdf_out, ps_stub_out, context)\n\n    optimizing = context.plugin_manager.is_optimization_enabled(context=context)\n    save_settings = get_pdf_save_settings(context.options.output_type)\n    save_settings['linearize'] = not optimizing and should_linearize(pdf_out, context)\n\n    pdf_out = metadata_fixup(pdf_out, context, pdf_save_settings=save_settings)\n    return optimize_pdf(pdf_out, context, executor)\n\n\ndef report_output_pdf(options, start_input_file, optimize_messages) -> ExitCode:\n    if options.output_file == '-':\n        log.info(\"Output sent to stdout\")\n    elif hasattr(options.output_file, 'writable') and options.output_file.writable():\n        log.info(\"Output written to stream\")\n    elif samefile(options.output_file, Path(os.devnull)):\n        pass  # Say nothing when sending to dev null\n    else:\n        if options.output_type == 'auto':\n            # For 'auto' mode, check what we actually produced\n            actual_type = options.extra_attrs.get('_actual_output_type', 'pdf')\n            pdfa_info = file_claims_pdfa(options.output_file)\n            if actual_type == 'pdfa' and pdfa_info['pass']:\n                log.info(\n                    \"Output file is a %s (auto mode achieved PDF/A)\",\n                    pdfa_info['conformance'],\n                )\n            elif pdfa_info['pass']:\n                # Unexpectedly got PDF/A\n                log.info(\"Output file is a %s\", pdfa_info['conformance'])\n            else:\n                # Regular PDF - this is expected for auto mode fallback\n                log.info(\"Output file is a PDF (auto mode)\")\n        elif options.output_type.startswith('pdfa'):\n            pdfa_info = file_claims_pdfa(options.output_file)\n            if pdfa_info['pass']:\n                log.info(\"Output file is a %s (as expected)\", pdfa_info['conformance'])\n            else:\n                log.warning(\n                    \"Output file is a valid PDF, but conversion to PDF/A did not \"\n                    \"succeed (issue: %s)\",\n                    pdfa_info['conformance'],\n                )\n                return ExitCode.pdfa_conversion_failed\n        if not check_pdf(options.output_file):\n            log.warning('Output file: The generated PDF is INVALID')\n            return ExitCode.invalid_output_pdf\n        report_output_file_size(\n            options, start_input_file, options.output_file, optimize_messages\n        )\n    return ExitCode.ok\n"
  },
  {
    "path": "src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py",
    "content": "# SPDX-FileCopyrightText: 2019-2023 James R. Barlow\n# SPDX-FileCopyrightText: 2019 Martin Wind\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Implements the concurrent and page synchronous parts of the pipeline.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport logging.handlers\nfrom collections.abc import Sequence\nfrom functools import partial\n\nimport PIL\n\nfrom ocrmypdf._concurrent import Executor\nfrom ocrmypdf._graft import OcrGrafter\nfrom ocrmypdf._jobcontext import PageContext, PdfContext\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf._pipeline import copy_final\nfrom ocrmypdf._pipelines._common import (\n    HOCRResult,\n    do_get_pdfinfo,\n    manage_work_folder,\n    postprocess,\n    report_output_pdf,\n    set_thread_pageno,\n    setup_pipeline,\n    worker_init,\n)\nfrom ocrmypdf._plugin_manager import OcrmypdfPluginManager\nfrom ocrmypdf._progressbar import ProgressBar\nfrom ocrmypdf.exceptions import ExitCode\nfrom ocrmypdf.helpers import available_cpu_count\n\nlog = logging.getLogger(__name__)\n\n\ndef _exec_hocrtransform_sync(page_context: PageContext) -> HOCRResult:\n    \"\"\"Process each page.\"\"\"\n    hocr_json = page_context.get_path('hocr.json')\n    if not hocr_json.exists():\n        # No hOCR file, so no OCR was performed on this page.\n        return HOCRResult(pageno=page_context.pageno)\n    hocr_result = HOCRResult.from_json(hocr_json.read_text())\n    # hOCR path is passed directly to the grafting phase where fpdf2 renders it\n    hocr_result.textpdf = page_context.get_path('ocr_hocr.hocr')\n    return hocr_result\n\n\ndef exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[str]:\n    \"\"\"Convert hOCR files to OCR PDF.\"\"\"\n    # Run exec_page_sync on every page\n    options = context.options\n    jobs = options.jobs or available_cpu_count()\n    max_workers = min(len(context.pdfinfo), jobs)\n    if max_workers > 1:\n        log.info(\"Continue processing %d pages concurrently\", max_workers)\n\n    ocrgraft = OcrGrafter(context)\n\n    def graft_page(result: HOCRResult, pbar: ProgressBar):\n        \"\"\"Graft text only PDF on to main PDF's page.\"\"\"\n        try:\n            set_thread_pageno(result.pageno + 1)\n            pbar.update()\n            ocrgraft.graft_page(\n                pageno=result.pageno,\n                image=result.pdf_page_from_image,\n                ocr_output=result.textpdf,\n                ocr_tree=result.ocr_tree,\n                autorotate_correction=result.orientation_correction,\n            )\n            pbar.update()\n        finally:\n            set_thread_pageno(None)\n\n    executor(\n        use_threads=options.use_threads,\n        max_workers=max_workers,\n        progress_kwargs=dict(\n            total=(2 * len(context.pdfinfo)),\n            desc='Grafting hOCR to PDF',\n            unit='page',\n            unit_scale=0.5,\n            disable=not options.progress_bar,\n        ),\n        worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),\n        task=_exec_hocrtransform_sync,\n        task_arguments=context.get_page_context_args(),\n        task_finished=graft_page,\n    )\n\n    pdf = ocrgraft.finalize()\n    messages: Sequence[str] = []\n    if options.output_type != 'none':\n        # PDF/A and metadata\n        log.info(\"Postprocessing...\")\n        pdf, messages = postprocess(pdf, context, executor)\n\n        # Copy PDF file to destination (we don't know the input PDF file name)\n        copy_final(pdf, options.output_file, None)\n    return messages\n\n\ndef run_hocr_to_ocr_pdf_pipeline(\n    options: OcrOptions,\n    *,\n    plugin_manager: OcrmypdfPluginManager,\n) -> ExitCode:\n    \"\"\"Run pipeline to convert hOCR to final output PDF.\"\"\"\n    with manage_work_folder(\n        work_folder=options.work_folder, retain=True, print_location=False\n    ) as work_folder:\n        executor = setup_pipeline(options, plugin_manager)\n        origin_pdf = work_folder / 'origin.pdf'\n\n        # Gather pdfinfo and create context\n        pdfinfo = do_get_pdfinfo(origin_pdf, executor, options)\n        context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)\n        plugin_manager.check_options(options=options)\n        optimize_messages = exec_hocr_to_ocr_pdf(context, executor)\n\n        return report_output_pdf(options, origin_pdf, optimize_messages)\n"
  },
  {
    "path": "src/ocrmypdf/_pipelines/ocr.py",
    "content": "# SPDX-FileCopyrightText: 2019-2023 James R. Barlow\n# SPDX-FileCopyrightText: 2019 Martin Wind\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Implements the concurrent and page synchronous parts of the pipeline.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport logging.handlers\nfrom collections.abc import Sequence\nfrom functools import partial\nfrom pathlib import Path\nfrom tempfile import mkdtemp\n\nimport PIL\n\nfrom ocrmypdf._concurrent import Executor\nfrom ocrmypdf._graft import OcrGrafter\nfrom ocrmypdf._jobcontext import PageContext, PdfContext\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf._pipeline import (\n    copy_final,\n    is_ocr_required,\n    merge_sidecars,\n    ocr_engine_direct,\n    ocr_engine_hocr,\n    ocr_engine_textonly_pdf,\n    triage,\n    validate_pdfinfo_options,\n)\nfrom ocrmypdf._pipelines._common import (\n    PageResult,\n    cli_exception_handler,\n    do_get_pdfinfo,\n    manage_debug_log_handler,\n    manage_work_folder,\n    postprocess,\n    process_page,\n    report_output_pdf,\n    set_thread_pageno,\n    setup_pipeline,\n    worker_init,\n)\nfrom ocrmypdf._plugin_manager import OcrmypdfPluginManager\nfrom ocrmypdf._progressbar import ProgressBar\nfrom ocrmypdf._validation import (\n    check_requested_output_file,\n    create_input_file,\n)\nfrom ocrmypdf.exceptions import ExitCode\nfrom ocrmypdf.helpers import available_cpu_count\nfrom ocrmypdf.models.ocr_element import OcrElement\n\nlog = logging.getLogger(__name__)\n\n\ndef _image_to_ocr_text(\n    page_context: PageContext, ocr_image_out: Path\n) -> tuple[Path | None, Path, OcrElement | None]:\n    \"\"\"Run OCR engine on image to create OCR PDF and text file.\"\"\"\n    options = page_context.options\n    pdf_renderer = options.pdf_renderer\n\n    # fpdf2 is the default renderer (auto resolves to fpdf2)\n    if pdf_renderer in ('auto', 'fpdf2'):\n        # Use generate_ocr() if the engine supports it, otherwise use hOCR path\n        ocr_engine = page_context.plugin_manager.get_ocr_engine(options=options)\n        if ocr_engine and ocr_engine.supports_generate_ocr():\n            ocr_tree, text_out = ocr_engine_direct(ocr_image_out, page_context)\n            return None, text_out, ocr_tree\n        ocr_out, text_out = ocr_engine_hocr(ocr_image_out, page_context)\n    elif pdf_renderer == 'sandwich':\n        ocr_out, text_out = ocr_engine_textonly_pdf(ocr_image_out, page_context)\n    else:\n        raise NotImplementedError(f\"pdf_renderer {pdf_renderer}\")\n    return ocr_out, text_out, None\n\n\ndef _exec_page_sync(page_context: PageContext) -> PageResult:\n    \"\"\"Execute a pipeline for a single page synchronously.\"\"\"\n    set_thread_pageno(page_context.pageno + 1)\n\n    if not is_ocr_required(page_context):\n        return PageResult(pageno=page_context.pageno)\n\n    ocr_image_out, pdf_page_from_image_out, orientation_correction = process_page(\n        page_context\n    )\n    ocr_out, text_out, ocr_tree = _image_to_ocr_text(page_context, ocr_image_out)\n    return PageResult(\n        pageno=page_context.pageno,\n        pdf_page_from_image=pdf_page_from_image_out,\n        ocr=ocr_out,\n        text=text_out,\n        orientation_correction=orientation_correction,\n        ocr_tree=ocr_tree,\n    )\n\n\ndef exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:\n    \"\"\"Execute the OCR pipeline concurrently.\"\"\"\n    options = context.options\n    jobs = options.jobs or available_cpu_count()\n    max_workers = min(len(context.pdfinfo), jobs)\n    if max_workers > 1:\n        log.info(\"Starting processing with %d workers concurrently\", max_workers)\n\n    sidecars: list[Path | None] = [None] * len(context.pdfinfo)\n    ocrgraft = OcrGrafter(context)\n\n    def update_page(result: PageResult, pbar: ProgressBar):\n        \"\"\"After OCR is complete for a page, update the PDF.\"\"\"\n        try:\n            set_thread_pageno(result.pageno + 1)\n            sidecars[result.pageno] = result.text\n            pbar.update(0.5)\n            ocrgraft.graft_page(\n                pageno=result.pageno,\n                image=result.pdf_page_from_image,\n                ocr_output=result.ocr,\n                ocr_tree=result.ocr_tree,\n                autorotate_correction=result.orientation_correction,\n            )\n            pbar.update(0.5)\n        finally:\n            set_thread_pageno(None)\n\n    executor(\n        use_threads=options.use_threads,\n        max_workers=max_workers,\n        progress_kwargs=dict(\n            total=len(context.pdfinfo),\n            desc='OCR' if options.ocr_engine != 'none' else 'Image processing',\n            unit='page',\n            disable=not options.progress_bar,\n        ),\n        worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),\n        task=_exec_page_sync,\n        task_arguments=context.get_page_context_args(),\n        task_finished=update_page,\n    )\n\n    # Output sidecar text\n    if options.sidecar:\n        text = merge_sidecars(sidecars, context)\n        # Copy text file to destination\n        copy_final(text, options.sidecar, options.input_file)\n\n    # Merge layers to one single pdf\n    pdf = ocrgraft.finalize()\n\n    messages: Sequence[str] = []\n    if options.output_type != 'none':\n        # PDF/A and metadata\n        log.info(\"Postprocessing...\")\n        pdf, messages = postprocess(pdf, context, executor)\n\n        # Copy PDF file to destination\n        copy_final(pdf, options.output_file, options.input_file)\n    return messages\n\n\ndef _run_pipeline(\n    options: OcrOptions,\n    plugin_manager: OcrmypdfPluginManager,\n) -> ExitCode:\n    with (\n        manage_work_folder(\n            work_folder=Path(mkdtemp(prefix=\"ocrmypdf.io.\")),\n            retain=options.keep_temporary_files,\n            print_location=options.keep_temporary_files,\n        ) as work_folder,\n        manage_debug_log_handler(options=options, work_folder=work_folder),\n    ):\n        executor = setup_pipeline(options, plugin_manager)\n        check_requested_output_file(options)\n        start_input_file, original_filename = create_input_file(options, work_folder)\n\n        # Triage image or pdf\n        origin_pdf = triage(\n            original_filename, start_input_file, work_folder / 'origin.pdf', options\n        )\n\n        # Gather pdfinfo and create context\n        pdfinfo = do_get_pdfinfo(origin_pdf, executor, options)\n        context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)\n\n        # Validate options are okay for this pdf\n        validate_pdfinfo_options(context)\n\n        # Execute the pipeline\n        optimize_messages = exec_concurrent(context, executor)\n\n        exitcode = report_output_pdf(options, start_input_file, optimize_messages)\n        return exitcode\n\n\ndef run_pipeline_cli(\n    options: OcrOptions,\n    *,\n    plugin_manager: OcrmypdfPluginManager,\n) -> ExitCode:\n    \"\"\"Run the OCR pipeline with command line exception handling.\n\n    Args:\n        options: The parsed OCR options.\n        plugin_manager: The plugin manager to use. If not provided, one will be\n            created.\n    \"\"\"\n    return cli_exception_handler(_run_pipeline, options, plugin_manager)\n\n\ndef run_pipeline(\n    options: OcrOptions,\n    *,\n    plugin_manager: OcrmypdfPluginManager,\n) -> ExitCode:\n    \"\"\"Run the OCR pipeline without command line exception handling.\n\n    Args:\n        options: The parsed OCR options.\n        plugin_manager: The plugin manager to use. If not provided, one will be\n            created.\n    \"\"\"\n    return _run_pipeline(options, plugin_manager)\n"
  },
  {
    "path": "src/ocrmypdf/_pipelines/pdf_to_hocr.py",
    "content": "# SPDX-FileCopyrightText: 2019-2023 James R. Barlow\n# SPDX-FileCopyrightText: 2019 Martin Wind\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Implements the concurrent and page synchronous parts of the pipeline.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport logging.handlers\nimport shutil\nfrom functools import partial\n\nimport PIL\n\nfrom ocrmypdf._concurrent import Executor\nfrom ocrmypdf._jobcontext import PageContext, PdfContext\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf._pipeline import (\n    is_ocr_required,\n    ocr_engine_hocr,\n    validate_pdfinfo_options,\n)\nfrom ocrmypdf._pipelines._common import (\n    HOCRResult,\n    do_get_pdfinfo,\n    manage_work_folder,\n    process_page,\n    set_thread_pageno,\n    setup_pipeline,\n    worker_init,\n)\nfrom ocrmypdf._plugin_manager import OcrmypdfPluginManager\nfrom ocrmypdf.helpers import available_cpu_count\n\nlog = logging.getLogger(__name__)\n\n\ndef _exec_page_hocr_sync(page_context: PageContext) -> HOCRResult:\n    \"\"\"Execute a pipeline for a single page hOCR.\"\"\"\n    set_thread_pageno(page_context.pageno + 1)\n\n    if not is_ocr_required(page_context):\n        return HOCRResult(pageno=page_context.pageno)\n\n    ocr_image_out, pdf_page_from_image_out, orientation_correction = process_page(\n        page_context\n    )\n    hocr_out, _ = ocr_engine_hocr(ocr_image_out, page_context)\n\n    result = HOCRResult(\n        pageno=page_context.pageno,\n        pdf_page_from_image=pdf_page_from_image_out,\n        hocr=hocr_out,\n        orientation_correction=orientation_correction,\n    )\n    page_context.get_path('hocr.json').write_text(result.to_json())\n    return result\n\n\ndef exec_pdf_to_hocr(context: PdfContext, executor: Executor) -> None:\n    \"\"\"Execute the OCR pipeline concurrently and output hOCR.\"\"\"\n    # Run exec_page_sync on every page\n    options = context.options\n    jobs = options.jobs or available_cpu_count()\n    max_workers = min(len(context.pdfinfo), jobs)\n    if max_workers > 1:\n        log.info(\"Starting processing with %d workers concurrently\", max_workers)\n\n    executor(\n        use_threads=options.use_threads,\n        max_workers=max_workers,\n        progress_kwargs=dict(\n            total=(2 * len(context.pdfinfo)),\n            desc='hOCR',\n            unit='page',\n            unit_scale=0.5,\n            disable=not options.progress_bar,\n        ),\n        worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),\n        task=_exec_page_hocr_sync,\n        task_arguments=context.get_page_context_args(),\n    )\n\n\ndef run_hocr_pipeline(\n    options: OcrOptions,\n    *,\n    plugin_manager: OcrmypdfPluginManager,\n) -> None:\n    \"\"\"Run pipeline to output hOCR.\"\"\"\n    if options.output_folder is None:\n        raise ValueError(\"output_folder must be specified for hOCR pipeline\")\n    with manage_work_folder(\n        work_folder=options.output_folder, retain=True, print_location=False\n    ) as work_folder:\n        executor = setup_pipeline(options, plugin_manager)\n        origin_pdf = work_folder / 'origin.pdf'\n        shutil.copy2(options.input_file, origin_pdf)\n\n        # Gather pdfinfo and create context\n        pdfinfo = do_get_pdfinfo(origin_pdf, executor, options)\n        context = PdfContext(\n            options, work_folder, options.input_file, pdfinfo, plugin_manager\n        )\n        # Validate options are okay for this pdf\n        validate_pdfinfo_options(context)\n        exec_pdf_to_hocr(context, executor)\n"
  },
  {
    "path": "src/ocrmypdf/_plugin_manager.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Plugin manager using pluggy with type-safe interface.\"\"\"\n\nfrom __future__ import annotations\n\nimport importlib\nimport importlib.util\nimport pkgutil\nimport sys\nfrom argparse import ArgumentParser\nfrom collections.abc import Sequence\nfrom logging import Handler\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING\n\nimport pluggy\nfrom pydantic import BaseModel\n\nimport ocrmypdf.builtin_plugins\nfrom ocrmypdf import Executor, PdfContext, pluginspec\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf._progressbar import ProgressBar\nfrom ocrmypdf.helpers import Resolution\nfrom ocrmypdf.pluginspec import OcrEngine\n\nif TYPE_CHECKING:\n    from PIL import Image\n\n    from ocrmypdf._jobcontext import PageContext\n    from ocrmypdf.pdfinfo import PdfInfo\n\n\nclass OcrmypdfPluginManager:\n    \"\"\"Type-safe wrapper around pluggy.PluginManager.\n\n    Capable of reconstructing itself in child workers via pickle.\n\n    This class provides type-safe methods for all hooks defined in pluginspec.py,\n    removing the need for unsafe `hook.method_name()` calls.\n    \"\"\"\n\n    def __init__(\n        self,\n        *args,\n        plugins: Sequence[str | Path],\n        builtins: bool = True,\n        **kwargs,\n    ):\n        self._init_args = args\n        self._init_kwargs = kwargs\n        self._plugins = plugins\n        self._builtins = builtins\n        self._pm = pluggy.PluginManager(*args, **kwargs)\n        self._setup_plugins()\n\n    @property\n    def pluggy(self) -> pluggy.PluginManager:\n        \"\"\"Access the underlying pluggy.PluginManager for advanced use cases.\n\n        This is useful for plugins that need to call methods like set_blocked()\n        in their initialize hook.\n        \"\"\"\n        return self._pm\n\n    def __getstate__(self):\n        state = dict(\n            init_args=self._init_args,\n            plugins=self._plugins,\n            builtins=self._builtins,\n            init_kwargs=self._init_kwargs,\n        )\n        return state\n\n    def __setstate__(self, state):\n        self.__init__(\n            *state['init_args'],\n            plugins=state['plugins'],\n            builtins=state['builtins'],\n            **state['init_kwargs'],\n        )\n\n    def _setup_plugins(self):\n        self._pm.add_hookspecs(pluginspec)\n\n        # 1. Register builtins\n        if self._builtins:\n            for module in sorted(\n                pkgutil.iter_modules(ocrmypdf.builtin_plugins.__path__)\n            ):\n                name = f'ocrmypdf.builtin_plugins.{module.name}'\n                module = importlib.import_module(name)\n                self._pm.register(module)\n\n        # 2. Register setuptools plugins\n        self._pm.load_setuptools_entrypoints('ocrmypdf')\n\n        # 3. Register plugins specified on command line\n        for name in self._plugins:\n            if isinstance(name, Path) or name.endswith('.py'):\n                # Import by filename\n                module_name = Path(name).stem\n                spec = importlib.util.spec_from_file_location(module_name, name)\n                module = importlib.util.module_from_spec(spec)\n                sys.modules[module_name] = module\n                spec.loader.exec_module(module)\n            else:\n                # Import by dotted module name\n                module = importlib.import_module(name)\n            self._pm.register(module)\n\n    # =========================================================================\n    # Type-safe hook methods\n    # =========================================================================\n\n    # --- firstresult hooks ---\n\n    def get_logging_console(self) -> Handler | None:\n        \"\"\"Returns a custom logging handler for progress bar compatibility.\"\"\"\n        return self._pm.hook.get_logging_console()\n\n    def get_executor(self, *, progressbar_class: type[ProgressBar]) -> Executor | None:\n        \"\"\"Returns an executor for parallel processing.\"\"\"\n        return self._pm.hook.get_executor(progressbar_class=progressbar_class)\n\n    def get_progressbar_class(self) -> type[ProgressBar] | None:\n        \"\"\"Returns a progress bar class.\"\"\"\n        return self._pm.hook.get_progressbar_class()\n\n    def rasterize_pdf_page(\n        self,\n        *,\n        input_file: Path,\n        output_file: Path,\n        raster_device: str,\n        raster_dpi: Resolution,\n        pageno: int,\n        page_dpi: Resolution | None,\n        rotation: int | None,\n        filter_vector: bool,\n        stop_on_soft_error: bool,\n        options: OcrOptions | None,\n        use_cropbox: bool,\n    ) -> Path | None:\n        \"\"\"Rasterize one page of a PDF at specified resolution.\"\"\"\n        return self._pm.hook.rasterize_pdf_page(\n            input_file=input_file,\n            output_file=output_file,\n            raster_device=raster_device,\n            raster_dpi=raster_dpi,\n            pageno=pageno,\n            page_dpi=page_dpi,\n            rotation=rotation,\n            filter_vector=filter_vector,\n            stop_on_soft_error=stop_on_soft_error,\n            options=options,\n            use_cropbox=use_cropbox,\n        )\n\n    def filter_ocr_image(\n        self, *, page: PageContext, image: Image.Image\n    ) -> Image.Image | None:\n        \"\"\"Filter the image before it is sent to OCR.\"\"\"\n        return self._pm.hook.filter_ocr_image(page=page, image=image)\n\n    def filter_page_image(\n        self, *, page: PageContext, image_filename: Path\n    ) -> Path | None:\n        \"\"\"Filter the whole page image before it is inserted into the PDF.\"\"\"\n        return self._pm.hook.filter_page_image(page=page, image_filename=image_filename)\n\n    def filter_pdf_page(\n        self, *, page: PageContext, image_filename: Path, output_pdf: Path\n    ) -> Path:\n        \"\"\"Convert a filtered whole page image into a PDF.\"\"\"\n        result = self._pm.hook.filter_pdf_page(\n            page=page, image_filename=image_filename, output_pdf=output_pdf\n        )\n        if result is None:\n            raise ValueError('No PDF produced')\n        if result != output_pdf:\n            raise ValueError('filter_pdf_page must return output_pdf')\n        return result\n\n    def get_ocr_engine(self, *, options: OcrOptions | None = None) -> OcrEngine:\n        \"\"\"Returns an OcrEngine to use for processing.\n\n        Args:\n            options: OcrOptions to pass to the hook for engine selection.\n        \"\"\"\n        result = self._pm.hook.get_ocr_engine(options=options)\n        if result is None:\n            raise ValueError('No OCR engine selected')\n        return result\n\n    def generate_pdfa(\n        self,\n        *,\n        pdf_pages: list[Path],\n        pdfmark: Path,\n        output_file: Path,\n        context: PdfContext,\n        pdf_version: str,\n        pdfa_part: str,\n        progressbar_class: type[ProgressBar] | None,\n        stop_on_soft_error: bool,\n    ) -> Path | None:\n        \"\"\"Generate a PDF/A file.\"\"\"\n        return self._pm.hook.generate_pdfa(\n            pdf_pages=pdf_pages,\n            pdfmark=pdfmark,\n            output_file=output_file,\n            context=context,\n            pdf_version=pdf_version,\n            pdfa_part=pdfa_part,\n            progressbar_class=progressbar_class,\n            stop_on_soft_error=stop_on_soft_error,\n        )\n\n    def optimize_pdf(\n        self,\n        *,\n        input_pdf: Path,\n        output_pdf: Path,\n        context: PdfContext,\n        executor: Executor,\n        linearize: bool,\n    ) -> tuple[Path, Sequence[str]]:\n        \"\"\"Optimize a PDF after OCR processing.\"\"\"\n        result = self._pm.hook.optimize_pdf(\n            input_pdf=input_pdf,\n            output_pdf=output_pdf,\n            context=context,\n            executor=executor,\n            linearize=linearize,\n        )\n        if result is None:\n            return input_pdf, []\n        return result\n\n    def is_optimization_enabled(self, *, context: PdfContext) -> bool | None:\n        \"\"\"Returns whether optimization is enabled for given context.\"\"\"\n        return self._pm.hook.is_optimization_enabled(context=context)\n\n    # --- non-firstresult hooks ---\n\n    def initialize(self, *, plugin_manager: pluggy.PluginManager) -> list[None]:\n        \"\"\"Called when plugins are first loaded.\n\n        Args:\n            plugin_manager: The underlying pluggy.PluginManager, allowing\n                plugins to call methods like set_blocked().\n        \"\"\"\n        return self._pm.hook.initialize(plugin_manager=plugin_manager)\n\n    def add_options(self, *, parser: ArgumentParser) -> list[None]:\n        \"\"\"Allows plugins to add command line and API arguments.\"\"\"\n        return self._pm.hook.add_options(parser=parser)\n\n    def register_options(self) -> list[dict[str, type[BaseModel]]]:\n        \"\"\"Returns plugin option models keyed by namespace.\"\"\"\n        return self._pm.hook.register_options()\n\n    def check_options(self, *, options: OcrOptions) -> list[None]:\n        \"\"\"Called to validate options after parsing.\"\"\"\n        return self._pm.hook.check_options(options=options)\n\n    def validate(self, *, pdfinfo: PdfInfo, options: OcrOptions) -> list[None]:\n        \"\"\"Called to validate options and pdfinfo after PDF is loaded.\"\"\"\n        return self._pm.hook.validate(pdfinfo=pdfinfo, options=options)\n\n\ndef get_plugin_manager(\n    plugins: Sequence[str | Path] | None = None, builtins=True\n) -> OcrmypdfPluginManager:\n    return OcrmypdfPluginManager(\n        project_name='ocrmypdf',\n        plugins=plugins if plugins is not None else [],\n        builtins=builtins,\n    )\n\n\n__all__ = ['OcrmypdfPluginManager', 'get_plugin_manager']\n"
  },
  {
    "path": "src/ocrmypdf/_plugin_registry.py",
    "content": "# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Plugin option registry for dynamic model composition.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\n\nfrom pydantic import BaseModel\n\nlog = logging.getLogger(__name__)\n\n\nclass PluginOptionRegistry:\n    \"\"\"Registry for plugin option models.\n\n    This registry collects option models from plugins during initialization.\n    Plugin options can be accessed via nested namespaces on OcrOptions\n    (e.g., options.tesseract.timeout) or via flat field names for backward\n    compatibility (e.g., options.tesseract_timeout).\n    \"\"\"\n\n    def __init__(self):\n        self._option_models: dict[str, type[BaseModel]] = {}\n\n    def register_option_model(\n        self, namespace: str, model_class: type[BaseModel]\n    ) -> None:\n        \"\"\"Register a plugin's option model.\n\n        Args:\n            namespace: The namespace for the plugin options (e.g., 'tesseract')\n            model_class: The Pydantic model class for the plugin options\n        \"\"\"\n        if namespace in self._option_models:\n            log.warning(\n                f\"Plugin option namespace '{namespace}' already registered, overriding\"\n            )\n\n        self._option_models[namespace] = model_class\n\n        log.debug(\n            f\"Registered plugin option model for namespace '{namespace}': \"\n            f\"{model_class.__name__}\"\n        )\n\n    def get_registered_models(self) -> dict[str, type[BaseModel]]:\n        \"\"\"Get all registered plugin option models.\"\"\"\n        return self._option_models.copy()\n"
  },
  {
    "path": "src/ocrmypdf/_progressbar.py",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Defines progress bar API.\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import Protocol\n\nfrom rich.console import Console\nfrom rich.progress import (\n    BarColumn,\n    MofNCompleteColumn,\n    Progress,\n    TaskProgressColumn,\n    TextColumn,\n    TimeRemainingColumn,\n)\nfrom rich.table import Column\n\n\nclass ProgressBar(Protocol):\n    \"\"\"The protocol that OCRmyPDF expects progress bar classes to be compatible with.\n\n    In practice this could be used for any time of monitoring, not just a progress bar.\n\n    Calling the class should return a new progress bar object, which is activated\n    with ``__enter__`` and terminated with ``__exit__``. An update method is called\n    whenever the progress bar is updated. Progress bar objects will not be reused;\n    a new one will be created for each group of tasks.\n\n    The progress bar is held in the main process/thread and not updated by child\n    process/threads. When a child notifies the parent of completed work, the\n    parent updates the progress bar.\n    Progress bars should never write to ``sys.stdout``, or they will corrupt the\n    output if OCRmyPDF writes a PDF to standard output.\n\n    Note:\n        The type of events that OCRmyPDF reports to a progress bar may change in\n    minor releases.\n\n    Args:\n        total (int | float | None):\n            The total number of work units expected. If ``None``, the total is unknown.\n            For example, if you are processing pages, this might be the number of pages,\n            or if you are measuring overall progress in percent, this might be 100.\n        desc (str | None):\n            A brief description of the current step (e.g. \"Scanning contents\",\n            \"OCR\", \"PDF/A conversion\"). OCRmyPDF updates this before each major step.\n        unit (str | None):\n            A short label for the type of work being tracked\n            (e.g. \"page\", \"%\", \"image\").\n        disable (bool):\n            If ``True``, progress updates are suppressed (no output).\n            Defaults to ``False``.\n        **kwargs:\n            Future or extra parameters that OCRmyPDF might pass. Implementations\n            should accept and ignore unrecognized keywords gracefully.\n\n    Example:\n        A simple plugin implementation could look like this:\n\n        .. code-block:: python\n\n            from ocrmypdf.pluginspec import ProgressBar\n            from ocrmypdf import hookimpl\n\n            class ConsoleProgressBar(ProgressBar):\n                def __init__(self, *, total=None, desc=None, unit=None, disable=False,\n                             **kwargs):\n                    self.total = total\n                    self.desc = desc\n                    self.unit = unit\n                    self.disable = disable\n                    self.current = 0\n\n                def __enter__(self):\n                    if not self.disable:\n                        print(f\"Starting {self.desc or 'an OCR task'} \"\n                              f\"(total={self.total} {self.unit})\"\n                        )\n                    return self\n\n                def __exit__(self, exc_type, exc_value, traceback):\n                    if not self.disable:\n                        if exc_type is None:\n                            print(\"Completed successfully.\")\n                        else:\n                            print(f\"Task ended with error: {exc_value}\")\n                    return False  # Let OCRmyPDF raise any exceptions\n\n                def update(self, n=1, *, completed=None):\n                    if completed is not None:\n                        # If 'completed' is given, set self.current\n                        # but let's just read it to show usage\n                        print(f\"Absolute completion reported: {completed}\")\n                    # Otherwise, we increment by 'n'\n                    self.current += n\n                    if not self.disable:\n                        if self.total:\n                            percent = (self.current / self.total) * 100\n                            print(\n                                f\"{self.desc}: {self.current}\"\n                                f\"/{self.total} ({percent:.1f}%)\"\n                            )\n                        else:\n                            print(f\"{self.desc}: {self.current} units done\")\n\n            @hookimpl\n            def get_progressbar_class():\n                return MyProgressBar\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        total: int | float | None,\n        desc: str | None,\n        unit: str | None,\n        disable: bool = False,\n        **kwargs,\n    ):\n        \"\"\"Initialize a progress bar.\n\n        This is called once before any work is done. OCRmyPDF supplies the total\n        number of units (or None if unknown), a description of the work, and the\n        type of units. The ``disable`` parameter can be used to turn off progress\n        reporting. Unrecognized keyword arguments should be ignored.\n\n        Args:\n            total (int | float | None):\n                The total amount of work. If ``None``, the total is unknown.\n            desc (str | None):\n                A description of the current task. May change for different stages.\n            unit (str | None):\n                A short label for the unit of work.\n            disable (bool):\n                If ``True``, no output or logging should be displayed.\n            **kwargs:\n                Extra parameters that may be passed by OCRmyPDF in future versions.\n        \"\"\"\n\n    def __enter__(self):\n        \"\"\"Enter a progress bar context.\"\"\"\n\n    def __exit__(self, *args):\n        \"\"\"Exit a progress bar context.\"\"\"\n\n    def update(self, n: float = 1, *, completed: float | None = None):\n        \"\"\"Increment the progress bar by ``n`` units, or set an absolute completion.\n\n        OCRmyPDF calls this method repeatedly while processing pages or other tasks.\n        If your total is known and you track it, you might do something like:\n\n        .. code-block:: python\n\n            self.current += n\n            percent = (self.current / total) * 100\n\n        The ``completed`` argument can indicate an absolute position, which is\n        particularly helpful if you're tracking a percentage of work (e.g., 0 to 100)\n        and want precise updates. In contrast, the incremental parameter ``n`` is\n        often more useful for page-based increments.\n\n        Args:\n            n (float, optional):\n                The amount to increment the progress by. Defaults to 1. May be\n                fractional if OCRmyPDF performs partial steps. If you are tracking\n                pages, this is typically how many pages have been processed in the\n                most recent step.\n            completed (float | None, optional):\n                The absolute amount of work completed so far. This can override or\n                supplement the simple increment logic. It's particularly useful\n                for percentage-based tracking (e.g., when ``total`` is 100).\n        \"\"\"\n\n\nclass NullProgressBar:\n    \"\"\"Progress bar API that takes no actions.\"\"\"\n\n    def __init__(self, **kwargs):\n        pass\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback):\n        return False\n\n    def update(self, _arg=None, *, completed=None):\n        return\n\n\nclass RichProgressBar:\n    \"\"\"Display progress bar using rich.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        console: Console,\n        desc: str,\n        total: float | None = None,\n        unit: str | None = None,\n        unit_scale: float | None = 1.0,\n        disable: bool = False,\n        **kwargs,\n    ):\n        self._entered = False\n        self.progress = Progress(\n            TextColumn(\n                \"[progress.description]{task.description}\",\n                table_column=Column(min_width=20),\n            ),\n            BarColumn(),\n            TaskProgressColumn(),\n            MofNCompleteColumn(),\n            TimeRemainingColumn(),\n            console=console,\n            auto_refresh=True,\n            redirect_stderr=True,\n            redirect_stdout=False,\n            disable=disable,\n            **kwargs,\n        )\n        self.unit_scale = unit_scale\n        self.progress_bar = self.progress.add_task(\n            desc,\n            total=total * self.unit_scale\n            if total is not None and self.unit_scale is not None\n            else None,\n            unit=unit,\n        )\n\n    def __enter__(self):\n        self.progress.start()\n        self._entered = True\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback):\n        self.progress.refresh()\n        self.progress.stop()\n        return False\n\n    def update(self, n=1, *, completed=None):\n        assert self._entered, \"Progress bar must be entered before updating\"\n        if completed is None:\n            advance = self.unit_scale if n is None else n\n            self.progress.update(self.progress_bar, advance=advance)\n        else:\n            self.progress.update(self.progress_bar, completed=completed)\n"
  },
  {
    "path": "src/ocrmypdf/_validation.py",
    "content": "#!/usr/bin/env python3\n# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Validate a work order from API or command line.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport sys\nfrom collections.abc import Sequence\nfrom pathlib import Path\nfrom shutil import copyfileobj\n\nimport pikepdf\n\nfrom ocrmypdf._defaults import DEFAULT_ROTATE_PAGES_THRESHOLD\nfrom ocrmypdf._exec import unpaper\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf._plugin_manager import OcrmypdfPluginManager\nfrom ocrmypdf.exceptions import (\n    BadArgsError,\n    InputFileError,\n    MissingDependencyError,\n    OutputFileAccessError,\n)\nfrom ocrmypdf.helpers import (\n    is_file_writable,\n    running_in_docker,\n    running_in_snap,\n    safe_symlink,\n)\nfrom ocrmypdf.subprocess import check_external_program\n\nlog = logging.getLogger(__name__)\n\n\ndef check_platform() -> None:\n    if sys.maxsize <= 2**32:  # pragma: no cover\n        log.warning(\n            \"You are running OCRmyPDF in a 32-bit (x86) Python interpreter. \"\n            \"This is not supported. 32-bit does not have enough address space \"\n            \"to process large files. \"\n            \"Please use a 64-bit (x86-64) version of Python.\"\n        )\n\n\ndef check_options_languages(\n    options: OcrOptions, ocr_engine_languages: list[str]\n) -> None:\n    # Check for blocked languages first, before checking if they're installed\n    DENIED_LANGUAGES = {'equ', 'osd'}\n    blocked = DENIED_LANGUAGES & set(options.languages)\n    if blocked:\n        raise BadArgsError(\n            \"The following languages are for Tesseract's internal use and \"\n            \"should not be issued explicitly: \"\n            f\"{', '.join(blocked)}\\n\"\n            \"Remove them from the -l/--language argument.\"\n        )\n\n    if not ocr_engine_languages:\n        return\n\n    missing_languages = set(options.languages) - set(ocr_engine_languages)\n    if missing_languages:\n        lang_text = '\\n'.join(lang for lang in missing_languages)\n        msg = (\n            \"OCR engine does not have language data for the following \"\n            \"requested languages: \\n\"\n            f\"{lang_text}\\n\"\n            \"Please install the appropriate language data for your OCR engine.\\n\"\n            \"\\n\"\n            \"See the online documentation for instructions:\\n\"\n            \"    https://ocrmypdf.readthedocs.io/en/latest/languages.html\\n\"\n            \"\\n\"\n            \"Note: most languages are identified by a 3-letter ISO 639-2 Code.\\n\"\n            \"For example, English is 'eng', German is 'deu', and Spanish is 'spa'.\\n\"\n            \"Simplified Chinese is 'chi_sim' and Traditional Chinese is 'chi_tra'.\"\n            \"\\n\"\n        )\n        raise MissingDependencyError(msg)\n\n\ndef check_options_sidecar(options: OcrOptions) -> None:\n    if options.sidecar == '\\0':\n        if options.output_file == '-':\n            raise BadArgsError(\"--sidecar filename needed when output file is stdout.\")\n        elif options.output_file == os.devnull:\n            raise BadArgsError(\n                \"--sidecar filename needed when output file is /dev/null or NUL.\"\n            )\n        options.sidecar = options.output_file + '.txt'\n    if options.sidecar == options.input_file or options.sidecar == options.output_file:\n        raise BadArgsError(\n            \"--sidecar file must be different from the input and output files\"\n        )\n\n\ndef check_options_preprocessing(options: OcrOptions) -> None:\n    if options.clean_final:\n        options.clean = True\n    if options.unpaper_args and not options.clean:\n        raise BadArgsError(\"--clean is required for --unpaper-args\")\n    if (\n        options.rotate_pages_threshold != DEFAULT_ROTATE_PAGES_THRESHOLD\n        and not options.rotate_pages\n    ):\n        raise BadArgsError(\"--rotate-pages is required for --rotate-pages-threshold\")\n    if options.clean:\n        check_external_program(\n            program='unpaper',\n            package='unpaper',\n            version_checker=unpaper.version,\n            need_version='6.1',\n            required_for=\"--clean, --clean-final\",\n        )\n\n\ndef _check_plugin_invariant_options(options: OcrOptions) -> None:\n    check_platform()\n    check_options_sidecar(options)\n    check_options_preprocessing(options)\n\n\ndef _check_plugin_options(\n    options: OcrOptions, plugin_manager: OcrmypdfPluginManager\n) -> None:\n    # First, let plugins check their external dependencies\n    plugin_manager.check_options(options=options)\n\n    # Then check OCR engine language support\n    ocr_engine_languages = plugin_manager.get_ocr_engine(options=options).languages(\n        options\n    )\n    check_options_languages(options, ocr_engine_languages)\n\n    # Finally, run comprehensive validation using the coordinator\n    from ocrmypdf._validation_coordinator import ValidationCoordinator\n\n    coordinator = ValidationCoordinator(plugin_manager)\n    coordinator.validate_all_options(options)\n\n\ndef check_options(options: OcrOptions, plugin_manager: OcrmypdfPluginManager) -> None:\n    \"\"\"Check options for validity and consistency.\n\n    This function coordinates validation across the entire system:\n    1. Core validation (platform, files, preprocessing)\n    2. Plugin external dependency validation\n    3. Plugin-specific validation (handled by plugin models)\n    4. Cross-cutting validation (handled by validation coordinator)\n    \"\"\"\n    _check_plugin_invariant_options(options)\n    _check_plugin_options(options, plugin_manager)\n\n\ndef create_input_file(options: OcrOptions, work_folder: Path) -> tuple[Path, str]:\n    if options.input_file == '-':\n        # stdin\n        log.info('reading file from standard input')\n        target = work_folder / 'stdin'\n        with open(target, 'wb') as stream_buffer:\n            copyfileobj(sys.stdin.buffer, stream_buffer)\n        return target, \"stdin\"\n    elif hasattr(options.input_file, 'readable'):\n        if not options.input_file.readable():\n            raise InputFileError(\"Input file stream is not readable\")\n        log.info('reading file from input stream')\n        target = work_folder / 'stream'\n        with open(target, 'wb') as stream_buffer:\n            copyfileobj(options.input_file, stream_buffer)\n        return target, \"stream\"\n    else:\n        try:\n            target = work_folder / 'origin'\n            safe_symlink(options.input_file, target)\n            return target, os.fspath(options.input_file)\n        except FileNotFoundError as e:\n            msg = f\"File not found - {options.input_file}\"\n            if running_in_docker():  # pragma: no cover\n                msg += (\n                    \"\\nDocker cannot access your working directory unless you \"\n                    \"explicitly share it with the Docker container and set up\"\n                    \"permissions correctly.\\n\"\n                    \"You may find it easier to use stdin/stdout:\"\n                    \"\\n\"\n                    \"\\tdocker run -i --rm jbarlow83/ocrmypdf - - <input.pdf >output.pdf\"\n                    \"\\n\"\n                )\n            elif running_in_snap():  # pragma: no cover\n                msg += (\n                    \"\\nSnap applications cannot access files outside of \"\n                    \"your home directory unless you explicitly allow it. \"\n                    \"You may find it easier to use stdin/stdout:\"\n                    \"\\n\"\n                    \"\\tsnap run ocrmypdf - - <input.pdf >output.pdf\"\n                    \"\\n\"\n                )\n            raise InputFileError(msg) from e\n\n\ndef check_requested_output_file(options: OcrOptions) -> None:\n    if options.output_file == '-':\n        if sys.stdout.isatty():\n            raise BadArgsError(\n                \"Output was set to stdout '-' but it looks like stdout \"\n                \"is connected to a terminal.  Please redirect stdout to a \"\n                \"file.\"\n            )\n    elif hasattr(options.output_file, 'writable'):\n        if not options.output_file.writable():\n            raise OutputFileAccessError(\"Output stream is not writable\")\n    elif not is_file_writable(options.output_file):\n        raise OutputFileAccessError(\n            f\"Output file location ({options.output_file}) is not a writable file.\"\n        )\n\n    if (\n        options.no_overwrite\n        and not hasattr(options.output_file, 'writable')\n        and options.output_file != '-'\n        and Path(str(options.output_file)).exists()\n    ):\n        raise OutputFileAccessError(\n            f\"Output file already exists: {options.output_file}\\n\"\n            \"To overwrite it, omit the --no-overwrite / -n option.\"\n        )\n\n\ndef report_output_file_size(\n    options: OcrOptions,\n    input_file: Path,\n    output_file: Path,\n    optimize_messages: Sequence[str] | None = None,\n    file_overhead: int = 4000,\n    page_overhead: int = 3000,\n) -> None:\n    if optimize_messages is None:\n        optimize_messages = []\n    try:\n        output_size = Path(output_file).stat().st_size\n        input_size = Path(input_file).stat().st_size\n    except FileNotFoundError:\n        return  # Outputting to stream or something\n    with pikepdf.open(output_file) as p:\n        # Overhead constants obtained by estimating amount of data added by OCR\n        # PDF/A conversion, and possible XMP metadata addition, with compression\n        reasonable_overhead = file_overhead + page_overhead * len(p.pages)\n    ratio = output_size / input_size\n    reasonable_ratio = output_size / (input_size + reasonable_overhead)\n    if reasonable_ratio < 1.35 or input_size < 25000:\n        return  # Seems fine\n\n    reasons = []\n    image_preproc = {\n        'deskew',\n        'clean_final',\n        'remove_background',\n        'oversample',\n    }\n    for arg in image_preproc:\n        if getattr(options, arg, False):\n            reasons.append(\n                f\"--{arg.replace('_', '-')} was issued, causing transcoding.\"\n            )\n    # Check force_ocr via the backward-compatible property\n    if options.force_ocr:\n        reasons.append(\"--force-ocr (or --mode force) was issued, causing transcoding.\")\n\n    reasons.extend(optimize_messages)\n\n    if options.output_type.startswith('pdfa'):\n        reasons.append(\"PDF/A conversion was enabled. (Try `--output-type pdf`.)\")\n    if options.plugins:\n        reasons.append(\"Plugins were used.\")\n\n    if reasons:\n        explanation = \"Possible reasons for this include:\\n\" + '\\n'.join(reasons) + \"\\n\"\n    else:\n        explanation = \"No reason for this increase is known.  Please report this issue.\"\n\n    log.warning(\n        f\"The output file size is {ratio:.2f}× larger than the input file.\\n\"\n        f\"{explanation}\"\n    )\n"
  },
  {
    "path": "src/ocrmypdf/_validation_coordinator.py",
    "content": "# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Validation coordinator for plugin options and cross-cutting concerns.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n    import pluggy\n\n    from ocrmypdf._options import OcrOptions\n\nlog = logging.getLogger(__name__)\n\n\nclass ValidationCoordinator:\n    \"\"\"Coordinates validation across plugin models and core options.\"\"\"\n\n    def __init__(self, plugin_manager: pluggy.PluginManager):\n        self.plugin_manager = plugin_manager\n        self.registry = getattr(plugin_manager, '_option_registry', None)\n\n    def validate_all_options(self, options: OcrOptions) -> None:\n        \"\"\"Run comprehensive validation on all options.\n\n        This runs validation in the correct order:\n        1. Plugin self-validation (already done by Pydantic)\n        2. Plugin context validation (requires external context)\n        3. Cross-cutting validation (between plugins and core)\n\n        Args:\n            options: The options to validate\n        \"\"\"\n        # Step 1: Plugin context validation\n        self._validate_plugin_contexts(options)\n\n        # Step 2: Cross-cutting validation\n        self._validate_cross_cutting_concerns(options)\n\n    def _validate_plugin_contexts(self, options: OcrOptions) -> None:\n        \"\"\"Validate plugin options that require external context.\"\"\"\n        # For now, we'll run the plugin validation directly since the models\n        # are still being integrated. This ensures the validation warnings\n        # and checks still work as expected.\n\n        # Run Tesseract validation\n        self._validate_tesseract_options(options)\n\n        # Run Optimize validation\n        self._validate_optimize_options(options)\n\n    def _validate_tesseract_options(self, options: OcrOptions) -> None:\n        \"\"\"Validate Tesseract options.\"\"\"\n        # Check pagesegmode warning\n        if options.tesseract.pagesegmode in (0, 2):\n            log.warning(\n                \"The tesseract-pagesegmode you selected will disable OCR. \"\n                \"This may cause processing to fail.\"\n            )\n\n        # Check downsample consistency\n        if (\n            options.tesseract.downsample_above != 32767\n            and not options.tesseract.downsample_large_images\n        ):\n            log.warning(\n                \"The --tesseract-downsample-above argument will have no effect unless \"\n                \"--tesseract-downsample-large-images is also given.\"\n            )\n\n        # Note: blocked languages (equ, osd) are checked earlier in\n        # check_options_languages() to ensure the check runs before\n        # the missing language check.\n\n    def _validate_optimize_options(self, options: OcrOptions) -> None:\n        \"\"\"Validate optimization options.\"\"\"\n        # Check optimization consistency\n        if options.optimize == 0 and any(\n            [\n                options.png_quality and options.png_quality > 0,\n                options.jpeg_quality and options.jpeg_quality > 0,\n            ]\n        ):\n            log.warning(\n                \"The arguments --png-quality and --jpeg-quality \"\n                \"will be ignored because --optimize=0.\"\n            )\n\n    def _validate_cross_cutting_concerns(self, options: OcrOptions) -> None:\n        \"\"\"Validate cross-cutting concerns that span multiple plugins.\"\"\"\n        from ocrmypdf._options import ProcessingMode\n\n        # Handle deprecated pdf_renderer values\n        self._handle_deprecated_pdf_renderer(options)\n\n        # Note: Mutual exclusivity of force_ocr/skip_text/redo_ocr is now enforced\n        # by the ProcessingMode enum - only one mode can be active at a time.\n\n        # Validate redo mode compatibility\n        if options.mode == ProcessingMode.redo and (\n            options.deskew or options.clean_final or options.remove_background\n        ):\n            raise ValueError(\n                \"--redo-ocr (or --mode redo) is not currently compatible with \"\n                \"--deskew, --clean-final, and --remove-background\"\n            )\n\n        # Validate output type compatibility\n        if options.output_type == 'none' and str(options.output_file) not in (\n            os.devnull,\n            '-',\n        ):\n            raise ValueError(\n                \"Since you specified `--output-type none`, the output file \"\n                f\"{options.output_file} cannot be produced. Set the output file to \"\n                \"`-` to suppress this message.\"\n            )\n\n        # Validate PDF/A image compression compatibility\n        if (\n            options.ghostscript.pdfa_image_compression\n            and options.ghostscript.pdfa_image_compression != 'auto'\n            and not options.output_type.startswith('pdfa')\n        ):\n            log.warning(\n                \"--pdfa-image-compression argument only applies when \"\n                \"--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'\"\n            )\n\n    def _handle_deprecated_pdf_renderer(self, options: OcrOptions) -> None:\n        \"\"\"Handle deprecated pdf_renderer values by redirecting to fpdf2.\"\"\"\n        if options.pdf_renderer in ('hocr', 'hocrdebug'):\n            log.info(\n                \"The '%s' PDF renderer has been removed. Using 'fpdf2' instead, \"\n                \"which provides full international language support, proper RTL \"\n                \"rendering, and improved text positioning.\",\n                options.pdf_renderer,\n            )\n            # Modify the options object to use fpdf2\n            object.__setattr__(options, 'pdf_renderer', 'fpdf2')\n"
  },
  {
    "path": "src/ocrmypdf/_version.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n__version__ = \"17.3.0\"\n"
  },
  {
    "path": "src/ocrmypdf/api.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Python API for OCRmyPDF.\n\nThis module provides the main Python API for OCRmyPDF, allowing you to perform\nOCR operations programmatically without using the command line interface.\n\nMain Functions:\n    ocr(): The primary function for OCR processing. Takes an input PDF or image\n        file and produces an OCR'd PDF with searchable text.\n\n    configure_logging(): Set up logging to match the command line interface\n        behavior, with support for progress bars and colored output.\n\nExperimental Functions:\n    _pdf_to_hocr(): Extract text from PDF pages and save as hOCR files for\n        manual editing before final PDF generation.\n\n    _hocr_to_ocr_pdf(): Convert hOCR files back to a searchable PDF after\n        manual text corrections.\n\nThe API maintains thread safety through internal locking since OCRmyPDF uses\nglobal state for plugins. Only one OCR operation can run per Python process\nat a time. For parallel processing, use multiple Python processes.\n\nExample:\n    import ocrmypdf\n\n    # Configure logging (optional)\n    ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)\n\n    # Perform OCR\n    ocrmypdf.ocr('input.pdf', 'output.pdf', language='eng')\n\nFor detailed parameter documentation, see the ocr() function docstring and\nthe equivalent command line parameters in the OCRmyPDF documentation.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport sys\nimport threading\nfrom collections.abc import Iterable, Sequence\nfrom enum import IntEnum\nfrom io import IOBase\nfrom pathlib import Path\nfrom typing import BinaryIO, overload\nfrom warnings import warn\n\nfrom ocrmypdf._logging import PageNumberFilter\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf._pipelines.hocr_to_ocr_pdf import run_hocr_to_ocr_pdf_pipeline\nfrom ocrmypdf._pipelines.ocr import run_pipeline, run_pipeline_cli\nfrom ocrmypdf._pipelines.pdf_to_hocr import run_hocr_pipeline\nfrom ocrmypdf._plugin_manager import OcrmypdfPluginManager, get_plugin_manager\nfrom ocrmypdf._validation import check_options\nfrom ocrmypdf.cli import ArgumentParser, get_parser\nfrom ocrmypdf.exceptions import ExitCode\n\nStrPath = Path | str | bytes\nPathOrIO = BinaryIO | StrPath\n\n# Installing plugins affects the global state of the Python interpreter,\n# so we need to use a lock to prevent multiple threads from installing\n# plugins at the same time.\n_api_lock = threading.Lock()\n\n\ndef setup_plugin_infrastructure(\n    plugins: Sequence[Path | str] | None = None,\n    plugin_manager: OcrmypdfPluginManager | None = None,\n) -> OcrmypdfPluginManager:\n    \"\"\"Set up plugin infrastructure with proper initialization.\n\n    This function handles:\n    1. Creating or validating the plugin manager\n    2. Calling plugin initialization hooks\n    3. Setting up plugin option registry\n\n    Args:\n        plugins: List of plugin paths/names to load\n        plugin_manager: Existing plugin manager (if any)\n\n    Returns:\n        Properly initialized plugin manager\n\n    Raises:\n        ValueError: If both plugins and plugin_manager are provided\n    \"\"\"\n    if plugins and plugin_manager:\n        raise ValueError(\"plugins= and plugin_manager are mutually exclusive\")\n\n    if not plugins:\n        plugins = []\n    elif isinstance(plugins, str | Path):\n        plugins = [plugins]\n    else:\n        plugins = list(plugins)\n\n    # Create plugin manager if not provided\n    if not plugin_manager:\n        plugin_manager = get_plugin_manager(plugins)\n\n    # Initialize plugins (pass the underlying pluggy manager)\n    plugin_manager.initialize(plugin_manager=plugin_manager.pluggy)\n\n    # Initialize plugin option registry\n    from ocrmypdf._plugin_registry import PluginOptionRegistry\n\n    registry = PluginOptionRegistry()\n\n    # Let plugins register their option models\n    option_models = plugin_manager.register_options()\n    all_plugin_models: dict[str, type] = {}\n    for plugin_options in option_models:\n        if plugin_options:  # Skip None returns\n            for namespace, model_class in plugin_options.items():\n                registry.register_option_model(namespace, model_class)\n                all_plugin_models[namespace] = model_class\n\n    # Register plugin models with OcrOptions for dynamic nested access\n    OcrOptions.register_plugin_models(all_plugin_models)\n\n    # Store registry in plugin manager for later access\n    plugin_manager._option_registry = registry\n\n    return plugin_manager\n\n\nclass Verbosity(IntEnum):\n    \"\"\"Verbosity level for configure_logging.\"\"\"\n\n    # pylint: disable=invalid-name\n    quiet = -1  #: Suppress most messages\n    default = 0  #: Default level of logging\n    debug = 1  #: Output ocrmypdf debug messages\n    debug_all = 2  #: More detailed debugging from ocrmypdf and dependent modules\n\n\ndef configure_logging(\n    verbosity: Verbosity,\n    *,\n    progress_bar_friendly: bool = True,\n    manage_root_logger: bool = False,\n    plugin_manager: OcrmypdfPluginManager | None = None,\n):\n    \"\"\"Set up logging.\n\n    Before calling :func:`ocrmypdf.ocr()`, you can use this function to\n    configure logging if you want ocrmypdf's output to look like the ocrmypdf\n    command line interface. It will register log handlers, log filters, and\n    formatters, configure color logging to standard error, and adjust the log\n    levels of third party libraries. Details of this are fine-tuned and subject\n    to change. The ``verbosity`` argument is equivalent to the argument\n    ``--verbose`` and applies those settings. If you have a wrapper\n    script for ocrmypdf and you want it to be very similar to ocrmypdf, use this\n    function; if you are using ocrmypdf as part of an application that manages\n    its own logging, you probably do not want this function.\n\n    If this function is not called, ocrmypdf will not configure logging, and it\n    is up to the caller of ``ocrmypdf.ocr()`` to set up logging as it wishes using\n    the Python standard library's logging module. If this function is called,\n    the caller may of course make further adjustments to logging.\n\n    Regardless of whether this function is called, ocrmypdf will perform all of\n    its logging under the ``\"ocrmypdf\"`` logging namespace. In addition,\n    ocrmypdf imports pdfminer, which logs under ``\"pdfminer\"``. A library user\n    may wish to configure both; note that pdfminer is extremely chatty at the\n    log level ``logging.INFO``.\n\n    This function does not set up the ``debug.log`` log file that the command\n    line interface does at certain verbosity levels. Applications should configure\n    their own debug logging.\n\n    Args:\n        verbosity: Verbosity level.\n        progress_bar_friendly: If True (the default), install a custom log handler\n            that is compatible with progress bars and colored output.\n        manage_root_logger: Configure the process's root logger.\n        plugin_manager: The plugin manager, used for obtaining the custom log handler.\n\n    Returns:\n        The toplevel logger for ocrmypdf (or the root logger, if we are managing it).\n    \"\"\"\n    prefix = '' if manage_root_logger else 'ocrmypdf'\n\n    log = logging.getLogger(prefix)\n    log.setLevel(logging.DEBUG)\n\n    console = None\n    if plugin_manager and progress_bar_friendly:\n        console = plugin_manager.get_logging_console()\n\n    if not console:\n        console = logging.StreamHandler(stream=sys.stderr)\n\n    if verbosity < 0:\n        console.setLevel(logging.ERROR)\n    elif verbosity >= 1:\n        console.setLevel(logging.DEBUG)\n    else:\n        console.setLevel(logging.INFO)\n\n    console.addFilter(PageNumberFilter())\n\n    if verbosity >= 2:\n        fmt = '%(levelname)7s %(name)s -%(pageno)s %(message)s'\n    else:\n        fmt = '%(pageno)s%(message)s'\n\n    formatter = None\n\n    if not formatter:\n        formatter = logging.Formatter(fmt=fmt)\n\n    console.setFormatter(formatter)\n    log.addHandler(console)\n\n    if verbosity <= 1:\n        pdfminer_log = logging.getLogger('pdfminer')\n        pdfminer_log.setLevel(logging.ERROR)\n        pil_log = logging.getLogger('PIL')\n        pil_log.setLevel(logging.INFO)\n        fonttools_log = logging.getLogger('fontTools')\n        fonttools_log.setLevel(logging.ERROR)\n\n    if manage_root_logger:\n        logging.captureWarnings(True)\n\n    return log\n\n\ndef _check_no_conflicting_ocr_params(\n    locals_dict: dict,\n    kwargs: dict,\n    excluded: set[str] | None = None,\n) -> None:\n    \"\"\"Check that no individual OCR parameters conflict with OcrOptions.\n\n    When a user passes an OcrOptions object, they should not also pass\n    individual OCR parameters (except plugins/plugin_manager which are\n    handled separately).\n\n    Args:\n        locals_dict: The locals() dict from the calling function.\n        kwargs: The **kwargs dict from the calling function.\n        excluded: Parameter names to exclude from conflict checking.\n\n    Raises:\n        ValueError: If conflicting parameters are found.\n    \"\"\"\n    if excluded is None:\n        excluded = set()\n\n    # Parameters that are allowed alongside OcrOptions\n    allowed_with_options = {\n        'input_file_or_options',\n        'options',  # The OcrOptions object itself after assignment\n        'plugins',\n        'plugin_manager',\n        'kwargs',\n    } | excluded\n\n    # Check all locals that are OCR parameters (not None and not allowed)\n    conflicts = [\n        name\n        for name, value in locals_dict.items()\n        if value is not None and name not in allowed_with_options\n    ]\n\n    # Check kwargs\n    conflicts.extend(kwargs.keys())\n\n    if conflicts:\n        raise ValueError(\n            f\"When passing OcrOptions as the first argument, do not pass \"\n            f\"additional OCR parameters. Conflicting parameters: \"\n            f\"{', '.join(sorted(conflicts))}. \"\n            f\"Set these values in OcrOptions instead.\"\n        )\n\n\ndef _remap_language_to_languages(options_kwargs: dict) -> None:\n    \"\"\"Map the public API 'language' parameter to OcrOptions 'languages' field.\n\n    The public API uses 'language' (matching CLI --language) but OcrOptions\n    uses 'languages' (plural). This also coerces a bare string to a list\n    and splits '+'-separated language codes (e.g. 'eng+deu' -> ['eng', 'deu'])\n    to match the CLI behavior.\n    \"\"\"\n    if 'language' in options_kwargs and 'languages' not in options_kwargs:\n        lang = options_kwargs.pop('language')\n        if lang is None:\n            return\n        if isinstance(lang, str):\n            lang = lang.split('+')\n        else:\n            # Flatten any '+'-separated entries in the list\n            expanded: list[str] = []\n            for item in lang:\n                if isinstance(item, str) and '+' in item:\n                    expanded.extend(item.split('+'))\n                else:\n                    expanded.append(item)\n            lang = expanded\n        options_kwargs['languages'] = lang\n    elif 'language' in options_kwargs:\n        del options_kwargs['language']\n\n\ndef create_options(\n    *, input_file: PathOrIO, output_file: PathOrIO, parser: ArgumentParser, **kwargs\n) -> OcrOptions:\n    \"\"\"Construct an options object from the input/output files and keyword arguments.\n\n    Args:\n        input_file: Input file path or file object.\n        output_file: Output file path or file object.\n        parser: ArgumentParser object (kept for compatibility,\n            may be used for plugin validation).\n        **kwargs: Keyword arguments.\n\n    Returns:\n        OcrOptions: An options object containing the parsed arguments.\n\n    Raises:\n        TypeError: If the type of a keyword argument is not supported.\n    \"\"\"\n    # Prepare kwargs for direct OcrOptions construction\n    options_kwargs = kwargs.copy()\n\n    # Map API parameter 'language' to OcrOptions field 'languages'\n    _remap_language_to_languages(options_kwargs)\n\n    # Set input and output files\n    options_kwargs['input_file'] = input_file\n    options_kwargs['output_file'] = output_file\n\n    # Handle special stream cases for sidecar\n    if 'sidecar' in options_kwargs and isinstance(\n        options_kwargs['sidecar'], BinaryIO | IOBase\n    ):\n        # Keep the stream object as-is - OcrOptions can handle it\n        pass\n\n    # Remove None values to let OcrOptions use its defaults\n    options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None}\n\n    # Remove any kwargs that aren't OcrOptions fields and store in extra_attrs\n    extra_attrs = {}\n    ocr_fields = set(OcrOptions.model_fields.keys())\n    # Legacy mode flags are handled by OcrOptions model validator\n    legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}\n\n    # Known extra attributes that should be preserved\n    known_extra = {'progress_bar', 'plugins'}\n\n    for key in list(options_kwargs.keys()):\n        if key in ocr_fields or key in legacy_mode_flags or key in known_extra:\n            continue\n        extra_attrs[key] = options_kwargs.pop(key)\n\n    # Create OcrOptions directly\n    try:\n        options = OcrOptions(**options_kwargs)\n        # Add any extra attributes\n        if extra_attrs:\n            options.extra_attrs.update(extra_attrs)\n        return options\n    except Exception as e:\n        # If direct construction fails, provide a helpful error message\n        raise TypeError(f\"Failed to create OcrOptions: {e}\") from e\n\n\n@overload\ndef ocr(\n    options: OcrOptions,\n    /,\n    *,\n    plugins: Iterable[Path | str] | None = None,\n    plugin_manager: OcrmypdfPluginManager | None = None,\n) -> ExitCode: ...\n\n\n@overload\ndef ocr(\n    input_file_or_options: PathOrIO,\n    output_file: PathOrIO,\n    *,\n    language: Iterable[str] | None = None,\n    image_dpi: int | None = None,\n    output_type: str | None = None,\n    sidecar: PathOrIO | None = None,\n    jobs: int | None = None,\n    use_threads: bool | None = None,\n    title: str | None = None,\n    author: str | None = None,\n    subject: str | None = None,\n    keywords: str | None = None,\n    rotate_pages: bool | None = None,\n    remove_background: bool | None = None,\n    deskew: bool | None = None,\n    clean: bool | None = None,\n    clean_final: bool | None = None,\n    unpaper_args: str | None = None,\n    oversample: int | None = None,\n    remove_vectors: bool | None = None,\n    mode: str | None = None,\n    force_ocr: bool | None = None,\n    skip_text: bool | None = None,\n    redo_ocr: bool | None = None,\n    skip_big: float | None = None,\n    optimize: int | None = None,\n    jpg_quality: int | None = None,\n    png_quality: int | None = None,\n    jbig2_lossy: bool | None = None,\n    jbig2_page_group_size: int | None = None,\n    jbig2_threshold: float | None = None,\n    pages: str | None = None,\n    max_image_mpixels: float | None = None,\n    tesseract_config: Iterable[str] | None = None,\n    tesseract_pagesegmode: int | None = None,\n    tesseract_oem: int | None = None,\n    tesseract_thresholding: int | None = None,\n    pdf_renderer: str | None = None,\n    rasterizer: str | None = None,\n    tesseract_timeout: float | None = None,\n    tesseract_non_ocr_timeout: float | None = None,\n    tesseract_downsample_above: int | None = None,\n    tesseract_downsample_large_images: bool | None = None,\n    rotate_pages_threshold: float | None = None,\n    pdfa_image_compression: str | None = None,\n    color_conversion_strategy: str | None = None,\n    user_words: os.PathLike | None = None,\n    user_patterns: os.PathLike | None = None,\n    fast_web_view: float | None = None,\n    continue_on_soft_render_error: bool | None = None,\n    invalidate_digital_signatures: bool | None = None,\n    tagged_pdf_mode: str | None = None,\n    no_overwrite: bool | None = None,\n    plugins: Iterable[Path | str] | None = None,\n    plugin_manager: OcrmypdfPluginManager | None = None,\n    keep_temporary_files: bool | None = None,\n    progress_bar: bool | None = None,\n    **kwargs,\n) -> ExitCode: ...\n\n\ndef ocr(  # noqa: D417\n    input_file_or_options: PathOrIO | OcrOptions,\n    output_file: PathOrIO | None = None,\n    *,\n    language: Iterable[str] | None = None,\n    image_dpi: int | None = None,\n    output_type: str | None = None,\n    sidecar: PathOrIO | None = None,\n    jobs: int | None = None,\n    use_threads: bool | None = None,\n    title: str | None = None,\n    author: str | None = None,\n    subject: str | None = None,\n    keywords: str | None = None,\n    rotate_pages: bool | None = None,\n    remove_background: bool | None = None,\n    deskew: bool | None = None,\n    clean: bool | None = None,\n    clean_final: bool | None = None,\n    unpaper_args: str | None = None,\n    oversample: int | None = None,\n    remove_vectors: bool | None = None,\n    mode: str | None = None,\n    force_ocr: bool | None = None,  # Legacy, use mode='force' instead\n    skip_text: bool | None = None,  # Legacy, use mode='skip' instead\n    redo_ocr: bool | None = None,  # Legacy, use mode='redo' instead\n    skip_big: float | None = None,\n    optimize: int | None = None,\n    jpg_quality: int | None = None,\n    png_quality: int | None = None,\n    jbig2_lossy: bool | None = None,  # Deprecated, ignored\n    jbig2_page_group_size: int | None = None,  # Deprecated, ignored\n    jbig2_threshold: float | None = None,\n    pages: str | None = None,\n    max_image_mpixels: float | None = None,\n    tesseract_config: Iterable[str] | None = None,\n    tesseract_pagesegmode: int | None = None,\n    tesseract_oem: int | None = None,\n    tesseract_thresholding: int | None = None,\n    pdf_renderer: str | None = None,\n    rasterizer: str | None = None,\n    tesseract_timeout: float | None = None,\n    tesseract_non_ocr_timeout: float | None = None,\n    tesseract_downsample_above: int | None = None,\n    tesseract_downsample_large_images: bool | None = None,\n    rotate_pages_threshold: float | None = None,\n    pdfa_image_compression: str | None = None,\n    color_conversion_strategy: str | None = None,\n    user_words: os.PathLike | None = None,\n    user_patterns: os.PathLike | None = None,\n    fast_web_view: float | None = None,\n    continue_on_soft_render_error: bool | None = None,\n    invalidate_digital_signatures: bool | None = None,\n    tagged_pdf_mode: str | None = None,\n    no_overwrite: bool | None = None,\n    plugins: Iterable[Path | str] | None = None,\n    plugin_manager: OcrmypdfPluginManager | None = None,\n    keep_temporary_files: bool | None = None,\n    progress_bar: bool | None = None,\n    **kwargs,\n) -> ExitCode:\n    \"\"\"Run OCRmyPDF on one PDF or image.\n\n    This function supports two calling conventions:\n\n    **New style (recommended):**\n        >>> from ocrmypdf import ocr\n        >>> from ocrmypdf._options import OcrOptions\n        >>> options = OcrOptions(\n        ...     input_file=\"input.pdf\",\n        ...     output_file=\"output.pdf\",\n        ...     languages=[\"eng\"],\n        ... )\n        >>> ocr(options)\n\n    **Old style:**\n        >>> ocr(\"input.pdf\", \"output.pdf\", language=[\"eng\"])\n\n    For most arguments, see documentation for the equivalent command line parameter.\n\n    This API takes a threading lock, because OCRmyPDF uses global state in particular\n    for the plugin system. The jobs parameter will be used to create a pool of\n    worker threads or processes at different times, subject to change. A Python\n    process can only run one OCRmyPDF task at a time.\n\n    To run parallelize instances OCRmyPDF, use separate Python processes to scale\n    horizontally. Generally speaking you should set jobs=sqrt(cpu_count) and run\n    sqrt(cpu_count) processes as a starting point. If you have files with a high page\n    count, run fewer processes and more jobs per process. If you have a lot of short\n    files, run more processes and fewer jobs per process.\n\n    A few specific arguments are discussed here:\n\n    Args:\n        input_file_or_options: Either an OcrOptions object containing all settings,\n            or a path/stream for the input file (old-style API).\n        output_file: Output file path or stream. Required when using old-style API\n            with input_file as first argument. Must be None when passing OcrOptions.\n        use_threads: Use worker threads instead of processes. This reduces\n            performance but may make debugging easier since it is easier to set\n            breakpoints.\n        plugins: List of plugin paths to load. Can be passed alongside OcrOptions.\n        plugin_manager: Pre-configured plugin manager. Can be passed alongside\n            OcrOptions.\n\n        For input_file (old-style API): If a :class:`pathlib.Path`, ``str`` or\n            ``bytes``, this is interpreted as file system path to the input file.\n            If the object appears to be a readable stream (with methods such as\n            ``.read()`` and ``.seek()``), the object will be read in its entirety\n            and saved to a temporary file. If ``input_file`` is ``\"-\"``, standard\n            input will be read.\n\n        For output_file (old-style API): If a :class:`pathlib.Path`, ``str`` or\n            ``bytes``, this is interpreted as file system path to the output file.\n            If the object appears to be a writable stream (with methods such as\n            ``.write()`` and ``.seek()``), the output will be written to this\n            stream. If ``output_file`` is ``\"-\"``, the output will be written to\n            ``sys.stdout`` (provided that standard output does not seem to be a\n            terminal device). When a stream is used as output, whether via a\n            writable object or ``\"-\"``, some final validation steps are not\n            performed (we do not read back the stream after it is written).\n\n    Raises:\n        ocrmypdf.MissingDependencyError: If a required dependency program is missing or\n            was not found on PATH.\n        ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that\n            could not be read, or some other file type that is not a PDF.\n        ocrmypdf.DpiError: If the input file is an image, but the resolution of the\n            image is not credible (allowing it to proceed would cause poor OCR).\n        ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output\n            file failed.\n        ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital\n            text already, and settings did not tell us to proceed.\n        ocrmypdf.InputFileError: Any other problem with the input file.\n        ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.\n        ocrmypdf.EncryptedPdfError: If the input PDF is encrypted (password protected).\n            OCRmyPDF does not remove passwords.\n        ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not\n            valid.\n        ValueError: If OcrOptions is passed along with other OCR parameters, or if\n            both plugins and plugin_manager are provided.\n        TypeError: If output_file is missing when using the old-style API.\n\n    Returns:\n        :class:`ocrmypdf.ExitCode`\n    \"\"\"\n    # Detect calling convention: OcrOptions object vs individual parameters\n    if isinstance(input_file_or_options, OcrOptions):\n        # New-style API: OcrOptions passed directly\n        options = input_file_or_options\n\n        # Check for conflicting parameters\n        # (all should be None except plugins/plugin_manager)\n        _check_no_conflicting_ocr_params(locals(), kwargs)\n\n        # plugins and plugin_manager can still be passed alongside OcrOptions\n        if plugins and plugin_manager:\n            raise ValueError(\"plugins= and plugin_manager are mutually exclusive\")\n\n        # Use plugins from OcrOptions if not explicitly passed\n        if plugins is None:\n            plugins = options.plugins or []\n\n        if isinstance(plugins, str | Path):\n            plugins = [plugins]\n        else:\n            plugins = list(plugins) if plugins else []\n\n        # Run the pipeline with the OcrOptions\n        with _api_lock:\n            plugin_manager = setup_plugin_infrastructure(\n                plugins=plugins, plugin_manager=plugin_manager\n            )\n\n            parser = get_parser()\n            plugin_manager.add_options(parser=parser)\n\n            check_options(options, plugin_manager)\n            return run_pipeline(options=options, plugin_manager=plugin_manager)\n\n    else:\n        # Old-style API: positional arguments\n        input_file = input_file_or_options\n\n        if output_file is None:\n            raise TypeError(\n                \"ocr() missing required argument: 'output_file'. \"\n                \"Either pass output_file as the second argument, or pass \"\n                \"an OcrOptions object as the first argument.\"\n            )\n\n        if plugins and plugin_manager:\n            raise ValueError(\"plugins= and plugin_manager are mutually exclusive\")\n\n        if not plugins:\n            plugins = []\n        elif isinstance(plugins, str | Path):\n            plugins = [plugins]\n        else:\n            plugins = list(plugins)\n\n        # No new variable names should be assigned until these two steps are run\n        create_options_kwargs = {\n            k: v\n            for k, v in locals().items()\n            if k\n            not in {\n                'input_file_or_options',\n                'input_file',\n                'output_file',\n                'kwargs',\n                'plugin_manager',\n            }\n        }\n        create_options_kwargs.update(kwargs)\n\n        parser = get_parser()\n        with _api_lock:\n            # Set up plugin infrastructure with proper initialization\n            plugin_manager = setup_plugin_infrastructure(\n                plugins=plugins, plugin_manager=plugin_manager\n            )\n\n            # Get parser and let plugins add their options\n            parser = get_parser()\n            plugin_manager.add_options(parser=parser)\n\n            if 'verbose' in kwargs:\n                warn(\n                    \"ocrmypdf.ocr(verbose=) is ignored. \"\n                    \"Use ocrmypdf.configure_logging().\"\n                )\n\n            # Warn about deprecated jbig2 options and remove from kwargs\n            if jbig2_lossy:\n                warn(\n                    \"jbig2_lossy is deprecated and will be ignored. \"\n                    \"Lossy JBIG2 has been removed due to character substitution risks.\"\n                )\n                create_options_kwargs.pop('jbig2_lossy', None)\n            if jbig2_page_group_size:\n                warn(\"jbig2_page_group_size is deprecated and will be ignored.\")\n                create_options_kwargs.pop('jbig2_page_group_size', None)\n\n            options = create_options(\n                input_file=input_file,\n                output_file=output_file,\n                parser=parser,\n                **create_options_kwargs,\n            )\n            check_options(options, plugin_manager)\n            return run_pipeline(options=options, plugin_manager=plugin_manager)\n\n\ndef _pdf_to_hocr(  # noqa: D417\n    input_pdf: Path,\n    output_folder: Path,\n    *,\n    language: Iterable[str] | None = None,\n    image_dpi: int | None = None,\n    jobs: int | None = None,\n    use_threads: bool | None = None,\n    title: str | None = None,\n    author: str | None = None,\n    subject: str | None = None,\n    keywords: str | None = None,\n    rotate_pages: bool | None = None,\n    remove_background: bool | None = None,\n    deskew: bool | None = None,\n    clean: bool | None = None,\n    clean_final: bool | None = None,\n    unpaper_args: str | None = None,\n    oversample: int | None = None,\n    remove_vectors: bool | None = None,\n    mode: str | None = None,\n    force_ocr: bool | None = None,  # Legacy, use mode='force' instead\n    skip_text: bool | None = None,  # Legacy, use mode='skip' instead\n    redo_ocr: bool | None = None,  # Legacy, use mode='redo' instead\n    skip_big: float | None = None,\n    pages: str | None = None,\n    max_image_mpixels: float | None = None,\n    tesseract_config: Iterable[str] | None = None,\n    tesseract_pagesegmode: int | None = None,\n    tesseract_oem: int | None = None,\n    tesseract_thresholding: int | None = None,\n    tesseract_timeout: float | None = None,\n    tesseract_non_ocr_timeout: float | None = None,\n    tesseract_downsample_above: int | None = None,\n    tesseract_downsample_large_images: bool | None = None,\n    rotate_pages_threshold: float | None = None,\n    rasterizer: str | None = None,\n    user_words: os.PathLike | None = None,\n    user_patterns: os.PathLike | None = None,\n    continue_on_soft_render_error: bool | None = None,\n    invalidate_digital_signatures: bool | None = None,\n    plugin_manager=None,\n    plugins: Sequence[Path | str] | None = None,\n    keep_temporary_files: bool | None = None,\n    **kwargs,\n):\n    \"\"\"Partially run OCRmyPDF and produces an output folder containing hOCR files.\n\n    Given a PDF file, this function will run OCRmyPDF up to the point where\n    the PDF is rasterized to images, OCRed, and the hOCR files are produced,\n    all of which are saved to the output folder. This is useful for applications\n    that want to provide an interface for users to edit the text before\n    rendering the final PDF.\n\n    Use :func:`hocr_to_ocr_pdf` to produce the final PDF.\n\n    For arguments not explicitly documented here, see documentation for the\n    equivalent command line parameter.\n\n    This API is **experimental** and subject to change.\n\n    Args:\n        input_pdf: Input PDF file path.\n        output_folder: Output folder path.\n        **kwargs: Keyword arguments.\n    \"\"\"\n    if plugins and plugin_manager:\n        raise ValueError(\"plugins= and plugin_manager are mutually exclusive\")\n\n    if not plugins:\n        plugins = []\n    elif isinstance(plugins, str | Path):\n        plugins = [plugins]\n    else:\n        plugins = list(plugins)\n\n    # Prepare kwargs for direct OcrOptions construction\n    options_kwargs = kwargs.copy()\n\n    # Set input file and handle special output_folder case\n    options_kwargs['input_file'] = input_pdf\n    options_kwargs['output_file'] = '/dev/null'  # Placeholder for hOCR pipeline\n\n    # Add all the function parameters\n    for param_name, param_value in locals().items():\n        if (\n            param_name\n            not in {'input_pdf', 'output_folder', 'kwargs', 'plugin_manager', 'plugins'}\n            and param_value is not None\n        ):\n            options_kwargs[param_name] = param_value\n\n    # Map API parameter 'language' to OcrOptions field 'languages'\n    _remap_language_to_languages(options_kwargs)\n\n    # Handle plugins\n    if plugins:\n        options_kwargs['plugins'] = plugins\n\n    # Remove None values to let OcrOptions use its defaults\n    options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None}\n\n    # Add output_folder to options_kwargs since it's now a proper field\n    options_kwargs['output_folder'] = output_folder\n\n    # Remove any kwargs that aren't OcrOptions fields and store in extra_attrs\n    extra_attrs = {}\n    ocr_fields = set(OcrOptions.model_fields.keys())\n    # Legacy mode flags are handled by OcrOptions model validator\n    legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}\n    known_extra = {'progress_bar', 'plugins'}\n\n    for key in list(options_kwargs.keys()):\n        if key in ocr_fields or key in legacy_mode_flags or key in known_extra:\n            continue\n        extra_attrs[key] = options_kwargs.pop(key)\n\n    with _api_lock:\n        # Set up plugin infrastructure with proper initialization\n        plugin_manager = setup_plugin_infrastructure(\n            plugins=plugins, plugin_manager=plugin_manager\n        )\n\n        plugin_manager.add_options(parser=get_parser())\n\n        # Create OcrOptions directly\n        try:\n            options = OcrOptions(**options_kwargs)\n            # Add any extra attributes\n            if extra_attrs:\n                options.extra_attrs.update(extra_attrs)\n        except Exception as e:\n            raise TypeError(\n                f\"Failed to create OcrOptions for hOCR pipeline: {e}\"\n            ) from e\n\n        return run_hocr_pipeline(options=options, plugin_manager=plugin_manager)\n\n\ndef _hocr_to_ocr_pdf(  # noqa: D417\n    work_folder: Path,\n    output_file: Path,\n    *,\n    jobs: int | None = None,\n    use_threads: bool | None = None,\n    optimize: int | None = None,\n    jpg_quality: int | None = None,\n    png_quality: int | None = None,\n    jbig2_lossy: bool | None = None,  # Deprecated, ignored\n    jbig2_page_group_size: int | None = None,  # Deprecated, ignored\n    jbig2_threshold: float | None = None,\n    pdfa_image_compression: str | None = None,\n    color_conversion_strategy: str | None = None,\n    fast_web_view: float | None = None,\n    plugin_manager=None,\n    plugins: Sequence[Path | str] | None = None,\n    **kwargs,\n):\n    \"\"\"Run OCRmyPDF on a work folder and produce an output PDF.\n\n    After running :func:`pdf_to_hocr`, this function will run OCRmyPDF on the work\n    folder to produce an output PDF. This function consolidates any changes made\n    to the hOCR files in the work folder and produces a final PDF.\n\n    For arguments not explicitly documented here, see documentation for the\n    equivalent command line parameter.\n\n    This API is **experimental** and subject to change.\n\n    Args:\n        work_folder: Work folder path, as generated by :func:`pdf_to_hocr`.\n        output_file: Output PDF file path.\n        **kwargs: Keyword arguments.\n    \"\"\"\n    if plugins and plugin_manager:\n        raise ValueError(\"plugins= and plugin_manager are mutually exclusive\")\n\n    if not plugins:\n        plugins = []\n    elif isinstance(plugins, str | Path):\n        plugins = [plugins]\n    else:\n        plugins = list(plugins)\n\n    # Prepare kwargs for direct OcrOptions construction\n    options_kwargs = kwargs.copy()\n\n    # Set output file and handle special work_folder case\n    options_kwargs['input_file'] = '/dev/null'  # Placeholder for hOCR to PDF pipeline\n    options_kwargs['output_file'] = output_file\n\n    # Add all the function parameters\n    for param_name, param_value in locals().items():\n        if (\n            param_name\n            not in {'work_folder', 'output_file', 'kwargs', 'plugin_manager', 'plugins'}\n            and param_value is not None\n        ):\n            options_kwargs[param_name] = param_value\n\n    # Handle plugins\n    if plugins:\n        options_kwargs['plugins'] = plugins\n\n    # Remove None values to let OcrOptions use its defaults\n    options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None}\n\n    # Warn about deprecated jbig2 options and remove from kwargs\n    if jbig2_lossy:\n        warn(\n            \"jbig2_lossy is deprecated and will be ignored. \"\n            \"Lossy JBIG2 has been removed due to character substitution risks.\"\n        )\n        options_kwargs.pop('jbig2_lossy', None)\n    if jbig2_page_group_size:\n        warn(\"jbig2_page_group_size is deprecated and will be ignored.\")\n        options_kwargs.pop('jbig2_page_group_size', None)\n\n    # Add work_folder to options_kwargs since it's now a proper field\n    options_kwargs['work_folder'] = work_folder\n\n    # Remove any kwargs that aren't OcrOptions fields and store in extra_attrs\n    extra_attrs = {}\n    ocr_fields = set(OcrOptions.model_fields.keys())\n    # Legacy mode flags are handled by OcrOptions model validator\n    legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}\n    known_extra = {'progress_bar', 'plugins'}\n\n    for key in list(options_kwargs.keys()):\n        if key in ocr_fields or key in legacy_mode_flags or key in known_extra:\n            continue\n        extra_attrs[key] = options_kwargs.pop(key)\n\n    with _api_lock:\n        # Set up plugin infrastructure with proper initialization\n        plugin_manager = setup_plugin_infrastructure(\n            plugins=plugins, plugin_manager=plugin_manager\n        )\n\n        plugin_manager.add_options(parser=get_parser())\n\n        # Create OcrOptions directly\n        try:\n            options = OcrOptions(**options_kwargs)\n            # Add any extra attributes\n            if extra_attrs:\n                options.extra_attrs.update(extra_attrs)\n        except Exception as e:\n            raise TypeError(\n                f\"Failed to create OcrOptions for hOCR to PDF pipeline: {e}\"\n            ) from e\n\n        return run_hocr_to_ocr_pdf_pipeline(\n            options=options, plugin_manager=plugin_manager\n        )\n\n\n__all__ = [\n    'PageNumberFilter',\n    'Verbosity',\n    'check_options',\n    'configure_logging',\n    'create_options',\n    'get_parser',\n    'get_plugin_manager',\n    'ocr',\n    'run_pipeline',\n    'run_pipeline_cli',\n    'setup_plugin_infrastructure',\n]\n"
  },
  {
    "path": "src/ocrmypdf/builtin_plugins/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Plugins in this package are automatically loaded by ocrmypdf.\"\"\"\n\nfrom __future__ import annotations\n"
  },
  {
    "path": "src/ocrmypdf/builtin_plugins/concurrency.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"OCRmyPDF's multiprocessing/multithreading abstraction layer.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport logging.handlers\nimport multiprocessing\nimport multiprocessing.queues\nimport os\nimport queue\nimport signal\nimport sys\nimport threading\nfrom collections.abc import Callable, Iterable\nfrom concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed\nfrom contextlib import suppress\nfrom typing import TYPE_CHECKING\n\nfrom rich.console import Console as RichConsole\n\nfrom ocrmypdf import Executor, hookimpl\nfrom ocrmypdf._logging import RichLoggingHandler\nfrom ocrmypdf._progressbar import RichProgressBar\nfrom ocrmypdf.exceptions import InputFileError\nfrom ocrmypdf.helpers import remove_all_log_handlers\n\nif TYPE_CHECKING:\n    from typing import TypeAlias\n\n    Queue: TypeAlias = multiprocessing.queues.Queue | queue.Queue\n    UserInit: TypeAlias = Callable[[], None]\n    WorkerInit: TypeAlias = Callable[[Queue, UserInit, int], None]\n\nFuturesExecutorClass = type[ThreadPoolExecutor] | type[ProcessPoolExecutor]\n\n\ndef log_listener(q: Queue):\n    \"\"\"Listen to the worker processes and forward the messages to logging.\n\n    For simplicity this is a thread rather than a process. Only one process\n    should actually write to sys.stderr or whatever we're using, so if this is\n    made into a process the main application needs to be directed to it.\n\n    See:\n    https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes\n    \"\"\"\n    while True:\n        try:\n            record = q.get()\n            if record is None:\n                break\n            logger = logging.getLogger(record.name)\n            logger.handle(record)\n        except Exception:  # pylint: disable=broad-except\n            import traceback  # pylint: disable=import-outside-toplevel\n\n            print(\"Logging problem\", file=sys.stderr)\n            traceback.print_exc(file=sys.stderr)\n\n\ndef process_sigbus(*args):\n    \"\"\"Handle SIGBUS signal at the worker level.\"\"\"\n    raise InputFileError(\"A worker process lost access to an input file\")\n\n\ndef process_init(q: Queue, user_init: UserInit, loglevel) -> None:\n    \"\"\"Initialize a process pool worker.\"\"\"\n    # Ignore SIGINT (our parent process will kill us gracefully)\n    signal.signal(signal.SIGINT, signal.SIG_IGN)\n\n    # Install SIGBUS handler (so our parent process can abort somewhat gracefully)\n    with suppress(AttributeError):  # Windows and Cygwin do not have SIGBUS\n        # Windows and Cygwin do not have pthread_sigmask or SIGBUS\n        signal.signal(signal.SIGBUS, process_sigbus)\n\n    # Remove any log handlers inherited from the parent process\n    root = logging.getLogger()\n    remove_all_log_handlers(root)\n\n    # Set up our single log handler to forward messages to the parent\n    root.setLevel(loglevel)\n    root.addHandler(logging.handlers.QueueHandler(q))\n\n    user_init()\n    return\n\n\ndef thread_init(q: Queue, user_init: UserInit, loglevel) -> None:\n    \"\"\"Begin a thread pool worker.\"\"\"\n    del q  # unused but required argument\n    del loglevel  # unused but required argument\n    # As a thread, block SIGBUS so the main thread deals with it...\n    with suppress(AttributeError):\n        signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGBUS})\n\n    user_init()\n    return\n\n\ndef setup_executor(use_threads: bool) -> tuple[Queue, Executor, WorkerInit]:\n    if not use_threads:\n        # Some execution environments like AWS Lambda and Termux do not support\n        # semaphores. Check if semaphore support is available, and if not, fall back\n        # to using threads.\n        try:\n            # pylint: disable=import-outside-toplevel\n            from multiprocessing.synchronize import SemLock\n\n            del SemLock\n        except ImportError:\n            use_threads = True\n\n    if use_threads:\n        loq_queue = queue.Queue(-1)\n        executor_class = ThreadPoolExecutor\n        initializer = thread_init\n    else:\n        loq_queue = multiprocessing.Queue(-1)\n        executor_class = ProcessPoolExecutor\n        initializer = process_init\n\n    return loq_queue, executor_class, initializer\n\n\nclass StandardExecutor(Executor):\n    \"\"\"Standard OCRmyPDF concurrent task executor.\"\"\"\n\n    def _execute(\n        self,\n        *,\n        use_threads: bool,\n        max_workers: int,\n        progress_kwargs: dict,\n        worker_initializer: Callable,\n        task: Callable,\n        task_arguments: Iterable,\n        task_finished: Callable,\n    ):\n        log_queue, executor_class, initializer = setup_executor(use_threads)\n\n        # Regardless of whether we use_threads for worker processes, the log_listener\n        # must be a thread. Make sure we create the listener after the worker pool,\n        # so that it does not get forked into the workers.\n        # If use_threads is False, we are currently guilty of creating a thread before\n        # forking on Linux, which is not recommended. However, we take a big\n        # performance hit in pdfinfo if we can't fork. Long term solution is to\n        # replace most of this with an asyncio implementation, and probably to\n        # migrate some of pdfinfo into C++ or Rust.\n        listener = threading.Thread(target=log_listener, args=(log_queue,))\n        listener.start()\n\n        with (\n            self.pbar_class(**progress_kwargs) as pbar,\n            executor_class(\n                max_workers=max_workers,\n                initializer=initializer,\n                initargs=(log_queue, worker_initializer, logging.getLogger(\"\").level),\n            ) as executor,\n        ):\n            futures = [executor.submit(task, *args) for args in task_arguments]\n            try:\n                for future in as_completed(futures):\n                    result = future.result()\n                    task_finished(result, pbar)\n            except KeyboardInterrupt:\n                # Terminate pool so we exit instantly\n                executor.shutdown(wait=False, cancel_futures=True)\n                raise\n            except Exception:\n                if not os.environ.get(\"PYTEST_CURRENT_TEST\", \"\"):\n                    # Normally we shutdown without waiting for other child workers\n                    # on error, because there is no point in waiting for them. Their\n                    # results will be discard. But if the condition above is True,\n                    # then we are running in pytest, and we want everything to exit\n                    # as cleanly as possible so that we get good error messages.\n                    executor.shutdown(wait=False, cancel_futures=True)\n                raise\n            finally:\n                # Terminate log listener\n                log_queue.put_nowait(None)\n\n        # When the above succeeds, wait for the listener thread to exit. (If\n        # an exception occurs, we don't try to join, in case it deadlocks.)\n        listener.join()\n\n\n@hookimpl\ndef get_executor(progressbar_class):\n    \"\"\"Return the default executor.\"\"\"\n    return StandardExecutor(pbar_class=progressbar_class)\n\n\nRICH_CONSOLE = RichConsole(stderr=True)\n\n\n@hookimpl\ndef get_progressbar_class():\n    \"\"\"Return the default progress bar class.\"\"\"\n\n    def partial_RichProgressBar(*args, **kwargs):\n        return RichProgressBar(*args, **kwargs, console=RICH_CONSOLE)\n\n    return partial_RichProgressBar\n\n\n@hookimpl\ndef get_logging_console():\n    \"\"\"Return the default logging console handler.\"\"\"\n    return RichLoggingHandler(console=RICH_CONSOLE)\n"
  },
  {
    "path": "src/ocrmypdf/builtin_plugins/default_filters.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"OCRmyPDF automatically installs these filters as plugins.\"\"\"\n\nfrom __future__ import annotations\n\nfrom ocrmypdf import hookimpl\n\n\n@hookimpl\ndef filter_pdf_page(page, image_filename, output_pdf):  # pylint: disable=unused-argument\n    return output_pdf\n"
  },
  {
    "path": "src/ocrmypdf/builtin_plugins/ghostscript.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"Built-in plugin to implement PDF page rasterization and PDF/A production.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom enum import StrEnum\nfrom pathlib import Path\nfrom typing import Annotated\n\nfrom packaging.version import Version\nfrom pikepdf import Name, Pdf, Stream\nfrom pydantic import BaseModel, Field\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf._exec import ghostscript\nfrom ocrmypdf._options import ProcessingMode\nfrom ocrmypdf.exceptions import MissingDependencyError\nfrom ocrmypdf.subprocess import check_external_program\n\nlog = logging.getLogger(__name__)\n\n# Currently all blacklisted versions are lower than 9.55, so none need to\n# be added here. If a future version is blacklisted, add it here.\nBLACKLISTED_GS_VERSIONS: frozenset[Version] = frozenset()\n\n\nclass ColorConversionStrategy(StrEnum):\n    \"\"\"Ghostscript color conversion strategies.\"\"\"\n\n    CMYK = 'CMYK'\n    GRAY = 'Gray'\n    LEAVE_COLOR_UNCHANGED = 'LeaveColorUnchanged'\n    RGB = 'RGB'\n    USE_DEVICE_INDEPENDENT_COLOR = 'UseDeviceIndependentColor'\n\n\nclass PdfaImageCompression(StrEnum):\n    \"\"\"PDF/A image compression methods.\"\"\"\n\n    AUTO = 'auto'\n    JPEG = 'jpeg'\n    LOSSLESS = 'lossless'\n\n\nclass GhostscriptOptions(BaseModel):\n    \"\"\"Options specific to Ghostscript operations.\"\"\"\n\n    color_conversion_strategy: Annotated[\n        ColorConversionStrategy,\n        Field(description=\"Ghostscript color conversion strategy\"),\n    ] = ColorConversionStrategy.LEAVE_COLOR_UNCHANGED\n    pdfa_image_compression: Annotated[\n        PdfaImageCompression, Field(description=\"PDF/A image compression method\")\n    ] = PdfaImageCompression.AUTO\n\n    @classmethod\n    def add_arguments_to_parser(cls, parser, namespace: str = 'ghostscript'):\n        \"\"\"Add Ghostscript-specific arguments to the argument parser.\n\n        Args:\n            parser: The argument parser to add arguments to\n            namespace: The namespace prefix for argument names (not used for ghostscript\n                for backward compatibility)\n        \"\"\"\n        gs = parser.add_argument_group(\"Ghostscript\", \"Advanced control of Ghostscript\")\n        gs.add_argument(\n            '--color-conversion-strategy',\n            action='store',\n            type=str,\n            choices=[ccs.value for ccs in ColorConversionStrategy],\n            default=ColorConversionStrategy.LEAVE_COLOR_UNCHANGED.value,\n            help=\"Set Ghostscript color conversion strategy\",\n        )\n        gs.add_argument(\n            '--pdfa-image-compression',\n            choices=[pc.value for pc in PdfaImageCompression],\n            default=PdfaImageCompression.AUTO.value,\n            help=\"Specify how to compress images in the output PDF/A. 'auto' lets \"\n            \"OCRmyPDF decide.  'jpeg' changes all grayscale and color images to \"\n            \"JPEG compression.  'lossless' uses PNG-style lossless compression \"\n            \"for all images.  Monochrome images are always compressed using a \"\n            \"lossless codec.  Compression settings \"\n            \"are applied to all pages, including those for which OCR was \"\n            \"skipped.  Not supported for --output-type=pdf ; that setting \"\n            \"preserves the original compression of all images.\",\n        )\n\n\n@hookimpl\ndef register_options():\n    \"\"\"Register Ghostscript option model.\"\"\"\n    return {'ghostscript': GhostscriptOptions}\n\n\n@hookimpl\ndef add_options(parser):\n    # Use the model's CLI generation method\n    GhostscriptOptions.add_arguments_to_parser(parser)\n\n\n@hookimpl\ndef check_options(options):\n    \"\"\"Check that the options are valid for this plugin.\"\"\"\n    # Only require Ghostscript for pdfa* output types (not 'auto' or 'pdf')\n    # 'auto' mode uses best-effort PDF/A without Ghostscript fallback\n    if options.output_type.startswith('pdfa'):\n        check_external_program(\n            program='gs',\n            package='ghostscript',\n            version_checker=ghostscript.version,\n            need_version='9.54',  # RHEL 9's version; Ubuntu 22.04 has 9.55\n        )\n        gs_version = ghostscript.version()\n        if gs_version in BLACKLISTED_GS_VERSIONS:\n            raise MissingDependencyError(\n                f\"Ghostscript {gs_version} contains serious regressions and is not \"\n                \"supported. Please upgrade to a newer version.\"\n            )\n        if Version('10.0.0') <= gs_version < Version('10.02.1') and (\n            options.mode in (ProcessingMode.skip, ProcessingMode.redo)\n        ):\n            raise MissingDependencyError(\n                f\"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) \"\n                \"contain serious regressions that corrupt PDFs with existing text, \"\n                \"such as those processed using --skip-text or --redo-ocr \"\n                \"(or --mode skip/redo). Please upgrade to a newer version, or use \"\n                \"--output-type pdf to avoid Ghostscript, or use --force-ocr \"\n                \"(or --mode force) to discard existing text.\"\n            )\n        if gs_version >= Version('10.6.0'):\n            log.warning(\n                \"Ghostscript 10.6.x contains JPEG encoding errors that may corrupt \"\n                \"images. OCRmyPDF will attempt to mitigate, but this version is \"\n                \"strongly not recommended. Please upgrade to a newer version. \"\n                \"As of 2025-12, 10.6.0 is the latest version of Ghostscript.\"\n            )\n        if options.output_type == 'pdfa':\n            options.output_type = 'pdfa-2'\n\n    if (\n        options.ghostscript.color_conversion_strategy\n        not in ghostscript.COLOR_CONVERSION_STRATEGIES\n    ):\n        raise ValueError(\n            f\"Invalid color conversion strategy: \"\n            f\"{options.ghostscript.color_conversion_strategy}\"\n        )\n    if (\n        options.ghostscript.pdfa_image_compression != 'auto'\n        and options.output_type not in ('auto', 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3')\n    ):\n        log.warning(\n            \"--pdfa-image-compression argument only applies when \"\n            \"--output-type is 'auto' or one of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'\"\n        )\n\n\n@hookimpl\ndef rasterize_pdf_page(\n    input_file,\n    output_file,\n    raster_device,\n    raster_dpi,\n    pageno,\n    page_dpi,\n    rotation,\n    filter_vector,\n    stop_on_soft_error,\n    options,\n    use_cropbox,\n):\n    \"\"\"Rasterize a single page of a PDF file using Ghostscript.\"\"\"\n    # Check if user explicitly requested a different rasterizer\n    if options is not None and options.rasterizer == 'pypdfium':\n        # Let pypdfium handle it (it will error in check_options if unavailable)\n        return None\n\n    ghostscript.rasterize_pdf(\n        input_file,\n        output_file,\n        raster_device=raster_device,\n        raster_dpi=raster_dpi,\n        pageno=pageno,\n        page_dpi=page_dpi,\n        rotation=rotation,\n        filter_vector=filter_vector,\n        stop_on_error=stop_on_soft_error,\n        use_cropbox=use_cropbox,\n    )\n    return output_file\n\n\ndef _collect_dctdecode_images(pdf: Pdf) -> dict[tuple, list[tuple[Stream, bytes]]]:\n    \"\"\"Collect all DCTDecode (JPEG) images from a PDF.\n\n    Returns a dict mapping image signatures to a list of (stream, raw_bytes) tuples.\n    The signature is (Width, Height, Filter, BitsPerComponent, ColorSpace).\n    \"\"\"\n    images: dict[tuple, list[tuple[Stream, bytes]]] = {}\n\n    def get_colorspace_key(obj):\n        \"\"\"Get a hashable key for the colorspace.\"\"\"\n        cs = obj.get(Name.ColorSpace)\n        if cs is None:\n            return None\n        if isinstance(cs, Name):\n            return str(cs)\n        # For array colorspaces like [/ICCBased ...], use the first element\n        try:\n            return str(cs[0]) if len(cs) > 0 else str(cs)\n        except (TypeError, KeyError):\n            return str(cs)\n\n    def process_xobject_dict(xobjects, depth=0):\n        \"\"\"Process an XObject dictionary for DCTDecode images.\"\"\"\n        if xobjects is None:\n            return\n        if depth > 10:\n            log.warning(\"Recursion depth exceeded in _collect_dctdecode_images\")\n            return\n        for key in xobjects.keys():\n            obj = xobjects[key]\n            if obj is None:\n                continue\n            # Check if it's an image with DCTDecode\n            if obj.get(Name.Subtype) == Name.Image:\n                filt = obj.get(Name.Filter)\n                if filt == Name.DCTDecode:\n                    sig = (\n                        int(obj.get(Name.Width, 0)),\n                        int(obj.get(Name.Height, 0)),\n                        str(filt),\n                        int(obj.get(Name.BitsPerComponent, 0)),\n                        get_colorspace_key(obj),\n                    )\n                    raw_bytes = obj.read_raw_bytes()\n                    if sig not in images:\n                        images[sig] = []\n                    images[sig].append((obj, raw_bytes))\n            # Recurse into Form XObjects\n            elif obj.get(Name.Subtype) == Name.Form:\n                if Name.Resources in obj:\n                    res = obj[Name.Resources]\n                    if Name.XObject in res:\n                        process_xobject_dict(res[Name.XObject], depth=depth + 1)\n\n    for page in pdf.pages:\n        if Name.Resources not in page:\n            continue\n        resources = page[Name.Resources]\n        if Name.XObject not in resources:\n            continue\n        process_xobject_dict(resources[Name.XObject])\n\n    return images\n\n\ndef _repair_gs106_jpeg_corruption(\n    input_pdf_path: Path,\n    output_pdf_path: Path,\n) -> bool:\n    \"\"\"Repair JPEG corruption caused by Ghostscript 10.6.\n\n    Ghostscript 10.6 has a bug that truncates JPEG data by 1-15 bytes.\n    This function detects and repairs such corruption by copying the\n    original JPEG bytes from the input PDF.\n\n    Returns True if any repairs were made.\n    \"\"\"\n    repaired_count = 0\n    first_error_logged = False\n\n    with (\n        Pdf.open(input_pdf_path) as input_pdf,\n        Pdf.open(output_pdf_path, allow_overwriting_input=True) as output_pdf,\n    ):\n        # Collect all DCTDecode images from both PDFs\n        input_images = _collect_dctdecode_images(input_pdf)\n        output_images = _collect_dctdecode_images(output_pdf)\n\n        # For each output image, try to find a corresponding input image\n        for sig, output_list in output_images.items():\n            if sig not in input_images:\n                continue\n            input_list = input_images[sig]\n\n            for output_stream, output_bytes in output_list:\n                # Try to find a matching input image\n                for _input_stream, input_bytes in input_list:\n                    input_len = len(input_bytes)\n                    output_len = len(output_bytes)\n\n                    # Check if output is 1-15 bytes shorter\n                    diff = input_len - output_len\n                    if not (1 <= diff <= 15):\n                        continue\n\n                    # Check if the bytes are identical up to the truncation point\n                    if output_bytes != input_bytes[:output_len]:\n                        continue\n\n                    # This is a corrupt image - repair it\n                    if not first_error_logged:\n                        log.error(\n                            \"Ghostscript 10.6 JPEG corruption detected. \"\n                            \"Repairing damaged images from original PDF.\"\n                        )\n                        first_error_logged = True\n                    log.warning(\n                        f\"Replacing corrupt JPEG image \"\n                        f\"({sig[0]}x{sig[1]}, {diff} bytes truncated)\"\n                    )\n\n                    # Write the original bytes back to the output stream\n                    output_stream.write(\n                        input_bytes,\n                        filter=Name.DCTDecode,\n                    )\n                    repaired_count += 1\n                    break  # Move to next output image\n\n        if repaired_count > 0:\n            output_pdf.save(output_pdf_path)\n            log.info(\n                f\"Repaired {repaired_count} JPEG image(s) corrupted by Ghostscript\"\n            )\n\n    return repaired_count > 0\n\n\n@hookimpl\ndef generate_pdfa(\n    pdf_pages,\n    pdfmark,\n    output_file,\n    context,\n    pdf_version,\n    pdfa_part,\n    progressbar_class,\n    stop_on_soft_error,\n):\n    \"\"\"Generate a PDF/A from the list of PDF pages and PDF/A metadata.\"\"\"\n    # Normalize output_type at point of use\n    output_type = context.options.output_type\n    if output_type == 'pdfa':\n        output_type = 'pdfa-2'\n\n    ghostscript.generate_pdfa(\n        pdf_pages=[pdfmark, *pdf_pages],\n        output_file=output_file,\n        compression=context.options.ghostscript.pdfa_image_compression,\n        color_conversion_strategy=context.options.ghostscript.color_conversion_strategy,\n        pdf_version=pdf_version,\n        pdfa_part=pdfa_part,\n        progressbar_class=progressbar_class,\n        stop_on_error=stop_on_soft_error,\n    )\n\n    # Repair JPEG corruption caused by Ghostscript 10.6.x\n    gs_version = ghostscript.version()\n    if gs_version >= Version('10.6.0') and len(pdf_pages) == 1:\n        input_pdf = Path(pdf_pages[0])\n        _repair_gs106_jpeg_corruption(input_pdf, Path(output_file))\n\n    return output_file\n"
  },
  {
    "path": "src/ocrmypdf/builtin_plugins/null_ocr.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Built-in plugin implementing a null OCR engine (no OCR).\n\nThis plugin provides an OCR engine that produces no text output. It is useful\nwhen users want OCRmyPDF's image processing, PDF/A conversion, or optimization\nfeatures without performing actual OCR.\n\nUsage:\n    ocrmypdf --ocr-engine none input.pdf output.pdf\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING\n\nfrom PIL import Image\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.hocrtransform import BoundingBox, OcrClass, OcrElement\nfrom ocrmypdf.pluginspec import OcrEngine, OrientationConfidence\n\nif TYPE_CHECKING:\n    from ocrmypdf._options import OcrOptions\n\n\nclass NullOcrEngine(OcrEngine):\n    \"\"\"A no-op OCR engine that produces no text output.\n\n    Use this when you want OCRmyPDF's image processing, PDF/A conversion,\n    or optimization features without performing actual OCR.\n    \"\"\"\n\n    @staticmethod\n    def version() -> str:\n        \"\"\"Return version string.\"\"\"\n        return \"none\"\n\n    @staticmethod\n    def creator_tag(options: OcrOptions) -> str:\n        \"\"\"Return creator tag for PDF metadata.\"\"\"\n        return \"OCRmyPDF (no OCR)\"\n\n    def __str__(self) -> str:\n        \"\"\"Return human-readable engine name.\"\"\"\n        return \"No OCR engine\"\n\n    @staticmethod\n    def languages(options: OcrOptions) -> set[str]:\n        \"\"\"Return supported languages (empty set for null engine).\"\"\"\n        return set()\n\n    @staticmethod\n    def get_orientation(input_file: Path, options: OcrOptions) -> OrientationConfidence:\n        \"\"\"Return neutral orientation (no rotation detected).\"\"\"\n        return OrientationConfidence(angle=0, confidence=0.0)\n\n    @staticmethod\n    def get_deskew(input_file: Path, options: OcrOptions) -> float:\n        \"\"\"Return zero deskew angle.\"\"\"\n        return 0.0\n\n    @staticmethod\n    def supports_generate_ocr() -> bool:\n        \"\"\"Return True - this engine supports the generate_ocr() API.\"\"\"\n        return True\n\n    @staticmethod\n    def generate_ocr(\n        input_file: Path,\n        options: OcrOptions,\n        page_number: int = 0,\n    ) -> tuple[OcrElement, str]:\n        \"\"\"Generate empty OCR results.\n\n        Args:\n            input_file: The image file (used to get dimensions).\n            options: OCR options (ignored).\n            page_number: Page number (stored in result).\n\n        Returns:\n            A tuple of (empty OcrElement page, empty string).\n        \"\"\"\n        # Get image dimensions\n        with Image.open(input_file) as img:\n            width, height = img.size\n            dpi_info = img.info.get('dpi', (72, 72))\n            dpi = dpi_info[0] if isinstance(dpi_info, tuple) else dpi_info\n\n        # Create empty page element with correct dimensions\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=width, bottom=height),\n            dpi=float(dpi),\n            page_number=page_number,\n        )\n\n        return page, \"\"\n\n    @staticmethod\n    def generate_hocr(\n        input_file: Path,\n        output_hocr: Path,\n        output_text: Path,\n        options: OcrOptions,\n    ) -> None:\n        \"\"\"Generate empty hOCR file.\n\n        Creates minimal valid hOCR output with no text content.\n        \"\"\"\n        # Get image dimensions for hOCR bbox\n        with Image.open(input_file) as img:\n            width, height = img.size\n\n        hocr_content = f'''<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n<head>\n    <title>OCRmyPDF - No OCR</title>\n    <meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"/>\n    <meta name='ocr-system' content='OCRmyPDF null engine'/>\n</head>\n<body>\n    <div class='ocr_page' title='bbox 0 0 {width} {height}'>\n    </div>\n</body>\n</html>\n'''\n        output_hocr.write_text(hocr_content, encoding='utf-8')\n        output_text.write_text('', encoding='utf-8')\n\n    @staticmethod\n    def generate_pdf(\n        input_file: Path,\n        output_pdf: Path,\n        output_text: Path,\n        options: OcrOptions,\n    ) -> None:\n        \"\"\"NullOcrEngine cannot generate PDFs directly.\n\n        Use pdf_renderer='fpdf2' instead of 'sandwich'.\n        \"\"\"\n        raise NotImplementedError(\n            \"NullOcrEngine cannot generate PDFs directly. \"\n            \"Use --pdf-renderer fpdf2 instead of sandwich mode.\"\n        )\n\n\n@hookimpl\ndef get_ocr_engine(options):\n    \"\"\"Return NullOcrEngine when --ocr-engine none is selected.\"\"\"\n    if options is not None:\n        ocr_engine = getattr(options, 'ocr_engine', 'auto')\n        if ocr_engine != 'none':\n            return None\n    return NullOcrEngine()\n"
  },
  {
    "path": "src/ocrmypdf/builtin_plugins/optimize.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"Built-in plugin to implement PDF page optimization.\"\"\"\n\nfrom __future__ import annotations\n\nimport argparse\nimport logging\nfrom collections.abc import Sequence\nfrom pathlib import Path\nfrom typing import Annotated\n\nfrom pydantic import BaseModel, Field, model_validator\n\nfrom ocrmypdf import Executor, PdfContext, hookimpl\nfrom ocrmypdf._exec import jbig2enc, pngquant\nfrom ocrmypdf._pipeline import get_pdf_save_settings\nfrom ocrmypdf.cli import numeric\nfrom ocrmypdf.optimize import optimize\nfrom ocrmypdf.subprocess import check_external_program\n\nlog = logging.getLogger(__name__)\n\n\nclass OptimizeOptions(BaseModel):\n    \"\"\"Options specific to PDF optimization.\"\"\"\n\n    level: Annotated[\n        int,\n        Field(\n            ge=0,\n            le=3,\n            description=\"Optimization level (0=none, 1=safe, 2=lossy, 3=aggressive)\",\n        ),\n    ] = 1\n    jpeg_quality: Annotated[\n        int, Field(ge=0, le=100, description=\"JPEG quality level for optimization\")\n    ] = 0\n    png_quality: Annotated[\n        int, Field(ge=0, le=100, description=\"PNG quality level for optimization\")\n    ] = 0\n    jbig2_threshold: Annotated[\n        float,\n        Field(ge=0.4, le=0.9, description=\"JBIG2 symbol classification threshold\"),\n    ] = 0.85\n\n    @classmethod\n    def add_arguments_to_parser(cls, parser, namespace: str = 'optimize'):\n        \"\"\"Add optimization-specific arguments to the argument parser.\n\n        Args:\n            parser: The argument parser to add arguments to\n            namespace: The namespace prefix for argument names\n                (not used for optimize for backward compatibility)\n        \"\"\"\n        optimizing = parser.add_argument_group(\n            \"Optimization options\", \"Control how the PDF is optimized after OCR\"\n        )\n        optimizing.add_argument(\n            '-O',\n            '--optimize',\n            type=int,\n            choices=range(0, 4),\n            default=1,\n            help=(\n                \"Control how PDF is optimized after processing:\"\n                \"0 - do not optimize; \"\n                \"1 - do safe, lossless optimizations (default); \"\n                \"2 - do lossy JPEG and JPEG2000 optimizations; \"\n                \"3 - do more aggressive lossy JPEG and JPEG2000 optimizations. \"\n                \"To enable lossy JBIG2, see --jbig2-lossy.\"\n            ),\n        )\n        optimizing.add_argument(\n            '--jpeg-quality',\n            type=numeric(int, 0, 100),\n            default=0,\n            metavar='Q',\n            help=(\n                \"Adjust JPEG quality level for JPEG optimization. \"\n                \"100 is best quality and largest output size; \"\n                \"1 is lowest quality and smallest output; \"\n                \"0 uses the default.\"\n            ),\n        )\n        optimizing.add_argument(\n            '--jpg-quality',\n            type=numeric(int, 0, 100),\n            default=0,\n            metavar='Q',\n            dest='jpeg_quality',\n            help=argparse.SUPPRESS,  # Alias for --jpeg-quality\n        )\n        optimizing.add_argument(\n            '--png-quality',\n            type=numeric(int, 0, 100),\n            default=0,\n            metavar='Q',\n            help=(\n                \"Adjust PNG quality level to use when quantizing PNGs. \"\n                \"Values have same meaning as with --jpeg-quality\"\n            ),\n        )\n        # Deprecated arguments - kept for backward compatibility, emit warnings\n        optimizing.add_argument(\n            '--jbig2-lossy',\n            action='store_true',\n            help=argparse.SUPPRESS,  # Deprecated, hidden from help\n        )\n        optimizing.add_argument(\n            '--jbig2-page-group-size',\n            type=numeric(int, 1, 10000),\n            default=0,\n            metavar='N',\n            help=argparse.SUPPRESS,  # Deprecated, hidden from help\n        )\n        optimizing.add_argument(\n            '--jbig2-threshold',\n            type=numeric(float, 0.4, 0.9),\n            default=0.85,\n            metavar='T',\n            help=(\n                \"Adjust JBIG2 symbol code classification threshold \"\n                \"(default 0.85), range 0.4 to 0.9.\"\n            ),\n        )\n\n    @model_validator(mode='after')\n    def validate_optimization_consistency(self):\n        \"\"\"Validate optimization options are consistent.\"\"\"\n        if self.level == 0 and any([self.png_quality > 0, self.jpeg_quality > 0]):\n            log.warning(\n                \"The arguments --png-quality and --jpeg-quality \"\n                \"will be ignored because --optimize=0.\"\n            )\n        return self\n\n    def validate_with_context(\n        self, external_programs_available: dict[str, bool]\n    ) -> None:\n        \"\"\"Validate options that require external context.\n\n        Args:\n            external_programs_available: Dict of program name -> availability\n        \"\"\"\n        if self.level >= 2:\n            if not external_programs_available.get('pngquant', False):\n                log.warning(\n                    \"pngquant is not available, so PNG optimization will be limited\"\n                )\n            if not external_programs_available.get('jbig2enc', False):\n                log.warning(\n                    \"jbig2enc is not available, so JBIG2 optimization will be limited\"\n                )\n\n\n@hookimpl\ndef register_options():\n    \"\"\"Register optimization option model.\"\"\"\n    return {'optimize': OptimizeOptions}\n\n\n@hookimpl\ndef add_options(parser):\n    # Use the model's CLI generation method\n    OptimizeOptions.add_arguments_to_parser(parser)\n\n\n@hookimpl\ndef check_options(options):\n    \"\"\"Check external dependencies for optimization.\"\"\"\n    # Warn about deprecated options\n    if getattr(options, 'jbig2_lossy', False):\n        log.warning(\n            \"The --jbig2-lossy option is deprecated and will be ignored. \"\n            \"Lossy JBIG2 compression has been removed due to risks of \"\n            \"character substitution errors.\"\n        )\n    if getattr(options, 'jbig2_page_group_size', 0) not in (0, None):\n        log.warning(\n            \"The --jbig2-page-group-size option is deprecated and will be ignored.\"\n        )\n\n    if options.optimize >= 2:\n        check_external_program(\n            program='pngquant',\n            package='pngquant',\n            version_checker=pngquant.version,\n            need_version='2.12.2',\n            required_for='--optimize {2,3}',\n        )\n\n    if options.optimize >= 2:\n        # Although we use JBIG2 for optimize=1, don't nag about it unless the\n        # user is asking for more optimization\n        check_external_program(\n            program='jbig2',\n            package='jbig2enc',\n            version_checker=jbig2enc.version,\n            need_version='0.28',\n            required_for='--optimize {2,3}',\n            recommended=True,\n        )\n\n\n@hookimpl\ndef optimize_pdf(\n    input_pdf: Path,\n    output_pdf: Path,\n    context: PdfContext,\n    executor: Executor,\n    linearize: bool,\n) -> tuple[Path, Sequence[str]]:\n    save_settings = dict(\n        linearize=linearize,\n        **get_pdf_save_settings(context.options.output_type),\n    )\n    result_path = optimize(input_pdf, output_pdf, context, save_settings, executor)\n    messages = []\n    if context.options.optimize == 0:\n        messages.append(\"Optimization was disabled.\")\n    else:\n        image_optimizers = {\n            'jbig2': jbig2enc.available(),\n            'pngquant': pngquant.available(),\n        }\n        for name, available in image_optimizers.items():\n            if not available:\n                messages.append(\n                    f\"The optional dependency '{name}' was not found, so some image \"\n                    f\"optimizations could not be attempted.\"\n                )\n    return result_path, messages\n\n\n@hookimpl\ndef is_optimization_enabled(context: PdfContext) -> bool:\n    return context.options.optimize != 0\n"
  },
  {
    "path": "src/ocrmypdf/builtin_plugins/pypdfium.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"Built-in plugin to implement PDF page rasterization using pypdfium2.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport threading\nfrom contextlib import closing\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Literal\n\nif TYPE_CHECKING:\n    import pypdfium2 as pdfium\nelse:\n    try:\n        import pypdfium2 as pdfium\n    except ImportError:\n        pdfium = None\nfrom PIL import Image\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.exceptions import MissingDependencyError\nfrom ocrmypdf.helpers import Resolution\n\nlog = logging.getLogger(__name__)\n\n# pypdfium2/PDFium is not thread-safe. All calls to the library must be serialized.\n# See: https://pypdfium2.readthedocs.io/en/stable/python_api.html#incompatibility-with-threading\n# When using process-based parallelism (use_threads=False), each process has its own\n# pdfium instance, so locking is not needed across processes.\n_pdfium_lock = threading.Lock()\n\n\n@hookimpl\ndef check_options(options):\n    \"\"\"Check that pypdfium2 is available if explicitly requested.\"\"\"\n    if options.rasterizer == 'pypdfium' and pdfium is None:\n        raise MissingDependencyError(\n            \"The --rasterizer pypdfium option requires the pypdfium2 package. \"\n            \"Install it with: pip install pypdfium2\"\n        )\n\n\ndef _open_pdf_document(input_file: Path):\n    \"\"\"Open a PDF document using pypdfium2.\"\"\"\n    assert pdfium is not None, \"pypdfium2 must be available to call this function\"\n    return pdfium.PdfDocument(input_file)\n\n\ndef _calculate_mediabox_crop(page) -> tuple[float, float, float, float]:\n    \"\"\"Calculate crop values to expand rendering from CropBox to MediaBox.\n\n    By default pypdfium2 renders to the CropBox. To render the full MediaBox,\n    we need negative crop values to expand the rendering area.\n\n    Returns:\n        Tuple of (left, bottom, right, top) crop values. Negative values\n        expand the rendering area beyond the CropBox to the MediaBox.\n    \"\"\"\n    mediabox = page.get_mediabox()  # (left, bottom, right, top)\n    cropbox = page.get_cropbox()  # (left, bottom, right, top), defaults to mediabox\n\n    # Calculate how much to expand from cropbox to mediabox\n    # Negative values = expand, positive = shrink\n    return (\n        mediabox[0] - cropbox[0],  # Expand left\n        mediabox[1] - cropbox[1],  # Expand bottom\n        cropbox[2] - mediabox[2],  # Expand right\n        cropbox[3] - mediabox[3],  # Expand top\n    )\n\n\ndef _render_page_to_bitmap(\n    page: pdfium.PdfPage,\n    raster_device: str,\n    raster_dpi: Resolution,\n    rotation: int | None,\n    use_cropbox: bool,\n) -> tuple[pdfium.PdfBitmap, int, int]:\n    \"\"\"Render a PDF page to a bitmap.\"\"\"\n    # Round DPI to match Ghostscript's precision\n    raster_dpi = raster_dpi.round(6)\n\n    # Get page dimensions BEFORE applying rotation\n    page_width_pts, page_height_pts = page.get_size()\n\n    # Calculate expected output dimensions using separate x/y DPI\n    expected_width = int(round(page_width_pts * raster_dpi.x / 72.0))\n    expected_height = int(round(page_height_pts * raster_dpi.y / 72.0))\n\n    # Calculate the scale factor based on DPI\n    # pypdfium2 uses points (72 DPI) as base unit\n    scale = raster_dpi.to_scalar() / 72.0\n\n    # Apply rotation if specified\n    if rotation:\n        # pypdfium2 rotation is in degrees, same as our input\n        # we track rotation in CCW, and pypdfium2 expects CW, so negate\n        page.set_rotation(-rotation % 360)\n        # When rotation is 90 or 270, dimensions are swapped in output\n        if rotation % 180 == 90:\n            expected_width, expected_height = expected_height, expected_width\n\n    # Render the page to a bitmap\n    # The scale parameter controls the resolution\n    # Render in grayscale for mono and gray devices (better input for 1-bit conversion)\n    grayscale = raster_device.lower() in ('pngmono', 'pnggray', 'jpeggray')\n\n    # Calculate crop to render the appropriate box\n    # Default (use_cropbox=False) renders MediaBox for consistency with Ghostscript\n    crop = (0, 0, 0, 0) if use_cropbox else _calculate_mediabox_crop(page)\n\n    bitmap = page.render(\n        scale=scale,\n        rotation=0,  # We already set rotation on the page\n        crop=crop,\n        may_draw_forms=True,\n        draw_annots=True,\n        grayscale=grayscale,\n        # Note: pypdfium2 doesn't have a direct equivalent to filter_vector\n        # This would require more complex implementation if needed\n    )\n    return bitmap, expected_width, expected_height\n\n\ndef _process_image_for_output(\n    pil_image: Image.Image,\n    raster_device: str,\n    raster_dpi: Resolution,\n    page_dpi: Resolution | None,\n    stop_on_soft_error: bool,\n    expected_width: int | None = None,\n    expected_height: int | None = None,\n) -> tuple[Image.Image, Literal['PNG', 'TIFF', 'JPEG']]:\n    \"\"\"Process PIL image for output format and set DPI metadata.\"\"\"\n    # Correct dimensions if slightly off (within 2 pixels tolerance)\n    if expected_width and expected_height:\n        actual_width, actual_height = pil_image.width, pil_image.height\n        width_diff = abs(actual_width - expected_width)\n        height_diff = abs(actual_height - expected_height)\n\n        # Only resize if off by small amount (1-2 pixels)\n        if (width_diff <= 2 or height_diff <= 2) and (\n            width_diff > 0 or height_diff > 0\n        ):\n            log.debug(\n                f\"Adjusting rendered dimensions from \"\n                f\"{actual_width}x{actual_height} to expected \"\n                f\"{expected_width}x{expected_height}\"\n            )\n            pil_image = pil_image.resize(\n                (expected_width, expected_height), Image.Resampling.LANCZOS\n            )\n\n    # Set the DPI metadata if page_dpi is specified\n    if page_dpi:\n        # PIL expects DPI as a tuple\n        dpi_tuple = (float(page_dpi.x), float(page_dpi.y))\n        pil_image.info['dpi'] = dpi_tuple\n    else:\n        # Use the raster DPI\n        dpi_tuple = (float(raster_dpi.x), float(raster_dpi.y))\n        pil_image.info['dpi'] = dpi_tuple\n\n    # Convert image mode to match raster_device\n    # This ensures pypdfium output matches Ghostscript's native device output\n    raster_device_lower = raster_device.lower()\n\n    if raster_device_lower == 'pngmono':\n        # Convert to 1-bit black and white (matches Ghostscript pngmono device)\n        if pil_image.mode != '1':\n            if pil_image.mode not in ('L', '1'):\n                pil_image = pil_image.convert('L')\n            pil_image = pil_image.convert('1')\n    elif raster_device_lower in ('pnggray', 'jpeggray'):\n        # Convert to 8-bit grayscale\n        if pil_image.mode not in ('L', '1'):\n            pil_image = pil_image.convert('L')\n    elif raster_device_lower == 'png256':\n        # Convert to 8-bit indexed color (256 colors)\n        if pil_image.mode != 'P':\n            if pil_image.mode not in ('RGB', 'RGBA'):\n                pil_image = pil_image.convert('RGB')\n            pil_image = pil_image.quantize(colors=256)\n    elif raster_device_lower in ('png16m', 'jpeg'):\n        # Convert to RGB\n        if pil_image.mode == 'RGBA':\n            background = Image.new('RGB', pil_image.size, (255, 255, 255))\n            background.paste(pil_image, mask=pil_image.split()[-1])\n            pil_image = background\n        elif pil_image.mode not in ('RGB',):\n            pil_image = pil_image.convert('RGB')\n    # pngalpha: keep RGBA as-is\n\n    # Determine output format based on raster_device\n    png_devices = ('png', 'pngmono', 'pnggray', 'png256', 'png16m', 'pngalpha')\n    if raster_device_lower in png_devices:\n        format_name = 'PNG'\n    elif raster_device_lower in ('jpeg', 'jpeggray', 'jpg'):\n        format_name = 'JPEG'\n    elif raster_device_lower in ('tiff', 'tif'):\n        format_name = 'TIFF'\n    else:\n        # Default to PNG for unknown formats\n        format_name = 'PNG'\n        if stop_on_soft_error:\n            raise ValueError(f\"Unsupported raster device: {raster_device}\")\n        else:\n            log.warning(f\"Unsupported raster device {raster_device}, using PNG\")\n\n    return pil_image, format_name\n\n\ndef _save_image(pil_image: Image.Image, output_file: Path, format_name: str) -> None:\n    \"\"\"Save PIL image to file with appropriate DPI metadata.\"\"\"\n    save_kwargs = {}\n    if (\n        format_name in ('PNG', 'TIFF')\n        and 'dpi' in pil_image.info\n        or format_name == 'JPEG'\n        and 'dpi' in pil_image.info\n    ):\n        save_kwargs['dpi'] = pil_image.info['dpi']\n\n    pil_image.save(output_file, format=format_name, **save_kwargs)\n\n\n@hookimpl\ndef rasterize_pdf_page(\n    input_file: Path,\n    output_file: Path,\n    raster_device: str,\n    raster_dpi: Resolution,\n    pageno: int,\n    page_dpi: Resolution | None,\n    rotation: int | None,\n    filter_vector: bool,\n    stop_on_soft_error: bool,\n    options,\n    use_cropbox: bool,\n) -> Path | None:\n    \"\"\"Rasterize a single page of a PDF file using pypdfium2.\n\n    Returns None if pypdfium2 is not available or if the user has selected\n    a different rasterizer, allowing Ghostscript to be used.\n    \"\"\"\n    # Check if user explicitly requested a different rasterizer\n    if options is not None and options.rasterizer == 'ghostscript':\n        return None  # Let Ghostscript handle it\n\n    if pdfium is None:\n        return None  # Fall back to Ghostscript\n\n    # Acquire lock to ensure thread-safe access to pypdfium2\n    with (\n        _pdfium_lock,\n        closing(_open_pdf_document(input_file)) as pdf,\n        closing(pdf[pageno - 1]) as page,\n    ):\n        # Render the page to a bitmap\n        bitmap, expected_width, expected_height = _render_page_to_bitmap(\n            page, raster_device, raster_dpi, rotation, use_cropbox\n        )\n        with closing(bitmap):\n            # Convert to PIL Image\n            pil_image = bitmap.to_pil()\n\n    # Process and save image outside the lock (PIL operations are thread-safe)\n    pil_image, format_name = _process_image_for_output(\n        pil_image,\n        raster_device,\n        raster_dpi,\n        page_dpi,\n        stop_on_soft_error,\n        expected_width,\n        expected_height,\n    )\n\n    _save_image(pil_image, output_file, format_name)\n\n    return output_file\n"
  },
  {
    "path": "src/ocrmypdf/builtin_plugins/tesseract_ocr.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"Built-in plugin to implement OCR using Tesseract.\"\"\"\n\nfrom __future__ import annotations\n\nimport argparse\nimport logging\nimport os\nfrom typing import Annotated\n\nfrom PIL import Image\nfrom pydantic import BaseModel, Field, field_validator, model_validator\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf._exec import tesseract\nfrom ocrmypdf._exec.tesseract import ThresholdingMethod\nfrom ocrmypdf._jobcontext import PageContext\nfrom ocrmypdf.cli import numeric\nfrom ocrmypdf.exceptions import BadArgsError, MissingDependencyError\nfrom ocrmypdf.helpers import available_cpu_count, clamp\nfrom ocrmypdf.imageops import calculate_downsample, downsample_image\nfrom ocrmypdf.pluginspec import OcrEngine\nfrom ocrmypdf.subprocess import check_external_program\n\nlog = logging.getLogger(__name__)\n\n\ndef _thresholding_method_converter(value: str) -> ThresholdingMethod:\n    \"\"\"Convert string argument to ThresholdingMethod enum.\n\n    Args:\n        value: String name of thresholding method (auto, otsu, adaptive-otsu, sauvola)\n\n    Returns:\n        ThresholdingMethod enum value\n\n    Raises:\n        argparse.ArgumentTypeError: If value is not a valid thresholding method\n    \"\"\"\n    method_map = {\n        'auto': ThresholdingMethod.AUTO,\n        'otsu': ThresholdingMethod.OTSU,\n        'adaptive-otsu': ThresholdingMethod.ADAPTIVE_OTSU,\n        'sauvola': ThresholdingMethod.SAUVOLA,\n    }\n    if value.lower() not in method_map:\n        import argparse\n\n        valid = ', '.join(method_map.keys())\n        raise argparse.ArgumentTypeError(\n            f\"Invalid thresholding method '{value}'. Must be one of: {valid}\"\n        )\n    return method_map[value.lower()]\n\n\nclass TesseractOptions(BaseModel):\n    \"\"\"Options specific to Tesseract OCR engine.\"\"\"\n\n    config: Annotated[\n        list[str], Field(description=\"Additional Tesseract configuration files\")\n    ] = []\n    pagesegmode: Annotated[\n        int | None,\n        Field(ge=0, le=13, description=\"Set Tesseract page segmentation mode\"),\n    ] = None\n    oem: Annotated[\n        int | None, Field(ge=0, le=3, description=\"Set Tesseract OCR engine mode\")\n    ] = None\n    thresholding: Annotated[\n        ThresholdingMethod,\n        Field(description=\"Set Tesseract input image thresholding mode\"),\n    ] = ThresholdingMethod.AUTO\n    timeout: Annotated[\n        float, Field(ge=0, description=\"Timeout for OCR operations in seconds\")\n    ] = 180.0\n    non_ocr_timeout: Annotated[\n        float, Field(ge=0, description=\"Timeout for non-OCR operations in seconds\")\n    ] = 180.0\n    downsample_large_images: Annotated[\n        bool, Field(description=\"Downsample large images before OCR\")\n    ] = True\n    downsample_above: Annotated[\n        int,\n        Field(\n            ge=100,\n            le=32767,\n            description=\"Downsample images larger than this pixel size\",\n        ),\n    ] = 32767\n    user_words: Annotated[\n        str | None, Field(description=\"Path to Tesseract user words file\")\n    ] = None\n    user_patterns: Annotated[\n        str | None, Field(description=\"Path to Tesseract user patterns file\")\n    ] = None\n    omp_thread_limit: Annotated[\n        int | None,\n        Field(\n            description=\"Calculated OMP_THREAD_LIMIT for Tesseract subprocesses\",\n            exclude=True,\n        ),\n    ] = None\n\n    @classmethod\n    def add_arguments_to_parser(cls, parser, namespace: str = 'tesseract'):\n        \"\"\"Add Tesseract-specific arguments to the argument parser.\n\n        Args:\n            parser: The argument parser to add arguments to\n            namespace: The namespace prefix for argument names\n        \"\"\"\n        tess = parser.add_argument_group(\n            \"Tesseract\", \"Advanced control of Tesseract OCR\"\n        )\n\n        tess.add_argument(\n            f'--{namespace}-config',\n            action='append',\n            metavar='CFG',\n            default=[],\n            dest=f'{namespace}_config',\n            help=\"Additional Tesseract configuration files -- see documentation.\",\n        )\n\n        tess.add_argument(\n            f'--{namespace}-pagesegmode',\n            action='store',\n            type=int,\n            metavar='PSM',\n            choices=range(0, 14),\n            dest=f'{namespace}_pagesegmode',\n            help=\"Set Tesseract page segmentation mode (see tesseract --help).\",\n        )\n\n        tess.add_argument(\n            f'--{namespace}-oem',\n            action='store',\n            type=int,\n            metavar='MODE',\n            choices=range(0, 4),\n            dest=f'{namespace}_oem',\n            help=(\n                \"Set Tesseract 4+ OCR engine mode: \"\n                \"0 - original Tesseract only; \"\n                \"1 - neural nets LSTM only; \"\n                \"2 - Tesseract + LSTM; \"\n                \"3 - default.\"\n            ),\n        )\n\n        tess.add_argument(\n            f'--{namespace}-thresholding',\n            action='store',\n            type=_thresholding_method_converter,\n            default='auto',\n            dest=f'{namespace}_thresholding',\n            help=(\n                \"Set Tesseract 5.0+ input image thresholding mode. This may improve \"\n                \"OCR results on low quality images or those that contain high \"\n                \"contrast color. Options: auto, otsu, adaptive-otsu, sauvola. \"\n                \"auto/otsu is the Tesseract default (legacy Otsu); adaptive-otsu \"\n                \"is an improved Otsu algorithm with improved sort for background \"\n                \"color changes; sauvola is based on local standard deviation.\"\n            ),\n        )\n\n        tess.add_argument(\n            f'--{namespace}-timeout',\n            default=180.0,\n            type=numeric(float, 0),\n            metavar='SECONDS',\n            dest=f'{namespace}_timeout',\n            help=(\n                \"Give up on OCR after the timeout, but copy the preprocessed page \"\n                \"into the final output. This timeout is only used when using Tesseract \"\n                \"for OCR. When Tesseract is used for other operations such as \"\n                \"deskewing and orientation, the timeout is controlled by \"\n                f\"--{namespace}-non-ocr-timeout.\"\n            ),\n        )\n\n        tess.add_argument(\n            f'--{namespace}-non-ocr-timeout',\n            default=180.0,\n            type=numeric(float, 0),\n            metavar='SECONDS',\n            dest=f'{namespace}_non_ocr_timeout',\n            help=(\n                \"Give up on non-OCR operations such as deskewing and orientation \"\n                f\"after timeout. This is a separate timeout from --{namespace}-timeout \"\n                \"because these operations are not as expensive as OCR.\"\n            ),\n        )\n\n        tess.add_argument(\n            f'--{namespace}-downsample-large-images',\n            action=argparse.BooleanOptionalAction,\n            default=True,\n            dest=f'{namespace}_downsample_large_images',\n            help=(\n                \"Downsample large images before OCR. Tesseract has \"\n                \"an upper limit on the size images it will support.\"\n                \" If this argument is given, OCRmyPDF will \"\n                \"downsample large images to fit Tesseract. This \"\n                \"may reduce OCR quality, on large images the most\"\n                \" desirable text is usually larger. If this \"\n                \"parameter is not supplied, Tesseract will error \"\n                \"out and produce no OCR on the page in question. \"\n                \"This argument should be used with a high value \"\n                f\"of --{namespace}-timeout to ensure Tesseract \"\n                \"has enough to time.\"\n            ),\n        )\n\n        tess.add_argument(\n            f'--{namespace}-downsample-above',\n            action='store',\n            type=numeric(int, 100, 32767),\n            default=32767,\n            dest=f'{namespace}_downsample_above',\n            help=(\n                \"Downsample images larger than this size pixel size (either dimension) \"\n                f\"before OCR. --{namespace}-downsample-large-images downsamples when \"\n                \"an image exceeds Tesseract's internal limits. This argument causes \"\n                \"downsampling to occur when an image exceeds the given size. This may \"\n                \"reduce OCR quality, but on large images the most desirable text is \"\n                \"usually larger.\"\n            ),\n        )\n\n        tess.add_argument(\n            '--user-words',\n            metavar='FILE',\n            dest='user_words',\n            help=\"Specify the location of the Tesseract user words file. This is a \"\n            \"list of words Tesseract should consider while performing OCR in \"\n            \"addition to its standard language dictionaries. This can improve \"\n            \"OCR quality especially for specialized and technical documents.\",\n        )\n        tess.add_argument(\n            '--user-patterns',\n            metavar='FILE',\n            dest='user_patterns',\n            help=\"Specify the location of the Tesseract user patterns file.\",\n        )\n\n    @field_validator('timeout', 'non_ocr_timeout')\n    @classmethod\n    def validate_timeout_reasonable(cls, v):\n        \"\"\"Validate timeout values are reasonable.\"\"\"\n        if v > 3600:  # 1 hour\n            log.warning(f\"Timeout of {v} seconds is very long and may cause issues\")\n        return v\n\n    @field_validator('pagesegmode')\n    @classmethod\n    def validate_pagesegmode_warning(cls, v):\n        \"\"\"Validate page segmentation mode and warn about problematic values.\"\"\"\n        if v in (0, 2):\n            log.warning(\n                \"The tesseract-pagesegmode you selected will disable OCR. \"\n                \"This may cause processing to fail.\"\n            )\n        return v\n\n    @model_validator(mode='after')\n    def validate_downsample_consistency(self):\n        \"\"\"Validate downsample options are consistent.\"\"\"\n        if self.downsample_above != 32767 and not self.downsample_large_images:\n            log.warning(\n                \"The --tesseract-downsample-above argument will have no effect unless \"\n                \"--tesseract-downsample-large-images is also given.\"\n            )\n        return self\n\n    def validate_with_context(self, languages: list[str]) -> None:\n        \"\"\"Validate options that require external context.\n\n        Args:\n            languages: List of languages being used for OCR\n        \"\"\"\n        # Validate languages are not internal Tesseract languages\n        DENIED_LANGUAGES = {'equ', 'osd'}\n        if DENIED_LANGUAGES & set(languages):\n            raise BadArgsError(\n                \"The following languages are for Tesseract's internal use \"\n                \"and should not be issued explicitly: \"\n                f\"{', '.join(DENIED_LANGUAGES & set(languages))}\\n\"\n                \"Remove them from the -l/--language argument.\"\n            )\n\n\n@hookimpl\ndef register_options():\n    \"\"\"Register Tesseract option model.\"\"\"\n    return {'tesseract': TesseractOptions}\n\n\n@hookimpl\ndef add_options(parser):\n    # Use the model's CLI generation method - it now handles all Tesseract options\n    TesseractOptions.add_arguments_to_parser(parser)\n\n\n@hookimpl\ndef check_options(options):\n    \"\"\"Check external dependencies and version compatibility for Tesseract.\"\"\"\n    check_external_program(\n        program='tesseract',\n        package={'linux': 'tesseract-ocr'},\n        version_checker=tesseract.version,\n        need_version='4.1.1',  # Ubuntu 22.04 version (also 20.04)\n        version_parser=tesseract.TesseractVersion,\n    )\n    tess_version = tesseract.version()\n    if tess_version == tesseract.TesseractVersion('5.4.0'):\n        raise MissingDependencyError(\n            \"Tesseract 5.4.0 is not supported due to regressions in this version. \"\n            \"Please upgrade to a newer or supported older version.\"\n        )\n\n    # Check version-specific feature compatibility\n    if (\n        not tesseract.has_thresholding()\n        and options.tesseract.thresholding != ThresholdingMethod.AUTO\n    ):\n        log.warning(\n            \"The installed version of Tesseract does not support changes to its \"\n            \"thresholding method. The --tesseract-threshold argument will be \"\n            \"ignored.\"\n        )\n\n\n@hookimpl\ndef validate(pdfinfo, options):\n    # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want\n    # to manage how many threads it uses to avoid creating total threads than cores.\n    # Performance testing shows we're better off\n    # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we\n    # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the\n    # input file is small, then we allow Tesseract to use threads, subject to the\n    # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.\n    # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.\n    if not os.environ.get('OMP_THREAD_LIMIT', '').isnumeric():\n        jobs = options.jobs or available_cpu_count()\n        tess_threads = clamp(jobs // len(pdfinfo), 1, 3)\n    else:\n        tess_threads = int(os.environ['OMP_THREAD_LIMIT'])\n    # Store the thread limit in options - it will be passed to subprocess env\n    options.tesseract.omp_thread_limit = tess_threads\n    log.debug(\"Using Tesseract OpenMP thread limit %d\", tess_threads)\n\n    if (\n        options.tesseract.downsample_above != 32767\n        and not options.tesseract.downsample_large_images\n    ):\n        log.warning(\n            \"The --tesseract-downsample-above argument will have no effect unless \"\n            \"--tesseract-downsample-large-images is also given.\"\n        )\n\n\n@hookimpl\ndef filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image:\n    \"\"\"Filter the image before OCR.\n\n    Tesseract cannot handle images with more than 32767 pixels in either axis,\n    or more than 2**31 bytes. This function resizes the image to fit within\n    those limits.\n    \"\"\"\n    options = page.options\n    if getattr(options, 'tesseract', None) is None:\n        return image\n    threshold = min(options.tesseract.downsample_above, 32767)\n\n    if options.tesseract.downsample_large_images:\n        size = calculate_downsample(\n            image, max_size=(threshold, threshold), max_bytes=(2**31) - 1\n        )\n        image = downsample_image(image, size)\n    return image\n\n\nclass TesseractOcrEngine(OcrEngine):\n    \"\"\"Implements OCR with Tesseract.\"\"\"\n\n    @staticmethod\n    def version():\n        return str(tesseract.version())\n\n    @staticmethod\n    def _determine_renderer(options):\n        \"\"\"Determine the PDF renderer to use based on options and languages.\"\"\"\n        if options.pdf_renderer == 'auto':\n            return 'fpdf2'\n        return options.pdf_renderer\n\n    @staticmethod\n    def creator_tag(options):\n        renderer = TesseractOcrEngine._determine_renderer(options)\n        match renderer:\n            case 'hocr':\n                return f\"OCRmyPDF hOCR + Tesseract OCR {TesseractOcrEngine.version()}\"\n            case 'fpdf2':\n                return f\"OCRmyPDF fpdf2 + Tesseract OCR {TesseractOcrEngine.version()}\"\n            case \"sandwich\":\n                return f\"Tesseract OCR + PDF {TesseractOcrEngine.version()}\"\n            case _:\n                return f\"Tesseract OCR {TesseractOcrEngine.version()}\"\n\n    def __str__(self):\n        return f\"Tesseract OCR {TesseractOcrEngine.version()}\"\n\n    @staticmethod\n    def languages(options):\n        return tesseract.get_languages()\n\n    @staticmethod\n    def get_orientation(input_file, options):\n        return tesseract.get_orientation(\n            input_file,\n            engine_mode=options.tesseract.oem,\n            timeout=options.tesseract.non_ocr_timeout,\n            omp_thread_limit=options.tesseract.omp_thread_limit,\n        )\n\n    @staticmethod\n    def get_deskew(input_file, options) -> float:\n        return tesseract.get_deskew(\n            input_file,\n            languages=options.languages,\n            engine_mode=options.tesseract.oem,\n            timeout=options.tesseract.non_ocr_timeout,\n            omp_thread_limit=options.tesseract.omp_thread_limit,\n        )\n\n    @staticmethod\n    def generate_hocr(input_file, output_hocr, output_text, options):\n        tesseract.generate_hocr(\n            input_file=input_file,\n            output_hocr=output_hocr,\n            output_text=output_text,\n            languages=options.languages,\n            engine_mode=options.tesseract.oem,\n            tessconfig=options.tesseract.config,\n            timeout=options.tesseract.timeout,\n            pagesegmode=options.tesseract.pagesegmode,\n            thresholding=options.tesseract.thresholding,\n            user_words=options.tesseract.user_words,\n            user_patterns=options.tesseract.user_patterns,\n            omp_thread_limit=options.tesseract.omp_thread_limit,\n        )\n\n    @staticmethod\n    def generate_pdf(input_file, output_pdf, output_text, options):\n        tesseract.generate_pdf(\n            input_file=input_file,\n            output_pdf=output_pdf,\n            output_text=output_text,\n            languages=options.languages,\n            engine_mode=options.tesseract.oem,\n            tessconfig=options.tesseract.config,\n            timeout=options.tesseract.timeout,\n            pagesegmode=options.tesseract.pagesegmode,\n            thresholding=options.tesseract.thresholding,\n            user_words=options.tesseract.user_words,\n            user_patterns=options.tesseract.user_patterns,\n            omp_thread_limit=options.tesseract.omp_thread_limit,\n        )\n\n\n@hookimpl\ndef get_ocr_engine(options):\n    \"\"\"Return TesseractOcrEngine when selected or as default.\"\"\"\n    if options is not None:\n        ocr_engine = getattr(options, 'ocr_engine', 'auto')\n        # Tesseract is selected if explicitly requested or if 'auto'\n        if ocr_engine not in ('auto', 'tesseract'):\n            return None\n    return TesseractOcrEngine()\n"
  },
  {
    "path": "src/ocrmypdf/cli.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Command line interface customization and validation.\"\"\"\n\nfrom __future__ import annotations\n\nimport argparse\nfrom argparse import ArgumentParser\nfrom collections.abc import Callable, Mapping\nfrom typing import Any, TypeVar\n\nfrom ocrmypdf._defaults import DEFAULT_ROTATE_PAGES_THRESHOLD\nfrom ocrmypdf._defaults import PROGRAM_NAME as _PROGRAM_NAME\nfrom ocrmypdf._options import OcrOptions, ProcessingMode, TaggedPdfMode\nfrom ocrmypdf._plugin_manager import OcrmypdfPluginManager\nfrom ocrmypdf._version import __version__ as _VERSION\n\nT = TypeVar('T', int, float)\n\n\ndef numeric(basetype: Callable[[Any], T], min_: T | None = None, max_: T | None = None):\n    \"\"\"Validator for numeric command line parameters.\n\n    Stipulates that the value must be of type basetype (typically int or float), and\n    optionally, within the range [min_, max_].\n    \"\"\"\n    min_ = basetype(min_) if min_ is not None else None\n    max_ = basetype(max_) if max_ is not None else None\n\n    def _numeric(s: str) -> T:\n        value = basetype(s)\n        if (min_ is not None and value < min_) or (max_ is not None and value > max_):\n            raise argparse.ArgumentTypeError(\n                f\"{s!r} not in valid range {(min_, max_)!r}\"\n            )\n        return value\n\n    _numeric.__name__ = basetype.__name__\n    return _numeric\n\n\ndef str_to_int(mapping: Mapping[str, int]):\n    \"\"\"Accept text on command line and convert to integer.\"\"\"\n\n    def _str_to_int(s: str) -> int:\n        try:\n            return mapping[s]\n        except KeyError:\n            raise argparse.ArgumentTypeError(\n                f\"{s!r} must be one of: {', '.join(mapping.keys())}\"\n            ) from None\n\n    return _str_to_int\n\n\nclass LanguageSetAction(argparse.Action):\n    \"\"\"Manages a list of languages.\"\"\"\n\n    def __init__(self, option_strings, dest, default=None, **kwargs):\n        \"\"\"Initialize the action.\"\"\"\n        if default is None:\n            default = list()\n        super().__init__(option_strings, dest, default=default, **kwargs)\n\n    def __call__(self, parser, namespace, values, option_string=None):\n        \"\"\"Add a language to the set.\"\"\"\n        dest = getattr(namespace, self.dest)\n        if isinstance(values, str) and '+' in values:\n            [dest.append(lang) for lang in values.split('+')]\n        else:\n            dest.append(values)\n\n\ndef get_parser():\n    \"\"\"Get the main CLI parser.\"\"\"\n    parser = ArgumentParser(\n        prog=_PROGRAM_NAME,\n        allow_abbrev=True,\n        fromfile_prefix_chars='@',\n        formatter_class=argparse.RawDescriptionHelpFormatter,\n        description=\"\"\"\\\nGenerates a searchable PDF or PDF/A from a regular PDF.\n\nOCRmyPDF rasterizes each page of the input PDF, optionally corrects page\nrotation and performs image processing, runs the Tesseract OCR engine on the\nimage, and then creates a PDF from the OCR information.\n\"\"\",\n        epilog=\"\"\"\\\nOCRmyPDF attempts to keep the output file at about the same size.  If a file\ncontains losslessly compressed images, and images in the output file will be\nlosslessly compressed as well.\n\nPDF is a page description file that attempts to preserve a layout exactly.\nA PDF can contain vector objects (such as text or lines) and raster objects\n(images).  A page might have multiple images.  OCRmyPDF is prepared to deal\nwith the wide variety of PDFs that exist in the wild.\n\nWhen a PDF page contains text, OCRmyPDF assumes that the page has already\nbeen OCRed or is a \"born digital\" page that should not be OCRed.  The default\nbehavior is to exit in this case without producing a file.  You can use the\noption --skip-text to ignore pages with text, or --force-ocr to rasterize\nall objects on the page and produce an image-only PDF as output.\n\n    ocrmypdf --skip-text file_with_some_text_pages.pdf output.pdf\n\n    ocrmypdf --force-ocr word_document.pdf output.pdf\n\nIf you are concerned about long-term archiving of PDFs, use the default option\n--output-type pdfa which converts the PDF to a standardized PDF/A-2b.  This\nremoves some features from the PDF such as Javascript or forms. If you want to\nminimize the number of changes made to your PDF, use --output-type pdf.\n\nIf OCRmyPDF is given an image file as input, it will attempt to convert the\nimage to a PDF before processing.  For more control over the conversion of\nimages to PDF, use the Python package img2pdf or other image to PDF software.\n\nFor example, this command uses img2pdf to convert all .png files beginning\nwith the 'page' prefix to a PDF, fitting each image on A4-sized paper, and\nsending the result to OCRmyPDF through a pipe.\n\n    img2pdf --pagesize A4 page*.png | ocrmypdf - myfile.pdf\n\nOnline documentation is located at:\n    https://ocrmypdf.readthedocs.io/en/latest/introduction.html\n\n\"\"\",\n    )\n\n    parser.add_argument(\n        'input_file',\n        metavar=\"input_pdf_or_image\",\n        help=\"PDF file containing the images to be OCRed (or '-' to read from \"\n        \"standard input)\",\n    )\n    parser.add_argument(\n        'output_file',\n        metavar=\"output_pdf\",\n        help=\"Output searchable PDF file (or '-' to write to standard output). \"\n        \"Existing files will be overwritten (use --no-overwrite to prevent this). \"\n        \"If same as input file, the input file will be updated only if \"\n        \"processing is successful.\",\n    )\n    parser.add_argument(\n        '-l',\n        '--language',\n        dest='languages',\n        action=LanguageSetAction,\n        help=\"Language(s) of the file to be OCRed (see tesseract --list-langs for \"\n        \"all language packs installed in your system). Use -l eng+deu for \"\n        \"multiple languages.\",\n    )\n    parser.add_argument(\n        '--image-dpi',\n        metavar='DPI',\n        type=int,\n        help=\"When the input file is an image, not a PDF, use this DPI instead \"\n        \"of the DPI claimed by the input file. If the input does not claim a \"\n        \"sensible DPI, this option will be required.\",\n    )\n    parser.add_argument(\n        '--output-type',\n        choices=['auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'],\n        default='auto',\n        help=\"Choose output type. 'auto' (default) produces best-effort PDF/A \"\n        \"without requiring Ghostscript - uses verapdf validation when available, \"\n        \"otherwise passes through as PDF/A if safe (input already PDF/A or \"\n        \"force-ocr was used), or falls back to regular PDF. 'pdfa' creates a \"\n        \"PDF/A-2b compliant file for long term archiving (requires Ghostscript \"\n        \"as fallback). 'pdf' minimizes changes to the input file. 'pdfa-1' \"\n        \"creates a PDF/A-1b file. 'pdfa-2' is equivalent to 'pdfa'. 'pdfa-3' \"\n        \"creates a PDF/A-3b file. 'none' will produce no output, which may be \"\n        \"helpful if only the --sidecar is desired.\",\n    )\n\n    # Use null string '\\0' as sentinel to indicate the user supplied no argument,\n    # since that is the only invalid character for filepaths on all platforms\n    # bool('\\0') is True in Python\n    parser.add_argument(\n        '--sidecar',\n        nargs='?',\n        const='\\0',\n        default=None,\n        metavar='FILE',\n        help=\"Generate sidecar text files that contain the same text recognized \"\n        \"by Tesseract. This may be useful for building a OCR text database. \"\n        \"If FILE is omitted, the sidecar file be named {output_file}.txt; the next \"\n        \"argument must NOT be the name of the input PDF. \"\n        \"If FILE is set to '-', the sidecar is written to stdout (a \"\n        \"convenient way to preview OCR quality). The output file and sidecar \"\n        \"may not both use stdout at the same time.\",\n    )\n\n    parser.add_argument(\n        '-n',\n        '--no-overwrite',\n        action='store_true',\n        default=False,\n        help=\"If the output file already exists, exit with an error instead of \"\n        \"overwriting it.\",\n    )\n\n    parser.add_argument(\n        '--version',\n        action='version',\n        version=_VERSION,\n        help=\"Print program version and exit\",\n    )\n\n    jobcontrol = parser.add_argument_group(\"Job control options\")\n    jobcontrol.add_argument(\n        '-j',\n        '--jobs',\n        metavar='N',\n        type=numeric(int, 0, 256),\n        help=\"Use up to N CPU cores simultaneously (default: use all).\",\n    )\n    jobcontrol.add_argument(\n        '-q', '--quiet', action='store_true', help=\"Suppress INFO messages\"\n    )\n    jobcontrol.add_argument(\n        '-v',\n        '--verbose',\n        type=numeric(int, 0, 2),\n        default=0,\n        const=1,\n        nargs='?',\n        help=\"Print more verbose messages for each additional verbose level. Use \"\n        \"`-v 1` typically for much more detailed logging. Higher numbers \"\n        \"are probably only useful in debugging.\",\n    )\n    jobcontrol.add_argument(\n        '--no-progress-bar',\n        action='store_false',\n        dest='progress_bar',\n        help=argparse.SUPPRESS,\n    )\n    jobcontrol.add_argument(\n        '--use-threads', action='store_true', default=True, help=argparse.SUPPRESS\n    )\n    jobcontrol.add_argument(\n        '--no-use-threads',\n        action='store_false',\n        dest='use_threads',\n        help=argparse.SUPPRESS,\n    )\n\n    metadata = parser.add_argument_group(\n        \"Metadata options\",\n        \"Set output PDF/A metadata (default: copy input document's metadata)\",\n    )\n    metadata.add_argument(\n        '--title', type=str, help=\"Set document title (place multiple words in quotes)\"\n    )\n    metadata.add_argument('--author', type=str, help=\"Set document author\")\n    metadata.add_argument(\n        '--subject', type=str, help=\"Set document subject description\"\n    )\n    metadata.add_argument('--keywords', type=str, help=\"Set document keywords\")\n\n    preprocessing = parser.add_argument_group(\n        \"Image preprocessing options\",\n        \"Options to improve the quality of the final PDF and OCR\",\n    )\n    preprocessing.add_argument(\n        '-r',\n        '--rotate-pages',\n        action='store_true',\n        help=\"Automatically rotate pages based on detected text orientation\",\n    )\n    preprocessing.add_argument(\n        '--remove-background',\n        action='store_true',\n        help=\"Attempt to remove background from gray or color pages, setting it \"\n        \"to white \",\n    )\n    preprocessing.add_argument(\n        '-d',\n        '--deskew',\n        action='store_true',\n        help=\"Deskew each page before performing OCR\",\n    )\n    preprocessing.add_argument(\n        '-c',\n        '--clean',\n        action='store_true',\n        help=\"Clean pages from scanning artifacts before performing OCR, and send \"\n        \"the cleaned page to OCR, but do not include the cleaned page in \"\n        \"the output\",\n    )\n    preprocessing.add_argument(\n        '-i',\n        '--clean-final',\n        action='store_true',\n        help=\"Clean page as above, and incorporate the cleaned image in the final \"\n        \"PDF.  Might remove desired content.\",\n    )\n    preprocessing.add_argument(\n        '--unpaper-args',\n        type=str,\n        default=None,\n        help=\"A quoted string of arguments to pass to unpaper. Requires --clean. \"\n        \"Example: --unpaper-args '--layout double'.\",\n    )\n    preprocessing.add_argument(\n        '--oversample',\n        metavar='DPI',\n        type=numeric(int, 0, 5000),\n        default=0,\n        help=\"Oversample images to at least the specified DPI, to improve OCR \"\n        \"results slightly\",\n    )\n    preprocessing.add_argument(\n        '--remove-vectors',\n        action='store_true',\n        help=\"EXPERIMENTAL. Mask out any vector objects in the PDF so that they \"\n        \"will not be included in OCR. This can eliminate false characters.\",\n    )\n\n    ocrsettings = parser.add_argument_group(\"OCR options\", \"Control how OCR is applied\")\n    ocrsettings.add_argument(\n        '-m',\n        '--mode',\n        choices=[mode.value for mode in ProcessingMode],\n        default=ProcessingMode.default.value,\n        help=\"Processing mode for pages with existing text. \"\n        \"'default' errors if text is found. \"\n        \"'force' rasterizes all content and runs OCR (same as --force-ocr). \"\n        \"'skip' skips pages with existing text (same as --skip-text). \"\n        \"'redo' re-OCRs pages, replacing old invisible text (same as --redo-ocr).\",\n    )\n    # Legacy flags for backward compatibility - these set the mode internally\n    ocrsettings.add_argument(\n        '-f',\n        '--force-ocr',\n        action='store_true',\n        help=\"Rasterize any text or vector objects on each page, apply OCR, and \"\n        \"save the rastered output (this rewrites the PDF). \"\n        \"Equivalent to --mode force.\",\n    )\n    ocrsettings.add_argument(\n        '-s',\n        '--skip-text',\n        action='store_true',\n        help=\"Skip OCR on any pages that already contain text, but include the \"\n        \"page in final output; useful for PDFs that contain a mix of \"\n        \"images, text pages, and/or previously OCRed pages. \"\n        \"Equivalent to --mode skip.\",\n    )\n    ocrsettings.add_argument(\n        '--redo-ocr',\n        action='store_true',\n        help=\"Attempt to detect and remove the hidden OCR layer from files that \"\n        \"were previously OCRed with OCRmyPDF or another program. Apply OCR \"\n        \"to text found in raster images. Existing visible text objects will \"\n        \"not be changed. If there is no existing OCR, OCR will be added. \"\n        \"Equivalent to --mode redo.\",\n    )\n    ocrsettings.add_argument(\n        '--skip-big',\n        type=numeric(float, 0, 5000),\n        metavar='MPixels',\n        help=\"Skip OCR on pages larger than the specified amount of megapixels, \"\n        \"but include skipped pages in final output\",\n    )\n    ocrsettings.add_argument(\n        '--invalidate-digital-signatures',\n        action='store_true',\n        help=\"Normally, OCRmyPDF will refuse to OCR a PDF that has a digital \"\n        \"signature. This option allows OCR to proceed, but the digital signature \"\n        \"will be invalidated.\",\n    )\n    ocrsettings.add_argument(\n        '--tagged-pdf-mode',\n        choices=[mode.value for mode in TaggedPdfMode],\n        default=TaggedPdfMode.default.value,\n        help=\"Control behavior when a Tagged PDF is encountered. \"\n        \"'default' errors if --mode is default, otherwise warns. \"\n        \"'ignore' always warns but continues processing.\",\n    )\n\n    advanced = parser.add_argument_group(\n        \"Advanced\", \"Advanced options to control OCRmyPDF\"\n    )\n    advanced.add_argument(\n        '--pages',\n        type=str,\n        help=(\n            \"Limit OCR to the specified pages (ranges or comma separated), \"\n            \"skipping others\"\n        ),\n    )\n    advanced.add_argument(\n        '--max-image-mpixels',\n        action='store',\n        type=numeric(float, 0),\n        metavar='MPixels',\n        help=\"Set maximum number of megapixels to unpack before treating an image as a \"\n        \"decompression bomb\",\n        default=250.0,\n    )\n    advanced.add_argument(\n        '--pdf-renderer',\n        choices=['auto', 'hocr', 'sandwich', 'hocrdebug', 'fpdf2'],\n        default='auto',\n        help=\"Choose OCR PDF renderer. 'auto' (recommended) uses fpdf2, which \"\n        \"provides full international language support including RTL scripts, \"\n        \"proper text positioning, and invisible text that becomes visible when \"\n        \"selected. 'sandwich' renders text as a background layer. Legacy 'hocr' \"\n        \"and 'hocrdebug' options are deprecated and will use fpdf2.\",\n    )\n    advanced.add_argument(\n        '--ocr-engine',\n        choices=['auto', 'tesseract', 'none'],\n        default='auto',\n        help=\"OCR engine to use. 'auto' (default) selects the best available engine. \"\n        \"'tesseract' uses Tesseract OCR. \"\n        \"'none' skips OCR entirely, useful for PDF/A conversion or image processing \"\n        \"without text recognition.\",\n    )\n    advanced.add_argument(\n        '--rasterizer',\n        choices=['auto', 'ghostscript', 'pypdfium'],\n        default='auto',\n        help=\"Choose PDF page rasterizer. 'auto' prefers pypdfium when available, \"\n        \"falling back to Ghostscript. 'pypdfium' is faster but requires the \"\n        \"pypdfium2 package. 'ghostscript' uses the traditional Ghostscript rasterizer.\",\n    )\n    advanced.add_argument(\n        '--rotate-pages-threshold',\n        default=DEFAULT_ROTATE_PAGES_THRESHOLD,\n        type=numeric(float, 0, 1000),\n        metavar='CONFIDENCE',\n        help=\"Only rotate pages when confidence is above this value (arbitrary \"\n        \"units reported by tesseract)\",\n    )\n    advanced.add_argument(\n        '--fast-web-view',\n        type=numeric(float, 0),\n        default=1.0,\n        metavar=\"MEGABYTES\",\n        help=\"If the size of file is more than this threshold (in MB), then \"\n        \"linearize the PDF for fast web viewing. This allows the PDF to be \"\n        \"displayed before it is fully downloaded in web browsers, but increases \"\n        \"the space required slightly. By default we skip this for small files \"\n        \"which do not benefit. If the threshold is 0 it will be apply to all files. \"\n        \"Set the threshold very high to disable.\",\n    )\n    advanced.add_argument(\n        '--continue-on-soft-render-error',\n        action='store_true',\n        help=\"Continue processing pages after a recoverable PDF rendering error. \"\n        \"A recoverable error is one that does not prevent the page from being \"\n        \"rendered, but may result in visual differences compared to the input \"\n        \"file. Missing fonts are a typical source of these errors.\",\n    )\n    advanced.add_argument(\n        '--plugin',\n        dest='plugins',\n        action='append',\n        default=[],\n        help=\"Name of plugin to import. Argument may be issued multiple times to \"\n        \"import multiple plugins. Plugins may be specified as module names in \"\n        \"Python syntax, provided they are installed in the same Python (virtual) \"\n        \"environment as ocrmypdf; or you may give the path to the Python file that \"\n        \"contains the plugin. Plugins must conform to the specification in the \"\n        \"OCRmyPDF documentation.\",\n    )\n\n    debugging = parser.add_argument_group(\n        \"Debugging\", \"Arguments to help with troubleshooting and debugging\"\n    )\n    debugging.add_argument(\n        '-k',\n        '--keep-temporary-files',\n        action='store_true',\n        help=\"Keep temporary files (helpful for debugging)\",\n    )\n    return parser\n\n\nplugins_only_parser = ArgumentParser(\n    prog=_PROGRAM_NAME, fromfile_prefix_chars='@', add_help=False, allow_abbrev=False\n)\nplugins_only_parser.add_argument(\n    '--plugin',\n    dest='plugins',\n    action='append',\n    default=[],\n    help=\"Name of plugin to import.\",\n)\n\n\ndef namespace_to_options(ns) -> OcrOptions:\n    \"\"\"Convert argparse.Namespace to OcrOptions.\n\n    This function encapsulates CLI-specific knowledge of how command line\n    arguments map to our internal options model.\n    \"\"\"\n    # Extract known fields\n    known_fields = {}\n    extra_attrs = {}\n\n    # Legacy boolean flags that map to mode - handled by OcrOptions model validator\n    legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'}\n\n    for key, value in vars(ns).items():\n        if key in OcrOptions.model_fields:\n            known_fields[key] = value\n        elif key in legacy_mode_flags:\n            # Pass legacy flags to OcrOptions for conversion to mode\n            known_fields[key] = value\n        else:\n            extra_attrs[key] = value\n\n    # Handle special cases for hOCR API\n    if 'output_folder' in extra_attrs and 'output_file' not in known_fields:\n        known_fields['output_file'] = '/dev/null'  # Placeholder\n\n    # Handle case where input_file is missing (e.g., in _hocr_to_ocr_pdf)\n    if 'work_folder' in extra_attrs and 'input_file' not in known_fields:\n        known_fields['input_file'] = '/dev/null'  # Placeholder\n\n    instance = OcrOptions(**known_fields)\n    instance.extra_attrs = extra_attrs\n    return instance\n\n\ndef get_options_and_plugins(\n    args=None,\n) -> tuple[OcrOptions, OcrmypdfPluginManager]:\n    \"\"\"Parse command line arguments and return OcrOptions and plugin manager.\n\n    This is the main entry point for CLI argument processing. It handles\n    plugin discovery, argument parsing, and conversion to our internal\n    options model.\n\n    Args:\n        args: Command line arguments. If None, uses sys.argv.\n\n    Returns:\n        Tuple of (OcrOptions, PluginManager)\n    \"\"\"\n    # Import here to avoid circular imports\n    from ocrmypdf.api import setup_plugin_infrastructure\n\n    # First pass: get plugins so we can register their options\n    pre_options, _unused = plugins_only_parser.parse_known_args(args=args)\n\n    # Set up plugin infrastructure with proper initialization\n    plugin_manager = setup_plugin_infrastructure(plugins=pre_options.plugins)\n\n    # Get parser and let plugins add their options\n    parser = get_parser()\n    plugin_manager.add_options(parser=parser)\n\n    # Parse all arguments\n    namespace = parser.parse_args(args=args)\n\n    # Convert to OcrOptions\n    options = namespace_to_options(namespace)\n\n    return options, plugin_manager\n"
  },
  {
    "path": "src/ocrmypdf/data/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"Data files used to generate certain PDFs.\"\"\"\n\nfrom __future__ import annotations\n"
  },
  {
    "path": "src/ocrmypdf/exceptions.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"OCRmyPDF's exceptions.\"\"\"\n\nfrom __future__ import annotations\n\nfrom enum import IntEnum\nfrom textwrap import dedent\n\n\nclass ExitCode(IntEnum):\n    \"\"\"OCRmyPDF's exit codes.\"\"\"\n\n    # pylint: disable=invalid-name\n    ok = 0\n    bad_args = 1\n    input_file = 2\n    missing_dependency = 3\n    invalid_output_pdf = 4\n    file_access_error = 5\n    already_done_ocr = 6\n    child_process_error = 7\n    encrypted_pdf = 8\n    invalid_config = 9\n    pdfa_conversion_failed = 10\n    other_error = 15\n    ctrl_c = 130\n\n\nclass ExitCodeException(Exception):\n    \"\"\"An exception which should return an exit code with sys.exit().\"\"\"\n\n    exit_code = ExitCode.other_error\n    message = \"\"\n\n    def __str__(self):\n        \"\"\"Return a string representation of the exception.\"\"\"\n        super_msg = super().__str__()  # Don't do str(super())\n        if self.message:\n            return self.message.format(super_msg)\n        return super_msg\n\n\nclass BadArgsError(ExitCodeException):\n    \"\"\"Invalid arguments on the command line or API.\"\"\"\n\n    exit_code = ExitCode.bad_args\n\n\nclass MissingDependencyError(ExitCodeException):\n    \"\"\"A third-party dependency is missing.\"\"\"\n\n    exit_code = ExitCode.missing_dependency\n\n\nclass UnsupportedImageFormatError(ExitCodeException):\n    \"\"\"The image format is not supported.\"\"\"\n\n    exit_code = ExitCode.input_file\n\n\nclass DpiError(ExitCodeException):\n    \"\"\"Missing information about input image DPI.\"\"\"\n\n    exit_code = ExitCode.input_file\n\n\nclass OutputFileAccessError(ExitCodeException):\n    \"\"\"Cannot access the intended output file path.\"\"\"\n\n    exit_code = ExitCode.file_access_error\n\n\nclass PriorOcrFoundError(ExitCodeException):\n    \"\"\"This file already has OCR.\"\"\"\n\n    exit_code = ExitCode.already_done_ocr\n\n\nclass InputFileError(ExitCodeException):\n    \"\"\"Something is wrong with the input file.\"\"\"\n\n    exit_code = ExitCode.input_file\n\n\nclass SubprocessOutputError(ExitCodeException):\n    \"\"\"A subprocess returned an unexpected error.\"\"\"\n\n    exit_code = ExitCode.child_process_error\n\n\nclass EncryptedPdfError(ExitCodeException):\n    \"\"\"Input PDF is encrypted.\"\"\"\n\n    exit_code = ExitCode.encrypted_pdf\n    message = dedent(\n        \"\"\"\\\n        Input PDF is encrypted. The encryption must be removed to\n        perform OCR.\n\n        For information about this PDF's security use\n            qpdf --show-encryption infilename\n\n        You can remove the encryption using\n            qpdf --decrypt [--password=[password]] infilename\n        \"\"\"\n    )\n\n\nclass TesseractConfigError(ExitCodeException):\n    \"\"\"Tesseract config can't be parsed.\"\"\"\n\n    exit_code = ExitCode.invalid_config\n    message = \"Error occurred while parsing a Tesseract configuration file\"\n\n\nclass DigitalSignatureError(InputFileError):\n    \"\"\"PDF has a digital signature.\"\"\"\n\n    message = dedent(\n        \"\"\"\\\n        Input PDF has a digital signature. OCR would alter the document,\n        invalidating the signature.\n        \"\"\"\n    )\n\n\nclass TaggedPDFError(InputFileError):\n    \"\"\"PDF is tagged.\"\"\"\n\n    message = dedent(\n        \"\"\"\\\n        This PDF is marked as a Tagged PDF. This often indicates\n        that the PDF was generated from an office document and does\n        not need OCR. Use --force-ocr, --skip-text or --redo-ocr to\n        override this error.\n        \"\"\"\n    )\n\n\nclass ColorConversionNeededError(BadArgsError):\n    \"\"\"PDF needs color conversion.\"\"\"\n\n    message = dedent(\n        \"\"\"\\\n        The input PDF has an unusual color space. Use\n        --color-conversion-strategy to convert to a common color space\n        such as RGB, or use --output-type pdf to skip PDF/A conversion\n        and retain the original color space.\n        \"\"\"\n    )\n"
  },
  {
    "path": "src/ocrmypdf/extra_plugins/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n#\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Extra plugins. These are not automatically inserted when ocrmypdf is run.\"\"\"\n"
  },
  {
    "path": "src/ocrmypdf/extra_plugins/semfree.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"Semaphore-free alternate executor.\n\nThere are two popular environments that do not fully support the standard Python\nmultiprocessing module: AWS Lambda, and Termux (a terminal emulator for Android).\n\nThis alternate executor divvies up work among worker processes before processing,\nrather than having each worker consume work from a shared queue when they finish\ntheir task. This means workers have no need to coordinate with each other. Each\nworker communicates only with the main process.\n\nThis is not without drawbacks. If the tasks are not \"even\" in size, which cannot\nbe guaranteed, some workers may end up with too much work while others are idle.\nIt is less efficient than the standard implementation, so not the default.\n\nThis module is deprecated and will be removed in a future release. The standard\nexecutor will fall back to threads in these environments.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport logging.handlers\nimport signal\nimport warnings\nfrom collections.abc import Callable, Iterable, Iterator\nfrom contextlib import suppress\nfrom enum import Enum, auto\nfrom itertools import islice, repeat, takewhile, zip_longest\nfrom multiprocessing import Pipe, Process\nfrom multiprocessing.connection import Connection, wait\n\nfrom ocrmypdf import Executor, hookimpl\nfrom ocrmypdf._concurrent import NullProgressBar\nfrom ocrmypdf.exceptions import InputFileError\nfrom ocrmypdf.helpers import remove_all_log_handlers\n\nwarnings.warn(\n    \"semfree.py is deprecated and will be removed in a future release.\",\n    DeprecationWarning,\n)\n\n\nclass MessageType(Enum):\n    \"\"\"Implement basic IPC messaging.\"\"\"\n\n    exception = auto()  # pylint: disable=invalid-name\n    result = auto()  # pylint: disable=invalid-name\n    complete = auto()  # pylint: disable=invalid-name\n\n\ndef split_every(n: int, iterable: Iterable) -> Iterator:\n    \"\"\"Split iterable into groups of n.\n\n    >>> list(split_every(4, range(10)))\n    [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]\n\n    https://stackoverflow.com/a/22919323\n    \"\"\"\n    iterator = iter(iterable)\n    return takewhile(bool, (list(islice(iterator, n)) for _ in repeat(None)))\n\n\ndef process_sigbus(*args):\n    \"\"\"Handle SIGBUS signal at the worker level.\"\"\"\n    raise InputFileError(\"A worker process lost access to an input file\")\n\n\nclass ConnectionLogHandler(logging.handlers.QueueHandler):\n    \"\"\"Handler used by child processes to forward log messages to parent.\"\"\"\n\n    def __init__(self, conn: Connection) -> None:\n        \"\"\"Initialize the handler.\"\"\"\n        # sets the parent's queue to None - parent only touches queue\n        # in enqueue() which we override\n        super().__init__(None)  # type: ignore\n        self.conn = conn\n\n    def enqueue(self, record):\n        \"\"\"Enqueue a log message.\"\"\"\n        self.conn.send(('log', record))\n\n\ndef process_loop(\n    conn: Connection, user_init: Callable[[], None], loglevel, task, task_args\n):\n    \"\"\"Initialize a process pool worker.\"\"\"\n    # Install SIGBUS handler (so our parent process can abort somewhat gracefully)\n    with suppress(AttributeError):  # Windows and Cygwin do not have SIGBUS\n        # Windows and Cygwin do not have pthread_sigmask or SIGBUS\n        signal.signal(signal.SIGBUS, process_sigbus)\n\n    # Reconfigure the root logger for this process to send all messages to a queue\n    h = ConnectionLogHandler(conn)\n    root = logging.getLogger()\n    remove_all_log_handlers(root)\n    root.setLevel(loglevel)\n    root.addHandler(h)\n\n    user_init()\n\n    for args in task_args:\n        try:\n            result = task(*args)\n        except Exception as e:  # pylint: disable=broad-except\n            conn.send((MessageType.exception, e))\n            break\n        else:\n            conn.send((MessageType.result, result))\n\n    conn.send((MessageType.complete, None))\n    conn.close()\n    return\n\n\nclass LambdaExecutor(Executor):\n    \"\"\"Executor for AWS Lambda or similar environments that lack semaphores.\"\"\"\n\n    def _execute(\n        self,\n        *,\n        use_threads: bool,\n        max_workers: int,\n        progress_kwargs: dict,\n        worker_initializer: Callable,\n        task: Callable,\n        task_arguments: Iterable,\n        task_finished: Callable,\n    ):\n        if use_threads and max_workers == 1:\n            with self.pbar_class(**progress_kwargs) as pbar:\n                for args in task_arguments:\n                    result = task(*args)\n                    task_finished(result, pbar)\n            return\n\n        task_arguments = list(task_arguments)\n        grouped_args = list(\n            zip_longest(*list(split_every(max_workers, task_arguments)))\n        )\n        if not grouped_args:\n            return\n\n        processes: list[Process] = []\n        connections: list[Connection] = []\n        for chunk in grouped_args:\n            parent_conn, child_conn = Pipe()\n\n            worker_args = [args for args in chunk if args is not None]\n            process = Process(\n                target=process_loop,\n                args=(\n                    child_conn,\n                    worker_initializer,\n                    logging.getLogger(\"\").level,\n                    task,\n                    worker_args,\n                ),\n            )\n            process.daemon = True\n            processes.append(process)\n            connections.append(parent_conn)\n\n        for process in processes:\n            process.start()\n\n        with self.pbar_class(**progress_kwargs) as pbar:\n            while connections:\n                for result in wait(connections):\n                    if not isinstance(result, Connection):\n                        raise NotImplementedError(\"We only support Connection()\")\n                    try:\n                        msg_type, msg = result.recv()\n                    except EOFError:\n                        connections.remove(result)\n                        continue\n\n                    if msg_type == MessageType.result:\n                        task_finished(msg, pbar)\n                    elif msg_type == 'log':\n                        record = msg\n                        logger = logging.getLogger(record.name)\n                        logger.handle(record)\n                    elif msg_type == MessageType.complete:\n                        connections.remove(result)\n                    elif msg_type == MessageType.exception:\n                        for process in processes:\n                            process.terminate()\n                        raise msg\n\n        for process in processes:\n            process.join()\n\n\n@hookimpl\ndef get_executor(progressbar_class):\n    \"\"\"Return a LambdaExecutor instance.\"\"\"\n    return LambdaExecutor(pbar_class=progressbar_class)\n\n\n@hookimpl\ndef get_logging_console():\n    \"\"\"Return a logging.StreamHandler instance.\"\"\"\n    return logging.StreamHandler()\n\n\n@hookimpl\ndef get_progressbar_class():\n    \"\"\"Return a NullProgressBar instance.\n\n    This executor cannot use a progress bar.\n    \"\"\"\n    return NullProgressBar\n"
  },
  {
    "path": "src/ocrmypdf/font/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Font management for OCRmyPDF PDF rendering.\n\nThis module provides font infrastructure for the fpdf2 PDF renderer. It includes:\n\n- FontManager: Base class for font loading and glyph checking\n- FontProvider: Protocol and implementations for font discovery\n- MultiFontManager: Automatic font selection for multilingual documents\n- SystemFontProvider: System font discovery\n\"\"\"\nfrom __future__ import annotations\n\nfrom ocrmypdf.font.font_manager import FontManager\nfrom ocrmypdf.font.font_provider import (\n    BuiltinFontProvider,\n    ChainedFontProvider,\n    FontProvider,\n)\nfrom ocrmypdf.font.multi_font_manager import MultiFontManager\nfrom ocrmypdf.font.system_font_provider import SystemFontProvider\n\n__all__ = [\n    \"FontManager\",\n    \"FontProvider\",\n    \"BuiltinFontProvider\",\n    \"ChainedFontProvider\",\n    \"MultiFontManager\",\n    \"SystemFontProvider\",\n]\n"
  },
  {
    "path": "src/ocrmypdf/font/font_manager.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Base font management for PDF rendering.\n\nThis module provides the base FontManager class that handles font loading\nand glyph checking using uharfbuzz.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom pathlib import Path\n\nimport uharfbuzz as hb\n\n\nclass FontManager:\n    \"\"\"Manages font loading and glyph checking for PDF rendering.\n\n    This base class handles loading fonts with uharfbuzz for glyph checking\n    and text shaping. Renderer-specific subclasses should extend this to\n    add their own font objects.\n\n    Attributes:\n        font_path: Path to the font file\n        font_data: Raw font file bytes\n        font_index: Index within TTC collection (0 for single-font files)\n        hb_face: uharfbuzz Face object\n        hb_font: uharfbuzz Font object\n    \"\"\"\n\n    def __init__(self, font_path: Path, font_index: int = 0):\n        \"\"\"Initialize font manager.\n\n        Args:\n            font_path: Path to TrueType/OpenType font file\n            font_index: Index of font within a TTC collection (default 0).\n                        For single-font files (.ttf, .otf), use 0.\n        \"\"\"\n        self.font_path = font_path\n        self.font_index = font_index\n\n        # Load font data\n        self.font_data = font_path.read_bytes()\n\n        # Load font with uharfbuzz for glyph checking and text measurement\n        # Note: uharfbuzz Face also supports font_index for TTC files\n        self.hb_face = hb.Face(self.font_data, font_index)\n        self.hb_font = hb.Font(self.hb_face)\n\n    def get_hb_font(self) -> hb.Font:\n        \"\"\"Get uharfbuzz Font object for text measurement.\n\n        Returns:\n            UHarfBuzz Font instance\n        \"\"\"\n        return self.hb_font\n\n    def has_glyph(self, codepoint: int) -> bool:\n        \"\"\"Check if font has a glyph for given codepoint.\n\n        Args:\n            codepoint: Unicode codepoint\n\n        Returns:\n            True if font has a real glyph (not .notdef)\n        \"\"\"\n        glyph_id = self.hb_font.get_nominal_glyph(codepoint)\n        return glyph_id is not None and glyph_id != 0\n\n    def get_font_metrics(self) -> tuple[float, float, float]:\n        \"\"\"Get normalized font metrics (ascent, descent, units_per_em).\n\n        Returns:\n            Tuple of (ascent, descent, units_per_em) where ascent and descent\n            are in font units. Ascent is positive (above baseline), descent\n            is typically negative (below baseline).\n        \"\"\"\n        extents = self.hb_font.get_font_extents('ltr')\n        units_per_em = self.hb_face.upem\n        return (extents.ascender, extents.descender, units_per_em)\n\n    def get_left_side_bearing(self, char: str, font_size: float) -> float:\n        \"\"\"Get the left side bearing of a character at a given font size.\n\n        The left side bearing (lsb) is the horizontal distance from the glyph\n        origin (x=0) to the leftmost pixel of the glyph. A positive lsb means\n        there's whitespace before the glyph starts.\n\n        Args:\n            char: Single character to get lsb for\n            font_size: Font size in points\n\n        Returns:\n            Left side bearing in points. Returns 0 if character not found.\n        \"\"\"\n        if not char:\n            return 0.0\n\n        codepoint = ord(char)\n        glyph_id = self.hb_font.get_nominal_glyph(codepoint)\n        if glyph_id is None or glyph_id == 0:\n            return 0.0\n\n        # Get glyph extents which include left/right bearing info\n        extents = self.hb_font.get_glyph_extents(glyph_id)\n        if extents is None:\n            return 0.0\n\n        # x_bearing is the left side bearing in font units\n        units_per_em = self.hb_face.upem\n        lsb_units = extents.x_bearing\n        lsb_pt = lsb_units * font_size / units_per_em\n\n        return lsb_pt\n"
  },
  {
    "path": "src/ocrmypdf/font/font_provider.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Font provider protocol and implementations for PDF rendering.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\nfrom typing import Protocol\n\nfrom ocrmypdf.font.font_manager import FontManager\n\nlog = logging.getLogger(__name__)\n\n\nclass FontProvider(Protocol):\n    \"\"\"Protocol for providing fonts to MultiFontManager.\n\n    Implementations are responsible for knowing where fonts are located\n    and loading them. MultiFontManager asks for fonts by name and uses\n    them for glyph coverage checking.\n    \"\"\"\n\n    def get_font(self, font_name: str) -> FontManager | None:\n        \"\"\"Get a FontManager for the named font.\n\n        Args:\n            font_name: Logical font name (e.g., 'NotoSans-Regular')\n\n        Returns:\n            FontManager if font is available, None otherwise\n        \"\"\"\n        ...\n\n    def get_available_fonts(self) -> list[str]:\n        \"\"\"Get list of available font names.\n\n        Returns:\n            List of font names that can be retrieved with get_font()\n        \"\"\"\n        ...\n\n    def get_fallback_font(self) -> FontManager:\n        \"\"\"Get the glyphless fallback font.\n\n        This font must always be available and handles any codepoint.\n\n        Returns:\n            FontManager for the glyphless fallback font (Occulta.ttf)\n        \"\"\"\n        ...\n\n\nclass BuiltinFontProvider:\n    \"\"\"Font provider using builtin fonts from ocrmypdf/data directory.\"\"\"\n\n    # Mapping of logical font names to filenames\n    # Only Latin (NotoSans) and the glyphless fallback (Occulta.ttf) are bundled.\n    # All other scripts (Arabic, Devanagari, CJK, etc.) are discovered from\n    # system fonts by SystemFontProvider to reduce package size.\n    FONT_FILES = {\n        'NotoSans-Regular': 'NotoSans-Regular.ttf',\n        'Occulta': 'Occulta.ttf',\n    }\n\n    def __init__(self, font_dir: Path | None = None):\n        \"\"\"Initialize builtin font provider.\n\n        Args:\n            font_dir: Directory containing font files. If None, uses\n                      the default ocrmypdf/data directory.\n        \"\"\"\n        if font_dir is None:\n            font_dir = Path(__file__).parent.parent / \"data\"\n        self.font_dir = font_dir\n        self._fonts: dict[str, FontManager] = {}\n        self._load_fonts()\n\n    def _load_fonts(self) -> None:\n        \"\"\"Load available fonts, logging warnings for missing ones.\"\"\"\n        for font_name, font_file in self.FONT_FILES.items():\n            font_path = self.font_dir / font_file\n            if not font_path.exists():\n                if font_name == 'Occulta':\n                    raise FileNotFoundError(\n                        f\"Required fallback font not found: {font_path}\"\n                    )\n                log.warning(\n                    \"Font %s not found at %s - OCR output quality for some \"\n                    \"scripts may be affected\",\n                    font_name,\n                    font_path,\n                )\n                continue\n\n            try:\n                self._fonts[font_name] = FontManager(font_path)\n            except Exception as e:\n                if font_name == 'Occulta':\n                    raise ValueError(\n                        f\"Failed to load required fallback font {font_file}: {e}\"\n                    ) from e\n                log.warning(\n                    \"Failed to load font %s: %s - OCR output quality may be affected\",\n                    font_name,\n                    e,\n                )\n\n    def get_font(self, font_name: str) -> FontManager | None:\n        \"\"\"Get a FontManager for the named font.\"\"\"\n        return self._fonts.get(font_name)\n\n    def get_available_fonts(self) -> list[str]:\n        \"\"\"Get list of available font names.\"\"\"\n        return list(self._fonts.keys())\n\n    def get_fallback_font(self) -> FontManager:\n        \"\"\"Get the glyphless fallback font.\"\"\"\n        return self._fonts['Occulta']\n\n\nclass ChainedFontProvider:\n    \"\"\"Font provider that tries multiple providers in order.\n\n    This allows combining builtin fonts with system fonts, trying\n    the builtin provider first and falling back to system fonts\n    for fonts not bundled with the package.\n    \"\"\"\n\n    def __init__(self, providers: list[FontProvider]):\n        \"\"\"Initialize chained font provider.\n\n        Args:\n            providers: List of font providers to try in order.\n                       The first provider that returns a font wins.\n        \"\"\"\n        if not providers:\n            raise ValueError(\"At least one provider is required\")\n        self.providers = providers\n\n    def get_font(self, font_name: str) -> FontManager | None:\n        \"\"\"Get a FontManager for the named font.\n\n        Tries each provider in order until one returns a font.\n\n        Args:\n            font_name: Logical font name (e.g., 'NotoSans-Regular')\n\n        Returns:\n            FontManager if any provider has the font, None otherwise\n        \"\"\"\n        for provider in self.providers:\n            if font := provider.get_font(font_name):\n                return font\n        return None\n\n    def get_available_fonts(self) -> list[str]:\n        \"\"\"Get list of available font names from all providers.\n\n        Returns:\n            Combined list of font names (deduplicated, order preserved)\n        \"\"\"\n        seen: set[str] = set()\n        result: list[str] = []\n        for provider in self.providers:\n            for name in provider.get_available_fonts():\n                if name not in seen:\n                    seen.add(name)\n                    result.append(name)\n        return result\n\n    def get_fallback_font(self) -> FontManager:\n        \"\"\"Get the glyphless fallback font.\n\n        Tries each provider until one provides a fallback font.\n\n        Returns:\n            FontManager for the fallback font\n\n        Raises:\n            RuntimeError: If no provider can provide a fallback font\n        \"\"\"\n        for provider in self.providers:\n            try:\n                return provider.get_fallback_font()\n            except (NotImplementedError, AttributeError, KeyError):\n                continue\n        raise RuntimeError(\"No fallback font available from any provider\")\n"
  },
  {
    "path": "src/ocrmypdf/font/multi_font_manager.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Multi-font management for PDF rendering.\n\nProvides automatic font selection for multilingual documents based on\nlanguage hints and glyph coverage analysis.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\n\nfrom ocrmypdf.font.font_manager import FontManager\nfrom ocrmypdf.font.font_provider import (\n    BuiltinFontProvider,\n    ChainedFontProvider,\n    FontProvider,\n)\nfrom ocrmypdf.font.system_font_provider import SystemFontProvider\n\nlog = logging.getLogger(__name__)\n\n\nclass MultiFontManager:\n    \"\"\"Manages multiple fonts with automatic selection and fallback.\n\n    This class orchestrates multiple FontManager instances to provide\n    word-level font selection for multilingual documents. It uses a hybrid\n    approach combining language hints from hOCR with glyph coverage analysis.\n\n    Font selection strategy:\n    1. Try language-preferred font (if language hint available)\n    2. Try fallback fonts in order by glyph coverage\n    3. Fall back to Occulta.ttf (glyphless fallback)\n    \"\"\"\n\n    # Language to font mapping\n    # Keys are ISO 639-2/3 codes or Tesseract language codes\n    LANGUAGE_FONT_MAP = {\n        # Arabic script\n        'ara': 'NotoSansArabic-Regular',  # Arabic\n        'per': 'NotoSansArabic-Regular',  # Persian (uses Arabic script)\n        'fas': 'NotoSansArabic-Regular',  # Farsi (alternative code for Persian)\n        'urd': 'NotoSansArabic-Regular',  # Urdu (uses Arabic script)\n        'pus': 'NotoSansArabic-Regular',  # Pashto\n        'kur': 'NotoSansArabic-Regular',  # Kurdish (Arabic script variant)\n        # Devanagari script\n        'hin': 'NotoSansDevanagari-Regular',  # Hindi\n        'san': 'NotoSansDevanagari-Regular',  # Sanskrit\n        'mar': 'NotoSansDevanagari-Regular',  # Marathi\n        'nep': 'NotoSansDevanagari-Regular',  # Nepali\n        'kok': 'NotoSansDevanagari-Regular',  # Konkani\n        'bho': 'NotoSansDevanagari-Regular',  # Bhojpuri\n        'mai': 'NotoSansDevanagari-Regular',  # Maithili\n        # CJK\n        'chi': 'NotoSansCJK-Regular',  # Chinese (generic)\n        'zho': 'NotoSansCJK-Regular',  # Chinese (ISO 639-3)\n        'chi_sim': 'NotoSansCJK-Regular',  # Chinese Simplified (Tesseract)\n        'chi_tra': 'NotoSansCJK-Regular',  # Chinese Traditional (Tesseract)\n        'jpn': 'NotoSansCJK-Regular',  # Japanese\n        'kor': 'NotoSansCJK-Regular',  # Korean\n        # Thai\n        'tha': 'NotoSansThai-Regular',  # Thai\n        # Hebrew\n        'heb': 'NotoSansHebrew-Regular',  # Hebrew\n        'yid': 'NotoSansHebrew-Regular',  # Yiddish (uses Hebrew script)\n        # Bengali script\n        'ben': 'NotoSansBengali-Regular',  # Bengali\n        'asm': 'NotoSansBengali-Regular',  # Assamese (uses Bengali script)\n        # Tamil\n        'tam': 'NotoSansTamil-Regular',  # Tamil\n        # Gujarati\n        'guj': 'NotoSansGujarati-Regular',  # Gujarati\n        # Telugu\n        'tel': 'NotoSansTelugu-Regular',  # Telugu\n        # Kannada\n        'kan': 'NotoSansKannada-Regular',  # Kannada\n        # Malayalam\n        'mal': 'NotoSansMalayalam-Regular',  # Malayalam\n        # Myanmar (Burmese)\n        'mya': 'NotoSansMyanmar-Regular',  # Myanmar\n        # Khmer (Cambodian)\n        'khm': 'NotoSansKhmer-Regular',  # Khmer\n        # Lao\n        'lao': 'NotoSansLao-Regular',  # Lao\n        # Georgian\n        'kat': 'NotoSansGeorgian-Regular',  # Georgian\n        'geo': 'NotoSansGeorgian-Regular',  # Georgian (alternative)\n        # Armenian\n        'hye': 'NotoSansArmenian-Regular',  # Armenian\n        'arm': 'NotoSansArmenian-Regular',  # Armenian (alternative)\n        # Ethiopic\n        'amh': 'NotoSansEthiopic-Regular',  # Amharic\n        'tir': 'NotoSansEthiopic-Regular',  # Tigrinya\n        # Sinhala\n        'sin': 'NotoSansSinhala-Regular',  # Sinhala\n        # Gurmukhi (Punjabi)\n        'pan': 'NotoSansGurmukhi-Regular',  # Punjabi\n        'pnb': 'NotoSansGurmukhi-Regular',  # Western Punjabi\n        # Oriya\n        'ori': 'NotoSansOriya-Regular',  # Oriya\n        'ory': 'NotoSansOriya-Regular',  # Oriya (alternative)\n        # Tibetan\n        'bod': 'NotoSansTibetan-Regular',  # Tibetan\n        'tib': 'NotoSansTibetan-Regular',  # Tibetan (alternative)\n    }\n\n    # Ordered fallback chain for fonts (after language-preferred font)\n    # Order matters: most common scripts first for faster matching\n    FALLBACK_FONTS = [\n        'NotoSans-Regular',  # Latin, Greek, Cyrillic\n        'NotoSansArabic-Regular',\n        'NotoSansDevanagari-Regular',\n        'NotoSansCJK-Regular',\n        'NotoSansThai-Regular',\n        'NotoSansHebrew-Regular',\n        'NotoSansBengali-Regular',\n        'NotoSansTamil-Regular',\n        'NotoSansGujarati-Regular',\n        'NotoSansTelugu-Regular',\n        'NotoSansKannada-Regular',\n        'NotoSansMalayalam-Regular',\n        'NotoSansMyanmar-Regular',\n        'NotoSansKhmer-Regular',\n        'NotoSansLao-Regular',\n        'NotoSansGeorgian-Regular',\n        'NotoSansArmenian-Regular',\n        'NotoSansEthiopic-Regular',\n        'NotoSansSinhala-Regular',\n        'NotoSansGurmukhi-Regular',\n        'NotoSansOriya-Regular',\n        'NotoSansTibetan-Regular',\n    ]\n\n    def __init__(\n        self,\n        font_dir: Path | None = None,\n        *,\n        font_provider: FontProvider | None = None,\n    ):\n        \"\"\"Initialize multi-font manager.\n\n        Args:\n            font_dir: Directory containing font files. If font_provider is\n                      not specified, this is passed to BuiltinFontProvider.\n            font_provider: Provider for loading fonts. If None, uses a\n                           ChainedFontProvider that tries builtin fonts first,\n                           then searches system fonts.\n        \"\"\"\n        if font_provider is not None:\n            self.font_provider = font_provider\n        else:\n            # Use chained provider: try builtin fonts first, then system fonts\n            self.font_provider = ChainedFontProvider(\n                [\n                    BuiltinFontProvider(font_dir),\n                    SystemFontProvider(),\n                ]\n            )\n\n        # Font selection cache: (word_text, language) -> font_name\n        self._selection_cache: dict[tuple[str, str | None], str] = {}\n        # Track whether we've warned about missing fonts (warn once per script)\n        self._warned_scripts: set[str] = set()\n\n    @property\n    def fonts(self) -> dict[str, FontManager]:\n        \"\"\"Get all loaded fonts (backward compatibility).\"\"\"\n        return self.get_all_fonts()\n\n    def _try_font(\n        self, font_name: str, word_text: str, cache_key: tuple[str, str | None]\n    ) -> FontManager | None:\n        \"\"\"Try to use a font for the given word.\n\n        Args:\n            font_name: Name of font to try\n            word_text: Text content to check\n            cache_key: Cache key for storing successful result\n\n        Returns:\n            FontManager if font exists and has all glyphs, None otherwise\n        \"\"\"\n        font = self.font_provider.get_font(font_name)\n        if font is None:\n            return None\n        if self._has_all_glyphs(font, word_text):\n            self._selection_cache[cache_key] = font_name\n            return font\n        return None\n\n    def select_font_for_word(\n        self, word_text: str, line_language: str | None\n    ) -> FontManager:\n        \"\"\"Select appropriate font for a word.\n\n        Uses a hybrid approach:\n        1. Language-based selection (if language hint available)\n        2. Ordered fallback through available fonts by glyph coverage\n        3. Final fallback to Occulta.ttf (glyphless)\n\n        Args:\n            word_text: The text content of the word\n            line_language: Language code from hOCR (e.g., 'ara', 'eng')\n\n        Returns:\n            FontManager instance to use for rendering this word\n        \"\"\"\n        cache_key = (word_text, line_language)\n        if cache_key in self._selection_cache:\n            cached_name = self._selection_cache[cache_key]\n            font = self.font_provider.get_font(cached_name)\n            if font:\n                return font\n\n        tried_fonts: set[str] = set()\n\n        # Phase 1: Try language-preferred font\n        if line_language and line_language in self.LANGUAGE_FONT_MAP:\n            preferred = self.LANGUAGE_FONT_MAP[line_language]\n            tried_fonts.add(preferred)\n            if result := self._try_font(preferred, word_text, cache_key):\n                return result\n\n        # Phase 2: Try fallback fonts in order\n        for font_name in self.FALLBACK_FONTS:\n            if font_name in tried_fonts:\n                continue\n            if result := self._try_font(font_name, word_text, cache_key):\n                return result\n\n        # Phase 3: Glyphless fallback (always succeeds)\n        # Warn if we're falling back for non-ASCII text (likely missing font)\n        self._warn_missing_font(word_text, line_language)\n        self._selection_cache[cache_key] = 'Occulta'\n        return self.font_provider.get_fallback_font()\n\n    def _warn_missing_font(self, word_text: str, line_language: str | None) -> None:\n        \"\"\"Warn user about missing font for non-Latin text.\n\n        Only warns once per language/script to avoid log spam.\n        \"\"\"\n        # Determine a key for deduplication (language or 'non-ascii')\n        warn_key = line_language if line_language else 'unknown'\n\n        # Only warn for non-ASCII text and only once per key\n        if warn_key in self._warned_scripts:\n            return\n\n        # Check if text contains non-ASCII characters\n        if not any(ord(c) > 127 for c in word_text):\n            return\n\n        self._warned_scripts.add(warn_key)\n\n        if line_language and line_language in self.LANGUAGE_FONT_MAP:\n            font_name = self.LANGUAGE_FONT_MAP[line_language]\n            log.warning(\n                \"No font found with glyphs for '%s' text. \"\n                \"Install %s for better rendering. \"\n                \"See https://fonts.google.com/noto\",\n                line_language,\n                font_name,\n            )\n        else:\n            log.warning(\n                \"No font found with glyphs for some text. \"\n                \"Install Noto fonts for better rendering. \"\n                \"See https://fonts.google.com/noto\"\n            )\n\n    def _has_all_glyphs(self, font: FontManager, text: str) -> bool:\n        \"\"\"Check if a font has glyphs for all characters in text.\n\n        Args:\n            font: FontManager instance to check\n            text: Text to verify coverage for\n\n        Returns:\n            True if font has real glyphs for all characters (not .notdef)\n        \"\"\"\n        if not text:\n            return True\n\n        hb_font = font.get_hb_font()\n\n        for char in text:\n            codepoint = ord(char)\n            glyph_id = hb_font.get_nominal_glyph(codepoint)\n            if glyph_id is None or glyph_id == 0:  # 0 = .notdef glyph\n                return False\n\n        return True\n\n    def has_font(self, font_name: str) -> bool:\n        \"\"\"Check if a named font is available.\n\n        Args:\n            font_name: Name of font to check\n\n        Returns:\n            True if font is available\n        \"\"\"\n        return self.font_provider.get_font(font_name) is not None\n\n    def has_all_glyphs(self, font_name: str, text: str) -> bool:\n        \"\"\"Check if a named font has glyphs for all characters in text.\n\n        Args:\n            font_name: Name of font to check\n            text: Text to verify coverage for\n\n        Returns:\n            True if font has real glyphs for all characters (not .notdef)\n        \"\"\"\n        font = self.font_provider.get_font(font_name)\n        if font is None:\n            return False\n        return self._has_all_glyphs(font, text)\n\n    def get_all_fonts(self) -> dict[str, FontManager]:\n        \"\"\"Get all loaded font managers.\n\n        Returns:\n            Dictionary mapping font names to FontManager instances\n        \"\"\"\n        result = {}\n        for name in self.font_provider.get_available_fonts():\n            font = self.font_provider.get_font(name)\n            if font is not None:\n                result[name] = font\n        return result\n"
  },
  {
    "path": "src/ocrmypdf/font/system_font_provider.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"System font discovery for PDF rendering.\n\nProvides lazy discovery of Noto fonts installed on the system across\nLinux, macOS, and Windows platforms.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport sys\nfrom pathlib import Path\n\nfrom ocrmypdf.font.font_manager import FontManager\n\nlog = logging.getLogger(__name__)\n\n\nclass SystemFontProvider:\n    \"\"\"Discovers and provides system-installed Noto fonts with lazy scanning.\n\n    This provider searches standard system font directories for Noto fonts.\n    Scanning is performed lazily - only when a font is actually requested\n    and not found in the builtin fonts. Results are cached for the lifetime\n    of the provider instance.\n    \"\"\"\n\n    # System font directories by platform\n    SYSTEM_FONT_DIRS: dict[str, list[Path]] = {\n        'linux': [\n            Path('/usr/share/fonts'),\n            Path('/usr/local/share/fonts'),\n            Path.home() / '.fonts',\n            Path.home() / '.local/share/fonts',\n        ],\n        'freebsd': [\n            Path('/usr/local/share/fonts'),\n            Path.home() / '.fonts',\n        ],\n        'darwin': [\n            Path('/Library/Fonts'),\n            Path('/System/Library/Fonts'),\n            Path.home() / 'Library/Fonts',\n        ],\n        # Windows is handled dynamically in _get_font_dirs()\n    }\n\n    # Noto font logical names → possible filenames (priority order)\n    # The first match found will be used\n    NOTO_FONT_PATTERNS: dict[str, list[str]] = {\n        'NotoSans-Regular': [\n            'NotoSans-Regular.ttf',\n            'NotoSans-Regular.otf',\n        ],\n        'NotoSansArabic-Regular': [\n            'NotoSansArabic-Regular.ttf',\n            'NotoSansArabic-Regular.otf',\n        ],\n        'NotoSansDevanagari-Regular': [\n            'NotoSansDevanagari-Regular.ttf',\n            'NotoSansDevanagari-Regular.otf',\n        ],\n        'NotoSansCJK-Regular': [\n            # Language-specific variants (any will work for CJK)\n            'NotoSansCJKsc-Regular.otf',  # Simplified Chinese\n            'NotoSansCJKtc-Regular.otf',  # Traditional Chinese\n            'NotoSansCJKjp-Regular.otf',  # Japanese\n            'NotoSansCJKkr-Regular.otf',  # Korean\n            # TTC collections (common on Linux distros)\n            'NotoSansCJK-Regular.ttc',\n            'NotoSansCJKsc-Regular.ttc',\n            # Variable fonts\n            'NotoSansCJKsc-VF.otf',\n        ],\n        'NotoSansThai-Regular': [\n            'NotoSansThai-Regular.ttf',\n            'NotoSansThai-Regular.otf',\n        ],\n        'NotoSansHebrew-Regular': [\n            'NotoSansHebrew-Regular.ttf',\n            'NotoSansHebrew-Regular.otf',\n        ],\n        'NotoSansBengali-Regular': [\n            'NotoSansBengali-Regular.ttf',\n            'NotoSansBengali-Regular.otf',\n        ],\n        'NotoSansTamil-Regular': [\n            'NotoSansTamil-Regular.ttf',\n            'NotoSansTamil-Regular.otf',\n        ],\n        'NotoSansGujarati-Regular': [\n            'NotoSansGujarati-Regular.ttf',\n            'NotoSansGujarati-Regular.otf',\n        ],\n        'NotoSansTelugu-Regular': [\n            'NotoSansTelugu-Regular.ttf',\n            'NotoSansTelugu-Regular.otf',\n        ],\n        'NotoSansKannada-Regular': [\n            'NotoSansKannada-Regular.ttf',\n            'NotoSansKannada-Regular.otf',\n        ],\n        'NotoSansMalayalam-Regular': [\n            'NotoSansMalayalam-Regular.ttf',\n            'NotoSansMalayalam-Regular.otf',\n        ],\n        'NotoSansMyanmar-Regular': [\n            'NotoSansMyanmar-Regular.ttf',\n            'NotoSansMyanmar-Regular.otf',\n        ],\n        'NotoSansKhmer-Regular': [\n            'NotoSansKhmer-Regular.ttf',\n            'NotoSansKhmer-Regular.otf',\n        ],\n        'NotoSansLao-Regular': [\n            'NotoSansLao-Regular.ttf',\n            'NotoSansLao-Regular.otf',\n        ],\n        'NotoSansGeorgian-Regular': [\n            'NotoSansGeorgian-Regular.ttf',\n            'NotoSansGeorgian-Regular.otf',\n        ],\n        'NotoSansArmenian-Regular': [\n            'NotoSansArmenian-Regular.ttf',\n            'NotoSansArmenian-Regular.otf',\n        ],\n        'NotoSansEthiopic-Regular': [\n            'NotoSansEthiopic-Regular.ttf',\n            'NotoSansEthiopic-Regular.otf',\n        ],\n        'NotoSansSinhala-Regular': [\n            'NotoSansSinhala-Regular.ttf',\n            'NotoSansSinhala-Regular.otf',\n        ],\n        'NotoSansGurmukhi-Regular': [\n            'NotoSansGurmukhi-Regular.ttf',\n            'NotoSansGurmukhi-Regular.otf',\n        ],\n        'NotoSansOriya-Regular': [\n            'NotoSansOriya-Regular.ttf',\n            'NotoSansOriya-Regular.otf',\n        ],\n        'NotoSansTibetan-Regular': [\n            'NotoSansTibetan-Regular.ttf',\n            'NotoSansTibetan-Regular.otf',\n        ],\n    }\n\n    def __init__(self) -> None:\n        \"\"\"Initialize system font provider with empty caches.\"\"\"\n        # Cache: font_name -> FontManager (successfully loaded fonts)\n        self._font_cache: dict[str, FontManager] = {}\n        # Negative cache: font names we've searched for but not found\n        self._not_found: set[str] = set()\n        # Cached font directories (computed lazily)\n        self._font_dirs: list[Path] | None = None\n\n    def _get_platform(self) -> str:\n        \"\"\"Get the current platform identifier.\n\n        Returns:\n            Platform string: 'linux', 'darwin', 'windows', or 'freebsd'\n        \"\"\"\n        if sys.platform == 'win32':\n            return 'windows'\n        elif sys.platform == 'darwin':\n            return 'darwin'\n        elif 'freebsd' in sys.platform:\n            return 'freebsd'\n        else:\n            return 'linux'\n\n    def _get_font_dirs(self) -> list[Path]:\n        \"\"\"Get font directories for the current platform.\n\n        Returns:\n            List of paths to search for fonts (may include non-existent paths)\n        \"\"\"\n        if self._font_dirs is not None:\n            return self._font_dirs\n\n        platform = self._get_platform()\n\n        if platform == 'windows':\n            # Get Windows font directories from environment\n            windir = os.environ.get('WINDIR', r'C:\\Windows')\n            self._font_dirs = [Path(windir) / 'Fonts']\n            # User-installed fonts (Windows 10+)\n            localappdata = os.environ.get('LOCALAPPDATA')\n            if localappdata:\n                self._font_dirs.append(\n                    Path(localappdata) / 'Microsoft' / 'Windows' / 'Fonts'\n                )\n        else:\n            self._font_dirs = list(self.SYSTEM_FONT_DIRS.get(platform, []))\n\n        return self._font_dirs\n\n    def _find_font_file(self, font_name: str) -> Path | None:\n        \"\"\"Search system directories for a font file.\n\n        Args:\n            font_name: Logical font name (e.g., 'NotoSansCJK-Regular')\n\n        Returns:\n            Path to font file if found, None otherwise\n        \"\"\"\n        if font_name not in self.NOTO_FONT_PATTERNS:\n            return None\n\n        patterns = self.NOTO_FONT_PATTERNS[font_name]\n\n        for font_dir in self._get_font_dirs():\n            if not font_dir.exists():\n                continue\n\n            for pattern in patterns:\n                # Search recursively for the font file\n                try:\n                    matches = list(font_dir.rglob(pattern))\n                    if matches:\n                        log.debug(\n                            \"Found system font %s at %s\", font_name, matches[0]\n                        )\n                        return matches[0]\n                except PermissionError:\n                    # Skip directories we can't read\n                    continue\n\n        return None\n\n    def get_font(self, font_name: str) -> FontManager | None:\n        \"\"\"Get a FontManager for the named font (lazy loading).\n\n        This method implements lazy scanning: fonts are only searched for\n        when first requested. Results (both positive and negative) are\n        cached for subsequent calls.\n\n        Args:\n            font_name: Logical font name (e.g., 'NotoSansCJK-Regular')\n\n        Returns:\n            FontManager if font is found and loadable, None otherwise\n        \"\"\"\n        # Check positive cache first\n        if font_name in self._font_cache:\n            return self._font_cache[font_name]\n\n        # Check negative cache (already searched, not found)\n        if font_name in self._not_found:\n            return None\n\n        # Lazy scan for this specific font\n        font_path = self._find_font_file(font_name)\n        if font_path is not None:\n            try:\n                fm = FontManager(font_path)\n                self._font_cache[font_name] = fm\n                return fm\n            except Exception as e:\n                log.warning(\n                    \"Found font %s at %s but failed to load: %s\",\n                    font_name,\n                    font_path,\n                    e,\n                )\n\n        # Cache negative result\n        self._not_found.add(font_name)\n        return None\n\n    def get_available_fonts(self) -> list[str]:\n        \"\"\"Get list of font names this provider can potentially find.\n\n        Note: This returns all font names we know patterns for, not\n        necessarily fonts that are actually installed. Use get_font()\n        to check if a specific font is available.\n\n        Returns:\n            List of logical font names\n        \"\"\"\n        return list(self.NOTO_FONT_PATTERNS.keys())\n\n    def get_fallback_font(self) -> FontManager:\n        \"\"\"Get the glyphless fallback font.\n\n        Raises:\n            NotImplementedError: System provider doesn't provide fallback.\n                Use BuiltinFontProvider for the fallback font.\n        \"\"\"\n        raise NotImplementedError(\n            \"SystemFontProvider does not provide a fallback font. \"\n            \"Use BuiltinFontProvider for Occulta.ttf fallback.\"\n        )\n"
  },
  {
    "path": "src/ocrmypdf/fpdf_renderer/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"fpdf2-based PDF renderer for OCR text layers.\n\nThis module provides the PDF renderer using fpdf2 for creating\nsearchable OCR text layers.\n\"\"\"\nfrom __future__ import annotations\n\nfrom ocrmypdf.fpdf_renderer.renderer import (\n    DebugRenderOptions,\n    Fpdf2MultiPageRenderer,\n    Fpdf2PdfRenderer,\n)\n\n__all__ = [\n    \"DebugRenderOptions\",\n    \"Fpdf2PdfRenderer\",\n    \"Fpdf2MultiPageRenderer\",\n]\n"
  },
  {
    "path": "src/ocrmypdf/fpdf_renderer/renderer.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"fpdf2-based PDF renderer for OCR text layers.\n\nThis module provides PDF rendering using fpdf2 for creating searchable\nOCR text layers.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom dataclasses import dataclass\nfrom math import atan, cos, degrees, radians, sin, sqrt\nfrom pathlib import Path\n\nfrom fpdf import FPDF\nfrom fpdf.enums import PDFResourceType, TextMode\nfrom pikepdf import Matrix, Rectangle\n\nfrom ocrmypdf.font import FontManager, MultiFontManager\nfrom ocrmypdf.models.ocr_element import OcrClass, OcrElement\n\nlog = logging.getLogger(__name__)\n\n\ndef transform_point(matrix: Matrix, x: float, y: float) -> tuple[float, float]:\n    \"\"\"Transform a point (x, y) by a matrix.\n\n    Args:\n        matrix: pikepdf Matrix to apply\n        x: X coordinate\n        y: Y coordinate\n\n    Returns:\n        Tuple of (transformed_x, transformed_y)\n    \"\"\"\n    # Use a degenerate rectangle to transform a single point\n    rect = Rectangle(x, y, x, y)\n    transformed = matrix.transform(rect)\n    return (transformed.llx, transformed.lly)\n\n\ndef transform_box(\n    matrix: Matrix, left: float, top: float, right: float, bottom: float\n) -> tuple[float, float, float, float]:\n    \"\"\"Transform a bounding box by a matrix.\n\n    Args:\n        matrix: pikepdf Matrix to apply\n        left: Left edge of box\n        top: Top edge of box\n        right: Right edge of box\n        bottom: Bottom edge of box\n\n    Returns:\n        Tuple of (llx, lly, width, height) of the transformed box\n    \"\"\"\n    rect = Rectangle(left, top, right, bottom)\n    transformed = matrix.transform(rect)\n    return (\n        transformed.llx,\n        transformed.lly,\n        transformed.width,\n        transformed.height,\n    )\n\n\n@dataclass\nclass DebugRenderOptions:\n    \"\"\"Options for debug visualization during rendering.\n\n    When enabled, draws colored lines/shapes to visualize OCR structure.\n    \"\"\"\n\n    render_baseline: bool = False  # Magenta lines along baselines\n    render_line_bbox: bool = False  # Blue rectangles around lines\n    render_word_bbox: bool = False  # Green rectangles around words\n\n\nclass CoordinateTransform:\n    \"\"\"Manages coordinate transformations for fpdf2 rendering.\n\n    Handles conversion from OCR pixel coordinates (top-left origin) to\n    PDF points. fpdf2 uses top-left origin like hOCR, so no Y-flip needed.\n    \"\"\"\n\n    def __init__(self, dpi: float, page_width_px: float, page_height_px: float):\n        \"\"\"Initialize coordinate transform.\"\"\"\n        self.dpi = dpi\n        self.page_width_px = page_width_px\n        self.page_height_px = page_height_px\n\n    @property\n    def page_width_pt(self) -> float:\n        \"\"\"Page width in PDF points.\"\"\"\n        return self.page_width_px * 72.0 / self.dpi\n\n    @property\n    def page_height_pt(self) -> float:\n        \"\"\"Page height in PDF points.\"\"\"\n        return self.page_height_px * 72.0 / self.dpi\n\n    def px_to_pt(self, value: float) -> float:\n        \"\"\"Convert pixels to PDF points.\"\"\"\n        return value * 72.0 / self.dpi\n\n    def bbox_to_pt(self, bbox) -> tuple[float, float, float, float]:\n        \"\"\"Convert BoundingBox from pixels to points.\"\"\"\n        return (\n            self.px_to_pt(bbox.left),\n            self.px_to_pt(bbox.top),\n            self.px_to_pt(bbox.right),\n            self.px_to_pt(bbox.bottom),\n        )\n\n\nclass Fpdf2PdfRenderer:\n    \"\"\"Renders OcrElement trees to PDF using fpdf2.\n\n    This class provides the core rendering logic for converting OCR output\n    into PDF text layers using fpdf2's text drawing capabilities.\n    \"\"\"\n\n    def __init__(\n        self,\n        page: OcrElement,\n        dpi: float,\n        multi_font_manager: MultiFontManager,\n        invisible_text: bool = True,\n        image: Path | None = None,\n        debug_render_options: DebugRenderOptions | None = None,\n    ):\n        \"\"\"Initialize renderer.\n\n        Args:\n            page: Root OcrElement (must be ocr_page)\n            dpi: Source image DPI\n            multi_font_manager: MultiFontManager instance\n            invisible_text: If True, render text as invisible (text mode 3)\n            image: Optional path to image to overlay on top of the text layer,\n                creating a sandwich PDF (text underneath, image on top)\n            debug_render_options: Options for debug visualization\n\n        Raises:\n            ValueError: If page is not an ocr_page or lacks a bounding box\n        \"\"\"\n        if page.ocr_class != OcrClass.PAGE:\n            raise ValueError(\"Root element must be ocr_page\")\n        if page.bbox is None:\n            raise ValueError(\"Page must have bounding box\")\n\n        self.page = page\n        self.dpi = dpi\n        self.multi_font_manager = multi_font_manager\n        self.invisible_text = invisible_text\n        self.image = image\n        self.debug_options = debug_render_options or DebugRenderOptions()\n\n        # Setup coordinate transform\n        self.coord_transform = CoordinateTransform(\n            dpi=dpi,\n            page_width_px=page.bbox.width,\n            page_height_px=page.bbox.height,\n        )\n\n        # Registered fonts: font_path -> fpdf_family_name\n        self._registered_fonts: dict[str, str] = {}\n        # Track whether we've already logged the info-level suppression message\n        self._logged_aspect_ratio_suppression = False\n\n    def render(self, output_path: Path) -> None:\n        \"\"\"Render page to PDF file.\n\n        Args:\n            output_path: Output PDF file path\n        \"\"\"\n        # Create PDF with custom page size\n        pdf = FPDF(\n            unit=\"pt\",\n            format=(\n                self.coord_transform.page_width_pt,\n                self.coord_transform.page_height_pt,\n            ),\n        )\n        pdf.set_auto_page_break(auto=False)\n\n        # Enable text shaping for complex scripts\n        pdf.set_text_shaping(True)\n\n        # Disable cell margin to ensure precise text positioning\n        # fpdf2's cell() adds c_margin padding by default, which shifts text\n        pdf.c_margin = 0\n\n        # Set text mode for invisible text\n        if self.invisible_text:\n            pdf.text_mode = TextMode.INVISIBLE\n        else:\n            pdf.text_mode = TextMode.FILL\n\n        # Render content to PDF\n        self.render_to_pdf(pdf)\n\n        # Write PDF\n        pdf.output(str(output_path))\n\n    def render_to_pdf(self, pdf: FPDF) -> None:\n        \"\"\"Render page content to an existing FPDF instance.\n\n        This method adds a page and renders all content. Used by both\n        single-page rendering and multi-page rendering.\n\n        Args:\n            pdf: FPDF instance to render into\n        \"\"\"\n        # Add page with correct dimensions\n        pdf.add_page(\n            format=(\n                self.coord_transform.page_width_pt,\n                self.coord_transform.page_height_pt,\n            )\n        )\n\n        # Render all paragraphs\n        for para in self.page.paragraphs:\n            self._render_paragraph(pdf, para)\n\n        # If no paragraphs, render lines directly\n        if not self.page.paragraphs:\n            for line in self.page.lines:\n                self._render_line(pdf, line)\n\n        # Place image on top of text layer (sandwich mode)\n        if self.image is not None:\n            pdf.image(\n                str(self.image),\n                x=0,\n                y=0,\n                w=self.coord_transform.page_width_pt,\n                h=self.coord_transform.page_height_pt,\n            )\n\n    def _register_font(self, pdf: FPDF, font_manager: FontManager) -> str:\n        \"\"\"Register font with fpdf2 if not already registered.\n\n        Args:\n            pdf: FPDF instance\n            font_manager: FontManager containing the font\n\n        Returns:\n            Font family name to use with pdf.set_font()\n        \"\"\"\n        font_path_str = str(font_manager.font_path)\n\n        if font_path_str not in self._registered_fonts:\n            # Use the font filename stem as the family name\n            family_name = font_manager.font_path.stem\n            pdf.add_font(family=family_name, fname=font_path_str)\n            self._registered_fonts[font_path_str] = family_name\n\n        return self._registered_fonts[font_path_str]\n\n    def _render_paragraph(self, pdf: FPDF, para: OcrElement) -> None:\n        \"\"\"Render a paragraph element.\n\n        Args:\n            pdf: FPDF instance\n            para: Paragraph OCR element\n        \"\"\"\n        for line in para.children:\n            if line.ocr_class in OcrClass.LINE_TYPES:\n                self._render_line(pdf, line)\n\n    def _render_line(self, pdf: FPDF, line: OcrElement) -> None:\n        \"\"\"Render a line element with baseline support.\n\n        Strategy (following pikepdf reference implementation):\n        1. Create a baseline_matrix that transforms from hOCR coordinates to\n           a coordinate system aligned with the text baseline\n        2. For each word, transform its hOCR bbox using baseline_matrix.inverse()\n           to get its position in the baseline coordinate system\n        3. Render words along the baseline with horizontal scaling\n\n        Args:\n            pdf: FPDF instance\n            line: Line OCR element\n        \"\"\"\n        if line.bbox is None:\n            return\n\n        # Validate line bbox\n        if line.bbox.height <= 0:\n            log.error(\n                \"line box is invalid so we cannot render it: box=%s text=%s\",\n                line.bbox,\n                line.text if hasattr(line, 'text') else '',\n            )\n            return\n\n        # Convert line bbox to PDF points\n        line_left_pt = self.coord_transform.px_to_pt(line.bbox.left)\n        line_top_pt = self.coord_transform.px_to_pt(line.bbox.top)\n        line_right_pt = self.coord_transform.px_to_pt(line.bbox.right)\n        line_bottom_pt = self.coord_transform.px_to_pt(line.bbox.bottom)\n        # Note: line_width_pt and line_height_pt not needed since we compute\n        # dimensions in the un-rotated coordinate system via matrix transform\n\n        # Debug rendering: draw line bbox (in page coordinates)\n        if self.debug_options.render_line_bbox:\n            self._render_debug_line_bbox(\n                pdf, line_left_pt, line_top_pt, line_right_pt, line_bottom_pt\n            )\n\n        # Get textangle (rotation of the entire line)\n        textangle = line.textangle or 0.0\n\n        # Read baseline early so we can detect rotation from steep slopes.\n        # When Tesseract doesn't report textangle for rotated text, the\n        # rotation gets encoded as a very steep baseline slope instead.\n        slope = 0.0\n        intercept_pt = 0.0\n        has_meaningful_baseline = False\n        if line.baseline is not None:\n            slope = line.baseline.slope\n            intercept_pt = self.coord_transform.px_to_pt(line.baseline.intercept)\n            if abs(slope) < 0.005:\n                slope = 0.0\n            has_meaningful_baseline = True\n\n        # Detect text rotation from steep baseline slope.\n        # A slope magnitude > 1.0 corresponds to > 45° from horizontal,\n        # which indicates the line is rotated, not merely skewed.\n        if textangle == 0.0 and abs(slope) > 1.0:\n            textangle = degrees(atan(slope))\n            # The original baseline slope and intercept are not meaningful\n            # after extracting rotation; recalculate intercept from font\n            # metrics below.\n            slope = 0.0\n            has_meaningful_baseline = False\n\n        # Build line_size_aabb_matrix: transforms from page coords to un-rotated\n        # line coords. The hOCR bbox is the minimum axis-aligned bounding box\n        # enclosing the rotated text.\n        # Start at top-left corner of line bbox, then rotate by -textangle\n        line_size_aabb_matrix = (\n            Matrix()\n            .translated(line_left_pt, line_top_pt)\n            .rotated(-textangle)  # textangle is counter-clockwise per hOCR spec\n        )\n\n        # Get the line dimensions in the un-rotated coordinate system\n        # Transform line bbox corners to get the un-rotated dimensions\n        inv_line_matrix = line_size_aabb_matrix.inverse()\n        # Transform bottom-right corner to get line dimensions in rotated space\n        _, _, line_size_width, line_size_height = transform_box(\n            inv_line_matrix, line_left_pt, line_top_pt, line_right_pt, line_bottom_pt\n        )\n\n        # Get baseline intercept\n        if not has_meaningful_baseline:\n            # No baseline provided or baseline was used for rotation detection:\n            # calculate intercept from font metrics\n            default_font_manager = self.multi_font_manager.fonts['NotoSans-Regular']\n            ascent, descent, units_per_em = default_font_manager.get_font_metrics()\n            ascent_norm = ascent / units_per_em\n            descent_norm = descent / units_per_em\n            # Baseline intercept based on font metrics\n            intercept_pt = (\n                -abs(descent_norm)\n                * line_size_height\n                / (ascent_norm + abs(descent_norm))\n            )\n\n        slope_angle_deg = degrees(atan(slope)) if slope != 0.0 else 0.0\n\n        # Build baseline_matrix: transforms from page coords to baseline coords\n        # 1. Start with line_size_aabb_matrix (translates to line corner, rotates)\n        # 2. Translate down to bottom of un-rotated line (line_size_height)\n        # 3. Apply baseline intercept offset\n        # 4. Rotate by baseline slope\n        baseline_matrix = (\n            line_size_aabb_matrix.translated(\n                0, line_size_height\n            )  # Move to bottom of line\n            .translated(0, intercept_pt)  # Apply baseline intercept\n            .rotated(slope_angle_deg)  # Rotate by baseline slope\n        )\n\n        # Calculate font size: height from baseline to top of line\n        font_size = line_size_height + intercept_pt\n        if font_size < 1.0:\n            font_size = line_size_height * 0.8\n\n        # Total rotation for rendering (textangle + slope)\n        total_rotation_deg = -textangle + slope_angle_deg\n\n        # Debug rendering: draw baseline\n        if self.debug_options.render_baseline:\n            # Baseline starts at origin in baseline coords, extends line width\n            baseline_start = transform_point(baseline_matrix, 0, 0)\n            baseline_end = transform_point(baseline_matrix, line_size_width, 0)\n            pdf.set_draw_color(255, 0, 255)  # Magenta\n            pdf.set_line_width(0.75)\n            pdf.line(\n                baseline_start[0], baseline_start[1], baseline_end[0], baseline_end[1]\n            )\n\n        # Extract line language for font selection\n        line_language = line.language\n\n        # Get inverse of baseline_matrix for transforming word bboxes\n        inv_baseline_matrix = baseline_matrix.inverse()\n\n        # Collect words to render\n        words: list[OcrElement | None] = [\n            w for w in line.children if w.ocr_class == OcrClass.WORD and w.text\n        ]\n\n        # Suppress lines where the text aspect ratio is implausible.\n        # This catches cases where Tesseract failed to detect rotation\n        # entirely (slope=0, no textangle) and produced garbage text in a\n        # bounding box whose shape doesn't match the text content at all.\n        if not self._check_aspect_ratio_plausible(\n            pdf, words, font_size, slope_angle_deg,\n            line_size_width, line_size_height, line_language,\n        ):\n            return\n\n        # Collect word rendering data: (text, x_baseline, font_family, word_tz)\n        word_render_data: list[tuple[str, float, str, float]] = []\n        for word in words:\n            if word is None or not word.text or word.bbox is None:\n                continue\n\n            word_left_pt = self.coord_transform.px_to_pt(word.bbox.left)\n            word_top_pt = self.coord_transform.px_to_pt(word.bbox.top)\n            word_right_pt = self.coord_transform.px_to_pt(word.bbox.right)\n            word_bottom_pt = self.coord_transform.px_to_pt(word.bbox.bottom)\n            word_width_pt = word_right_pt - word_left_pt\n\n            # Debug rendering: draw word bbox (in page coordinates)\n            if self.debug_options.render_word_bbox:\n                self._render_debug_word_bbox(\n                    pdf, word_left_pt, word_top_pt, word_right_pt, word_bottom_pt\n                )\n\n            # Get x position in baseline coordinate system\n            box_llx, _, _, _ = transform_box(\n                inv_baseline_matrix,\n                word_left_pt,\n                word_top_pt,\n                word_right_pt,\n                word_bottom_pt,\n            )\n\n            # Select font and compute word-only Tz\n            font_manager = self.multi_font_manager.select_font_for_word(\n                word.text, line_language\n            )\n            font_family = self._register_font(pdf, font_manager)\n            pdf.set_font(font_family, size=font_size)\n            natural_width = pdf.get_string_width(word.text)\n            if natural_width > 0 and word_width_pt > 0:\n                word_tz = (word_width_pt / natural_width) * 100\n            else:\n                word_tz = 100.0\n\n            word_render_data.append((word.text, box_llx, font_family, word_tz))\n\n        if not word_render_data:\n            return\n\n        # Emit single BT block for the entire line using raw PDF operators.\n        # This avoids a poppler bug where Tz (horizontal scaling) is not\n        # carried across BT/ET boundaries, affecting all poppler-based tools\n        # and viewers (Evince, pdftotext, etc.). By keeping all words in a\n        # single BT block with relative Td positioning and per-word Tz, we\n        # ensure correct inter-word spacing.\n        self._emit_line_bt_block(\n            pdf,\n            word_render_data,\n            baseline_matrix,\n            font_size,\n            total_rotation_deg,\n        )\n\n    def _check_aspect_ratio_plausible(\n        self,\n        pdf: FPDF,\n        words: list[OcrElement | None],\n        font_size: float,\n        slope_angle_deg: float,\n        line_size_width: float,\n        line_size_height: float,\n        line_language: str | None,\n    ) -> bool:\n        \"\"\"Check whether the line's aspect ratio is plausible for its text.\n\n        Compares the aspect ratio of the OCR bounding box to the aspect ratio\n        the text would have if rendered normally (accounting for baseline\n        slope). A large mismatch indicates Tesseract misread rotated text\n        without detecting the rotation.\n\n        Returns:\n            True if plausible (rendering should proceed), False to suppress.\n        \"\"\"\n        if line_size_width <= 0 or line_size_height <= 0 or font_size <= 0:\n            return True\n\n        # Fast path: most lines are wider than they are tall, which is\n        # the normal shape for horizontal text. Only tall-narrow boxes\n        # (height > width) need the expensive font measurement check.\n        if line_size_width >= line_size_height:\n            return True\n\n        line_text = ' '.join(\n            w.text for w in words if w is not None and w.text\n        )\n        if not line_text:\n            return True\n\n        # Measure the natural rendered width of the line text\n        font_manager = self.multi_font_manager.select_font_for_word(\n            line_text, line_language\n        )\n        font_family = self._register_font(pdf, font_manager)\n        pdf.set_font(font_family, size=round(font_size))\n        natural_width = pdf.get_string_width(line_text)\n\n        if natural_width <= 0:\n            return True\n\n        # Compute the AABB the text would occupy considering baseline slope\n        theta = radians(abs(slope_angle_deg))\n        expected_w = natural_width * cos(theta) + font_size * sin(theta)\n        expected_h = natural_width * sin(theta) + font_size * cos(theta)\n\n        if expected_h <= 0:\n            return True\n\n        actual_aspect = line_size_width / line_size_height\n        expected_aspect = expected_w / expected_h\n        ratio = actual_aspect / expected_aspect\n\n        if ratio >= 0.1:\n            return True\n\n        # Implausible aspect ratio — suppress this line\n        log.debug(\n            \"Suppressing text with improbable aspect ratio: \"\n            \"actual=%.3f expected=%.3f ratio=%.4f text=%r\",\n            actual_aspect,\n            expected_aspect,\n            ratio,\n            line_text[:80],\n        )\n        if not self._logged_aspect_ratio_suppression:\n            log.info(\n                \"Suppressing OCR output text with improbable aspect ratio\"\n            )\n            self._logged_aspect_ratio_suppression = True\n        return False\n\n    def _emit_line_bt_block(\n        self,\n        pdf: FPDF,\n        word_render_data: list[tuple[str, float, str, float]],\n        baseline_matrix: Matrix,\n        font_size: float,\n        total_rotation_deg: float,\n    ) -> None:\n        \"\"\"Emit a single BT block for the entire line using raw PDF operators.\n\n        Writes all words in a single BT..ET block with relative Td positioning\n        and per-word Tz. Each non-last word gets a trailing space appended, with\n        Tz calculated so the rendered width of \"word \" spans from the current\n        word's start to the next word's start. This works around a poppler bug\n        where Tz is not carried across BT/ET boundaries, which affects all\n        poppler-based viewers and tools (Evince, pdftotext, etc.).\n\n        Args:\n            pdf: FPDF instance\n            word_render_data: List of (text, x_baseline, font_family, word_tz)\n                tuples, one per word on this line\n            baseline_matrix: Transform from baseline coords to page coords\n            font_size: Font size in points\n            total_rotation_deg: Total rotation angle (textangle + slope)\n        \"\"\"\n        page_height = self.coord_transform.page_height_pt\n\n        # Compute baseline direction in PDF coordinates for rotation\n        has_rotation = abs(total_rotation_deg) > 0.01\n        bx0, by0_fpdf = transform_point(baseline_matrix, 0, 0)\n        by0_pdf = page_height - by0_fpdf\n\n        ops: list[str] = []\n\n        if has_rotation:\n            # Compute direction vector along the baseline in PDF coordinates\n            bx1, by1_fpdf = transform_point(baseline_matrix, 100, 0)\n            by1_pdf = page_height - by1_fpdf\n            dx = bx1 - bx0\n            dy = by1_pdf - by0_pdf\n            length = sqrt(dx * dx + dy * dy)\n            if length > 0:\n                cos_a = dx / length\n                sin_a = dy / length\n            else:\n                cos_a = 1.0\n                sin_a = 0.0\n\n            # Save graphics state, apply rotation+translation via cm.\n            # The cm maps local coordinates (baseline-aligned, x along text)\n            # to PDF page coordinates.\n            ops.append('q')\n            ops.append(\n                f'{cos_a:.6f} {sin_a:.6f} {-sin_a:.6f} {cos_a:.6f} '\n                f'{bx0:.2f} {by0_pdf:.2f} cm'\n            )\n\n        # Begin text object\n        ops.append('BT')\n\n        # Text render mode: 3 = invisible, 0 = fill\n        tr = 3 if self.invisible_text else 0\n        ops.append(f'{tr} Tr')\n\n        # Initial text position\n        first_x_baseline = word_render_data[0][1]\n        if has_rotation:\n            # In the cm-transformed space, origin is at the baseline start\n            ops.append(f'{first_x_baseline:.2f} 0 Td')\n        else:\n            # Direct PDF coordinates\n            page_x, page_y_fpdf = transform_point(\n                baseline_matrix, first_x_baseline, 0\n            )\n            page_y_pdf = page_height - page_y_fpdf\n            ops.append(f'{page_x:.2f} {page_y_pdf:.2f} Td')\n\n        prev_font_family: str | None = None\n        prev_x_baseline = first_x_baseline\n\n        for i, (text, x_baseline, font_family, word_tz) in enumerate(\n            word_render_data\n        ):\n            is_last = i == len(word_render_data) - 1\n\n            # Set font if changed\n            if font_family != prev_font_family:\n                pdf.set_font(font_family, size=font_size)\n                # Register font resource on this page\n                pdf._resource_catalog.add(\n                    PDFResourceType.FONT, pdf.current_font.i, pdf.page\n                )\n                ops.append(\n                    f'/F{pdf.current_font.i} {pdf.font_size_pt:.2f} Tf'\n                )\n                prev_font_family = font_family\n\n            # Relative positioning (for words after the first)\n            if i > 0:\n                if has_rotation:\n                    # In rotated space, advance is purely along x-axis\n                    dx_baseline = x_baseline - prev_x_baseline\n                    ops.append(f'{dx_baseline:.2f} 0 Td')\n                else:\n                    # Non-rotated: compute delta in PDF coordinates\n                    px_prev, py_prev_f = transform_point(\n                        baseline_matrix, prev_x_baseline, 0\n                    )\n                    px_curr, py_curr_f = transform_point(\n                        baseline_matrix, x_baseline, 0\n                    )\n                    dx_pdf = px_curr - px_prev\n                    # Flip y delta for PDF coordinates (y-up)\n                    dy_pdf = -(py_curr_f - py_prev_f)\n                    ops.append(f'{dx_pdf:.2f} {dy_pdf:.2f} Td')\n\n            # Determine text to render\n            if not is_last:\n                next_text, next_x_baseline, _, _ = word_render_data[i + 1]\n                advance = next_x_baseline - x_baseline\n\n                # Add trailing space for text extraction unless both are CJK\n                if (\n                    advance > 0\n                    and not (\n                        self._is_cjk_only(text)\n                        and self._is_cjk_only(next_text)\n                    )\n                ):\n                    text_to_render = text + ' '\n                else:\n                    text_to_render = text\n            else:\n                text_to_render = text\n\n            # Use word_tz (fits word into its hOCR bbox) — Td handles\n            # inter-word gaps, so Tz should not stretch to fill them.\n            render_tz = word_tz\n\n            ops.append(f'{render_tz:.2f} Tz')\n            ops.append(self._encode_shaped_text(pdf, text_to_render))\n\n            prev_x_baseline = x_baseline\n\n        # End text object\n        ops.append('ET')\n\n        if has_rotation:\n            ops.append('Q')\n\n        pdf._out('\\n'.join(ops))\n\n        # Reset fpdf2's internal stretching tracking so subsequent API calls\n        # don't think Tz is still set from our raw operators\n        pdf.font_stretching = 100\n\n    def _encode_shaped_text(self, pdf: FPDF, text: str) -> str:\n        \"\"\"Encode text using HarfBuzz text shaping for complex script support.\n\n        Unlike font.encode_text() which maps unicode characters one-by-one to\n        glyph IDs, this uses HarfBuzz to handle BiDi reordering, Arabic joining\n        forms, Devanagari conjuncts, and other complex script shaping. Falls\n        back to encode_text() when text shaping is not enabled.\n        \"\"\"\n        font = pdf.current_font\n        if pdf.text_shaping and pdf.text_shaping.get(\"use_shaping_engine\"):\n            shaped = font.shape_text(text, pdf.font_size_pt, pdf.text_shaping)\n            if shaped:\n                mapped = \"\".join(\n                    chr(ti[\"mapped_char\"])\n                    for ti in shaped\n                    if ti[\"mapped_char\"] is not None\n                )\n                if mapped:\n                    return f\"({font.escape_text(mapped)}) Tj\"\n        return font.encode_text(text)\n\n    def _is_cjk_only(self, text: str) -> bool:\n        \"\"\"Check if text contains only CJK characters.\n\n        CJK scripts don't use spaces between words, so we should not insert\n        spaces between adjacent CJK words.\n\n        Args:\n            text: Text to check\n\n        Returns:\n            True if text contains only CJK characters\n        \"\"\"\n        for char in text:\n            cp = ord(char)\n            # Check if character is in CJK ranges\n            if not (\n                0x4E00 <= cp <= 0x9FFF  # CJK Unified Ideographs\n                or 0x3400 <= cp <= 0x4DBF  # CJK Extension A\n                or 0x20000 <= cp <= 0x2A6DF  # CJK Extension B\n                or 0x2A700 <= cp <= 0x2B73F  # CJK Extension C\n                or 0x2B740 <= cp <= 0x2B81F  # CJK Extension D\n                or 0x2B820 <= cp <= 0x2CEAF  # CJK Extension E\n                or 0x2CEB0 <= cp <= 0x2EBEF  # CJK Extension F\n                or 0x30000 <= cp <= 0x3134F  # CJK Extension G\n                or 0x3040 <= cp <= 0x309F  # Hiragana\n                or 0x30A0 <= cp <= 0x30FF  # Katakana\n                or 0x31F0 <= cp <= 0x31FF  # Katakana Phonetic Extensions\n                or 0xAC00 <= cp <= 0xD7AF  # Hangul Syllables\n                or 0x1100 <= cp <= 0x11FF  # Hangul Jamo\n                or 0x3130 <= cp <= 0x318F  # Hangul Compatibility Jamo\n                or 0xA960 <= cp <= 0xA97F  # Hangul Jamo Extended-A\n                or 0xD7B0 <= cp <= 0xD7FF  # Hangul Jamo Extended-B\n                or 0x3000 <= cp <= 0x303F  # CJK Symbols and Punctuation\n                or 0xFF00 <= cp <= 0xFFEF  # Halfwidth and Fullwidth Forms\n            ):\n                return False\n        return True\n\n    def _render_debug_line_bbox(\n        self,\n        pdf: FPDF,\n        left: float,\n        top: float,\n        right: float,\n        bottom: float,\n    ) -> None:\n        \"\"\"Draw a blue box around the line bbox.\"\"\"\n        pdf.set_draw_color(0, 0, 255)  # Blue\n        pdf.set_line_width(0.5)\n        pdf.rect(left, top, right - left, bottom - top)\n\n    def _render_debug_baseline(\n        self,\n        pdf: FPDF,\n        x: float,\n        y: float,\n        width: float,\n        rotation_deg: float,\n    ) -> None:\n        \"\"\"Draw a magenta line along the baseline.\"\"\"\n        pdf.set_draw_color(255, 0, 255)  # Magenta\n        pdf.set_line_width(0.75)\n\n        if abs(rotation_deg) > 0.1:\n            with pdf.rotation(rotation_deg, x=x, y=y):\n                pdf.line(x, y, x + width, y)\n        else:\n            pdf.line(x, y, x + width, y)\n\n    def _render_debug_word_bbox(\n        self,\n        pdf: FPDF,\n        left: float,\n        top: float,\n        right: float,\n        bottom: float,\n    ) -> None:\n        \"\"\"Draw a green box around the word bbox.\"\"\"\n        pdf.set_draw_color(0, 255, 0)  # Green\n        pdf.set_line_width(0.3)\n        pdf.rect(left, top, right - left, bottom - top)\n\n\nclass Fpdf2MultiPageRenderer:\n    \"\"\"Renders multiple OcrElement pages into a single PDF.\n\n    This class handles multi-page documents by delegating to Fpdf2PdfRenderer\n    for each page while sharing a single FPDF instance and font registration.\n    \"\"\"\n\n    def __init__(\n        self,\n        pages_data: list[tuple[int, OcrElement, float]],\n        multi_font_manager: MultiFontManager,\n        invisible_text: bool = True,\n        debug_render_options: DebugRenderOptions | None = None,\n    ):\n        \"\"\"Initialize multi-page renderer.\n\n        Args:\n            pages_data: List of (pageno, ocr_tree, dpi) tuples\n            multi_font_manager: Shared multi-font manager for all pages\n            invisible_text: Whether to render invisible text\n            debug_render_options: Options for debug visualization\n        \"\"\"\n        self.pages_data = pages_data\n        self.multi_font_manager = multi_font_manager\n        self.invisible_text = invisible_text\n        self.debug_options = debug_render_options or DebugRenderOptions()\n\n    def render(self, output_path: Path) -> None:\n        \"\"\"Render all pages to a single multi-page PDF.\n\n        Args:\n            output_path: Output PDF file path\n        \"\"\"\n        if not self.pages_data:\n            raise ValueError(\"No pages to render\")\n\n        # Create PDF (page size will be set per-page)\n        pdf = FPDF(unit=\"pt\")\n        pdf.set_auto_page_break(auto=False)\n        pdf.set_text_shaping(True)\n\n        # Disable cell margin to ensure precise text positioning\n        # fpdf2's cell() adds c_margin padding by default, which shifts text\n        pdf.c_margin = 0\n\n        # Set text mode for invisible text\n        if self.invisible_text:\n            pdf.text_mode = TextMode.INVISIBLE\n        else:\n            pdf.text_mode = TextMode.FILL\n\n        # Shared font registration across all pages\n        shared_registered_fonts: dict[str, str] = {}\n\n        # Render each page using Fpdf2PdfRenderer\n        for _pageno, page, dpi in self.pages_data:\n            if page.bbox is None:\n                continue\n\n            # Create a renderer for this page\n            page_renderer = Fpdf2PdfRenderer(\n                page=page,\n                dpi=dpi,\n                multi_font_manager=self.multi_font_manager,\n                invisible_text=self.invisible_text,\n                debug_render_options=self.debug_options,\n            )\n\n            # Share font registration to avoid re-registering fonts\n            page_renderer._registered_fonts = shared_registered_fonts\n\n            # Render page content to the shared PDF\n            page_renderer.render_to_pdf(pdf)\n\n        # Write PDF\n        pdf.output(str(output_path))\n"
  },
  {
    "path": "src/ocrmypdf/helpers.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Support functions.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport multiprocessing\nimport os\nimport shutil\nimport warnings\nfrom collections.abc import Callable, Iterable, Sequence\nfrom contextlib import suppress\nfrom decimal import Decimal\nfrom io import StringIO\nfrom math import isclose, isfinite\nfrom pathlib import Path\nfrom statistics import harmonic_mean\nfrom typing import (\n    Any,\n    Generic,\n    TypeVar,\n)\n\nimport img2pdf\nimport pikepdf\n\nlog = logging.getLogger(__name__)\n\nIMG2PDF_KWARGS = dict(engine=img2pdf.Engine.pikepdf, rotation=img2pdf.Rotation.ifvalid)\n\n\nT = TypeVar('T', float, int, Decimal)\n\n\nclass Resolution(Generic[T]):\n    \"\"\"The number of pixels per inch in each 2D direction.\n\n    Resolution objects are considered \"equal\" for == purposes if they are\n    equal to a reasonable tolerance.\n    \"\"\"\n\n    x: T\n    y: T\n\n    __slots__ = ('x', 'y')\n\n    def __init__(self, x: T, y: T):\n        \"\"\"Construct a Resolution object.\"\"\"\n        self.x = x\n        self.y = y\n\n    # rel_tol after converting from dpi to pixels per meter and saving\n    # as integer with rounding, as many file formats\n    CONVERSION_ERROR = 0.002\n\n    def round(self, ndigits: int) -> Resolution:\n        \"\"\"Round to ndigits after the decimal point.\"\"\"\n        return Resolution(round(self.x, ndigits), round(self.y, ndigits))\n\n    def to_int(self) -> Resolution[int]:\n        \"\"\"Round to nearest integer.\"\"\"\n        return Resolution(int(round(self.x)), int(round(self.y)))\n\n    @classmethod\n    def _isclose(cls, a, b):\n        return isclose(a, b, rel_tol=cls.CONVERSION_ERROR)\n\n    @property\n    def is_square(self) -> bool:\n        \"\"\"True if the resolution is square (x == y).\"\"\"\n        return self._isclose(self.x, self.y)\n\n    @property\n    def is_finite(self) -> bool:\n        \"\"\"True if both x and y are finite numbers.\"\"\"\n        return isfinite(self.x) and isfinite(self.y)\n\n    def to_scalar(self) -> float:\n        \"\"\"Return the harmonic mean of x and y as a 1D approximation.\n\n        In most cases, Resolution is 2D, but typically it is \"square\" (x == y) and\n        can be approximated as a single number. When not square, the harmonic mean\n        is used to approximate the 2D resolution as a single number.\n        \"\"\"\n        return harmonic_mean([float(self.x), float(self.y)])\n\n    def _take_minmax(\n        self, vals: Iterable[Any], yvals: Iterable[Any] | None, cmp: Callable\n    ) -> Resolution:\n        \"\"\"Return a new Resolution object with the maximum resolution of inputs.\"\"\"\n        if yvals is not None:\n            return Resolution(cmp(self.x, *vals), cmp(self.y, *yvals))\n        cmp_x, cmp_y = self.x, self.y\n        for x, y in vals:\n            cmp_x = cmp(x, cmp_x)\n            cmp_y = cmp(y, cmp_y)\n        return Resolution(cmp_x, cmp_y)\n\n    def take_max(\n        self, vals: Iterable[Any], yvals: Iterable[Any] | None = None\n    ) -> Resolution:\n        \"\"\"Return a new Resolution object with the maximum resolution of inputs.\"\"\"\n        return self._take_minmax(vals, yvals, max)\n\n    def take_min(\n        self, vals: Iterable[Any], yvals: Iterable[Any] | None = None\n    ) -> Resolution:\n        \"\"\"Return a new Resolution object with the minimum resolution of inputs.\"\"\"\n        return self._take_minmax(vals, yvals, min)\n\n    def flip_axis(self) -> Resolution[T]:\n        \"\"\"Return a new Resolution object with x and y swapped.\"\"\"\n        return Resolution(self.y, self.x)\n\n    def __getitem__(self, idx: int | slice) -> T:\n        \"\"\"Support [0] and [1] indexing.\"\"\"\n        return (self.x, self.y)[idx]\n\n    def __str__(self):\n        \"\"\"Return a string representation of the resolution.\"\"\"\n        return f\"{self.x:f}×{self.y:f}\"\n\n    def __repr__(self):  # pragma: no cover\n        \"\"\"Return a repr() of the resolution.\"\"\"\n        return f\"Resolution({self.x!r}, {self.y!r})\"\n\n    def __eq__(self, other):\n        \"\"\"Return True if the resolution is equal to another resolution.\"\"\"\n        if isinstance(other, tuple) and len(other) == 2:\n            other = Resolution(*other)\n        if not isinstance(other, Resolution):\n            return NotImplemented\n        return self._isclose(self.x, other.x) and self._isclose(self.y, other.y)\n\n\ndef safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike) -> None:\n    \"\"\"Create a symbolic link at ``soft_link_name``, which references ``input_file``.\n\n    Think of this as copying ``input_file`` to ``soft_link_name`` with less overhead.\n\n    Use symlinks safely. Self-linking loops are prevented. On Windows, file copy is\n    used since symlinks may require administrator privileges. An existing link at the\n    destination is removed.\n    \"\"\"\n    input_file = os.fspath(input_file)\n    soft_link_name = os.fspath(soft_link_name)\n\n    # Guard against soft linking to oneself\n    if input_file == soft_link_name:\n        log.warning(\n            \"No symbolic link created. You are using the original data directory \"\n            \"as the working directory.\"\n        )\n        return\n\n    # Soft link already exists: delete for relink?\n    if os.path.lexists(soft_link_name):\n        # do not delete or overwrite real (non-soft link) file\n        if not os.path.islink(soft_link_name):\n            raise FileExistsError(f\"{soft_link_name} exists and is not a link\")\n        os.unlink(soft_link_name)\n\n    if not os.path.exists(input_file):\n        raise FileNotFoundError(f\"trying to create a broken symlink to {input_file}\")\n\n    if os.name == 'nt':\n        # Don't actually use symlinks on Windows due to permission issues\n        shutil.copyfile(input_file, soft_link_name)\n        return\n\n    log.debug(\"os.symlink(%s, %s)\", input_file, soft_link_name)\n\n    # Create symbolic link using absolute path\n    os.symlink(os.path.abspath(input_file), soft_link_name)\n\n\ndef samefile(file1: os.PathLike, file2: os.PathLike) -> bool:\n    \"\"\"Return True if two files are the same file.\n\n    Attempts to account for different relative paths to the same file.\n    \"\"\"\n    if os.name == 'nt':\n        return file1 == file2\n    else:\n        return os.path.samefile(file1, file2)\n\n\ndef is_iterable_notstr(thing: Any) -> bool:\n    \"\"\"Is this is an iterable type, other than a string?\"\"\"\n    return isinstance(thing, Iterable) and not isinstance(thing, str)\n\n\ndef monotonic(seq: Sequence) -> bool:\n    \"\"\"Does this sequence increase monotonically?\"\"\"\n    return all(b > a for a, b in zip(seq, seq[1:], strict=False))\n\n\ndef page_number(input_file: os.PathLike) -> int:\n    \"\"\"Get one-based page number implied by filename (000002.pdf -> 2).\"\"\"\n    return int(os.path.basename(os.fspath(input_file))[0:6])\n\n\ndef available_cpu_count() -> int:\n    \"\"\"Returns number of CPUs in the system.\"\"\"\n    try:\n        return multiprocessing.cpu_count()\n    except NotImplementedError:\n        pass\n    warnings.warn(\n        \"Could not get CPU count. Assuming one (1) CPU. Use -j N to set manually.\"\n    )\n    return 1\n\n\ndef is_file_writable(test_file: os.PathLike) -> bool:\n    \"\"\"Intentionally racy test if target is writable.\n\n    We intend to write to the output file if and only if we succeed and\n    can replace it atomically. Before doing the OCR work, make sure\n    the location is writable.\n    \"\"\"\n    try:\n        p = Path(test_file)\n        if p.is_symlink():\n            p = p.resolve(strict=False)\n\n        # p.is_file() throws an exception in some cases\n        if p.exists() and (p.is_file() or p.samefile(os.devnull)):\n            return os.access(\n                os.fspath(p),\n                os.W_OK,\n                effective_ids=(os.access in os.supports_effective_ids),\n            )\n\n        try:\n            fp = p.open('wb')\n        except OSError:\n            return False\n        else:\n            fp.close()\n            with suppress(OSError):\n                p.unlink()\n        return True\n    except (OSError, RuntimeError) as e:\n        log.debug(e)\n        log.error(str(e))\n        return False\n\n\ndef check_pdf(input_file: Path) -> bool:\n    \"\"\"Check if a PDF complies with the PDF specification.\n\n    Checks for proper formatting and proper linearization. Uses pikepdf (which in\n    turn, uses QPDF) to perform the checks.\n    \"\"\"\n    try:\n        pdf = pikepdf.open(input_file)\n    except pikepdf.PdfError as e:\n        log.error(e)\n        return False\n    else:\n        with pdf:\n            with warnings.catch_warnings():\n                warnings.filterwarnings('ignore', message=r'pikepdf.*JBIG2.*')\n                messages = pdf.check_pdf_syntax()\n            success = True\n            for msg in messages:\n                if 'error' in msg.lower():\n                    log.error(msg)\n                    success = False\n                elif (\n                    \"/DecodeParms: operation for dictionary attempted on object \"\n                    \"of type null\" in msg\n                ):\n                    pass  # Ignore/spurious warning\n                else:\n                    log.warning(msg)\n                    success = False\n\n            sio = StringIO()\n            linearize_msgs = ''\n            try:\n                # If linearization is missing entirely, we do not complain. We do\n                # complain if linearization is present but incorrect.\n                pdf.check_linearization(sio)\n            except (RuntimeError, pikepdf.ForeignObjectError):\n                pass\n            else:\n                linearize_msgs = sio.getvalue()\n                if linearize_msgs:\n                    log.warning(linearize_msgs)\n\n            return bool(success and not linearize_msgs)\n\n\ndef clamp(n: T, smallest: T, largest: T) -> T:\n    \"\"\"Clamps the value of ``n`` to between ``smallest`` and ``largest``.\"\"\"\n    return max(smallest, min(n, largest))\n\n\ndef remove_all_log_handlers(logger: logging.Logger) -> None:\n    \"\"\"Remove all log handlers, usually used in a child process.\n\n    The child process inherits the log handlers from the parent process when\n    a fork occurs. Typically we want to remove all log handlers in the child\n    process so that the child process can set up a single queue handler to\n    forward log messages to the parent process.\n    \"\"\"\n    for handler in logger.handlers[:]:\n        logger.removeHandler(handler)\n        handler.close()  # To ensure handlers with opened resources are released\n\n\ndef pikepdf_enable_mmap() -> None:\n    \"\"\"Enable pikepdf memory mapping.\"\"\"\n    try:\n        pikepdf._core.set_access_default_mmap(True)\n        log.debug(\n            \"pikepdf mmap \"\n            + (\n                'enabled'\n                if pikepdf._core.get_access_default_mmap()  # type: ignore[attr-defined]\n                else 'disabled'\n            )\n        )\n    except AttributeError:\n        log.debug(\"pikepdf mmap not available\")\n\n\ndef running_in_docker() -> bool:\n    \"\"\"Returns True if we seem to be running in a Docker container.\"\"\"\n    return Path('/.dockerenv').exists()\n\n\ndef running_in_snap() -> bool:\n    \"\"\"Returns True if we seem to be running in a Snap container.\"\"\"\n    try:\n        cgroup_text = Path('/proc/self/cgroup').read_text()\n        return 'snap.ocrmypdf' in cgroup_text\n    except FileNotFoundError:\n        return False\n"
  },
  {
    "path": "src/ocrmypdf/hocrtransform/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2023-2025 James R. Barlow\n# SPDX-License-Identifier: MIT\n\n\"\"\"Transform OCR output to text-only PDFs.\n\nThis package provides tools for:\n1. Parsing OCR output (hOCR format) into generic OcrElement structures\n2. Rendering OcrElement structures to searchable PDF text layers\n\nThe architecture separates parsing from rendering, allowing:\n- Support for multiple OCR input formats (hOCR, ALTO, custom engines)\n- Independent improvements to text rendering\n- Reuse of the OcrElement data model for other purposes\n\nMain components:\n- OcrElement: Generic dataclass representing OCR output structure\n- HocrParser: Parses hOCR files into OcrElement trees\n- Fpdf2PdfRenderer: Renders OcrElement trees to PDF text layers (via fpdf2)\n\nFor PDF rendering, use the fpdf2_renderer module:\n    from ocrmypdf.fpdf_renderer import Fpdf2PdfRenderer, DebugRenderOptions\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom ocrmypdf.hocrtransform.hocr_parser import (\n    HocrParseError,\n    HocrParser,\n)\nfrom ocrmypdf.models.ocr_element import (\n    Baseline,\n    BoundingBox,\n    FontInfo,\n    OcrClass,\n    OcrElement,\n)\n\n__all__ = (\n    # hOCR parsing\n    'HocrParser',\n    'HocrParseError',\n    # OCR element data model\n    'OcrElement',\n    'OcrClass',\n    'BoundingBox',\n    'Baseline',\n    'FontInfo',\n)\n"
  },
  {
    "path": "src/ocrmypdf/hocrtransform/__main__.py",
    "content": "# SPDX-FileCopyrightText: 2023-2025 James R. Barlow\n# SPDX-License-Identifier: MIT\n\n\"\"\"Simple CLI for testing HOCR to PDF conversion using fpdf2 renderer.\"\"\"\n\nfrom __future__ import annotations\n\nimport argparse\nfrom pathlib import Path\n\nfrom ocrmypdf.font import MultiFontManager\nfrom ocrmypdf.fpdf_renderer import DebugRenderOptions, Fpdf2PdfRenderer\nfrom ocrmypdf.hocrtransform.hocr_parser import HocrParser\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(description='Convert hocr file to PDF')\n    parser.add_argument(\n        '-b',\n        '--boundingboxes',\n        action=\"store_true\",\n        default=False,\n        help='Show bounding boxes borders (debug mode)',\n    )\n    parser.add_argument(\n        '-r',\n        '--resolution',\n        type=int,\n        default=300,\n        help='Resolution of the image that was OCRed',\n    )\n    parser.add_argument(\n        '-i',\n        '--image',\n        default=None,\n        help='Path to the image to overlay on top of the text layer',\n    )\n    parser.add_argument('hocrfile', help='Path to the hocr file to be parsed')\n    parser.add_argument('outputfile', help='Path to the PDF file to be generated')\n    args = parser.parse_args()\n\n    # Parse hOCR file\n    hocr_parser = HocrParser(args.hocrfile)\n    ocr_page = hocr_parser.parse()\n\n    # Use DPI from hOCR if available, otherwise use command-line resolution\n    dpi = ocr_page.dpi or args.resolution\n\n    # Setup debug render options if requested\n    debug_options = None\n    if args.boundingboxes:\n        debug_options = DebugRenderOptions(\n            render_line_bbox=True,\n            render_word_bbox=True,\n            render_baseline=True,\n        )\n\n    # Create multi-font manager with default font directory\n    font_dir = Path(__file__).parent.parent / \"data\"\n    multi_font_manager = MultiFontManager(font_dir)\n\n    # Render to PDF using fpdf2\n    image_path = Path(args.image) if args.image else None\n    renderer = Fpdf2PdfRenderer(\n        page=ocr_page,\n        dpi=dpi,\n        multi_font_manager=multi_font_manager,\n        invisible_text=bool(args.image),\n        image=image_path,\n        debug_render_options=debug_options,\n    )\n    renderer.render(Path(args.outputfile))\n"
  },
  {
    "path": "src/ocrmypdf/hocrtransform/hocr_parser.py",
    "content": "# SPDX-FileCopyrightText: 2010 Jonathan Brinley\n# SPDX-FileCopyrightText: 2013-2014 Julien Pfefferkorn\n# SPDX-FileCopyrightText: 2023-2025 James R. Barlow\n# SPDX-License-Identifier: MIT\n\n\"\"\"Parser for hOCR format files.\n\nThis module provides functionality to parse hOCR files (HTML-based OCR format)\nand convert them to the engine-agnostic OcrElement tree structure.\n\nFor details of the hOCR format, see:\nhttp://kba.github.io/hocr-spec/1.2/\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nimport unicodedata\nfrom pathlib import Path\nfrom typing import Literal, cast\nfrom xml.etree import ElementTree as ET\n\nfrom ocrmypdf.models.ocr_element import (\n    Baseline,\n    BoundingBox,\n    FontInfo,\n    OcrClass,\n    OcrElement,\n)\n\nTextDirection = Literal[\"ltr\", \"rtl\"]\n\nlog = logging.getLogger(__name__)\n\nElement = ET.Element\n\n\nclass HocrParseError(Exception):\n    \"\"\"Error while parsing hOCR file.\"\"\"\n\n\nclass HocrParser:\n    \"\"\"Parser for hOCR format files.\n\n    Converts hOCR XML/HTML files into OcrElement trees.\n\n    The hOCR format uses HTML with special class attributes (ocr_page, ocr_line,\n    ocrx_word, etc.) and a title attribute containing properties like bbox,\n    baseline, and confidence scores.\n    \"\"\"\n\n    # Regex patterns for parsing hOCR title attributes\n    _bbox_pattern = re.compile(\n        r'''\n        bbox \\s+\n        (\\d+) \\s+   # left: uint\n        (\\d+) \\s+   # top: uint\n        (\\d+) \\s+   # right: uint\n        (\\d+)       # bottom: uint\n        ''',\n        re.VERBOSE,\n    )\n\n    _baseline_pattern = re.compile(\n        r'''\n        baseline \\s+\n        ([\\-\\+]?\\d*\\.?\\d*) \\s+  # slope: +/- decimal float\n        ([\\-\\+]?\\d+)            # intercept: +/- int\n        ''',\n        re.VERBOSE,\n    )\n\n    _textangle_pattern = re.compile(\n        r'''\n        textangle \\s+\n        ([\\-\\+]?\\d*\\.?\\d*)  # angle: +/- decimal float\n        ''',\n        re.VERBOSE,\n    )\n\n    _x_wconf_pattern = re.compile(\n        r'''\n        x_wconf \\s+\n        (\\d+)  # confidence: uint (0-100)\n        ''',\n        re.VERBOSE,\n    )\n\n    _x_fsize_pattern = re.compile(\n        r'''\n        x_fsize \\s+\n        (\\d*\\.?\\d+)  # font size: float\n        ''',\n        re.VERBOSE,\n    )\n\n    _x_font_pattern = re.compile(\n        r'''\n        x_font \\s+\n        ([^\\s;]+)  # font name: non-whitespace, non-semicolon string\n        ''',\n        re.VERBOSE,\n    )\n\n    _ppageno_pattern = re.compile(\n        r'''\n        ppageno \\s+\n        (\\d+)  # page number: uint\n        ''',\n        re.VERBOSE,\n    )\n\n    _scan_res_pattern = re.compile(\n        r'''\n        scan_res \\s+\n        (\\d+) \\s+  # x resolution\n        (\\d+)      # y resolution\n        ''',\n        re.VERBOSE,\n    )\n\n    def __init__(self, hocr_file: str | Path):\n        \"\"\"Initialize the parser with an hOCR file.\n\n        Args:\n            hocr_file: Path to the hOCR file to parse\n\n        Raises:\n            HocrParseError: If the file cannot be parsed\n        \"\"\"\n        self._hocr_path = Path(hocr_file)\n        try:\n            self._tree = ET.parse(os.fspath(hocr_file))\n        except ET.ParseError as e:\n            raise HocrParseError(f\"Failed to parse hOCR file: {e}\") from e\n\n        # Detect XML namespace\n        root_tag = self._tree.getroot().tag\n        matches = re.match(r'({.*})html', root_tag)\n        self._xmlns = matches.group(1) if matches else ''\n\n    def parse(self) -> OcrElement:\n        \"\"\"Parse the hOCR file and return an OcrElement tree.\n\n        Returns:\n            The root OcrElement (ocr_page) containing the document structure\n\n        Raises:\n            HocrParseError: If no ocr_page element is found\n        \"\"\"\n        # Find the first ocr_page element\n        page_div = self._tree.find(self._xpath('div', 'ocr_page'))\n        if page_div is None:\n            raise HocrParseError(\"No ocr_page element found in hOCR file\")\n\n        return self._parse_page(page_div)\n\n    def _xpath(self, html_tag: str, html_class: str | None = None) -> str:\n        \"\"\"Build an XPath expression for finding elements.\n\n        Args:\n            html_tag: HTML tag name (e.g., 'div', 'span', 'p')\n            html_class: Optional class attribute to match\n\n        Returns:\n            XPath expression string\n        \"\"\"\n        xpath = f\".//{self._xmlns}{html_tag}\"\n        if html_class:\n            xpath += f\"[@class='{html_class}']\"\n        return xpath\n\n    def _parse_page(self, page_elem: Element) -> OcrElement:\n        \"\"\"Parse an ocr_page element.\n\n        Args:\n            page_elem: The XML element with class=\"ocr_page\"\n\n        Returns:\n            OcrElement representing the page\n        \"\"\"\n        title = page_elem.attrib.get('title', '')\n\n        bbox = self._parse_bbox(title)\n        if bbox is None:\n            raise HocrParseError(\"ocr_page missing bbox\")\n\n        # Parse page-level properties\n        page_number = self._parse_ppageno(title)\n        dpi = self._parse_scan_res(title)\n\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=bbox,\n            page_number=page_number,\n            dpi=dpi,\n        )\n\n        # Parse child paragraphs\n        for par_elem in page_elem.iterfind(self._xpath('p', 'ocr_par')):\n            paragraph = self._parse_paragraph(par_elem)\n            if paragraph is not None:\n                page.children.append(paragraph)\n\n        # If no paragraphs found, check for words directly under page\n        # (some Tesseract output structures)\n        if not page.children:\n            for word_elem in page_elem.iterfind(self._xpath('span', 'ocrx_word')):\n                word = self._parse_word(word_elem)\n                if word is not None:\n                    page.children.append(word)\n\n        return page\n\n    def _parse_paragraph(self, par_elem: Element) -> OcrElement | None:\n        \"\"\"Parse an ocr_par element.\n\n        Args:\n            par_elem: The XML element with class=\"ocr_par\"\n\n        Returns:\n            OcrElement representing the paragraph, or None if empty\n        \"\"\"\n        title = par_elem.attrib.get('title', '')\n        bbox = self._parse_bbox(title)\n\n        # Get direction and language from attributes\n        dir_attr = par_elem.attrib.get('dir')\n        direction: TextDirection | None = (\n            cast(TextDirection, dir_attr) if dir_attr in ('ltr', 'rtl') else None\n        )\n\n        language = par_elem.attrib.get('lang')\n\n        paragraph = OcrElement(\n            ocr_class=OcrClass.PARAGRAPH,\n            bbox=bbox,\n            direction=direction,\n            language=language,\n        )\n\n        # Parse child lines\n        line_classes = {\n            'ocr_line',\n            'ocr_header',\n            'ocr_footer',\n            'ocr_caption',\n            'ocr_textfloat',\n        }\n        for span_elem in par_elem.iterfind(self._xpath('span')):\n            elem_class = span_elem.attrib.get('class', '')\n            if elem_class in line_classes:\n                line = self._parse_line(span_elem, elem_class, direction, language)\n                if line is not None:\n                    paragraph.children.append(line)\n\n        # Return None if paragraph is empty\n        if not paragraph.children:\n            return None\n\n        return paragraph\n\n    def _parse_line(\n        self,\n        line_elem: Element,\n        ocr_class: str,\n        parent_direction: TextDirection | None,\n        parent_language: str | None,\n    ) -> OcrElement | None:\n        \"\"\"Parse a line element (ocr_line, ocr_header, etc.).\n\n        Args:\n            line_elem: The XML element representing the line\n            ocr_class: The hOCR class of the line\n            parent_direction: Text direction inherited from parent\n            parent_language: Language inherited from parent\n\n        Returns:\n            OcrElement representing the line, or None if empty\n        \"\"\"\n        title = line_elem.attrib.get('title', '')\n        bbox = self._parse_bbox(title)\n\n        if bbox is None:\n            return None\n\n        baseline = self._parse_baseline(title)\n        textangle = self._parse_textangle(title)\n\n        # Inherit direction and language from parent if not specified\n        dir_attr = line_elem.attrib.get('dir')\n        if dir_attr in ('ltr', 'rtl'):\n            direction: TextDirection | None = cast(TextDirection, dir_attr)\n        else:\n            direction = parent_direction\n\n        language = line_elem.attrib.get('lang') or parent_language\n\n        line = OcrElement(\n            ocr_class=ocr_class,\n            bbox=bbox,\n            baseline=baseline,\n            textangle=textangle,\n            direction=direction,\n            language=language,\n        )\n\n        # Parse child words\n        for word_elem in line_elem.iterfind(self._xpath('span', 'ocrx_word')):\n            word = self._parse_word(word_elem)\n            if word is not None:\n                line.children.append(word)\n\n        # Return None if line has no words\n        if not line.children:\n            return None\n\n        return line\n\n    def _parse_word(self, word_elem: Element) -> OcrElement | None:\n        \"\"\"Parse an ocrx_word element.\n\n        Args:\n            word_elem: The XML element with class=\"ocrx_word\"\n\n        Returns:\n            OcrElement representing the word, or None if empty\n        \"\"\"\n        title = word_elem.attrib.get('title', '')\n        bbox = self._parse_bbox(title)\n\n        # Get the text content\n        text = self._get_element_text(word_elem)\n        text = self._normalize_text(text)\n\n        if not text:\n            return None\n\n        # Parse confidence (x_wconf is 0-100, convert to 0.0-1.0)\n        confidence = self._parse_x_wconf(title)\n        if confidence is not None:\n            confidence = confidence / 100.0\n\n        # Parse font info\n        font = self._parse_font_info(title)\n\n        return OcrElement(\n            ocr_class=OcrClass.WORD,\n            bbox=bbox,\n            text=text,\n            confidence=confidence,\n            font=font,\n        )\n\n    def _get_element_text(self, element: Element) -> str:\n        \"\"\"Get the full text content of an element including children.\n\n        Args:\n            element: XML element\n\n        Returns:\n            Combined text content\n        \"\"\"\n        text = element.text if element.text is not None else ''\n        for child in element:\n            text += self._get_element_text(child)\n        text += element.tail if element.tail is not None else ''\n        return text\n\n    @staticmethod\n    def _normalize_text(text: str) -> str:\n        \"\"\"Normalize text using NFKC normalization.\n\n        This splits ligatures and combines diacritics.\n\n        Args:\n            text: Raw text\n\n        Returns:\n            Normalized text, stripped of leading/trailing whitespace\n        \"\"\"\n        return unicodedata.normalize(\"NFKC\", text).strip()\n\n    def _parse_bbox(self, title: str) -> BoundingBox | None:\n        \"\"\"Parse a bbox from an hOCR title attribute.\n\n        Args:\n            title: The title attribute value\n\n        Returns:\n            BoundingBox or None if not found\n        \"\"\"\n        match = self._bbox_pattern.search(title)\n        if not match:\n            return None\n\n        try:\n            return BoundingBox(\n                left=float(match.group(1)),\n                top=float(match.group(2)),\n                right=float(match.group(3)),\n                bottom=float(match.group(4)),\n            )\n        except ValueError:\n            return None\n\n    def _parse_baseline(self, title: str) -> Baseline | None:\n        \"\"\"Parse baseline from an hOCR title attribute.\n\n        Args:\n            title: The title attribute value\n\n        Returns:\n            Baseline or None if not found\n        \"\"\"\n        match = self._baseline_pattern.search(title)\n        if not match:\n            return None\n\n        try:\n            return Baseline(\n                slope=float(match.group(1)) if match.group(1) else 0.0,\n                intercept=float(match.group(2)),\n            )\n        except ValueError:\n            return None\n\n    def _parse_textangle(self, title: str) -> float | None:\n        \"\"\"Parse textangle from an hOCR title attribute.\n\n        Args:\n            title: The title attribute value\n\n        Returns:\n            Angle in degrees or None if not found\n        \"\"\"\n        match = self._textangle_pattern.search(title)\n        if not match:\n            return None\n\n        try:\n            return float(match.group(1))\n        except ValueError:\n            return None\n\n    def _parse_x_wconf(self, title: str) -> float | None:\n        \"\"\"Parse word confidence from an hOCR title attribute.\n\n        Args:\n            title: The title attribute value\n\n        Returns:\n            Confidence (0-100) or None if not found\n        \"\"\"\n        match = self._x_wconf_pattern.search(title)\n        if not match:\n            return None\n\n        try:\n            return float(match.group(1))\n        except ValueError:\n            return None\n\n    def _parse_ppageno(self, title: str) -> int | None:\n        \"\"\"Parse physical page number from an hOCR title attribute.\n\n        Args:\n            title: The title attribute value\n\n        Returns:\n            Page number or None if not found\n        \"\"\"\n        match = self._ppageno_pattern.search(title)\n        if not match:\n            return None\n\n        try:\n            return int(match.group(1))\n        except ValueError:\n            return None\n\n    def _parse_scan_res(self, title: str) -> float | None:\n        \"\"\"Parse scan resolution (DPI) from an hOCR title attribute.\n\n        Args:\n            title: The title attribute value\n\n        Returns:\n            DPI (using first value if x and y differ) or None if not found\n        \"\"\"\n        match = self._scan_res_pattern.search(title)\n        if not match:\n            return None\n\n        try:\n            # Use the first (x) resolution value\n            return float(match.group(1))\n        except ValueError:\n            return None\n\n    def _parse_font_info(self, title: str) -> FontInfo | None:\n        \"\"\"Parse font information from an hOCR title attribute.\n\n        Args:\n            title: The title attribute value\n\n        Returns:\n            FontInfo or None if no font info found\n        \"\"\"\n        font_match = self._x_font_pattern.search(title)\n        size_match = self._x_fsize_pattern.search(title)\n\n        if not font_match and not size_match:\n            return None\n\n        return FontInfo(\n            name=font_match.group(1) if font_match else None,\n            size=float(size_match.group(1)) if size_match else None,\n        )\n"
  },
  {
    "path": "src/ocrmypdf/imageops.py",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"OCR-related image manipulation.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom math import floor, sqrt\n\nfrom PIL import Image\n\nlog = logging.getLogger(__name__)\n\n\ndef bytes_per_pixel(mode: str) -> int:\n    \"\"\"Return the number of padded bytes per pixel for a given PIL image mode.\n\n    In RGB mode we assume 4 bytes per pixel, which is the case for most\n    consumers.\n    \"\"\"\n    if mode in ('1', 'L', 'P'):\n        return 1\n    if mode in ('LA', 'PA', 'La') or mode.startswith('I;16'):\n        return 2\n    return 4\n\n\ndef _calculate_downsample(\n    image_size: tuple[int, int],\n    bytes_per_pixel: int,\n    *,\n    max_size: tuple[int, int] | None = None,\n    max_pixels: int | None = None,\n    max_bytes: int | None = None,\n) -> tuple[int, int]:\n    \"\"\"Calculate image size required to downsample an image to fit limits.\n\n    If no limit is exceeded, the input image's size is returned.\n\n    Args:\n        image_size: Dimensions of image.\n        bytes_per_pixel: Number of bytes per pixel.\n        max_size: The maximum width and height of the image.\n        max_pixels: The maximum number of pixels in the image. Some image consumers\n            limit the total number of pixels as some value other than width*height.\n        max_bytes: The maximum number of bytes in the image. RGB is counted as 4\n            bytes; all other modes are counted as 1 byte.\n    \"\"\"\n    size = image_size\n\n    if max_size is not None:\n        overage = max_size[0] / size[0], max_size[1] / size[1]\n        size_factor = min(overage)\n        if size_factor < 1.0:\n            log.debug(\"Resizing image to fit image dimensions limit\")\n            size = floor(size[0] * size_factor), floor(size[1] * size_factor)\n            if size[0] == 0:\n                size = 1, min(size[1], max_size[1])\n            elif size[1] == 0:\n                size = min(size[0], max_size[0]), 1\n\n    if max_pixels is not None and size[0] * size[1] > max_pixels:\n        log.debug(\"Resizing image to fit image pixel limit\")\n        pixels_factor = sqrt(max_pixels / (size[0] * size[1]))\n        size = floor(size[0] * pixels_factor), floor(size[1] * pixels_factor)\n\n    if max_bytes is not None:\n        bpp = bytes_per_pixel\n        # stride = bytes per line\n        stride = size[0] * bpp\n        height = size[1]\n        if stride * height > max_bytes:\n            log.debug(\"Resizing image to fit image byte size limit\")\n            bytes_factor = sqrt(max_bytes / (stride * height))\n            scaled_stride = floor(stride * bytes_factor)\n            scaled_height = floor(height * bytes_factor)\n            if scaled_stride == 0:\n                scaled_stride = bpp\n                scaled_height = min(max_bytes // bpp, scaled_height)\n            if scaled_height == 0:\n                scaled_height = 1\n                scaled_stride = min(max_bytes // scaled_height, scaled_stride)\n            size = floor(scaled_stride / bpp), scaled_height\n\n    return size\n\n\ndef calculate_downsample(\n    image: Image.Image,\n    *,\n    max_size: tuple[int, int] | None = None,\n    max_pixels: int | None = None,\n    max_bytes: int | None = None,\n) -> tuple[int, int]:\n    \"\"\"Calculate image size required to downsample an image to fit limits.\n\n    If no limit is exceeded, the input image's size is returned.\n\n    Args:\n        image: The image to downsample.\n        max_size: The maximum width and height of the image.\n        max_pixels: The maximum number of pixels in the image. Some image consumers\n            limit the total number of pixels as some value other than width*height.\n        max_bytes: The maximum number of bytes in the image. RGB is counted as 4\n            bytes; all other modes are counted as 1 byte.\n    \"\"\"\n    return _calculate_downsample(\n        image.size,\n        bytes_per_pixel(image.mode),\n        max_size=max_size,\n        max_pixels=max_pixels,\n        max_bytes=max_bytes,\n    )\n\n\ndef downsample_image(\n    image: Image.Image,\n    new_size: tuple[int, int],\n    *,\n    resample_mode: Image.Resampling = Image.Resampling.BICUBIC,\n    reducing_gap: int = 3,\n) -> Image.Image:\n    \"\"\"Downsample an image to fit within the given limits.\n\n    The DPI is adjusted to match the new size, which is how we can ensure the\n    OCR is positioned correctly.\n\n    Args:\n        image: The image to downsample\n        new_size: The new size of the image.\n        resample_mode: The resampling mode to use when downsampling.\n        reducing_gap: The reducing gap to use when downsampling (for larger\n            reductions).\n    \"\"\"\n    if new_size == image.size:\n        return image\n\n    original_size = image.size\n    original_dpi = image.info['dpi']\n    image = image.resize(\n        new_size,\n        resample=resample_mode,\n        reducing_gap=reducing_gap,\n    )\n    image.info['dpi'] = (\n        round(original_dpi[0] * new_size[0] / original_size[0]),\n        round(original_dpi[1] * new_size[1] / original_size[1]),\n    )\n    log.debug(f\"Rescaled image to {image.size} pixels and {image.info['dpi']} dpi\")\n    return image\n"
  },
  {
    "path": "src/ocrmypdf/languages.py",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Language codes and names from ISO 639.\n\nDerived from\nhttps://www.loc.gov/standards/iso639-2/ascii_8bits.html\n\"\"\"\nfrom __future__ import annotations\n\nfrom typing import NamedTuple\n\n\nclass ISOCodeData(NamedTuple):\n    \"\"\"Data for a single ISO 639 code.\"\"\"\n\n    alt: str\n    alpha_2: str\n    english: str\n    french: str\n\n\nISO_639_3 = {\n    'aar': ISOCodeData('', 'aa', 'Afar', 'afar'),\n    'abk': ISOCodeData('', 'ab', 'Abkhazian', 'abkhaze'),\n    'ace': ISOCodeData('', '', 'Achinese', 'aceh'),\n    'ach': ISOCodeData('', '', 'Acoli', 'acoli'),\n    'ada': ISOCodeData('', '', 'Adangme', 'adangme'),\n    'ady': ISOCodeData('', '', 'Adyghe; Adygei', 'adyghé'),\n    'afa': ISOCodeData(\n        '',\n        '',\n        'Afro-Asiatic languages',\n        'afro-asiatiques, langues',\n    ),\n    'afh': ISOCodeData('', '', 'Afrihili', 'afrihili'),\n    'afr': ISOCodeData('', 'af', 'Afrikaans', 'afrikaans'),\n    'ain': ISOCodeData('', '', 'Ainu', 'aïnou'),\n    'aka': ISOCodeData('', 'ak', 'Akan', 'akan'),\n    'akk': ISOCodeData('', '', 'Akkadian', 'akkadien'),\n    'alb': ISOCodeData('sqi', 'sq', 'Albanian', 'albanais'),\n    'ale': ISOCodeData('', '', 'Aleut', 'aléoute'),\n    'alg': ISOCodeData(\n        '',\n        '',\n        'Algonquian languages',\n        'algonquines, langues',\n    ),\n    'alt': ISOCodeData('', '', 'Southern Altai', 'altai du Sud'),\n    'amh': ISOCodeData('', 'am', 'Amharic', 'amharique'),\n    'ang': ISOCodeData(\n        '',\n        '',\n        'English, Old (ca.450-1100)',\n        'anglo-saxon (ca.450-1100)',\n    ),\n    'anp': ISOCodeData('', '', 'Angika', 'angika'),\n    'apa': ISOCodeData('', '', 'Apache languages', 'apaches, langues'),\n    'ara': ISOCodeData('', 'ar', 'Arabic', 'arabe'),\n    'arc': ISOCodeData(\n        '',\n        '',\n        'Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)',\n        \"araméen d'empire (700-300 BCE)\",\n    ),\n    'arg': ISOCodeData('', 'an', 'Aragonese', 'aragonais'),\n    'arm': ISOCodeData('hye', 'hy', 'Armenian', 'arménien'),\n    'arn': ISOCodeData(\n        '',\n        '',\n        'Mapudungun; Mapuche',\n        'mapudungun; mapuche; mapuce',\n    ),\n    'arp': ISOCodeData('', '', 'Arapaho', 'arapaho'),\n    'art': ISOCodeData(\n        '',\n        '',\n        'Artificial languages',\n        'artificielles, langues',\n    ),\n    'arw': ISOCodeData('', '', 'Arawak', 'arawak'),\n    'asm': ISOCodeData('', 'as', 'Assamese', 'assamais'),\n    'ast': ISOCodeData(\n        '',\n        '',\n        'Asturian; Bable; Leonese; Asturleonese',\n        'asturien; bable; léonais; asturoléonais',\n    ),\n    'ath': ISOCodeData(\n        '',\n        '',\n        'Athapascan languages',\n        'athapascanes, langues',\n    ),\n    'aus': ISOCodeData(\n        '',\n        '',\n        'Australian languages',\n        'australiennes, langues',\n    ),\n    'ava': ISOCodeData('', 'av', 'Avaric', 'avar'),\n    'ave': ISOCodeData('', 'ae', 'Avestan', 'avestique'),\n    'awa': ISOCodeData('', '', 'Awadhi', 'awadhi'),\n    'aym': ISOCodeData('', 'ay', 'Aymara', 'aymara'),\n    'aze': ISOCodeData('', 'az', 'Azerbaijani', 'azéri'),\n    'bad': ISOCodeData('', '', 'Banda languages', 'banda, langues'),\n    'bai': ISOCodeData('', '', 'Bamileke languages', 'bamiléké, langues'),\n    'bak': ISOCodeData('', 'ba', 'Bashkir', 'bachkir'),\n    'bal': ISOCodeData('', '', 'Baluchi', 'baloutchi'),\n    'bam': ISOCodeData('', 'bm', 'Bambara', 'bambara'),\n    'ban': ISOCodeData('', '', 'Balinese', 'balinais'),\n    'baq': ISOCodeData('eus', 'eu', 'Basque', 'basque'),\n    'bas': ISOCodeData('', '', 'Basa', 'basa'),\n    'bat': ISOCodeData('', '', 'Baltic languages', 'baltes, langues'),\n    'bej': ISOCodeData('', '', 'Beja; Bedawiyet', 'bedja'),\n    'bel': ISOCodeData('', 'be', 'Belarusian', 'biélorusse'),\n    'bem': ISOCodeData('', '', 'Bemba', 'bemba'),\n    'ben': ISOCodeData('', 'bn', 'Bengali', 'bengali'),\n    'ber': ISOCodeData('', '', 'Berber languages', 'berbères, langues'),\n    'bho': ISOCodeData('', '', 'Bhojpuri', 'bhojpuri'),\n    'bih': ISOCodeData('', 'bh', 'Bihari languages', 'langues biharis'),\n    'bik': ISOCodeData('', '', 'Bikol', 'bikol'),\n    'bin': ISOCodeData('', '', 'Bini; Edo', 'bini; edo'),\n    'bis': ISOCodeData('', 'bi', 'Bislama', 'bichlamar'),\n    'bla': ISOCodeData('', '', 'Siksika', 'blackfoot'),\n    'bnt': ISOCodeData('', '', 'Bantu languages', 'bantou, langues'),\n    'bos': ISOCodeData('', 'bs', 'Bosnian', 'bosniaque'),\n    'bra': ISOCodeData('', '', 'Braj', 'braj'),\n    'bre': ISOCodeData('', 'br', 'Breton', 'breton'),\n    'btk': ISOCodeData('', '', 'Batak languages', 'batak, langues'),\n    'bua': ISOCodeData('', '', 'Buriat', 'bouriate'),\n    'bug': ISOCodeData('', '', 'Buginese', 'bugi'),\n    'bul': ISOCodeData('', 'bg', 'Bulgarian', 'bulgare'),\n    'bur': ISOCodeData('mya', 'my', 'Burmese', 'birman'),\n    'byn': ISOCodeData('', '', 'Blin; Bilin', 'blin; bilen'),\n    'cad': ISOCodeData('', '', 'Caddo', 'caddo'),\n    'cai': ISOCodeData(\n        '',\n        '',\n        'Central American Indian languages',\n        \"amérindiennes de L'Amérique centrale, langues\",\n    ),\n    'car': ISOCodeData('', '', 'Galibi Carib', 'karib; galibi; carib'),\n    'cat': ISOCodeData('', 'ca', 'Catalan; Valencian', 'catalan; valencien'),\n    'cau': ISOCodeData(\n        '',\n        '',\n        'Caucasian languages',\n        'caucasiennes, langues',\n    ),\n    'ceb': ISOCodeData('', '', 'Cebuano', 'cebuano'),\n    'cel': ISOCodeData(\n        '',\n        '',\n        'Celtic languages',\n        'celtiques, langues; celtes, langues',\n    ),\n    'cha': ISOCodeData('', 'ch', 'Chamorro', 'chamorro'),\n    'chb': ISOCodeData('', '', 'Chibcha', 'chibcha'),\n    'che': ISOCodeData('', 'ce', 'Chechen', 'tchétchène'),\n    'chg': ISOCodeData('', '', 'Chagatai', 'djaghataï'),\n    'chi': ISOCodeData('zho', 'zh', 'Chinese', 'chinois'),\n    'chk': ISOCodeData('', '', 'Chuukese', 'chuuk'),\n    'chm': ISOCodeData('', '', 'Mari', 'mari'),\n    'chn': ISOCodeData('', '', 'Chinook jargon', 'chinook, jargon'),\n    'cho': ISOCodeData('', '', 'Choctaw', 'choctaw'),\n    'chp': ISOCodeData('', '', 'Chipewyan; Dene Suline', 'chipewyan'),\n    'chr': ISOCodeData('', '', 'Cherokee', 'cherokee'),\n    'chu': ISOCodeData(\n        '',\n        'cu',\n        (\n            'Church Slavic; Old Slavonic; Church Slavonic;'\n            ' Old Bulgarian; Old Church Slavonic'\n        ),\n        \"slavon d'église; vieux slave; slavon liturgique; vieux bulgare\",\n    ),\n    'chv': ISOCodeData('', 'cv', 'Chuvash', 'tchouvache'),\n    'chy': ISOCodeData('', '', 'Cheyenne', 'cheyenne'),\n    'cmc': ISOCodeData('', '', 'Chamic languages', 'chames, langues'),\n    'cnr': ISOCodeData('', '', 'Montenegrin', 'monténégrin'),\n    'cop': ISOCodeData('', '', 'Coptic', 'copte'),\n    'cor': ISOCodeData('', 'kw', 'Cornish', 'cornique'),\n    'cos': ISOCodeData('', 'co', 'Corsican', 'corse'),\n    'cpe': ISOCodeData(\n        '',\n        '',\n        'Creoles and pidgins, English based',\n        \"créoles et pidgins basés sur l'anglais\",\n    ),\n    'cpf': ISOCodeData(\n        '',\n        '',\n        'Creoles and pidgins, French-based',\n        'créoles et pidgins basés sur le français',\n    ),\n    'cpp': ISOCodeData(\n        '',\n        '',\n        'Creoles and pidgins, Portuguese-based',\n        'créoles et pidgins basés sur le portugais',\n    ),\n    'cre': ISOCodeData('', 'cr', 'Cree', 'cree'),\n    'crh': ISOCodeData(\n        '',\n        '',\n        'Crimean Tatar; Crimean Turkish',\n        'tatar de Crimé',\n    ),\n    'crp': ISOCodeData('', '', 'Creoles and pidgins', 'créoles et pidgins'),\n    'csb': ISOCodeData('', '', 'Kashubian', 'kachoube'),\n    'cus': ISOCodeData('', '', 'Cushitic languages', 'couchitiques, langues'),\n    'cze': ISOCodeData('ces', 'cs', 'Czech', 'tchèque'),\n    'dak': ISOCodeData('', '', 'Dakota', 'dakota'),\n    'dan': ISOCodeData('', 'da', 'Danish', 'danois'),\n    'dar': ISOCodeData('', '', 'Dargwa', 'dargwa'),\n    'day': ISOCodeData('', '', 'Land Dayak languages', 'dayak, langues'),\n    'del': ISOCodeData('', '', 'Delaware', 'delaware'),\n    'den': ISOCodeData('', '', 'Slave (Athapascan)', 'esclave (athapascan)'),\n    'dgr': ISOCodeData('', '', 'Dogrib', 'dogrib'),\n    'din': ISOCodeData('', '', 'Dinka', 'dinka'),\n    'div': ISOCodeData('', 'dv', 'Divehi; Dhivehi; Maldivian', 'maldivien'),\n    'doi': ISOCodeData('', '', 'Dogri', 'dogri'),\n    'dra': ISOCodeData(\n        '',\n        '',\n        'Dravidian languages',\n        'dravidiennes, langues',\n    ),\n    'dsb': ISOCodeData('', '', 'Lower Sorbian', 'bas-sorabe'),\n    'dua': ISOCodeData('', '', 'Duala', 'douala'),\n    'dum': ISOCodeData(\n        '',\n        '',\n        'Dutch, Middle (ca.1050-1350)',\n        'néerlandais moyen (ca. 1050-1350)',\n    ),\n    'dut': ISOCodeData('nld', 'nl', 'Dutch; Flemish', 'néerlandais; flamand'),\n    'dyu': ISOCodeData('', '', 'Dyula', 'dioula'),\n    'dzo': ISOCodeData('', 'dz', 'Dzongkha', 'dzongkha'),\n    'efi': ISOCodeData('', '', 'Efik', 'efik'),\n    'egy': ISOCodeData('', '', 'Egyptian (Ancient)', 'égyptien'),\n    'eka': ISOCodeData('', '', 'Ekajuk', 'ekajuk'),\n    'elx': ISOCodeData('', '', 'Elamite', 'élamite'),\n    'eng': ISOCodeData('', 'en', 'English', 'anglais'),\n    'enm': ISOCodeData(\n        '',\n        '',\n        'English, Middle (1100-1500)',\n        'anglais moyen (1100-1500)',\n    ),\n    'epo': ISOCodeData('', 'eo', 'Esperanto', 'espéranto'),\n    'est': ISOCodeData('', 'et', 'Estonian', 'estonien'),\n    'ewe': ISOCodeData('', 'ee', 'Ewe', 'éwé'),\n    'ewo': ISOCodeData('', '', 'Ewondo', 'éwondo'),\n    'fan': ISOCodeData('', '', 'Fang', 'fang'),\n    'fao': ISOCodeData('', 'fo', 'Faroese', 'féroïen'),\n    'fat': ISOCodeData('', '', 'Fanti', 'fanti'),\n    'fij': ISOCodeData('', 'fj', 'Fijian', 'fidjien'),\n    'fil': ISOCodeData('', '', 'Filipino; Pilipino', 'filipino; pilipino'),\n    'fin': ISOCodeData('', 'fi', 'Finnish', 'finnois'),\n    'fiu': ISOCodeData(\n        '',\n        '',\n        'Finno-Ugrian languages',\n        'finno-ougriennes, langues',\n    ),\n    'fon': ISOCodeData('', '', 'Fon', 'fon'),\n    'fre': ISOCodeData('fra', 'fr', 'French', 'français'),\n    'frm': ISOCodeData(\n        '',\n        '',\n        'French, Middle (ca.1400-1600)',\n        'français moyen (1400-1600)',\n    ),\n    'fro': ISOCodeData(\n        '',\n        '',\n        'French, Old (842-ca.1400)',\n        'français ancien (842-ca.1400)',\n    ),\n    'frr': ISOCodeData('', '', 'Northern Frisian', 'frison septentrional'),\n    'frs': ISOCodeData('', '', 'Eastern Frisian', 'frison oriental'),\n    'fry': ISOCodeData('', 'fy', 'Western Frisian', 'frison occidental'),\n    'ful': ISOCodeData('', 'ff', 'Fulah', 'peul'),\n    'fur': ISOCodeData('', '', 'Friulian', 'frioulan'),\n    'gaa': ISOCodeData('', '', 'Ga', 'ga'),\n    'gay': ISOCodeData('', '', 'Gayo', 'gayo'),\n    'gba': ISOCodeData('', '', 'Gbaya', 'gbaya'),\n    'gem': ISOCodeData('', '', 'Germanic languages', 'germaniques, langues'),\n    'geo': ISOCodeData('kat', 'ka', 'Georgian', 'géorgien'),\n    'ger': ISOCodeData('deu', 'de', 'German', 'allemand'),\n    'gez': ISOCodeData('', '', 'Geez', 'guèze'),\n    'gil': ISOCodeData('', '', 'Gilbertese', 'kiribati'),\n    'gla': ISOCodeData(\n        '',\n        'gd',\n        'Gaelic; Scottish Gaelic',\n        'gaélique; gaélique écossais',\n    ),\n    'gle': ISOCodeData('', 'ga', 'Irish', 'irlandais'),\n    'glg': ISOCodeData('', 'gl', 'Galician', 'galicien'),\n    'glv': ISOCodeData('', 'gv', 'Manx', 'manx; mannois'),\n    'gmh': ISOCodeData(\n        '',\n        '',\n        'German, Middle High (ca.1050-1500)',\n        'allemand, moyen haut (ca. 1050-1500)',\n    ),\n    'goh': ISOCodeData(\n        '',\n        '',\n        'German, Old High (ca.750-1050)',\n        'allemand, vieux haut (ca. 750-1050)',\n    ),\n    'gon': ISOCodeData('', '', 'Gondi', 'gond'),\n    'gor': ISOCodeData('', '', 'Gorontalo', 'gorontalo'),\n    'got': ISOCodeData('', '', 'Gothic', 'gothique'),\n    'grb': ISOCodeData('', '', 'Grebo', 'grebo'),\n    'grc': ISOCodeData(\n        '',\n        '',\n        'Greek, Ancient (to 1453)',\n        \"grec ancien (jusqu'à 1453)\",\n    ),\n    'gre': ISOCodeData(\n        'ell',\n        'el',\n        'Greek, Modern (1453-)',\n        'grec moderne (après 1453)',\n    ),\n    'grn': ISOCodeData('', 'gn', 'Guarani', 'guarani'),\n    'gsw': ISOCodeData(\n        '',\n        '',\n        'Swiss German; Alemannic; Alsatian',\n        'suisse alémanique; alémanique; alsacien',\n    ),\n    'guj': ISOCodeData('', 'gu', 'Gujarati', 'goudjrati'),\n    'gwi': ISOCodeData('', '', \"Gwich'in\", \"gwich'in\"),\n    'hai': ISOCodeData('', '', 'Haida', 'haida'),\n    'hat': ISOCodeData(\n        '',\n        'ht',\n        'Haitian; Haitian Creole',\n        'haïtien; créole haïtien',\n    ),\n    'hau': ISOCodeData('', 'ha', 'Hausa', 'haoussa'),\n    'haw': ISOCodeData('', '', 'Hawaiian', 'hawaïen'),\n    'heb': ISOCodeData('', 'he', 'Hebrew', 'hébreu'),\n    'her': ISOCodeData('', 'hz', 'Herero', 'herero'),\n    'hil': ISOCodeData('', '', 'Hiligaynon', 'hiligaynon'),\n    'him': ISOCodeData(\n        '',\n        '',\n        'Himachali languages; Western Pahari languages',\n        'langues himachalis; langues paharis occidentales',\n    ),\n    'hin': ISOCodeData('', 'hi', 'Hindi', 'hindi'),\n    'hit': ISOCodeData('', '', 'Hittite', 'hittite'),\n    'hmn': ISOCodeData('', '', 'Hmong; Mong', 'hmong'),\n    'hmo': ISOCodeData('', 'ho', 'Hiri Motu', 'hiri motu'),\n    'hrv': ISOCodeData('', 'hr', 'Croatian', 'croate'),\n    'hsb': ISOCodeData('', '', 'Upper Sorbian', 'haut-sorabe'),\n    'hun': ISOCodeData('', 'hu', 'Hungarian', 'hongrois'),\n    'hup': ISOCodeData('', '', 'Hupa', 'hupa'),\n    'iba': ISOCodeData('', '', 'Iban', 'iban'),\n    'ibo': ISOCodeData('', 'ig', 'Igbo', 'igbo'),\n    'ice': ISOCodeData('isl', 'is', 'Icelandic', 'islandais'),\n    'ido': ISOCodeData('', 'io', 'Ido', 'ido'),\n    'iii': ISOCodeData('', 'ii', 'Sichuan Yi; Nuosu', 'yi de Sichuan'),\n    'ijo': ISOCodeData('', '', 'Ijo languages', 'ijo, langues'),\n    'iku': ISOCodeData('', 'iu', 'Inuktitut', 'inuktitut'),\n    'ile': ISOCodeData('', 'ie', 'Interlingue; Occidental', 'interlingue'),\n    'ilo': ISOCodeData('', '', 'Iloko', 'ilocano'),\n    'ina': ISOCodeData(\n        '',\n        'ia',\n        'Interlingua (International Auxiliary Language Association)',\n        'interlingua (langue auxiliaire internationale)',\n    ),\n    'inc': ISOCodeData('', '', 'Indic languages', 'indo-aryennes, langues'),\n    'ind': ISOCodeData('', 'id', 'Indonesian', 'indonésien'),\n    'ine': ISOCodeData(\n        '',\n        '',\n        'Indo-European languages',\n        'indo-européennes, langues',\n    ),\n    'inh': ISOCodeData('', '', 'Ingush', 'ingouche'),\n    'ipk': ISOCodeData('', 'ik', 'Inupiaq', 'inupiaq'),\n    'ira': ISOCodeData('', '', 'Iranian languages', 'iraniennes, langues'),\n    'iro': ISOCodeData('', '', 'Iroquoian languages', 'iroquoises, langues'),\n    'ita': ISOCodeData('', 'it', 'Italian', 'italien'),\n    'jav': ISOCodeData('', 'jv', 'Javanese', 'javanais'),\n    'jbo': ISOCodeData('', '', 'Lojban', 'lojban'),\n    'jpn': ISOCodeData('', 'ja', 'Japanese', 'japonais'),\n    'jpr': ISOCodeData('', '', 'Judeo-Persian', 'judéo-persan'),\n    'jrb': ISOCodeData('', '', 'Judeo-Arabic', 'judéo-arabe'),\n    'kaa': ISOCodeData('', '', 'Kara-Kalpak', 'karakalpak'),\n    'kab': ISOCodeData('', '', 'Kabyle', 'kabyle'),\n    'kac': ISOCodeData('', '', 'Kachin; Jingpho', 'kachin; jingpho'),\n    'kal': ISOCodeData('', 'kl', 'Kalaallisut; Greenlandic', 'groenlandais'),\n    'kam': ISOCodeData('', '', 'Kamba', 'kamba'),\n    'kan': ISOCodeData('', 'kn', 'Kannada', 'kannada'),\n    'kar': ISOCodeData('', '', 'Karen languages', 'karen, langues'),\n    'kas': ISOCodeData('', 'ks', 'Kashmiri', 'kashmiri'),\n    'kau': ISOCodeData('', 'kr', 'Kanuri', 'kanouri'),\n    'kaw': ISOCodeData('', '', 'Kawi', 'kawi'),\n    'kaz': ISOCodeData('', 'kk', 'Kazakh', 'kazakh'),\n    'kbd': ISOCodeData('', '', 'Kabardian', 'kabardien'),\n    'kha': ISOCodeData('', '', 'Khasi', 'khasi'),\n    'khi': ISOCodeData('', '', 'Khoisan languages', 'khoïsan, langues'),\n    'khm': ISOCodeData('', 'km', 'Central Khmer', 'khmer central'),\n    'kho': ISOCodeData('', '', 'Khotanese; Sakan', 'khotanais; sakan'),\n    'kik': ISOCodeData('', 'ki', 'Kikuyu; Gikuyu', 'kikuyu'),\n    'kin': ISOCodeData('', 'rw', 'Kinyarwanda', 'rwanda'),\n    'kir': ISOCodeData('', 'ky', 'Kirghiz; Kyrgyz', 'kirghiz'),\n    'kmb': ISOCodeData('', '', 'Kimbundu', 'kimbundu'),\n    'kok': ISOCodeData('', '', 'Konkani', 'konkani'),\n    'kom': ISOCodeData('', 'kv', 'Komi', 'kom'),\n    'kon': ISOCodeData('', 'kg', 'Kongo', 'kongo'),\n    'kor': ISOCodeData('', 'ko', 'Korean', 'coréen'),\n    'kos': ISOCodeData('', '', 'Kosraean', 'kosrae'),\n    'kpe': ISOCodeData('', '', 'Kpelle', 'kpellé'),\n    'krc': ISOCodeData('', '', 'Karachay-Balkar', 'karatchai balkar'),\n    'krl': ISOCodeData('', '', 'Karelian', 'carélien'),\n    'kro': ISOCodeData('', '', 'Kru languages', 'krou, langues'),\n    'kru': ISOCodeData('', '', 'Kurukh', 'kurukh'),\n    'kua': ISOCodeData('', 'kj', 'Kuanyama; Kwanyama', 'kuanyama; kwanyama'),\n    'kum': ISOCodeData('', '', 'Kumyk', 'koumyk'),\n    'kur': ISOCodeData('', 'ku', 'Kurdish', 'kurde'),\n    'kut': ISOCodeData('', '', 'Kutenai', 'kutenai'),\n    'lad': ISOCodeData('', '', 'Ladino', 'judéo-espagnol'),\n    'lah': ISOCodeData('', '', 'Lahnda', 'lahnda'),\n    'lam': ISOCodeData('', '', 'Lamba', 'lamba'),\n    'lao': ISOCodeData('', 'lo', 'Lao', 'lao'),\n    'lat': ISOCodeData('', 'la', 'Latin', 'latin'),\n    'lav': ISOCodeData('', 'lv', 'Latvian', 'letton'),\n    'lez': ISOCodeData('', '', 'Lezghian', 'lezghien'),\n    'lim': ISOCodeData(\n        '',\n        'li',\n        'Limburgan; Limburger; Limburgish',\n        'limbourgeois',\n    ),\n    'lin': ISOCodeData('', 'ln', 'Lingala', 'lingala'),\n    'lit': ISOCodeData('', 'lt', 'Lithuanian', 'lituanien'),\n    'lol': ISOCodeData('', '', 'Mongo', 'mongo'),\n    'loz': ISOCodeData('', '', 'Lozi', 'lozi'),\n    'ltz': ISOCodeData(\n        '',\n        'lb',\n        'Luxembourgish; Letzeburgesch',\n        'luxembourgeois',\n    ),\n    'lua': ISOCodeData('', '', 'Luba-Lulua', 'luba-lulua'),\n    'lub': ISOCodeData('', 'lu', 'Luba-Katanga', 'luba-katanga'),\n    'lug': ISOCodeData('', 'lg', 'Ganda', 'ganda'),\n    'lui': ISOCodeData('', '', 'Luiseno', 'luiseno'),\n    'lun': ISOCodeData('', '', 'Lunda', 'lunda'),\n    'luo': ISOCodeData(\n        '',\n        '',\n        'Luo (Kenya and Tanzania)',\n        'luo (Kenya et Tanzanie)',\n    ),\n    'lus': ISOCodeData('', '', 'Lushai', 'lushai'),\n    'mac': ISOCodeData('mkd', 'mk', 'Macedonian', 'macédonien'),\n    'mad': ISOCodeData('', '', 'Madurese', 'madourais'),\n    'mag': ISOCodeData('', '', 'Magahi', 'magahi'),\n    'mah': ISOCodeData('', 'mh', 'Marshallese', 'marshall'),\n    'mai': ISOCodeData('', '', 'Maithili', 'maithili'),\n    'mak': ISOCodeData('', '', 'Makasar', 'makassar'),\n    'mal': ISOCodeData('', 'ml', 'Malayalam', 'malayalam'),\n    'man': ISOCodeData('', '', 'Mandingo', 'mandingue'),\n    'mao': ISOCodeData('mri', 'mi', 'Maori', 'maori'),\n    'map': ISOCodeData(\n        '',\n        '',\n        'Austronesian languages',\n        'austronésiennes, langues',\n    ),\n    'mar': ISOCodeData('', 'mr', 'Marathi', 'marathe'),\n    'mas': ISOCodeData('', '', 'Masai', 'massaï'),\n    'may': ISOCodeData('msa', 'ms', 'Malay', 'malais'),\n    'mdf': ISOCodeData('', '', 'Moksha', 'moksa'),\n    'mdr': ISOCodeData('', '', 'Mandar', 'mandar'),\n    'men': ISOCodeData('', '', 'Mende', 'mendé'),\n    'mga': ISOCodeData(\n        '',\n        '',\n        'Irish, Middle (900-1200)',\n        'irlandais moyen (900-1200)',\n    ),\n    'mic': ISOCodeData('', '', \"Mi'kmaq; Micmac\", \"mi'kmaq; micmac\"),\n    'min': ISOCodeData('', '', 'Minangkabau', 'minangkabau'),\n    'mis': ISOCodeData('', '', 'Uncoded languages', 'langues non codées'),\n    'mkh': ISOCodeData('', '', 'Mon-Khmer languages', 'môn-khmer, langues'),\n    'mlg': ISOCodeData('', 'mg', 'Malagasy', 'malgache'),\n    'mlt': ISOCodeData('', 'mt', 'Maltese', 'maltais'),\n    'mnc': ISOCodeData('', '', 'Manchu', 'mandchou'),\n    'mni': ISOCodeData('', '', 'Manipuri', 'manipuri'),\n    'mno': ISOCodeData('', '', 'Manobo languages', 'manobo, langues'),\n    'moh': ISOCodeData('', '', 'Mohawk', 'mohawk'),\n    'mon': ISOCodeData('', 'mn', 'Mongolian', 'mongol'),\n    'mos': ISOCodeData('', '', 'Mossi', 'moré'),\n    'mul': ISOCodeData('', '', 'Multiple languages', 'multilingue'),\n    'mun': ISOCodeData('', '', 'Munda languages', 'mounda, langues'),\n    'mus': ISOCodeData('', '', 'Creek', 'muskogee'),\n    'mwl': ISOCodeData('', '', 'Mirandese', 'mirandais'),\n    'mwr': ISOCodeData('', '', 'Marwari', 'marvari'),\n    'myn': ISOCodeData('', '', 'Mayan languages', 'maya, langues'),\n    'myv': ISOCodeData('', '', 'Erzya', 'erza'),\n    'nah': ISOCodeData('', '', 'Nahuatl languages', 'nahuatl, langues'),\n    'nai': ISOCodeData(\n        '',\n        '',\n        'North American Indian languages',\n        'nord-amérindiennes, langues',\n    ),\n    'nap': ISOCodeData('', '', 'Neapolitan', 'napolitain'),\n    'nau': ISOCodeData('', 'na', 'Nauru', 'nauruan'),\n    'nav': ISOCodeData('', 'nv', 'Navajo; Navaho', 'navaho'),\n    'nbl': ISOCodeData(\n        '',\n        'nr',\n        'Ndebele, South; South Ndebele',\n        'ndébélé du Sud',\n    ),\n    'nde': ISOCodeData(\n        '',\n        'nd',\n        'Ndebele, North; North Ndebele',\n        'ndébélé du Nord',\n    ),\n    'ndo': ISOCodeData('', 'ng', 'Ndonga', 'ndonga'),\n    'nds': ISOCodeData(\n        '',\n        '',\n        'Low German; Low Saxon; German, Low; Saxon, Low',\n        'bas allemand; bas saxon; allemand, bas; saxon, bas',\n    ),\n    'nep': ISOCodeData('', 'ne', 'Nepali', 'népalais'),\n    'new': ISOCodeData('', '', 'Nepal Bhasa; Newari', 'nepal bhasa; newari'),\n    'nia': ISOCodeData('', '', 'Nias', 'nias'),\n    'nic': ISOCodeData(\n        '',\n        '',\n        'Niger-Kordofanian languages',\n        'nigéro-kordofaniennes, langues',\n    ),\n    'niu': ISOCodeData('', '', 'Niuean', 'niué'),\n    'nno': ISOCodeData(\n        '',\n        'nn',\n        'Norwegian Nynorsk; Nynorsk, Norwegian',\n        'norvégien nynorsk; nynorsk, norvégien',\n    ),\n    'nob': ISOCodeData(\n        '',\n        'nb',\n        'Bokmål, Norwegian; Norwegian Bokmål',\n        'norvégien bokmål',\n    ),\n    'nog': ISOCodeData('', '', 'Nogai', 'nogaï; nogay'),\n    'non': ISOCodeData('', '', 'Norse, Old', 'norrois, vieux'),\n    'nor': ISOCodeData('', 'no', 'Norwegian', 'norvégien'),\n    'nqo': ISOCodeData('', '', \"N'Ko\", \"n'ko\"),\n    'nso': ISOCodeData(\n        '',\n        '',\n        'Pedi; Sepedi; Northern Sotho',\n        'pedi; sepedi; sotho du Nord',\n    ),\n    'nub': ISOCodeData('', '', 'Nubian languages', 'nubiennes, langues'),\n    'nwc': ISOCodeData(\n        '',\n        '',\n        'Classical Newari; Old Newari; Classical Nepal Bhasa',\n        'newari classique',\n    ),\n    'nya': ISOCodeData(\n        '',\n        'ny',\n        'Chichewa; Chewa; Nyanja',\n        'chichewa; chewa; nyanja',\n    ),\n    'nym': ISOCodeData('', '', 'Nyamwezi', 'nyamwezi'),\n    'nyn': ISOCodeData('', '', 'Nyankole', 'nyankolé'),\n    'nyo': ISOCodeData('', '', 'Nyoro', 'nyoro'),\n    'nzi': ISOCodeData('', '', 'Nzima', 'nzema'),\n    'oci': ISOCodeData(\n        '',\n        'oc',\n        'Occitan (post 1500)',\n        'occitan (après 1500)',\n    ),\n    'oji': ISOCodeData('', 'oj', 'Ojibwa', 'ojibwa'),\n    'ori': ISOCodeData('', 'or', 'Oriya', 'oriya'),\n    'orm': ISOCodeData('', 'om', 'Oromo', 'galla'),\n    'osa': ISOCodeData('', '', 'Osage', 'osage'),\n    'oss': ISOCodeData('', 'os', 'Ossetian; Ossetic', 'ossète'),\n    'ota': ISOCodeData(\n        '',\n        '',\n        'Turkish, Ottoman (1500-1928)',\n        'turc ottoman (1500-1928)',\n    ),\n    'oto': ISOCodeData('', '', 'Otomian languages', 'otomi, langues'),\n    'paa': ISOCodeData('', '', 'Papuan languages', 'papoues, langues'),\n    'pag': ISOCodeData('', '', 'Pangasinan', 'pangasinan'),\n    'pal': ISOCodeData('', '', 'Pahlavi', 'pahlavi'),\n    'pam': ISOCodeData('', '', 'Pampanga; Kapampangan', 'pampangan'),\n    'pan': ISOCodeData('', 'pa', 'Panjabi; Punjabi', 'pendjabi'),\n    'pap': ISOCodeData('', '', 'Papiamento', 'papiamento'),\n    'pau': ISOCodeData('', '', 'Palauan', 'palau'),\n    'peo': ISOCodeData(\n        '',\n        '',\n        'Persian, Old (ca.600-400 B.C.)',\n        'perse, vieux (ca. 600-400 av. J.-C.)',\n    ),\n    'per': ISOCodeData('fas', 'fa', 'Persian', 'persan'),\n    'phi': ISOCodeData(\n        '',\n        '',\n        'Philippine languages',\n        'philippines, langues',\n    ),\n    'phn': ISOCodeData('', '', 'Phoenician', 'phénicien'),\n    'pli': ISOCodeData('', 'pi', 'Pali', 'pali'),\n    'pol': ISOCodeData('', 'pl', 'Polish', 'polonais'),\n    'pon': ISOCodeData('', '', 'Pohnpeian', 'pohnpei'),\n    'por': ISOCodeData('', 'pt', 'Portuguese', 'portugais'),\n    'pra': ISOCodeData('', '', 'Prakrit languages', 'prâkrit, langues'),\n    'pro': ISOCodeData(\n        '',\n        '',\n        'Provençal, Old (to 1500); Occitan, Old (to 1500)',\n        \"provençal ancien (jusqu'à 1500); occitan ancien (jusqu'à 1500)\",\n    ),\n    'pus': ISOCodeData('', 'ps', 'Pushto; Pashto', 'pachto'),\n    'qaa': ISOCodeData(\n        '',\n        '',\n        'Reserved for local use',\n        \"réservée à l'usage local\",\n    ),\n    'que': ISOCodeData('', 'qu', 'Quechua', 'quechua'),\n    'raj': ISOCodeData('', '', 'Rajasthani', 'rajasthani'),\n    'rap': ISOCodeData('', '', 'Rapanui', 'rapanui'),\n    'rar': ISOCodeData(\n        '',\n        '',\n        'Rarotongan; Cook Islands Maori',\n        'rarotonga; maori des îles Cook',\n    ),\n    'roa': ISOCodeData('', '', 'Romance languages', 'romanes, langues'),\n    'roh': ISOCodeData('', 'rm', 'Romansh', 'romanche'),\n    'rom': ISOCodeData('', '', 'Romany', 'tsigane'),\n    'rum': ISOCodeData(\n        'ron',\n        'ro',\n        'Romanian; Moldavian; Moldovan',\n        'roumain; moldave',\n    ),\n    'run': ISOCodeData('', 'rn', 'Rundi', 'rundi'),\n    'rup': ISOCodeData(\n        '',\n        '',\n        'Aromanian; Arumanian; Macedo-Romanian',\n        'aroumain; macédo-roumain',\n    ),\n    'rus': ISOCodeData('', 'ru', 'Russian', 'russe'),\n    'sad': ISOCodeData('', '', 'Sandawe', 'sandawe'),\n    'sag': ISOCodeData('', 'sg', 'Sango', 'sango'),\n    'sah': ISOCodeData('', '', 'Yakut', 'iakoute'),\n    'sai': ISOCodeData(\n        '',\n        '',\n        'South American Indian languages',\n        'sud-amérindiennes, langues',\n    ),\n    'sal': ISOCodeData('', '', 'Salishan languages', 'salishennes, langues'),\n    'sam': ISOCodeData('', '', 'Samaritan Aramaic', 'samaritain'),\n    'san': ISOCodeData('', 'sa', 'Sanskrit', 'sanskrit'),\n    'sas': ISOCodeData('', '', 'Sasak', 'sasak'),\n    'sat': ISOCodeData('', '', 'Santali', 'santal'),\n    'scn': ISOCodeData('', '', 'Sicilian', 'sicilien'),\n    'sco': ISOCodeData('', '', 'Scots', 'écossais'),\n    'sel': ISOCodeData('', '', 'Selkup', 'selkoupe'),\n    'sem': ISOCodeData('', '', 'Semitic languages', 'sémitiques, langues'),\n    'sga': ISOCodeData(\n        '',\n        '',\n        'Irish, Old (to 900)',\n        \"irlandais ancien (jusqu'à 900)\",\n    ),\n    'sgn': ISOCodeData('', '', 'Sign Languages', 'langues des signes'),\n    'shn': ISOCodeData('', '', 'Shan', 'chan'),\n    'sid': ISOCodeData('', '', 'Sidamo', 'sidamo'),\n    'sin': ISOCodeData('', 'si', 'Sinhala; Sinhalese', 'singhalais'),\n    'sio': ISOCodeData('', '', 'Siouan languages', 'sioux, langues'),\n    'sit': ISOCodeData(\n        '',\n        '',\n        'Sino-Tibetan languages',\n        'sino-tibétaines, langues',\n    ),\n    'sla': ISOCodeData('', '', 'Slavic languages', 'slaves, langues'),\n    'slo': ISOCodeData('slk', 'sk', 'Slovak', 'slovaque'),\n    'slv': ISOCodeData('', 'sl', 'Slovenian', 'slovène'),\n    'sma': ISOCodeData('', '', 'Southern Sami', 'sami du Sud'),\n    'sme': ISOCodeData('', 'se', 'Northern Sami', 'sami du Nord'),\n    'smi': ISOCodeData('', '', 'Sami languages', 'sames, langues'),\n    'smj': ISOCodeData('', '', 'Lule Sami', 'sami de Lule'),\n    'smn': ISOCodeData('', '', 'Inari Sami', \"sami d'Inari\"),\n    'smo': ISOCodeData('', 'sm', 'Samoan', 'samoan'),\n    'sms': ISOCodeData('', '', 'Skolt Sami', 'sami skolt'),\n    'sna': ISOCodeData('', 'sn', 'Shona', 'shona'),\n    'snd': ISOCodeData('', 'sd', 'Sindhi', 'sindhi'),\n    'snk': ISOCodeData('', '', 'Soninke', 'soninké'),\n    'sog': ISOCodeData('', '', 'Sogdian', 'sogdien'),\n    'som': ISOCodeData('', 'so', 'Somali', 'somali'),\n    'son': ISOCodeData('', '', 'Songhai languages', 'songhai, langues'),\n    'sot': ISOCodeData('', 'st', 'Sotho, Southern', 'sotho du Sud'),\n    'spa': ISOCodeData('', 'es', 'Spanish; Castilian', 'espagnol; castillan'),\n    'srd': ISOCodeData('', 'sc', 'Sardinian', 'sarde'),\n    'srn': ISOCodeData('', '', 'Sranan Tongo', 'sranan tongo'),\n    'srp': ISOCodeData('', 'sr', 'Serbian', 'serbe'),\n    'srr': ISOCodeData('', '', 'Serer', 'sérère'),\n    'ssa': ISOCodeData(\n        '',\n        '',\n        'Nilo-Saharan languages',\n        'nilo-sahariennes, langues',\n    ),\n    'ssw': ISOCodeData('', 'ss', 'Swati', 'swati'),\n    'suk': ISOCodeData('', '', 'Sukuma', 'sukuma'),\n    'sun': ISOCodeData('', 'su', 'Sundanese', 'soundanais'),\n    'sus': ISOCodeData('', '', 'Susu', 'soussou'),\n    'sux': ISOCodeData('', '', 'Sumerian', 'sumérien'),\n    'swa': ISOCodeData('', 'sw', 'Swahili', 'swahili'),\n    'swe': ISOCodeData('', 'sv', 'Swedish', 'suédois'),\n    'syc': ISOCodeData('', '', 'Classical Syriac', 'syriaque classique'),\n    'syr': ISOCodeData('', '', 'Syriac', 'syriaque'),\n    'tah': ISOCodeData('', 'ty', 'Tahitian', 'tahitien'),\n    'tai': ISOCodeData('', '', 'Tai languages', 'tai, langues'),\n    'tam': ISOCodeData('', 'ta', 'Tamil', 'tamoul'),\n    'tat': ISOCodeData('', 'tt', 'Tatar', 'tatar'),\n    'tel': ISOCodeData('', 'te', 'Telugu', 'télougou'),\n    'tem': ISOCodeData('', '', 'Timne', 'temne'),\n    'ter': ISOCodeData('', '', 'Tereno', 'tereno'),\n    'tet': ISOCodeData('', '', 'Tetum', 'tetum'),\n    'tgk': ISOCodeData('', 'tg', 'Tajik', 'tadjik'),\n    'tgl': ISOCodeData('', 'tl', 'Tagalog', 'tagalog'),\n    'tha': ISOCodeData('', 'th', 'Thai', 'thaï'),\n    'tib': ISOCodeData('bod', 'bo', 'Tibetan', 'tibétain'),\n    'tig': ISOCodeData('', '', 'Tigre', 'tigré'),\n    'tir': ISOCodeData('', 'ti', 'Tigrinya', 'tigrigna'),\n    'tiv': ISOCodeData('', '', 'Tiv', 'tiv'),\n    'tkl': ISOCodeData('', '', 'Tokelau', 'tokelau'),\n    'tlh': ISOCodeData('', '', 'Klingon; tlhIngan-Hol', 'klingon'),\n    'tli': ISOCodeData('', '', 'Tlingit', 'tlingit'),\n    'tmh': ISOCodeData('', '', 'Tamashek', 'tamacheq'),\n    'tog': ISOCodeData('', '', 'Tonga (Nyasa)', 'tonga (Nyasa)'),\n    'ton': ISOCodeData(\n        '',\n        'to',\n        'Tonga (Tonga Islands)',\n        'tongan (Îles Tonga)',\n    ),\n    'tpi': ISOCodeData('', '', 'Tok Pisin', 'tok pisin'),\n    'tsi': ISOCodeData('', '', 'Tsimshian', 'tsimshian'),\n    'tsn': ISOCodeData('', 'tn', 'Tswana', 'tswana'),\n    'tso': ISOCodeData('', 'ts', 'Tsonga', 'tsonga'),\n    'tuk': ISOCodeData('', 'tk', 'Turkmen', 'turkmène'),\n    'tum': ISOCodeData('', '', 'Tumbuka', 'tumbuka'),\n    'tup': ISOCodeData('', '', 'Tupi languages', 'tupi, langues'),\n    'tur': ISOCodeData('', 'tr', 'Turkish', 'turc'),\n    'tut': ISOCodeData('', '', 'Altaic languages', 'altaïques, langues'),\n    'tvl': ISOCodeData('', '', 'Tuvalu', 'tuvalu'),\n    'twi': ISOCodeData('', 'tw', 'Twi', 'twi'),\n    'tyv': ISOCodeData('', '', 'Tuvinian', 'touva'),\n    'udm': ISOCodeData('', '', 'Udmurt', 'oudmourte'),\n    'uga': ISOCodeData('', '', 'Ugaritic', 'ougaritique'),\n    'uig': ISOCodeData('', 'ug', 'Uighur; Uyghur', 'ouïgour'),\n    'ukr': ISOCodeData('', 'uk', 'Ukrainian', 'ukrainien'),\n    'umb': ISOCodeData('', '', 'Umbundu', 'umbundu'),\n    'und': ISOCodeData('', '', 'Undetermined', 'indéterminée'),\n    'urd': ISOCodeData('', 'ur', 'Urdu', 'ourdou'),\n    'uzb': ISOCodeData('', 'uz', 'Uzbek', 'ouszbek'),\n    'vai': ISOCodeData('', '', 'Vai', 'vaï'),\n    'ven': ISOCodeData('', 've', 'Venda', 'venda'),\n    'vie': ISOCodeData('', 'vi', 'Vietnamese', 'vietnamien'),\n    'vol': ISOCodeData('', 'vo', 'Volapük', 'volapük'),\n    'vot': ISOCodeData('', '', 'Votic', 'vote'),\n    'wak': ISOCodeData('', '', 'Wakashan languages', 'wakashanes, langues'),\n    'wal': ISOCodeData('', '', 'Wolaitta; Wolaytta', 'wolaitta; wolaytta'),\n    'war': ISOCodeData('', '', 'Waray', 'waray'),\n    'was': ISOCodeData('', '', 'Washo', 'washo'),\n    'wel': ISOCodeData('cym', 'cy', 'Welsh', 'gallois'),\n    'wen': ISOCodeData('', '', 'Sorbian languages', 'sorabes, langues'),\n    'wln': ISOCodeData('', 'wa', 'Walloon', 'wallon'),\n    'wol': ISOCodeData('', 'wo', 'Wolof', 'wolof'),\n    'xal': ISOCodeData('', '', 'Kalmyk; Oirat', 'kalmouk; oïrat'),\n    'xho': ISOCodeData('', 'xh', 'Xhosa', 'xhosa'),\n    'yao': ISOCodeData('', '', 'Yao', 'yao'),\n    'yap': ISOCodeData('', '', 'Yapese', 'yapois'),\n    'yid': ISOCodeData('', 'yi', 'Yiddish', 'yiddish'),\n    'yor': ISOCodeData('', 'yo', 'Yoruba', 'yoruba'),\n    'ypk': ISOCodeData('', '', 'Yupik languages', 'yupik, langues'),\n    'zap': ISOCodeData('', '', 'Zapotec', 'zapotèque'),\n    'zbl': ISOCodeData(\n        '',\n        '',\n        'Blissymbols; Blissymbolics; Bliss',\n        'symboles Bliss; Bliss',\n    ),\n    'zen': ISOCodeData('', '', 'Zenaga', 'zenaga'),\n    'zgh': ISOCodeData(\n        '',\n        '',\n        'Standard Moroccan Tamazight',\n        'amazighe standard marocain',\n    ),\n    'zha': ISOCodeData('', 'za', 'Zhuang; Chuang', 'zhuang; chuang'),\n    'znd': ISOCodeData('', '', 'Zande languages', 'zandé, langues'),\n    'zul': ISOCodeData('', 'zu', 'Zulu', 'zoulou'),\n    'zun': ISOCodeData('', '', 'Zuni', 'zuni'),\n    'zxx': ISOCodeData(\n        '',\n        '',\n        'No linguistic content; Not applicable',\n        'pas de contenu linguistique; non applicable',\n    ),\n    'zza': ISOCodeData(\n        '',\n        '',\n        'Zaza; Dimili; Dimli; Kirdki; Kirmanjki; Zazaki',\n        'zaza; dimili; dimli; kirdki; kirmanjki; zazaki',\n    ),\n}\n\n\ndef iso_639_2_from_3(iso3: str) -> str:\n    \"\"\"Convert ISO 639-3 code to ISO 639-2 code.\"\"\"\n    if iso3 in ISO_639_3:\n        return ISO_639_3[iso3].alpha_2\n    else:\n        return \"\"\n"
  },
  {
    "path": "src/ocrmypdf/models/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"OCRmyPDF models for plugin options and cross-cutting concerns.\"\"\"\n\nfrom __future__ import annotations\n"
  },
  {
    "path": "src/ocrmypdf/models/ocr_element.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"OCR element dataclasses for representing OCR output structure.\n\nThis module provides a generic, engine-agnostic representation of OCR output.\nThe OcrElement dataclass can represent structural units from any OCR source\n(hOCR, ALTO, custom engines, etc.) in a unified format suitable for rendering.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom dataclasses import dataclass, field\nfrom typing import Literal\n\n\n@dataclass\nclass BoundingBox:\n    \"\"\"An axis-aligned bounding box in pixel coordinates.\n\n    Coordinates use top-left origin (standard for images and hOCR).\n\n    Attributes:\n        left: Left edge x-coordinate\n        top: Top edge y-coordinate\n        right: Right edge x-coordinate\n        bottom: Bottom edge y-coordinate\n    \"\"\"\n\n    left: float\n    top: float\n    right: float\n    bottom: float\n\n    @property\n    def width(self) -> float:\n        \"\"\"Width of the bounding box.\"\"\"\n        return self.right - self.left\n\n    @property\n    def height(self) -> float:\n        \"\"\"Height of the bounding box.\"\"\"\n        return self.bottom - self.top\n\n    def __post_init__(self):\n        \"\"\"Validate bounding box coordinates.\"\"\"\n        if self.right < self.left:\n            raise ValueError(\n                f\"Invalid bounding box: right ({self.right}) < left ({self.left})\"\n            )\n        if self.bottom < self.top:\n            raise ValueError(\n                f\"Invalid bounding box: bottom ({self.bottom}) < top ({self.top})\"\n            )\n\n\n@dataclass\nclass Baseline:\n    \"\"\"Text baseline information.\n\n    The baseline is represented as a linear equation: y = slope * x + intercept.\n    This describes the line along which text characters sit, relative to the\n    bottom-left corner of the line's bounding box.\n\n    In hOCR, the baseline is specified relative to the bottom of the line's bbox,\n    with the intercept being the vertical offset from the bottom and the slope\n    representing rotation (positive = ascending left-to-right).\n\n    Attributes:\n        slope: Slope of the baseline (rise over run)\n        intercept: Y-intercept of the baseline (vertical offset from bbox bottom)\n    \"\"\"\n\n    slope: float = 0.0\n    intercept: float = 0.0\n\n\n@dataclass\nclass FontInfo:\n    \"\"\"Font information for text rendering.\n\n    Attributes:\n        name: Font family name (e.g., \"Times New Roman\")\n        size: Font size in points\n        bold: Whether the font is bold\n        italic: Whether the font is italic\n        monospace: Whether the font is monospace\n        serif: Whether the font is serif (vs sans-serif)\n        smallcaps: Whether the font uses small caps\n        underline: Whether the text is underlined\n    \"\"\"\n\n    name: str | None = None\n    size: float | None = None\n    bold: bool = False\n    italic: bool = False\n    monospace: bool = False\n    serif: bool = False\n    smallcaps: bool = False\n    underline: bool = False\n\n\n@dataclass\nclass OcrElement:\n    \"\"\"A generic OCR element representing any structural unit of OCR output.\n\n    OcrElements form a tree structure where pages contain paragraphs, paragraphs\n    contain lines, lines contain words, etc. The specific hierarchy depends on\n    the OCR engine, but this dataclass can represent any of these levels.\n\n    The ocr_class field uses hOCR naming conventions (ocr_page, ocr_par, ocr_line,\n    ocrx_word, etc.) as a common vocabulary, but elements from other sources can\n    map to these classes.\n\n    Common hOCR classes:\n        - ocr_page: The root element for a page\n        - ocr_carea: A content/column area\n        - ocr_par: A paragraph\n        - ocr_line: A line of text\n        - ocr_header: A header line\n        - ocr_footer: A footer line\n        - ocr_caption: A caption line\n        - ocr_textfloat: A floating text element\n        - ocrx_word: A single word\n\n    Attributes:\n        ocr_class: The element type (e.g., \"ocr_page\", \"ocr_line\", \"ocrx_word\")\n        bbox: Axis-aligned bounding box in source pixel coordinates (top-left origin)\n        poly: Polygon vertices for oriented/non-rectangular bounds\n        text: Text content (primarily for leaf nodes like words)\n        confidence: OCR confidence score (0.0-1.0)\n        children: Child elements (hierarchical structure)\n        direction: Text direction (\"ltr\" or \"rtl\")\n        language: Language code (e.g., \"eng\", \"deu\", \"chi_sim\")\n        baseline: Text baseline information (slope and intercept)\n        textangle: Text rotation angle in degrees (counter-clockwise from horizontal)\n        font: Font information (name, size, style)\n        dpi: Image resolution in dots per inch (typically for page-level)\n        page_number: Physical page number (0-indexed)\n        logical_page_number: Logical page number (as printed on the page)\n    \"\"\"\n\n    ocr_class: str\n\n    # Bounding boxes\n    bbox: BoundingBox | None = None\n    poly: list[tuple[float, float]] | None = None\n\n    # Text content\n    text: str = \"\"\n\n    # Confidence (0.0-1.0)\n    confidence: float | None = None\n\n    # Children (hierarchical structure)\n    children: list[OcrElement] = field(default_factory=list)\n\n    # Text direction and language\n    direction: Literal[\"ltr\", \"rtl\"] | None = None\n    language: str | None = None\n\n    # Baseline (for lines)\n    baseline: Baseline | None = None\n\n    # Rotation angle in degrees (counter-clockwise)\n    textangle: float | None = None\n\n    # Font information\n    font: FontInfo | None = None\n\n    # Page-level properties\n    dpi: float | None = None\n    page_number: int | None = None\n    logical_page_number: int | None = None\n\n    def iter_by_class(self, *ocr_classes: str) -> list[OcrElement]:\n        \"\"\"Iterate over all descendants matching the given class(es).\n\n        Args:\n            *ocr_classes: One or more ocr_class values to match\n\n        Returns:\n            List of all matching descendant elements (depth-first order)\n        \"\"\"\n        result = []\n        if self.ocr_class in ocr_classes:\n            result.append(self)\n        for child in self.children:\n            result.extend(child.iter_by_class(*ocr_classes))\n        return result\n\n    def find_by_class(self, *ocr_classes: str) -> OcrElement | None:\n        \"\"\"Find the first descendant matching the given class(es).\n\n        Args:\n            *ocr_classes: One or more ocr_class values to match\n\n        Returns:\n            The first matching element, or None if not found\n        \"\"\"\n        if self.ocr_class in ocr_classes:\n            return self\n        for child in self.children:\n            result = child.find_by_class(*ocr_classes)\n            if result is not None:\n                return result\n        return None\n\n    def get_text_recursive(self) -> str:\n        \"\"\"Get the combined text of this element and all descendants.\n\n        Returns:\n            Combined text content, with words separated by spaces\n        \"\"\"\n        if self.text:\n            return self.text\n        texts = [child.get_text_recursive() for child in self.children]\n        return \" \".join(t for t in texts if t)\n\n    @property\n    def words(self) -> list[OcrElement]:\n        \"\"\"Get all word elements (ocrx_word) in this element's subtree.\"\"\"\n        return self.iter_by_class(\"ocrx_word\")\n\n    @property\n    def lines(self) -> list[OcrElement]:\n        \"\"\"Get all line elements in this element's subtree.\"\"\"\n        return self.iter_by_class(\n            \"ocr_line\", \"ocr_header\", \"ocr_footer\", \"ocr_caption\", \"ocr_textfloat\"\n        )\n\n    @property\n    def paragraphs(self) -> list[OcrElement]:\n        \"\"\"Get all paragraph elements (ocr_par) in this element's subtree.\"\"\"\n        return self.iter_by_class(\"ocr_par\")\n\n\n# Type alias for text direction\nTextDirection = Literal[\"ltr\", \"rtl\"]\n\n\n# hOCR class constants for convenience\nclass OcrClass:\n    \"\"\"Constants for common OCR element classes.\"\"\"\n\n    # Page-level\n    PAGE = \"ocr_page\"\n    CAREA = \"ocr_carea\"\n\n    # Block-level\n    PARAGRAPH = \"ocr_par\"\n\n    # Line-level\n    LINE = \"ocr_line\"\n    HEADER = \"ocr_header\"\n    FOOTER = \"ocr_footer\"\n    CAPTION = \"ocr_caption\"\n    TEXTFLOAT = \"ocr_textfloat\"\n\n    # Word-level\n    WORD = \"ocrx_word\"\n\n    # Character-level\n    CHAR = \"ocrx_cinfo\"\n\n    # Line types (for convenience)\n    LINE_TYPES = frozenset({LINE, HEADER, FOOTER, CAPTION, TEXTFLOAT})\n"
  },
  {
    "path": "src/ocrmypdf/optimize.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Post-processing image optimization of OCR PDFs.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport sys\nimport tempfile\nimport threading\nfrom collections.abc import Callable, Iterator, MutableSet, Sequence\nfrom os import fspath\nfrom pathlib import Path\nfrom typing import Any, NamedTuple, NewType\nfrom zlib import compress\n\nimport img2pdf\nfrom packaging.version import Version\nfrom pikepdf import (\n    Array,\n    Dictionary,\n    Name,\n    Object,\n    ObjectStreamMode,\n    Pdf,\n    PdfError,\n    PdfImage,\n    Stream,\n    UnsupportedImageTypeError,\n)\nfrom pikepdf.models.image import HifiPrintImageNotTranscodableError\nfrom PIL import Image\n\nfrom ocrmypdf._concurrent import Executor, SerialExecutor\nfrom ocrmypdf._exec import ghostscript, jbig2enc, pngquant\nfrom ocrmypdf._jobcontext import PdfContext\nfrom ocrmypdf._progressbar import ProgressBar\nfrom ocrmypdf.exceptions import OutputFileAccessError\nfrom ocrmypdf.helpers import IMG2PDF_KWARGS, safe_symlink\n\nlog = logging.getLogger(__name__)\n\nDEFAULT_JPEG_QUALITY = 75\nDEFAULT_PNG_QUALITY = 70\nFLATE_JPEG_THRESHOLD = 10000\n\n\nXref = NewType('Xref', int)\n\n\nclass XrefExt(NamedTuple):\n    \"\"\"A PDF xref and image extension pair.\"\"\"\n\n    xref: Xref\n    ext: str\n\n\ndef img_name(root: Path, xref: Xref, ext: str) -> Path:\n    \"\"\"Return the name of an image file for a given xref and extension.\"\"\"\n    return root / f'{xref:08d}{ext}'\n\n\ndef png_name(root: Path, xref: Xref) -> Path:\n    \"\"\"Return the name of a PNG file for a given xref.\"\"\"\n    return img_name(root, xref, '.png')\n\n\ndef jpg_name(root: Path, xref: Xref) -> Path:\n    \"\"\"Return the name of a JPEG file for a given xref.\"\"\"\n    return img_name(root, xref, '.jpg')\n\n\ndef extract_image_filter(\n    image: Stream, xref: Xref\n) -> tuple[PdfImage, tuple[Name, Object]] | None:\n    \"\"\"Determine if an image is extractable.\"\"\"\n    if image.Subtype != Name.Image:\n        return None\n    if not isinstance(image.Length, int) or image.Length < 100:\n        log.debug(f\"xref {xref}: skipping image with small stream size\")\n        return None\n    if (\n        not isinstance(image.Width, int)\n        or not isinstance(image.Height, int)\n        or image.Width < 8\n        or image.Height < 8\n    ):  # Issue 732\n        log.debug(f\"xref {xref}: skipping image with unusually small dimensions\")\n        return None\n\n    pim = PdfImage(image)\n\n    if len(pim.filter_decodeparms) > 1:\n        first_filtdp = pim.filter_decodeparms[0]\n        second_filtdp = pim.filter_decodeparms[1]\n        if (\n            len(pim.filter_decodeparms) == 2\n            and first_filtdp[0] == Name.FlateDecode\n            and first_filtdp[1] is not None\n            and first_filtdp[1].get(Name.Predictor, 1) == 1\n            and second_filtdp[0] == Name.DCTDecode\n            and not second_filtdp[1]\n        ):\n            log.debug(\n                f\"xref {xref}: found image compressed as /FlateDecode /DCTDecode, \"\n                \"marked for JPEG optimization\"\n            )\n            filtdp = pim.filter_decodeparms[1]\n        else:\n            log.debug(f\"xref {xref}: skipping image with multiple compression filters\")\n            return None\n    else:\n        filtdp = pim.filter_decodeparms[0]\n\n    if pim.bits_per_component > 8:\n        log.debug(f\"xref {xref}: skipping wide gamut image\")\n        return None  # Don't mess with wide gamut images\n\n    if filtdp[0] == Name.JPXDecode:\n        log.debug(f\"xref {xref}: skipping JPEG2000 image\")\n        return None  # Don't do JPEG2000\n\n    if filtdp[0] == Name.CCITTFaxDecode and filtdp[1].get('/K', 0) >= 0:\n        log.debug(f\"xref {xref}: skipping CCITT Group 3 image\")\n        return None  # pikepdf doesn't support Group 3 yet\n\n    if Name.Decode in image:\n        log.debug(f\"xref {xref}: skipping image with Decode table\")\n        return None  # Don't mess with custom Decode tables\n    if image.get(Name.SMask, Dictionary()).get(Name.Matte, None) is not None:\n        # https://github.com/ocrmypdf/OCRmyPDF/issues/1536\n        # Do not attempt to optimize images that have a SMask with a Matte.\n        # That means alpha channel pre-blending is used, and we're not prepared\n        # to deal with the complexities of that.\n        log.debug(f\"xref {xref}: skipping image whose SMask has Matte\")\n        return None\n\n    return pim, filtdp\n\n\ndef extract_image_jbig2(\n    *, pdf: Pdf, root: Path, image: Stream, xref: Xref, options\n) -> XrefExt | None:\n    \"\"\"Extract an image, saving it as a JBIG2 file.\"\"\"\n    del options  # unused arg\n\n    result = extract_image_filter(image, xref)\n    if result is None:\n        return None\n    pim, filtdp = result\n\n    if (\n        pim.bits_per_component == 1\n        and filtdp[0] != Name.JBIG2Decode\n        and jbig2enc.available()\n    ):\n        # Save any colorspace associated with the image, so that we\n        # will export a pure 1-bit PNG with no palette or ICC profile.\n        # Showing the palette or ICC to jbig2enc will cause it to perform\n        # colorspace transform to 1bpp, which will conflict the palette or\n        # ICC if it exists.\n        colorspace = pim.obj.get(Name.ColorSpace, None)\n        if colorspace is not None or pim.image_mask:\n            try:\n                # Set to DeviceGray temporarily; we already in 1 bpc.\n                pim.obj.ColorSpace = Name.DeviceGray\n                imgname = root / f'{xref:08d}'\n                with imgname.open('wb') as f:\n                    ext = pim.extract_to(stream=f)\n                # Rename the file so it has .prejbig2.ext extension\n                # Making it unique avoids problems with Windows if the\n                # same image is extracted multiple times\n                imgname.rename(imgname.with_suffix(\".prejbig2\" + ext))\n            except NotImplementedError as e:\n                if '/Decode' in str(e):\n                    log.debug(\n                        f\"xref {xref}: skipping image with unsupported Decode table\"\n                    )\n                    return None\n                raise\n            except UnsupportedImageTypeError:\n                return None\n            finally:\n                # Restore image colorspace after temporarily setting it to DeviceGray\n                if colorspace is not None:\n                    pim.obj.ColorSpace = colorspace\n                else:\n                    del pim.obj.ColorSpace\n            return XrefExt(xref, \".prejbig2\" + ext)\n    return None\n\n\ndef _should_optimize_jpeg(options, filtdp):\n    if options.optimize >= 2:\n        return True\n    # Ghostscript 10.6.0+ introduced some sort of JPEG encoding issue.\n    # To resolve this, re-optimize the JPEG anyway.\n    return options.optimize < 2 and ghostscript.version() >= Version('10.6.0')\n\n\ndef extract_image_generic(\n    *, pdf: Pdf, root: Path, image: Stream, xref: Xref, options\n) -> XrefExt | None:\n    \"\"\"Generic image extraction.\"\"\"\n    result = extract_image_filter(image, xref)\n    if result is None:\n        return None\n    pim, filtdp = result\n\n    # Don't try to PNG-optimize 1bpp images, since JBIG2 does it better.\n    if pim.bits_per_component == 1:\n        return None\n\n    if filtdp[0] == Name.DCTDecode and _should_optimize_jpeg(options, filtdp):\n        try:\n            imgname = root / f'{xref:08d}'\n            with imgname.open('wb') as f:\n                ext = pim.extract_to(stream=f)\n            imgname.rename(imgname.with_suffix(ext))\n        except (UnsupportedImageTypeError, HifiPrintImageNotTranscodableError):\n            return None\n        return XrefExt(xref, ext)\n    elif (\n        pim.indexed\n        and pim.colorspace in pim.SIMPLE_COLORSPACES\n        and options.optimize >= 3\n    ):\n        # Try to improve on indexed images - these are far from low hanging\n        # fruit in most cases\n        pim.as_pil_image().save(png_name(root, xref))\n        return XrefExt(xref, '.png')\n    elif not pim.indexed and pim.colorspace in pim.SIMPLE_COLORSPACES:\n        # An optimization opportunity here, not currently taken, is directly\n        # generating a PNG from compressed data\n        try:\n            pim.as_pil_image().save(png_name(root, xref))\n        except NotImplementedError:\n            log.warning(\"PDF contains an atypical image that cannot be optimized.\")\n            return None\n        return XrefExt(xref, '.png')\n    elif (\n        not pim.indexed\n        and pim.colorspace == Name.ICCBased\n        and pim.bits_per_component == 1\n    ):\n        # We can losslessly optimize 1-bit images to CCITT or JBIG2 without\n        # paying any attention to the ICC profile\n        pim.as_pil_image().save(png_name(root, xref))\n        return XrefExt(xref, '.png')\n\n    return None\n\n\ndef _find_image_xrefs_container(\n    pdf: Pdf,\n    container: Object,\n    pageno: int,\n    include_xrefs: MutableSet[Xref],\n    exclude_xrefs: MutableSet[Xref],\n    pageno_for_xref: dict[Xref, int],\n    depth: int = 0,\n):\n    \"\"\"Find all image XRefs or Form XObject and add to the include/exclude sets.\"\"\"\n    if depth > 10:\n        log.warning(\"Recursion depth exceeded in _find_image_xrefs_page\")\n        return\n    try:\n        xobjs = container.Resources.XObject\n    except AttributeError:\n        return\n    for _imname, image in dict(xobjs).items():\n        if image.objgen[1] != 0:\n            continue  # Ignore images in an incremental PDF\n        xref = Xref(image.objgen[0])\n        if xref in include_xrefs or xref in exclude_xrefs:\n            continue  # Already processed\n        if Name.Subtype in image and image.Subtype == Name.Form:\n            # Recurse into Form XObjects\n            log.debug(f\"Recursing into Form XObject {_imname} in page {pageno}\")\n            _find_image_xrefs_container(\n                pdf,\n                image,\n                pageno,\n                include_xrefs,\n                exclude_xrefs,\n                pageno_for_xref,\n                depth + 1,\n            )\n            continue\n        if Name.SMask in image:\n            # Ignore soft masks\n            smask_xref = Xref(image.SMask.objgen[0])\n            exclude_xrefs.add(smask_xref)\n            log.debug(f\"xref {smask_xref}: skipping image because it is an SMask\")\n        include_xrefs.add(xref)\n        log.debug(f\"xref {xref}: treating as an optimization candidate\")\n        if xref not in pageno_for_xref:\n            pageno_for_xref[xref] = pageno\n\n\ndef _find_image_xrefs(pdf: Pdf):\n    include_xrefs: MutableSet[Xref] = set()\n    exclude_xrefs: MutableSet[Xref] = set()\n    pageno_for_xref: dict[Xref, int] = {}\n\n    for pageno, page in enumerate(pdf.pages):\n        _find_image_xrefs_container(\n            pdf, page.obj, pageno, include_xrefs, exclude_xrefs, pageno_for_xref\n        )\n\n    working_xrefs = include_xrefs - exclude_xrefs\n    return working_xrefs, pageno_for_xref\n\n\ndef extract_images(\n    pdf: Pdf,\n    root: Path,\n    options,\n    extract_fn: Callable[..., XrefExt | None],\n) -> Iterator[tuple[int, XrefExt]]:\n    \"\"\"Extract image using extract_fn.\n\n    Enumerate images on each page, lookup their xref/ID number in the PDF.\n    Exclude images that are soft masks (i.e. alpha transparency related).\n    Record the page number on which an image is first used, since images may be\n    used on multiple pages (or multiple times on the same page).\n\n    Current we do not check Form XObjects or other objects that may contain\n    images, and we don't evaluate alternate images or thumbnails.\n\n    extract_fn must decide if wants to extract the image in this context. If\n    it does a tuple should be returned: (xref, ext) where .ext is the file\n    extension. extract_fn must also extract the file it finds interesting.\n    \"\"\"\n    errors = 0\n    working_xrefs, pageno_for_xref = _find_image_xrefs(pdf)\n    for xref in working_xrefs:\n        image = pdf.get_object((xref, 0))\n        try:\n            result = extract_fn(\n                pdf=pdf, root=root, image=image, xref=xref, options=options\n            )\n        except Exception:  # pylint: disable=broad-except\n            log.exception(\n                f\"xref {xref}: While extracting this image, an error occurred\"\n            )\n            errors += 1\n        else:\n            if result:\n                _, ext = result\n                yield pageno_for_xref[xref], XrefExt(xref, ext)\n\n\ndef extract_images_generic(\n    pdf: Pdf, root: Path, options\n) -> tuple[list[Xref], list[Xref]]:\n    \"\"\"Extract any >=2bpp image we think we can improve.\"\"\"\n    jpegs = []\n    pngs = []\n    for _, xref_ext in extract_images(pdf, root, options, extract_image_generic):\n        log.debug('%s', xref_ext)\n        if xref_ext.ext == '.png':\n            pngs.append(xref_ext.xref)\n        elif xref_ext.ext == '.jpg':\n            jpegs.append(xref_ext.xref)\n    log.debug(f\"Optimizable images: JPEGs: {len(jpegs)} PNGs: {len(pngs)}\")\n    return jpegs, pngs\n\n\ndef extract_images_jbig2(pdf: Pdf, root: Path, options) -> list[XrefExt]:\n    \"\"\"Extract any bitonal image that we think we can improve as JBIG2.\"\"\"\n    jbig2_images = []\n    for _pageno, xref_ext in extract_images(pdf, root, options, extract_image_jbig2):\n        jbig2_images.append(xref_ext)\n\n    log.debug(f\"Optimizable images: JBIG2: {len(jbig2_images)}\")\n    return jbig2_images\n\n\ndef _produce_jbig2_images(\n    jbig2_images: list[XrefExt], root: Path, options, executor: Executor\n) -> None:\n    \"\"\"Produce JBIG2 images using lossless single-image encoding.\"\"\"\n\n    def jbig2_args():\n        for xref_ext in jbig2_images:\n            xref, ext = xref_ext\n            yield (\n                fspath(root),\n                img_name(root, xref, ext),\n                root / f'{xref:08d}.jbig2',\n                options.jbig2_threshold,\n            )\n\n    executor(\n        use_threads=True,\n        max_workers=options.jobs,\n        progress_kwargs=dict(\n            total=len(jbig2_images),\n            desc=\"JBIG2\",\n            unit='image',\n            disable=not options.progress_bar,\n        ),\n        task=jbig2enc.convert_single,\n        task_arguments=jbig2_args(),\n    )\n\n\ndef convert_to_jbig2(\n    pdf: Pdf,\n    jbig2_images: list[XrefExt],\n    root: Path,\n    options,\n    executor: Executor,\n) -> None:\n    \"\"\"Convert images to JBIG2 and insert into PDF.\n\n    Each JBIG2 image is encoded independently using lossless compression.\n    No symbol dictionary (JBIG2Globals) is used.\n    \"\"\"\n    _produce_jbig2_images(jbig2_images, root, options, executor)\n\n    for xref_ext in jbig2_images:\n        xref, _ = xref_ext\n        jbig2_im_file = root / f'{xref:08d}.jbig2'\n        jbig2_im_data = jbig2_im_file.read_bytes()\n        im_obj = pdf.get_object(xref, 0)\n        im_obj.write(jbig2_im_data, filter=Name.JBIG2Decode, decode_parms=None)\n\n\ndef _optimize_jpeg(\n    xref: Xref, in_jpg: Path, opt_jpg: Path, jpg_quality: int\n) -> tuple[Xref, Path | None]:\n    with Image.open(in_jpg) as im:\n        save_kwargs: dict[str, Any] = {'optimize': True}\n        if isinstance(jpg_quality, int) and 0 < jpg_quality <= 100:\n            save_kwargs['quality'] = jpg_quality\n        im.save(opt_jpg, **save_kwargs)\n\n    if opt_jpg.stat().st_size > in_jpg.stat().st_size:\n        log.debug(f\"xref {xref}, jpeg, made larger - skip\")\n        opt_jpg.unlink()\n        return xref, None\n    return xref, opt_jpg\n\n\ndef transcode_jpegs(\n    pdf: Pdf, jpegs: Sequence[Xref], root: Path, options, executor: Executor\n) -> None:\n    \"\"\"Optimize JPEGs according to optimization settings.\"\"\"\n\n    def jpeg_args() -> Iterator[tuple[Xref, Path, Path, int]]:\n        for xref in jpegs:\n            in_jpg = jpg_name(root, xref)\n            opt_jpg = in_jpg.with_suffix('.opt.jpg')\n            yield xref, in_jpg, opt_jpg, options.jpg_quality\n\n    def finish_jpeg(result: tuple[Xref, Path | None], pbar: ProgressBar):\n        xref, opt_jpg = result\n        if opt_jpg:\n            compdata = opt_jpg.read_bytes()  # JPEG can inserted into PDF as is\n            im_obj = pdf.get_object(xref, 0)\n            im_obj.write(compdata, filter=Name.DCTDecode)\n        pbar.update()\n\n    executor(\n        use_threads=True,  # Processes are significantly slower at this task\n        max_workers=options.jobs,\n        progress_kwargs=dict(\n            desc=\"Recompressing JPEGs\",\n            total=len(jpegs),\n            unit='image',\n            disable=not options.progress_bar,\n        ),\n        task=_optimize_jpeg,\n        task_arguments=jpeg_args(),\n        task_finished=finish_jpeg,\n    )\n\n\ndef _already_flate_encoded(image: Stream) -> bool:\n    \"\"\"Check if the image already has FlateDecode in its filter chain.\"\"\"\n    filt = image.get(Name.Filter)\n    if filt is None:\n        return False\n    if isinstance(filt, Array):\n        return Name.FlateDecode in list(filt)\n    return filt == Name.FlateDecode\n\n\ndef _find_deflatable_jpeg(\n    *, pdf: Pdf, root: Path, image: Stream, xref: Xref, options\n) -> XrefExt | None:\n    result = extract_image_filter(image, xref)\n    if result is None:\n        return None\n    _pim, filtdp = result\n\n    # Skip if already FlateDecode compressed - would double-compress\n    if _already_flate_encoded(image):\n        return None\n\n    if (\n        filtdp[0] == Name.DCTDecode\n        and not filtdp[1]\n        and (\n            (\n                # Don't flate very large images because it will slow down PDF viewers\n                1 <= options.optimize <= 2\n                and image.get(Name.Width, 0) < FLATE_JPEG_THRESHOLD\n                and image.get(Name.Height, 0) < FLATE_JPEG_THRESHOLD\n            )\n            or options.optimize == 3\n        )\n    ):\n        return XrefExt(xref, '.memory')\n\n    return None\n\n\ndef _deflate_jpeg(\n    pdf: Pdf, lock: threading.Lock, xref: Xref, complevel: int\n) -> tuple[Xref, bytes]:\n    with lock:\n        xobj = pdf.get_object(xref, 0)\n        try:\n            data = xobj.read_raw_bytes()\n        except PdfError:\n            return xref, b''\n    compdata = compress(data, complevel)\n    if len(compdata) >= len(data):\n        return xref, b''\n    return xref, compdata\n\n\ndef deflate_jpegs(pdf: Pdf, root: Path, options, executor: Executor) -> None:\n    \"\"\"Apply FlateDecode to JPEGs.\n\n    This is a lossless compression method that is supported by all PDF viewers,\n    and generally results in a smaller file size compared to straight DCTDecode\n    images.\n    \"\"\"\n    jpegs = []\n    for _pageno, xref_ext in extract_images(pdf, root, options, _find_deflatable_jpeg):\n        xref = xref_ext.xref\n        log.debug(f'xref {xref}: marking this JPEG as deflatable')\n        jpegs.append(xref)\n\n    complevel = 9 if options.optimize == 3 else 6\n\n    # Our calls to xobj.write() in finish() need coordination\n    lock = threading.Lock()\n\n    def deflate_args() -> Iterator:\n        for xref in jpegs:\n            yield pdf, lock, xref, complevel\n\n    def finish(result: tuple[Xref, bytes], pbar: ProgressBar):\n        xref, compdata = result\n        if len(compdata) > 0:\n            with lock:\n                xobj = pdf.get_object(xref, 0)\n                xobj.write(compdata, filter=[Name.FlateDecode, Name.DCTDecode])\n        pbar.update()\n\n    executor(\n        use_threads=True,  # We're sharing the pdf directly, must use threads\n        max_workers=options.jobs,\n        progress_kwargs=dict(\n            desc=\"Deflating JPEGs\",\n            total=len(jpegs),\n            unit='image',\n            disable=not options.progress_bar,\n        ),\n        task=_deflate_jpeg,\n        task_arguments=deflate_args(),\n        task_finished=finish,\n    )\n\n\ndef _transcode_png(pdf: Pdf, filename: Path, xref: Xref) -> bool:\n    output = filename.with_suffix('.png.pdf')\n    with output.open('wb') as f:\n        img2pdf.convert(fspath(filename), outputstream=f, **IMG2PDF_KWARGS)\n\n    with Pdf.open(output) as pdf_image:\n        foreign_image = next(iter(pdf_image.pages[0].images.values()))\n        local_image = pdf.copy_foreign(foreign_image)\n\n        im_obj = pdf.get_object(xref, 0)\n        im_obj.write(\n            local_image.read_raw_bytes(),\n            filter=local_image.Filter,\n            decode_parms=local_image.DecodeParms,\n        )\n\n        # Don't copy keys from the new image...\n        del_keys = set(im_obj.keys()) - set(local_image.keys())\n        # ...except for the keep_fields, which are essential to displaying\n        # the image correctly and preserving its metadata. (/Decode arrays\n        # and /SMaskInData are implicitly discarded prior to this point.)\n        keep_fields = {\n            '/ID',\n            '/Intent',\n            '/Interpolate',\n            '/Mask',\n            '/Metadata',\n            '/OC',\n            '/OPI',\n            '/SMask',\n            '/StructParent',\n        }\n        del_keys -= keep_fields\n        for key in local_image.keys():\n            if key != Name.Length and str(key) not in keep_fields:\n                im_obj[key] = local_image[key]\n        for key in del_keys:\n            del im_obj[key]\n    return True\n\n\ndef transcode_pngs(\n    pdf: Pdf,\n    images: Sequence[Xref],\n    image_name_fn: Callable[[Path, Xref], Path],\n    root: Path,\n    options,\n    executor: Executor,\n) -> None:\n    \"\"\"Apply lossy transcoding to PNGs.\"\"\"\n    modified: MutableSet[Xref] = set()\n    if options.optimize >= 2:\n        png_quality = (\n            max(10, options.png_quality - 10),\n            min(100, options.png_quality + 10),\n        )\n\n        def pngquant_args():\n            for xref in images:\n                log.debug(image_name_fn(root, xref))\n                yield (\n                    image_name_fn(root, xref),\n                    png_name(root, xref),\n                    png_quality[0],\n                    png_quality[1],\n                )\n                modified.add(xref)\n\n        executor(\n            use_threads=True,\n            max_workers=options.jobs,\n            progress_kwargs=dict(\n                desc=\"PNGs\",\n                total=len(images),\n                unit='image',\n                disable=not options.progress_bar,\n            ),\n            task=pngquant.quantize,\n            task_arguments=pngquant_args(),\n        )\n\n    for xref in modified:\n        filename = png_name(root, xref)\n        _transcode_png(pdf, filename, xref)\n\n\nDEFAULT_EXECUTOR = SerialExecutor()\n\n\ndef optimize(\n    input_file: Path,\n    output_file: Path,\n    context: PdfContext,\n    save_settings: dict[str, Any],\n    executor: Executor = DEFAULT_EXECUTOR,\n) -> Path:\n    \"\"\"Optimize images in a PDF file.\"\"\"\n    options = context.options\n    if options.optimize == 0:\n        safe_symlink(input_file, output_file)\n        return output_file\n\n    if not options.jpg_quality:\n        options.jpg_quality = DEFAULT_JPEG_QUALITY if options.optimize < 3 else 40\n    if not options.png_quality:\n        options.png_quality = DEFAULT_PNG_QUALITY if options.optimize < 3 else 30\n\n    with Pdf.open(input_file) as pdf:\n        root = output_file.parent / 'images'\n        root.mkdir(exist_ok=True)\n\n        jpegs, pngs = extract_images_generic(pdf, root, options)\n        transcode_jpegs(pdf, jpegs, root, options, executor)\n        deflate_jpegs(pdf, root, options, executor)\n        # if options.optimize >= 2:\n        # Try pngifying the jpegs\n        #    transcode_pngs(pdf, jpegs, jpg_name, root, options)\n        transcode_pngs(pdf, pngs, png_name, root, options, executor)\n\n        jbig2_images = extract_images_jbig2(pdf, root, options)\n        convert_to_jbig2(pdf, jbig2_images, root, options, executor)\n\n        target_file = output_file.with_suffix('.opt.pdf')\n        pdf.remove_unreferenced_resources()\n        pdf.save(target_file, **save_settings)\n\n    input_size = input_file.stat().st_size\n    output_size = target_file.stat().st_size\n    if output_size == 0:\n        raise OutputFileAccessError(\n            f\"Output file not created after optimizing. We probably ran \"\n            f\"out of disk space in the temporary folder: {tempfile.gettempdir()}.\"\n        )\n    savings = 1 - output_size / input_size\n\n    if savings < 0:\n        log.info(\n            \"Image optimization did not improve the file - \"\n            \"optimizations will not be used\"\n        )\n        # We still need to save the file\n        with Pdf.open(input_file) as pdf:\n            pdf.remove_unreferenced_resources()\n            pdf.save(output_file, **save_settings)\n    else:\n        safe_symlink(target_file, output_file)\n\n    return output_file\n\n\ndef main(infile, outfile, level, jobs=1):\n    \"\"\"Entry point for direct optimization of a file.\"\"\"\n    from shutil import copy  # pylint: disable=import-outside-toplevel\n    from tempfile import TemporaryDirectory  # pylint: disable=import-outside-toplevel\n\n    from ocrmypdf._options import OcrOptions  # pylint: disable=import-outside-toplevel\n\n    infile = Path(infile)\n\n    # Create OcrOptions with optimization-specific settings\n    options = OcrOptions(\n        input_file=infile,\n        output_file=outfile,  # Required field\n        jobs=jobs,\n        optimize=int(level),\n        jpg_quality=0,  # Use default\n        png_quality=0,\n        jbig2_threshold=0.85,\n        quiet=True,\n        progress_bar=False,\n    )\n\n    with TemporaryDirectory() as tmpdir:\n        context = PdfContext(options, Path(tmpdir), infile, None, None)\n        tmpout = Path(tmpdir) / 'out.pdf'\n        optimize(\n            infile,\n            tmpout,\n            context,\n            dict(\n                compress_streams=True,\n                preserve_pdfa=True,\n                object_stream_mode=ObjectStreamMode.generate,\n            ),\n        )\n        copy(fspath(tmpout), fspath(outfile))\n\n\nif __name__ == '__main__':\n    main(sys.argv[1], sys.argv[2], sys.argv[3])\n"
  },
  {
    "path": "src/ocrmypdf/pdfa.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Utilities for PDF/A production and confirmation with Ghostscript.\"\"\"\n\nfrom __future__ import annotations\n\nimport base64\nimport logging\nfrom collections.abc import Iterator\nfrom importlib.resources import files as package_files\nfrom pathlib import Path\n\nimport pikepdf\nfrom pikepdf import Array, Dictionary, Name, Pdf, Stream\n\nlog = logging.getLogger(__name__)\n\nSRGB_ICC_PROFILE_NAME = 'sRGB.icc'\n\n\ndef _postscript_objdef(\n    alias: str,\n    dictionary: dict[str, str],\n    *,\n    stream_name: str | None = None,\n    stream_data: bytes | None = None,\n) -> Iterator[str]:\n    assert (stream_name is None) == (stream_data is None)\n\n    objtype = '/stream' if stream_name else '/dict'\n\n    if stream_name:\n        assert stream_data is not None\n        a85_data = base64.a85encode(stream_data, adobe=True).decode('ascii')\n        yield f'{stream_name} ' + a85_data\n        yield 'def'\n\n    if alias != '{Catalog}':  # Catalog needs no definition\n        yield f'[/_objdef {alias} /type {objtype} /OBJ pdfmark'\n\n    yield f'[{alias} <<'\n    for key, val in dictionary.items():\n        yield f'  {key} {val}'\n    yield '>> /PUT pdfmark'\n\n    if stream_name:\n        yield f'[{alias} {stream_name[1:]} /PUT pdfmark'\n\n\ndef _make_postscript(icc_name: str, icc_data: bytes, colors: int) -> Iterator[str]:\n    yield '%!'\n    yield from _postscript_objdef(\n        '{icc_PDFA}',  # Not an f-string\n        {'/N': str(colors)},\n        stream_name='/ICCProfile',\n        stream_data=icc_data,\n    )\n    yield ''\n    yield from _postscript_objdef(\n        '{OutputIntent_PDFA}',\n        {\n            '/Type': '/OutputIntent',\n            '/S': '/GTS_PDFA1',\n            '/DestOutputProfile': '{icc_PDFA}',\n            '/OutputConditionIdentifier': f'({icc_name})',  # Only f-string\n        },\n    )\n    yield ''\n    yield from _postscript_objdef(\n        '{Catalog}', {'/OutputIntents': '[ {OutputIntent_PDFA} ]'}\n    )\n\n\ndef generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):\n    \"\"\"Create a Postscript PDFMARK file for Ghostscript PDF/A conversion.\n\n    pdfmark is an extension to the Postscript language that describes some PDF\n    features like bookmarks and annotations. It was originally specified Adobe\n    Distiller, for Postscript to PDF conversion.\n\n    Ghostscript uses pdfmark for PDF to PDF/A conversion as well. To use Ghostscript\n    to create a PDF/A, we need to create a pdfmark file with the necessary metadata.\n\n    This function takes care of the many version-specific bugs and peculiarities in\n    Ghostscript's handling of pdfmark.\n\n    The only information we put in specifies that we want the file to be a\n    PDF/A, and we want to Ghostscript to convert objects to the sRGB colorspace\n    if it runs into any object that it decides must be converted.\n\n    Arguments:\n        target_filename: filename to save\n        icc: ICC identifier such as 'sRGB'\n    References:\n        Adobe PDFMARK Reference:\n        https://opensource.adobe.com/dc-acrobat-sdk-docs/library/pdfmark/\n    \"\"\"\n    if icc != 'sRGB':\n        raise NotImplementedError(\"Only supporting sRGB\")\n\n    bytes_icc_profile = (\n        package_files('ocrmypdf.data') / SRGB_ICC_PROFILE_NAME\n    ).read_bytes()\n    postscript = '\\n'.join(_make_postscript(icc, bytes_icc_profile, 3))\n\n    # We should have encoded everything to pure ASCII by this point, and\n    # to be safe, only allow ASCII in PostScript\n    Path(target_filename).write_text(postscript, encoding='ascii')\n    return target_filename\n\n\ndef file_claims_pdfa(filename: Path):\n    \"\"\"Determines if the file claims to be PDF/A compliant.\n\n    This only checks if the XMP metadata contains a PDF/A marker. It does not\n    do full PDF/A validation.\n    \"\"\"\n    with pikepdf.open(filename) as pdf:\n        pdfmeta = pdf.open_metadata()\n        if not pdfmeta.pdfa_status:\n            return {\n                'pass': False,\n                'output': 'pdf',\n                'conformance': 'No PDF/A metadata in XMP',\n            }\n        valid_part_conforms = {'1a', '1b', '2a', '2b', '2u', '3a', '3b', '3u'}\n        # Raw value in XMP metadata returned by pikepdf is uppercase, but ISO\n        # uses lower case for conformance levels.\n        pdfa_status_iso = pdfmeta.pdfa_status.lower()\n        conformance = f'PDF/A-{pdfa_status_iso}'\n        pdfa_dict: dict[str, str | bool] = {}\n        if pdfa_status_iso in valid_part_conforms:\n            pdfa_dict['pass'] = True\n            pdfa_dict['output'] = 'pdfa'\n        pdfa_dict['conformance'] = conformance\n    return pdfa_dict\n\n\ndef _load_srgb_icc_profile() -> bytes:\n    \"\"\"Load the sRGB ICC profile from package data.\"\"\"\n    return (package_files('ocrmypdf.data') / SRGB_ICC_PROFILE_NAME).read_bytes()\n\n\ndef _pdfa_part_conformance(output_type: str) -> tuple[str, str]:\n    \"\"\"Extract PDF/A part and conformance from output_type.\n\n    Args:\n        output_type: One of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'\n\n    Returns:\n        Tuple of (part, conformance) e.g., ('2', 'B')\n    \"\"\"\n    mapping = {\n        'pdfa': ('2', 'B'),\n        'pdfa-1': ('1', 'B'),\n        'pdfa-2': ('2', 'B'),\n        'pdfa-3': ('3', 'B'),\n    }\n    return mapping.get(output_type, ('2', 'B'))\n\n\ndef add_pdfa_metadata(pdf: Pdf, part: str, conformance: str) -> None:\n    \"\"\"Add PDF/A XMP metadata declaration to a PDF.\n\n    Args:\n        pdf: An open pikepdf.Pdf object\n        part: PDF/A part number ('1', '2', or '3')\n        conformance: Conformance level ('A', 'B', or 'U')\n    \"\"\"\n    with pdf.open_metadata() as meta:\n        meta['pdfaid:part'] = part\n        meta['pdfaid:conformance'] = conformance\n\n\ndef add_srgb_output_intent(pdf: Pdf) -> None:\n    \"\"\"Add sRGB ICC profile as OutputIntent to PDF catalog.\n\n    This creates the required PDF/A OutputIntent structure with:\n    - An ICC profile stream containing sRGB profile\n    - An OutputIntent dictionary pointing to that profile\n    - Updates the Catalog's OutputIntents array\n\n    Args:\n        pdf: An open pikepdf.Pdf object\n    \"\"\"\n    icc_data = _load_srgb_icc_profile()\n\n    # Create ICC profile stream\n    icc_stream = Stream(pdf, icc_data)\n    icc_stream[Name.N] = 3  # RGB has 3 components\n\n    # Create OutputIntent dictionary\n    output_intent = Dictionary({\n        '/Type': Name.OutputIntent,\n        '/S': Name('/GTS_PDFA1'),\n        '/OutputConditionIdentifier': 'sRGB',\n        '/DestOutputProfile': icc_stream,\n    })\n\n    # Add to catalog's OutputIntents array\n    if Name.OutputIntents not in pdf.Root:\n        pdf.Root[Name.OutputIntents] = Array([])\n\n    # Check if sRGB OutputIntent already exists\n    for intent in pdf.Root.OutputIntents:  # type: ignore[attr-defined]\n        if str(intent.get(Name.OutputConditionIdentifier)) == 'sRGB':\n            log.debug('sRGB OutputIntent already exists, skipping')\n            return\n\n    pdf.Root.OutputIntents.append(output_intent)\n\n\ndef speculative_pdfa_conversion(\n    input_file: Path,\n    output_file: Path,\n    output_type: str,\n) -> Path:\n    \"\"\"Attempt to convert a PDF to PDF/A by adding required structures.\n\n    This function creates a copy of the input PDF and adds:\n    1. sRGB ICC profile as OutputIntent\n    2. XMP metadata declaring PDF/A conformance\n\n    This approach works for PDFs that are already mostly PDF/A compliant\n    but lack the formal declarations. It does NOT perform color conversion,\n    font embedding, or other transformations that Ghostscript does.\n\n    Args:\n        input_file: Path to input PDF\n        output_file: Path where output PDF should be written\n        output_type: One of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'\n\n    Returns:\n        Path to the output file\n\n    Raises:\n        pikepdf.PdfError: If the PDF cannot be opened or modified\n    \"\"\"\n    part, conformance = _pdfa_part_conformance(output_type)\n\n    with Pdf.open(input_file) as pdf:\n        add_srgb_output_intent(pdf)\n        add_pdfa_metadata(pdf, part, conformance)\n\n        pdf.save(output_file)\n\n    log.debug('Speculative PDF/A conversion complete: %s', output_file)\n    return output_file\n"
  },
  {
    "path": "src/ocrmypdf/pdfinfo/__init__.py",
    "content": "#!/usr/bin/env python3\n\n# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"For extracting information about PDFs prior to OCR.\"\"\"\n\nfrom __future__ import annotations\n\nfrom ocrmypdf.pdfinfo._types import Colorspace, Encoding, FloatRect\nfrom ocrmypdf.pdfinfo.info import PageInfo, PdfInfo\n\n__all__ = [\"Colorspace\", \"Encoding\", \"FloatRect\", \"PageInfo\", \"PdfInfo\"]\n"
  },
  {
    "path": "src/ocrmypdf/pdfinfo/_contentstream.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"PDF content stream interpretation.\"\"\"\n\nfrom __future__ import annotations\n\nimport re\nfrom collections import defaultdict\nfrom collections.abc import Mapping\nfrom math import hypot, inf, isclose\nfrom typing import NamedTuple\nfrom warnings import warn\n\nfrom pikepdf import Matrix, Object, PdfInlineImage, parse_content_stream\n\nfrom ocrmypdf.exceptions import InputFileError\nfrom ocrmypdf.helpers import Resolution\nfrom ocrmypdf.pdfinfo._types import UNIT_SQUARE\n\n\nclass XobjectSettings(NamedTuple):\n    \"\"\"Info about an XObject found in a PDF.\"\"\"\n\n    name: str\n    shorthand: tuple[float, float, float, float, float, float]\n    stack_depth: int\n\n\nclass InlineSettings(NamedTuple):\n    \"\"\"Info about an inline image found in a PDF.\"\"\"\n\n    iimage: PdfInlineImage\n    shorthand: tuple[float, float, float, float, float, float]\n    stack_depth: int\n\n\nclass ContentsInfo(NamedTuple):\n    \"\"\"Info about various objects found in a PDF.\"\"\"\n\n    xobject_settings: list[XobjectSettings]\n    inline_images: list[InlineSettings]\n    found_vector: bool\n    found_text: bool\n    name_index: Mapping[str, list[XobjectSettings]]\n\n\nclass TextboxInfo(NamedTuple):\n    \"\"\"Info about a text box found in a PDF.\"\"\"\n\n    bbox: tuple[float, float, float, float]\n    is_visible: bool\n    is_corrupt: bool\n\n\nclass VectorMarker:\n    \"\"\"Sentinel indicating vector drawing operations were found on a page.\"\"\"\n\n\nclass TextMarker:\n    \"\"\"Sentinel indicating text drawing operations were found on a page.\"\"\"\n\n\ndef _is_unit_square(shorthand):\n    \"\"\"Check if the shorthand represents a unit square transformation.\"\"\"\n    values = map(float, shorthand)\n    pairwise = zip(values, UNIT_SQUARE, strict=False)\n    return all(isclose(a, b, rel_tol=1e-3) for a, b in pairwise)\n\n\ndef _normalize_stack(graphobjs):\n    \"\"\"Convert runs of qQ's in the stack into single graphobjs.\"\"\"\n    for operands, operator in graphobjs:\n        operator = str(operator)\n        if re.match(r'Q*q+$', operator):  # Zero or more Q, one or more q\n            for char in operator:  # Split into individual\n                yield ([], char)  # Yield individual\n        else:\n            yield (operands, operator)\n\n\ndef _interpret_contents(contentstream: Object, initial_shorthand=UNIT_SQUARE):\n    \"\"\"Interpret the PDF content stream.\n\n    The stack represents the state of the PDF graphics stack.  We are only\n    interested in the current transformation matrix (CTM) so we only track\n    this object; a full implementation would need to track many other items.\n\n    The CTM is initialized to the mapping from user space to device space.\n    PDF units are 1/72\".  In a PDF viewer or printer this matrix is initialized\n    to the transformation to device space.  For example if set to\n    (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches.\n\n    Images are always considered to be (0, 0) -> (1, 1).  Before drawing an\n    image there should be a 'cm' that sets up an image coordinate system\n    where drawing from (0, 0) -> (1, 1) will draw on the desired area of the\n    page.\n\n    PDF units suit our needs so we initialize ctm to the identity matrix.\n\n    According to the PDF specification, the maximum stack depth is 32. Other\n    viewers tolerate some amount beyond this.  We issue a warning if the\n    stack depth exceeds the spec limit and set a hard limit beyond this to\n    bound our memory requirements.  If the stack underflows behavior is\n    undefined in the spec, but we just pretend nothing happened and leave the\n    CTM unchanged.\n    \"\"\"\n    stack = []\n    ctm = Matrix(initial_shorthand)\n    xobject_settings: list[XobjectSettings] = []\n    inline_images: list[InlineSettings] = []\n    name_index = defaultdict(lambda: [])\n    found_vector = False\n    found_text = False\n    vector_ops = set('S s f F f* B B* b b*'.split())\n    text_showing_ops = set(\"\"\"TJ Tj \" '\"\"\".split())\n    image_ops = set('BI ID EI q Q Do cm'.split())\n    operator_whitelist = ' '.join(vector_ops | text_showing_ops | image_ops)\n\n    for n, graphobj in enumerate(\n        _normalize_stack(parse_content_stream(contentstream, operator_whitelist))\n    ):\n        operands, operator = graphobj\n        if operator == 'q':\n            stack.append(ctm)\n            if len(stack) > 32:  # See docstring\n                if len(stack) > 128:\n                    raise RuntimeError(\n                        f\"PDF graphics stack overflowed hard limit at operator {n}\"\n                    )\n                warn(\"PDF graphics stack overflowed spec limit\")\n        elif operator == 'Q':\n            try:\n                ctm = stack.pop()\n            except IndexError:\n                # Keeping the ctm the same seems to be the only sensible thing\n                # to do. Just pretend nothing happened, keep calm and carry on.\n                warn(\"PDF graphics stack underflowed - PDF may be malformed\")\n        elif operator == 'cm':\n            try:\n                ctm = Matrix(operands) @ ctm\n            except ValueError as e:\n                raise InputFileError(\n                    \"PDF content stream is corrupt - this PDF is malformed. \"\n                    \"Use a PDF editor that is capable of visually inspecting the PDF.\"\n                ) from e\n        elif operator == 'Do':\n            image_name = operands[0]\n            settings = XobjectSettings(\n                name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack)\n            )\n            xobject_settings.append(settings)\n            name_index[str(image_name)].append(settings)\n        elif operator == 'INLINE IMAGE':  # BI/ID/EI are grouped into this\n            iimage = operands[0]\n            inline = InlineSettings(\n                iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack)\n            )\n            inline_images.append(inline)\n        elif operator in vector_ops:\n            found_vector = True\n        elif operator in text_showing_ops:\n            found_text = True\n\n    return ContentsInfo(\n        xobject_settings=xobject_settings,\n        inline_images=inline_images,\n        found_vector=found_vector,\n        found_text=found_text,\n        name_index=name_index,\n    )\n\n\ndef _get_dpi(ctm_shorthand, image_size) -> Resolution:\n    \"\"\"Given the transformation matrix and image size, find the image DPI.\n\n    PDFs do not include image resolution information within image data.\n    Instead, the PDF page content stream describes the location where the\n    image will be rasterized, and the effective resolution is the ratio of the\n    pixel size to raster target size.\n\n    Normally a scanned PDF has the paper size set appropriately but this is\n    not guaranteed. The most common case is a cropped image will change the\n    page size (/CropBox) without altering the page content stream. That means\n    it is not sufficient to assume that the image fills the page, even though\n    that is the most common case.\n\n    A PDF image may be scaled (always), cropped, translated, rotated in place\n    to an arbitrary angle (rarely) and skewed. Only equal area mappings can\n    be expressed, that is, it is not necessary to consider distortions where\n    the effective DPI varies with position.\n\n    To determine the image scale, transform an offset axis vector v0 (0, 0),\n    width-axis vector v0 (1, 0), height-axis vector vh (0, 1) with the matrix,\n    which gives the dimensions of the image in PDF units. From there we can\n    compare to actual image dimensions. PDF uses\n    row vector * matrix_transposed unlike the traditional\n    matrix * column vector.\n\n    The offset, width and height vectors can be combined in a matrix and\n    multiplied by the transform matrix. Then we want to calculated\n        magnitude(width_vector - offset_vector)\n    and\n        magnitude(height_vector - offset_vector)\n\n    When the above is worked out algebraically, the effect of translation\n    cancels out, and the vector magnitudes become functions of the nonzero\n    transformation matrix indices. The results of the derivation are used\n    in this code.\n\n    pdfimages -list does calculate the DPI in some way that is not completely\n    naive, but it does not get the DPI of rotated images right, so cannot be\n    used anymore to validate this. Photoshop works, or using Acrobat to\n    rotate the image back to normal.\n\n    It does not matter if the image is partially cropped, or even out of the\n    /MediaBox.\n\n    \"\"\"\n    a, b, c, d, _, _ = ctm_shorthand  # pylint: disable=invalid-name\n\n    # Calculate the width and height of the image in PDF units\n    image_drawn = hypot(a, b), hypot(c, d)\n\n    def calc(drawn, pixels, inches_per_pt=72.0):\n        # The scale of the image is pixels per unit of default user space (1/72\")\n        scale = pixels / drawn if drawn != 0 else inf\n        dpi = scale * inches_per_pt\n        return dpi\n\n    dpi_w, dpi_h = (calc(image_drawn[n], image_size[n]) for n in range(2))\n    return Resolution(dpi_w, dpi_h)\n"
  },
  {
    "path": "src/ocrmypdf/pdfinfo/_image.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"PDF image analysis.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom collections.abc import Iterator\nfrom decimal import Decimal\n\nfrom pikepdf import (\n    Dictionary,\n    Matrix,\n    Name,\n    Object,\n    Pdf,\n    PdfImage,\n    PdfInlineImage,\n    Stream,\n    UnsupportedImageTypeError,\n)\n\nfrom ocrmypdf.helpers import Resolution\nfrom ocrmypdf.pdfinfo._contentstream import (\n    ContentsInfo,\n    TextMarker,\n    VectorMarker,\n    _get_dpi,\n    _interpret_contents,\n    _is_unit_square,\n)\nfrom ocrmypdf.pdfinfo._types import (\n    FRIENDLY_COLORSPACE,\n    FRIENDLY_COMP,\n    FRIENDLY_ENCODING,\n    UNIT_SQUARE,\n    Colorspace,\n    Encoding,\n)\n\nlogger = logging.getLogger()\n\n\nclass ImageInfo:\n    \"\"\"Information about an image found in a PDF.\n\n    This gathers information from pikepdf and pdfminer.six, and is pickle-able\n    so that it can be passed to a worker process, unlike objects from those\n    libraries.\n    \"\"\"\n\n    DPI_PREC = Decimal('1.000')\n\n    _comp: int | None\n    _name: str\n\n    def __init__(\n        self,\n        *,\n        name='',\n        pdfimage: Object | None = None,\n        inline: PdfInlineImage | None = None,\n        shorthand=None,\n    ):\n        \"\"\"Initialize an ImageInfo.\"\"\"\n        self._name = str(name)\n        self._shorthand = shorthand\n\n        pim: PdfInlineImage | PdfImage\n\n        if inline is not None:\n            self._origin = 'inline'\n            pim = inline\n        elif pdfimage is not None and isinstance(pdfimage, Stream):\n            self._origin = 'xobject'\n            pim = PdfImage(pdfimage)\n        else:\n            raise ValueError(\"Either pdfimage or inline must be set\")\n\n        self._width = pim.width\n        self._height = pim.height\n        if (smask := pim.obj.get(Name.SMask, None)) is not None and isinstance(\n            smask, Stream | Dictionary\n        ):\n            # SMask is pretty much an alpha channel, but in PDF it's possible\n            # for channel to have different dimensions than the image\n            # itself. Some PDF writers use this to create a grayscale stencil\n            # mask. For our purposes, the effective size is the size of the\n            # larger component (image or smask).\n            self._width = max(smask.get(Name.Width, 0), self._width)\n            self._height = max(smask.get(Name.Height, 0), self._height)\n        if (mask := pim.obj.get(Name.Mask, None)) is not None and isinstance(\n            mask, Stream | Dictionary\n        ):\n            # If the image has a /Mask entry, it has an explicit mask.\n            # /Mask can be a Stream or an Array. If it's a Stream,\n            # use its /Width and /Height if they are larger than the main\n            # image's.\n            self._width = max(mask.get(Name.Width, 0), self._width)\n            self._height = max(mask.get(Name.Height, 0), self._height)\n\n        # If /ImageMask is true, then this image is a stencil mask\n        # (Images that draw with this stencil mask will have a reference to\n        # it in their /Mask, but we don't actually need that information)\n        if pim.image_mask:\n            self._type = 'stencil'\n        else:\n            self._type = 'image'\n\n        self._bpc = int(pim.bits_per_component)\n        if (\n            len(pim.filters) == 2\n            and pim.filters[0] == '/FlateDecode'\n            and pim.filters[1] == '/DCTDecode'\n        ):\n            # Special case: FlateDecode followed by DCTDecode\n            self._enc = Encoding.flate_jpeg\n        else:\n            try:\n                self._enc = FRIENDLY_ENCODING.get(pim.filters[0])\n            except IndexError:\n                self._enc = None\n\n        try:\n            self._color = FRIENDLY_COLORSPACE.get(pim.colorspace or '')\n        except NotImplementedError:\n            self._color = None\n        if self._enc == Encoding.jpeg2000:\n            self._color = Colorspace.jpeg2000\n\n        self._comp = None\n        if self._color == Colorspace.icc and isinstance(pim, PdfImage):\n            self._comp = self._init_icc(pim)\n        else:\n            if isinstance(self._color, Colorspace):\n                self._comp = FRIENDLY_COMP.get(self._color)\n            # Bit of a hack... infer grayscale if component count is uncertain\n            # but encoding only supports monochrome.\n            if self._comp is None and self._enc in (Encoding.ccitt, Encoding.jbig2):\n                self._comp = FRIENDLY_COMP[Colorspace.gray]\n\n    def _init_icc(self, pim: PdfImage):\n        try:\n            icc = pim.icc\n        except UnsupportedImageTypeError as e:\n            logger.warning(\n                f\"An image with a corrupt or unreadable ICC profile was found. \"\n                f\"Output PDF may not match the input PDF visually: {e}. {self}\"\n            )\n            return None\n        # Check the ICC profile to determine actual colorspace\n        if icc is None or not hasattr(icc, 'profile'):\n            logger.warning(\n                f\"An image with an ICC profile but no ICC profile data was found. \"\n                f\"The output PDF may not match the input PDF visually. {self}\"\n            )\n            return None\n        try:\n            if icc.profile.xcolor_space == 'GRAY':\n                return 1\n            elif icc.profile.xcolor_space == 'CMYK':\n                return 4\n            else:\n                return 3\n        except AttributeError:\n            return None\n\n    @property\n    def name(self):\n        \"\"\"Name of the image as it appears in the PDF.\"\"\"\n        return self._name\n\n    @property\n    def type_(self):\n        \"\"\"Type of image, either 'image' or 'stencil'.\"\"\"\n        return self._type\n\n    @property\n    def width(self) -> int:\n        \"\"\"Width of the image in pixels.\"\"\"\n        return self._width\n\n    @property\n    def height(self) -> int:\n        \"\"\"Height of the image in pixels.\"\"\"\n        return self._height\n\n    @property\n    def bpc(self):\n        \"\"\"Bits per component.\"\"\"\n        return self._bpc\n\n    @property\n    def color(self):\n        \"\"\"Colorspace of the image.\"\"\"\n        return self._color if self._color is not None else '?'\n\n    @property\n    def comp(self):\n        \"\"\"Number of components/channels in the image.\"\"\"\n        return self._comp if self._comp is not None else '?'\n\n    @property\n    def enc(self):\n        \"\"\"Encoding of the image.\"\"\"\n        return self._enc if self._enc is not None else 'image'\n\n    @property\n    def renderable(self) -> bool:\n        \"\"\"Whether the image is renderable.\n\n        Some PDFs in the wild have invalid images that are not renderable,\n        due to unusual dimensions.\n\n        Stencil masks are not also not renderable, since they are not\n        drawn, but rather they control how rendering happens.\n        \"\"\"\n        return (\n            self.dpi.is_finite\n            and self.width >= 0\n            and self.height >= 0\n            and self.type_ != 'stencil'\n        )\n\n    @property\n    def dpi(self) -> Resolution:\n        \"\"\"Dots per inch of the image.\n\n        Calculated based on where and how the image is drawn in the PDF.\n        \"\"\"\n        return _get_dpi(self._shorthand, (self._width, self._height))\n\n    @property\n    def printed_area(self) -> float:\n        \"\"\"Physical area of the image in square inches.\"\"\"\n        if not self.renderable:\n            return 0.0\n        return float((self.width / self.dpi.x) * (self.height / self.dpi.y))\n\n    def __repr__(self):\n        \"\"\"Return a string representation of the image.\"\"\"\n        return (\n            f\"<ImageInfo '{self.name}' {self.type_} {self.width}×{self.height} \"\n            f\"{self.color} {self.comp} {self.bpc} {self.enc} {self.dpi}>\"\n        )\n\n\ndef _find_inline_images(contentsinfo: ContentsInfo) -> Iterator[ImageInfo]:\n    \"\"\"Find inline images in the contentstream.\"\"\"\n    for n, inline in enumerate(contentsinfo.inline_images):\n        yield ImageInfo(\n            name=f'inline-{n:02d}', shorthand=inline.shorthand, inline=inline.iimage\n        )\n\n\ndef _image_xobjects(container) -> Iterator[tuple[Object, str]]:\n    \"\"\"Search for all XObject-based images in the container.\n\n    Usually the container is a page, but it could also be a Form XObject\n    that contains images. Filter out the Form XObjects which are dealt with\n    elsewhere.\n\n    Generate a sequence of tuples (image, xobj container), where container,\n    where xobj is the name of the object and image is the object itself,\n    since the object does not know its own name.\n\n    \"\"\"\n    if Name.Resources not in container:\n        return\n    resources = container[Name.Resources]\n    if Name.XObject not in resources:\n        return\n    for key, candidate in resources[Name.XObject].items():\n        if candidate is None or Name.Subtype not in candidate:\n            continue\n        if candidate[Name.Subtype] == Name.Image:\n            pdfimage = candidate\n            yield (pdfimage, key)\n\n\ndef _find_regular_images(\n    container: Object, contentsinfo: ContentsInfo\n) -> Iterator[ImageInfo]:\n    \"\"\"Find images stored in the container's /Resources /XObject.\n\n    Usually the container is a page, but it could also be a Form XObject\n    that contains images.\n\n    Generates images with their DPI at time of drawing.\n    \"\"\"\n    for pdfimage, xobj in _image_xobjects(container):\n        if xobj not in contentsinfo.name_index:\n            continue\n        for draw in contentsinfo.name_index[xobj]:\n            if draw.stack_depth == 0 and _is_unit_square(draw.shorthand):\n                # At least one PDF in the wild (and test suite) draws an image\n                # when the graphics stack depth is 0, meaning that the image\n                # gets drawn into a square of 1x1 PDF units (or 1/72\",\n                # or 0.35 mm).  The equivalent DPI will be >100,000.  Exclude\n                # these from our DPI calculation for the page.\n                continue\n\n            yield ImageInfo(name=draw.name, pdfimage=pdfimage, shorthand=draw.shorthand)\n\n\ndef _find_form_xobject_images(pdf: Pdf, container: Object, contentsinfo: ContentsInfo):\n    \"\"\"Find any images that are in Form XObjects in the container.\n\n    The container may be a page, or a parent Form XObject.\n\n    \"\"\"\n    if Name.Resources not in container:\n        return\n    resources = container[Name.Resources]\n    if Name.XObject not in resources:\n        return\n    xobjs = resources[Name.XObject].as_dict()\n    for xobj in xobjs:\n        candidate = xobjs[xobj]\n        if candidate is None or candidate.get(Name.Subtype) != Name.Form:\n            continue\n\n        form_xobject = candidate\n        for settings in contentsinfo.xobject_settings:\n            if settings.name != xobj:\n                continue\n\n            # Find images once for each time this Form XObject is drawn.\n            # This could be optimized to cache the multiple drawing events\n            # but in practice both Form XObjects and multiple drawing of the\n            # same object are both very rare.\n            ctm_shorthand = settings.shorthand\n            yield from _process_content_streams(\n                pdf=pdf, container=form_xobject, shorthand=ctm_shorthand\n            )\n\n\ndef _process_content_streams(\n    *, pdf: Pdf, container: Object, shorthand=None\n) -> Iterator[VectorMarker | TextMarker | ImageInfo]:\n    \"\"\"Find all individual instances of images drawn in the container.\n\n    Usually the container is a page, but it may also be a Form XObject.\n\n    On a typical page images are stored inline or as regular images\n    in an XObject.\n\n    Form XObjects may include inline images, XObject images,\n    and recursively, other Form XObjects; and also vector graphic objects.\n\n    Every instance of an image being drawn somewhere is flattened and\n    treated as a unique image, since if the same image is drawn multiple times\n    on one page it may be drawn at differing resolutions, and our objective\n    is to find the resolution at which the page can be rastered without\n    downsampling.\n\n    \"\"\"\n    if container.get(Name.Type) == Name.Page and Name.Contents in container:\n        initial_shorthand = shorthand or UNIT_SQUARE\n    elif (\n        container.get(Name.Type) == Name.XObject\n        and container[Name.Subtype] == Name.Form\n    ):\n        # Set the CTM to the state it was when the \"Do\" operator was\n        # encountered that is drawing this instance of the Form XObject\n        ctm = Matrix(shorthand) if shorthand else Matrix()\n\n        # A Form XObject may provide its own matrix to map form space into\n        # user space. Get this if one exists\n        form_shorthand = container.get(Name.Matrix, Matrix())\n        form_matrix = Matrix(form_shorthand)\n\n        # Concatenate form matrix with CTM to ensure CTM is correct for\n        # drawing this instance of the XObject\n        ctm = form_matrix @ ctm\n        initial_shorthand = ctm.shorthand\n    else:\n        return\n\n    contentsinfo = _interpret_contents(container, initial_shorthand)\n\n    if contentsinfo.found_vector:\n        yield VectorMarker()\n    if contentsinfo.found_text:\n        yield TextMarker()\n    yield from _find_inline_images(contentsinfo)\n    yield from _find_regular_images(container, contentsinfo)\n    yield from _find_form_xobject_images(pdf, container, contentsinfo)\n"
  },
  {
    "path": "src/ocrmypdf/pdfinfo/_types.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"PDF type definitions and constants.\"\"\"\n\nfrom __future__ import annotations\n\nfrom enum import Enum, auto\n\n\nclass Colorspace(Enum):\n    \"\"\"Description of common image colorspaces in a PDF.\"\"\"\n\n    # pylint: disable=invalid-name\n    gray = auto()\n    rgb = auto()\n    cmyk = auto()\n    lab = auto()\n    icc = auto()\n    index = auto()\n    sep = auto()\n    devn = auto()\n    pattern = auto()\n    jpeg2000 = auto()\n\n\nclass Encoding(Enum):\n    \"\"\"Description of common image encodings in a PDF.\"\"\"\n\n    # pylint: disable=invalid-name\n    ccitt = auto()\n    jpeg = auto()\n    jpeg2000 = auto()\n    jbig2 = auto()\n    asciihex = auto()\n    ascii85 = auto()\n    lzw = auto()\n    flate = auto()\n    runlength = auto()\n    flate_jpeg = auto()\n\n\nFloatRect = tuple[float, float, float, float]\n\nFRIENDLY_COLORSPACE: dict[str, Colorspace] = {\n    '/DeviceGray': Colorspace.gray,\n    '/CalGray': Colorspace.gray,\n    '/DeviceRGB': Colorspace.rgb,\n    '/CalRGB': Colorspace.rgb,\n    '/DeviceCMYK': Colorspace.cmyk,\n    '/Lab': Colorspace.lab,\n    '/ICCBased': Colorspace.icc,\n    '/Indexed': Colorspace.index,\n    '/Separation': Colorspace.sep,\n    '/DeviceN': Colorspace.devn,\n    '/Pattern': Colorspace.pattern,\n    '/G': Colorspace.gray,  # Abbreviations permitted in inline images\n    '/RGB': Colorspace.rgb,\n    '/CMYK': Colorspace.cmyk,\n    '/I': Colorspace.index,\n}\n\nFRIENDLY_ENCODING: dict[str, Encoding] = {\n    '/CCITTFaxDecode': Encoding.ccitt,\n    '/DCTDecode': Encoding.jpeg,\n    '/JPXDecode': Encoding.jpeg2000,\n    '/JBIG2Decode': Encoding.jbig2,\n    '/CCF': Encoding.ccitt,  # Abbreviations permitted in inline images\n    '/DCT': Encoding.jpeg,\n    '/AHx': Encoding.asciihex,\n    '/A85': Encoding.ascii85,\n    '/LZW': Encoding.lzw,\n    '/Fl': Encoding.flate,\n    '/RL': Encoding.runlength,\n}\n\nFRIENDLY_COMP: dict[Colorspace, int] = {\n    Colorspace.gray: 1,\n    Colorspace.rgb: 3,\n    Colorspace.cmyk: 4,\n    Colorspace.lab: 3,\n    Colorspace.index: 1,\n}\n\nUNIT_SQUARE = (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)\n"
  },
  {
    "path": "src/ocrmypdf/pdfinfo/_worker.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"PDF page info worker process handling.\"\"\"\n\nfrom __future__ import annotations\n\nimport atexit\nimport logging\nfrom collections.abc import Container, Sequence\nfrom contextlib import contextmanager\nfrom functools import partial\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING\n\nfrom pikepdf import Pdf\n\nfrom ocrmypdf._concurrent import Executor\nfrom ocrmypdf._progressbar import ProgressBar\nfrom ocrmypdf.exceptions import InputFileError\nfrom ocrmypdf.helpers import available_cpu_count, pikepdf_enable_mmap\n\nif TYPE_CHECKING:\n    from ocrmypdf.pdfinfo.info import PageInfo\n    from ocrmypdf.pdfinfo.layout import PdfMinerState\n\nlogger = logging.getLogger()\n\nworker_pdf = None  # pylint: disable=invalid-name\n\n\ndef _pdf_pageinfo_sync_init(pdf: Pdf, infile: Path, pdfminer_loglevel):\n    global worker_pdf  # pylint: disable=global-statement,invalid-name\n    pikepdf_enable_mmap()\n\n    logging.getLogger('pdfminer').setLevel(pdfminer_loglevel)\n\n    # If the pdf is not opened, open a copy for our worker process to use\n    if pdf is None:\n        worker_pdf = Pdf.open(infile)\n\n        def on_process_close():\n            worker_pdf.close()\n\n        # Close when this process exits\n        atexit.register(on_process_close)\n\n\n@contextmanager\ndef _pdf_pageinfo_sync_pdf(thread_pdf: Pdf | None, infile: Path):\n    if thread_pdf is not None:\n        yield thread_pdf\n    elif worker_pdf is not None:\n        yield worker_pdf\n    else:\n        with Pdf.open(infile) as pdf:\n            yield pdf\n\n\ndef _pdf_pageinfo_sync(\n    pageno: int,\n    thread_pdf: Pdf | None,\n    infile: Path,\n    check_pages: Container[int],\n    detailed_analysis: bool,\n    miner_state: PdfMinerState | None,\n) -> PageInfo:\n    # Import here to avoid circular import - info.py imports this module,\n    # but PageInfo is defined in info.py\n    from ocrmypdf.pdfinfo.info import PageInfo\n\n    with _pdf_pageinfo_sync_pdf(thread_pdf, infile) as pdf:\n        return PageInfo(\n            pdf, pageno, infile, check_pages, detailed_analysis, miner_state\n        )\n\n\ndef _pdf_pageinfo_concurrent(\n    pdf,\n    executor: Executor,\n    max_workers: int,\n    use_threads: bool,\n    infile,\n    progbar,\n    check_pages,\n    detailed_analysis: bool = False,\n    miner_state: PdfMinerState | None = None,\n) -> Sequence[PageInfo | None]:\n    pages: list[PageInfo | None] = [None] * len(pdf.pages)\n\n    def update_pageinfo(page: PageInfo, pbar: ProgressBar):\n        if not page:\n            raise InputFileError(\"Could read a page in the PDF\")\n        pages[page.pageno] = page\n        pbar.update()\n\n    if max_workers is None:\n        max_workers = available_cpu_count()\n\n    total = len(pdf.pages)\n\n    n_workers = min(1 + len(pages) // 4, max_workers)\n    if n_workers == 1:\n        # If we decided on only one worker, there is no point in using\n        # a separate process.\n        use_threads = True\n\n    if use_threads and n_workers > 1:\n        # If we are using threads, there is no point in using more than one\n        # worker thread - they will just fight over the GIL.\n        n_workers = 1\n\n    # If we use a thread, we can pass the already-open Pdf for them to use\n    # If we use processes, we pass a None which tells the init function to open its\n    # own\n    initial_pdf = pdf if use_threads else None\n\n    contexts = (\n        (n, initial_pdf, infile, check_pages, detailed_analysis, miner_state)\n        for n in range(total)\n    )\n    assert n_workers == 1 if use_threads else n_workers >= 1, \"Not multithreadable\"\n    logger.debug(\n        f\"Gathering info with {n_workers} \"\n        + ('thread' if use_threads else 'process')\n        + \" workers\"\n    )\n    executor(\n        use_threads=use_threads,\n        max_workers=n_workers,\n        progress_kwargs=dict(\n            total=total, desc=\"Scanning contents\", unit='page', disable=not progbar\n        ),\n        worker_initializer=partial(\n            _pdf_pageinfo_sync_init,\n            initial_pdf,\n            infile,\n            logging.getLogger('pdfminer').level,\n        ),\n        task=_pdf_pageinfo_sync,\n        task_arguments=contexts,\n        task_finished=update_pageinfo,\n    )\n    return pages\n"
  },
  {
    "path": "src/ocrmypdf/pdfinfo/info.py",
    "content": "#!/usr/bin/env python3\n\n# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"Extract information about the content of a PDF.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport statistics\nfrom collections.abc import Callable, Container, Iterable, Iterator\nfrom contextlib import nullcontext\nfrom decimal import Decimal\nfrom os import PathLike\nfrom pathlib import Path\nfrom typing import NamedTuple\n\nfrom pdfminer.layout import LTPage, LTTextBox\nfrom pikepdf import Name, Page, Pdf\n\nfrom ocrmypdf._concurrent import Executor, SerialExecutor\nfrom ocrmypdf.exceptions import EncryptedPdfError\nfrom ocrmypdf.helpers import Resolution\nfrom ocrmypdf.pdfinfo._contentstream import TextboxInfo, TextMarker, VectorMarker\nfrom ocrmypdf.pdfinfo._image import ImageInfo, _process_content_streams\nfrom ocrmypdf.pdfinfo._types import FloatRect\nfrom ocrmypdf.pdfinfo._worker import _pdf_pageinfo_concurrent\nfrom ocrmypdf.pdfinfo.layout import (\n    LTStateAwareChar,\n    PdfMinerState,\n    get_text_boxes,\n)\n\nlogger = logging.getLogger()\n\n\ndef _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) -> bool:\n    \"\"\"Smarter text detection that ignores text in margins.\"\"\"\n    pw, ph = float(page_width), float(page_height)  # pylint: disable=invalid-name\n\n    margin_ratio = 0.125\n    interior_bbox = (\n        margin_ratio * pw,  # left\n        (1 - margin_ratio) * ph,  # top\n        (1 - margin_ratio) * pw,  # right\n        margin_ratio * ph,  # bottom  (first quadrant: bottom < top)\n    )\n\n    def rects_intersect(a: FloatRect, b: FloatRect) -> bool:\n        \"\"\"Check if two 4-tuple rects intersect.\n\n        Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3)\n        https://stackoverflow.com/questions/306316/determine-if-two-rectangles-overlap-each-other\n        Formula assumes all boxes are in first quadrant.\n        \"\"\"\n        return a[0] < b[2] and a[2] > b[0] and a[1] > b[3] and a[3] < b[1]\n\n    has_text = False\n    for bbox in text_blocks:\n        if rects_intersect(bbox, interior_bbox):\n            has_text = True\n            break\n    return has_text\n\n\ndef simplify_textboxes(\n    miner_page: LTPage, textbox_getter: Callable[[LTPage], Iterator[LTTextBox]]\n) -> Iterator[TextboxInfo]:\n    \"\"\"Extract only limited content from text boxes.\n\n    We do this to save memory and ensure that our objects are pickleable.\n    \"\"\"\n    for box in textbox_getter(miner_page):\n        first_line = box._objs[0]  # pylint: disable=protected-access\n        first_char = first_line._objs[0]  # pylint: disable=protected-access\n        if not isinstance(first_char, LTStateAwareChar):\n            continue\n        visible = first_char.rendermode != 3\n        corrupt = first_char.get_text() == '\\ufffd'\n        yield TextboxInfo(box.bbox, visible, corrupt)\n\n\nclass PageResolutionProfile(NamedTuple):\n    \"\"\"Information about the resolutions of a page.\"\"\"\n\n    weighted_dpi: float\n    \"\"\"The weighted average DPI of the page, weighted by the area of each image.\"\"\"\n\n    max_dpi: float\n    \"\"\"The maximum DPI of an image on the page.\"\"\"\n\n    average_to_max_dpi_ratio: float\n    \"\"\"The average DPI of the page divided by the maximum DPI of the page.\n\n    This indicates the intensity of the resolution variation on the page.\n\n    If the average is 1.0 or close to 1.0, has all of its content at a uniform\n    resolution. If the average is much lower than 1.0, some content is at a\n    higher resolution than the rest of the page.\n    \"\"\"\n\n    area_ratio: float\n    \"\"\"The maximum-DPI area of the page divided by the total drawn area.\n\n    This indicates the prevalence of high-resolution content on the page.\n    \"\"\"\n\n\nclass PageInfo:\n    \"\"\"Information about type of contents on each page in a PDF.\"\"\"\n\n    _has_text: bool | None\n    _has_vector: bool | None\n    _images: list[ImageInfo] = []\n\n    def __init__(\n        self,\n        pdf: Pdf,\n        pageno: int,\n        infile: PathLike,\n        check_pages: Container[int],\n        detailed_analysis: bool = False,\n        miner_state: PdfMinerState | None = None,\n    ):\n        \"\"\"Initialize a PageInfo object.\"\"\"\n        self._pageno = pageno\n        self._infile = infile\n        self._detailed_analysis = detailed_analysis\n        self._gather_pageinfo(\n            pdf, pageno, infile, check_pages, detailed_analysis, miner_state\n        )\n\n    def _gather_pageinfo(\n        self,\n        pdf: Pdf,\n        pageno: int,\n        infile: PathLike,\n        check_pages: Container[int],\n        detailed_analysis: bool,\n        miner_state: PdfMinerState | None,\n    ):\n        page: Page = pdf.pages[pageno]\n        mediabox = [Decimal(d) for d in page.mediabox.as_list()]\n        width_pt = mediabox[2] - mediabox[0]\n        height_pt = mediabox[3] - mediabox[1]\n\n        self._artbox = [float(d) for d in page.artbox.as_list()]\n        self._bleedbox = [float(d) for d in page.bleedbox.as_list()]\n        self._cropbox = [float(d) for d in page.cropbox.as_list()]\n        self._mediabox = [float(d) for d in page.mediabox.as_list()]\n        self._trimbox = [float(d) for d in page.trimbox.as_list()]\n\n        check_this_page = pageno in check_pages\n\n        if check_this_page and detailed_analysis:\n            page_analysis = miner_state.get_page_analysis(pageno)\n            if page_analysis is not None:\n                self._textboxes = list(\n                    simplify_textboxes(page_analysis, get_text_boxes)\n                )\n            else:\n                self._textboxes = []\n            bboxes = (box.bbox for box in self._textboxes)\n\n            self._has_text = _page_has_text(bboxes, width_pt, height_pt)\n        else:\n            self._textboxes = []\n            self._has_text = None  # i.e. \"no information\"\n\n        userunit = page.get(Name.UserUnit, Decimal(1.0))\n        if not isinstance(userunit, Decimal):\n            userunit = Decimal(userunit)\n        self._userunit = userunit\n        self._width_inches = width_pt * userunit / Decimal(72.0)\n        self._height_inches = height_pt * userunit / Decimal(72.0)\n        self._rotate = int(getattr(page.obj, 'Rotate', 0))\n\n        userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)\n\n        if check_this_page:\n            self._has_vector = False\n            self._has_text = False\n            self._images = []\n            for info in _process_content_streams(\n                pdf=pdf, container=page, shorthand=userunit_shorthand\n            ):\n                if isinstance(info, VectorMarker):\n                    self._has_vector = True\n                elif isinstance(info, TextMarker):\n                    self._has_text = True\n                elif isinstance(info, ImageInfo):\n                    self._images.append(info)\n                else:\n                    raise NotImplementedError()\n        else:\n            self._has_vector = None  # i.e. \"no information\"\n            self._has_text = None\n            self._images = []\n\n        self._dpi = None\n        if self._images:\n            dpi = Resolution(0.0, 0.0).take_max(\n                image.dpi for image in self._images if image.renderable\n            )\n            self._dpi = dpi\n            self._width_pixels = int(round(dpi.x * float(self._width_inches)))\n            self._height_pixels = int(round(dpi.y * float(self._height_inches)))\n\n    @property\n    def pageno(self) -> int:\n        \"\"\"Return page number (0-based).\"\"\"\n        return self._pageno\n\n    @property\n    def has_text(self) -> bool:\n        \"\"\"Return True if page has text, False if not or unknown.\"\"\"\n        return bool(self._has_text)\n\n    @property\n    def has_corrupt_text(self) -> bool:\n        \"\"\"Return True if page has corrupt text, False if not or unknown.\"\"\"\n        if not self._detailed_analysis:\n            raise NotImplementedError('Did not do detailed analysis')\n        return any(tbox.is_corrupt for tbox in self._textboxes)\n\n    @property\n    def has_vector(self) -> bool:\n        \"\"\"Return True if page has vector graphics, False if not or unknown.\n\n        Vector graphics are sometimes used to draw fonts, so it may not be\n        obvious on visual inspection whether a page has text or not.\n        \"\"\"\n        return bool(self._has_vector)\n\n    @property\n    def width_inches(self) -> Decimal:\n        \"\"\"Return width of page in inches.\"\"\"\n        return self._width_inches\n\n    @property\n    def height_inches(self) -> Decimal:\n        \"\"\"Return height of page in inches.\"\"\"\n        return self._height_inches\n\n    @property\n    def width_pixels(self) -> int:\n        \"\"\"Return width of page in pixels.\"\"\"\n        return int(round(float(self.width_inches) * self.dpi.x))\n\n    @property\n    def height_pixels(self) -> int:\n        \"\"\"Return height of page in pixels.\"\"\"\n        return int(round(float(self.height_inches) * self.dpi.y))\n\n    @property\n    def rotation(self) -> int:\n        \"\"\"Return rotation of page in degrees.\n\n        Will only be a multiple of 90.\n        \"\"\"\n        return self._rotate\n\n    @rotation.setter\n    def rotation(self, value):\n        if value in (0, 90, 180, 270, 360, -90, -180, -270):\n            self._rotate = value\n        else:\n            raise ValueError(\"rotation must be a cardinal angle\")\n\n    @property\n    def cropbox(self) -> FloatRect:\n        \"\"\"Return cropbox of page in PDF coordinates.\"\"\"\n        return self._cropbox\n\n    @property\n    def mediabox(self) -> FloatRect:\n        \"\"\"Return mediabox of page in PDF coordinates.\"\"\"\n        return self._mediabox\n\n    @property\n    def trimbox(self) -> FloatRect:\n        \"\"\"Return trimbox of page in PDF coordinates.\"\"\"\n        return self._trimbox\n\n    @property\n    def artbox(self) -> FloatRect:\n        \"\"\"Return artbox of page in PDF coordinates.\"\"\"\n        return self._artbox\n\n    @property\n    def bleedbox(self) -> FloatRect:\n        \"\"\"Return bleedbox of page in PDF coordinates.\"\"\"\n        return self._bleedbox\n\n    @property\n    def images(self) -> list[ImageInfo]:\n        \"\"\"Return images.\"\"\"\n        return self._images\n\n    def get_textareas(self, visible: bool | None = None, corrupt: bool | None = None):\n        \"\"\"Return textareas bounding boxes in PDF coordinates on the page.\"\"\"\n\n        def predicate(\n            obj: TextboxInfo, want_visible: bool | None, want_corrupt: bool | None\n        ) -> bool:\n            result = True\n            if want_visible is not None and obj.is_visible != want_visible:\n                result = False\n            if want_corrupt is not None and obj.is_corrupt != want_corrupt:\n                result = False\n            return result\n\n        if not self._textboxes:\n            if visible is not None and corrupt is not None:\n                raise NotImplementedError('Incomplete information on textboxes')\n            return self._textboxes\n\n        return (obj.bbox for obj in self._textboxes if predicate(obj, visible, corrupt))\n\n    @property\n    def dpi(self) -> Resolution:\n        \"\"\"Return DPI needed to render all images on the page.\"\"\"\n        if self._dpi is None:\n            return Resolution(0.0, 0.0)\n        return self._dpi\n\n    @property\n    def userunit(self) -> Decimal:\n        \"\"\"Return user unit of page.\"\"\"\n        return self._userunit\n\n    @property\n    def min_version(self) -> str:\n        \"\"\"Return minimum PDF version needed to render this page.\"\"\"\n        if self.userunit is not None:\n            return '1.6'\n        else:\n            return '1.5'\n\n    def page_dpi_profile(self) -> PageResolutionProfile | None:\n        \"\"\"Return information about the DPIs of the page.\n\n        This is useful to detect pages with a small proportion of high-resolution\n        content that is forcing us to use a high DPI for the whole page. The ratio\n        is weighted by the area of each image. If images overlap, the overlapped\n        area counts.\n\n        Vector graphics and text are ignored.\n\n        Returns None if there is no meaningful DPI for the page.\n        \"\"\"\n        image_dpis = []\n        image_areas = []\n        for image in self._images:\n            if not image.renderable:\n                continue\n            image_dpis.append(image.dpi.to_scalar())\n            image_areas.append(image.printed_area)\n\n        total_drawn_area = sum(image_areas)\n        if total_drawn_area == 0:\n            return None\n\n        weights = [area / total_drawn_area for area in image_areas]\n        # Calculate harmonic mean of DPIs weighted by area\n        weighted_dpi = statistics.harmonic_mean(image_dpis, weights)\n        max_dpi = max(image_dpis)\n        dpi_average_max_ratio = weighted_dpi / max_dpi\n\n        arg_max_dpi = image_dpis.index(max_dpi)\n        max_area_ratio = image_areas[arg_max_dpi] / total_drawn_area\n        return PageResolutionProfile(\n            weighted_dpi,\n            max_dpi,\n            dpi_average_max_ratio,\n            max_area_ratio,\n        )\n\n    def __repr__(self):\n        \"\"\"Return string representation.\"\"\"\n        return (\n            f'<PageInfo '\n            f'pageno={self.pageno} {self.width_inches}\"x{self.height_inches}\" '\n            f'rotation={self.rotation} dpi={self.dpi} has_text={self.has_text}>'\n        )\n\n\nDEFAULT_EXECUTOR = SerialExecutor()\n\n\nclass PdfInfo:\n    \"\"\"Extract summary information about a PDF without retaining the PDF itself.\n\n    Crucially this lets us get the information in a pure Python format so that\n    it can be pickled and passed to a worker process.\n    \"\"\"\n\n    _has_acroform: bool = False\n    _has_signature: bool = False\n    _needs_rendering: bool = False\n\n    def __init__(\n        self,\n        infile: Path,\n        *,\n        detailed_analysis: bool = False,\n        progbar: bool = False,\n        max_workers: int | None = None,\n        use_threads: bool = True,\n        check_pages=None,\n        executor: Executor = DEFAULT_EXECUTOR,\n    ):\n        \"\"\"Initialize.\"\"\"\n        self._infile = infile\n        if check_pages is None:\n            check_pages = range(0, 1_000_000_000)\n\n        with Pdf.open(infile) as pdf:\n            if pdf.is_encrypted:\n                raise EncryptedPdfError()  # Triggered by encryption with empty passwd\n            pscript5_mode = str(pdf.docinfo.get(Name.Creator, \"\")).startswith(\n                'PScript5'\n            )\n            self._miner_state = (\n                PdfMinerState(infile, pscript5_mode)\n                if detailed_analysis\n                else nullcontext()\n            )\n            with self._miner_state as miner_state:\n                self._pages = _pdf_pageinfo_concurrent(\n                    pdf,\n                    executor,\n                    max_workers,\n                    use_threads,\n                    infile,\n                    progbar,\n                    check_pages=check_pages,\n                    detailed_analysis=detailed_analysis,\n                    miner_state=miner_state,\n                )\n            self._needs_rendering = pdf.Root.get(Name.NeedsRendering, False)\n            if Name.AcroForm in pdf.Root:\n                if (\n                    len(pdf.Root.AcroForm.get(Name.Fields, [])) > 0\n                    or Name.XFA in pdf.Root.AcroForm\n                ):\n                    self._has_acroform = True\n                self._has_signature = bool(pdf.Root.AcroForm.get(Name.SigFlags, 0) & 1)\n            self._is_tagged = bool(\n                pdf.Root.get(Name.MarkInfo, {}).get(Name.Marked, False)\n            )\n\n    @property\n    def pages(self) -> list[PageInfo | None]:\n        \"\"\"Return list of PageInfo objects, one per page in the PDF.\"\"\"\n        return self._pages\n\n    @property\n    def min_version(self) -> str:\n        \"\"\"Return minimum PDF version needed to render this PDF.\"\"\"\n        # The minimum PDF is the maximum version that any particular page needs\n        return max(page.min_version for page in self.pages if page)\n\n    @property\n    def has_userunit(self) -> bool:\n        \"\"\"Return True if any page has a user unit.\"\"\"\n        return any(page.userunit != 1.0 for page in self.pages if page)\n\n    @property\n    def has_acroform(self) -> bool:\n        \"\"\"Return True if the document catalog has an AcroForm.\"\"\"\n        return self._has_acroform\n\n    @property\n    def has_signature(self) -> bool:\n        \"\"\"Return True if the document annotations has a digital signature.\"\"\"\n        return self._has_signature\n\n    @property\n    def is_tagged(self) -> bool:\n        \"\"\"Return True if the document catalog indicates this is a Tagged PDF.\"\"\"\n        return self._is_tagged\n\n    @property\n    def filename(self) -> str | Path:\n        \"\"\"Return filename of PDF.\"\"\"\n        if not isinstance(self._infile, str | Path):\n            raise NotImplementedError(\"can't get filename from stream\")\n        return self._infile\n\n    @property\n    def needs_rendering(self) -> bool:\n        \"\"\"Return True if PDF contains XFA forms.\n\n        XFA forms are not supported by most standard PDF renderers, so we\n        need to detect and suppress them.\n        \"\"\"\n        return self._needs_rendering\n\n    def __getitem__(self, item) -> PageInfo:\n        \"\"\"Return PageInfo object for page number `item`.\"\"\"\n        return self._pages[item]\n\n    def __len__(self):\n        \"\"\"Return number of pages in PDF.\"\"\"\n        return len(self._pages)\n\n    def __repr__(self):\n        \"\"\"Return string representation.\"\"\"\n        return f\"<PdfInfo('...'), page count={len(self)}>\"\n\n\ndef main():  # pragma: no cover\n    \"\"\"Run as a script.\"\"\"\n    import argparse  # pylint: disable=import-outside-toplevel\n    from pprint import pprint  # pylint: disable=import-outside-toplevel\n\n    parser = argparse.ArgumentParser()\n    parser.add_argument('infile')\n    args = parser.parse_args()\n    pdfinfo = PdfInfo(args.infile)\n\n    pprint(pdfinfo)\n    for page in pdfinfo.pages:\n        pprint(page)\n        for im in page.images:\n            pprint(im)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "src/ocrmypdf/pdfinfo/layout.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"Detailed text position and layout analysis, building on pdfminer.six.\"\"\"\n\nfrom __future__ import annotations\n\nimport re\nfrom collections.abc import Iterator, Mapping\nfrom contextlib import contextmanager\nfrom math import copysign\nfrom os import PathLike\nfrom pathlib import Path\nfrom typing import Any\nfrom unittest.mock import patch\n\nimport pdfminer\nimport pdfminer.encodingdb\nimport pdfminer.pdfdevice\nimport pdfminer.pdfinterp\nimport pdfminer.psparser\nfrom deprecation import deprecated\nfrom pdfminer.converter import PDFLayoutAnalyzer\nfrom pdfminer.layout import LAParams, LTChar, LTPage, LTTextBox\nfrom pdfminer.pdfcolor import PDFColorSpace\nfrom pdfminer.pdfdevice import PDFTextSeq\nfrom pdfminer.pdfdocument import PDFTextExtractionNotAllowed\nfrom pdfminer.pdffont import FontWidthDict, PDFFont, PDFSimpleFont, PDFUnicodeNotDefined\nfrom pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager, PDFTextState\nfrom pdfminer.pdfpage import PDFPage\nfrom pdfminer.utils import Matrix, bbox2str, matrix2str\n\nfrom ocrmypdf.exceptions import EncryptedPdfError, InputFileError\n\nSTRIP_NAME = re.compile(r'[0-9]+')\n\n\noriginal_pdfsimplefont_init = PDFSimpleFont.__init__\n\n\ndef pdfsimplefont__init__(\n    self,\n    descriptor: Mapping[str, Any],\n    widths: FontWidthDict,\n    spec: Mapping[str, Any],\n) -> None:\n    \"\"\"Monkeypatch pdfminer.six PDFSimpleFont.__init__.\n\n    If there is no ToUnicode and no Encoding, pdfminer.six assumes that Unicode\n    conversion is possible. This is incorrect, according to PDF Reference Manual\n    9.10.2. This patch fixes that.\n    \"\"\"\n    # Font encoding is specified either by a name of\n    # built-in encoding or a dictionary that describes\n    # the differences.\n    original_pdfsimplefont_init(self, descriptor, widths, spec)\n    if not self.unicode_map and 'Encoding' not in spec:\n        self.cid2unicode = {}\n    return\n\n\nPDFSimpleFont.__init__ = pdfsimplefont__init__\n\n# Patch pdfminer.six buffer size\n# The parser doesn't properly handle keyword tokens are split across the end of the\n# buffer, so increase the buffer size something far larger than will ever be seen.\npdfminer.psparser.PSBaseParser.BUFSIZ = 256 * 1024 * 1024\n\n\ndef pdftype3font__pscript5_get_height(self):\n    \"\"\"Monkeypatch for PScript5.dll PDFs.\n\n    The height of Type3 fonts is known to be incorrect in PScript5.dll\n    generated PDFs. This patch attempts to correct the height by\n    using the bbox height if it is available, otherwise using the\n    ascent and descent.\n    \"\"\"\n    h = self.bbox[3] - self.bbox[1]\n    if h == 0:\n        h = self.ascent - self.descent\n    return h * copysign(1.0, self.vscale)\n\n\ndef pdftype3font__pscript5_get_descent(self):\n    \"\"\"Monkeypatch for PScript5.dll PDFs.\n\n    The descent of Type3 fonts is known to be incorrect in PScript5.dll\n    generated PDFs. This patch attempts to correct the descent by\n    using the vscale.\n    \"\"\"\n    return self.descent * copysign(1.0, self.vscale)\n\n\ndef pdftype3font__pscript5_get_ascent(self):\n    \"\"\"Monkeypatch for PScript5.dll PDFs.\n\n    The ascent of Type3 fonts is known to be incorrect in PScript5.dll\n    generated PDFs. This patch attempts to correct the ascent by\n    using the vscale.\n    \"\"\"\n    return self.ascent * copysign(1.0, self.vscale)\n\n\ndef _is_undefined_char(s: str) -> bool:\n    \"\"\"Check if a string is an undefined character.\"\"\"\n    return s.startswith('(cid:') and s.endswith(')')\n\n\nclass LTStateAwareChar(LTChar):\n    \"\"\"A subclass of LTChar that tracks text render mode at time of drawing.\"\"\"\n\n    __slots__ = (\n        'rendermode',\n        '_text',\n        'matrix',\n        'fontname',\n        'adv',\n        'upright',\n        'size',\n        'width',\n        'height',\n        'bbox',\n        'x0',\n        'x1',\n        'y0',\n        'y1',\n    )\n\n    def __init__(\n        self,\n        matrix: Matrix,\n        font: PDFFont,\n        fontsize: float,\n        scaling: float,\n        rise: float,\n        text: str,\n        textwidth: float,\n        textdisp: float | tuple[float | None, float],\n        ncs: PDFColorSpace,\n        graphicstate: PDFGraphicState,\n        textstate: PDFTextState,\n    ) -> None:\n        \"\"\"Initialize.\"\"\"\n        super().__init__(\n            matrix,\n            font,\n            fontsize,\n            scaling,\n            rise,\n            text,\n            textwidth,\n            textdisp,\n            ncs,\n            graphicstate,\n        )\n        self.rendermode = textstate.render\n\n    def is_compatible(self, obj: object) -> bool:\n        \"\"\"Check if characters can be combined into a textline.\n\n        We consider characters compatible if:\n            - the Unicode mapping is known, and both have the same render mode\n            - the Unicode mapping is unknown but both are part of the same font\n        \"\"\"\n        # pylint: disable=protected-access\n        if not isinstance(obj, LTStateAwareChar):\n            return False\n        both_unicode_mapped = not _is_undefined_char(\n            self._text\n        ) and not _is_undefined_char(obj._text)\n        if both_unicode_mapped:\n            return self.rendermode == obj.rendermode\n        return self.fontname == obj.fontname and self.rendermode == obj.rendermode\n\n    def get_text(self) -> str:\n        \"\"\"Get text from this character.\"\"\"\n        if _is_undefined_char(self._text):\n            return '\\ufffd'  # standard 'Unknown symbol'\n        return self._text\n\n    def __repr__(self) -> str:\n        \"\"\"Return a string representation of this object.\"\"\"\n        return (\n            f\"<{self.__class__.__name__} \"\n            f\"{bbox2str(self.bbox)} \"\n            f\"matrix={matrix2str(self.matrix)} \"\n            f\"rendermode={self.rendermode!r} \"\n            f\"font={self.fontname!r} \"\n            f\"adv={self.adv} \"\n            f\"text={self.get_text()!r}>\"\n        )\n\n\nclass TextPositionTracker(PDFLayoutAnalyzer):\n    \"\"\"A page layout analyzer that pays attention to text visibility.\"\"\"\n\n    textstate: PDFTextState\n\n    def __init__(\n        self,\n        rsrcmgr: PDFResourceManager,\n        pageno: int = 1,\n        laparams: LAParams | None = None,\n    ):\n        \"\"\"Initialize the layout analyzer.\"\"\"\n        super().__init__(rsrcmgr, pageno, laparams)\n        self.result: LTPage | None = None\n\n    def begin_page(self, page: PDFPage, ctm: Matrix) -> None:\n        \"\"\"Begin processing of a page.\"\"\"\n        super().begin_page(page, ctm)\n        self.cur_item = LTPage(self.pageno, page.mediabox)\n\n    def end_page(self, page: PDFPage) -> None:\n        \"\"\"End processing of a page.\"\"\"\n        assert not self._stack, str(len(self._stack))\n        assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))\n        if self.laparams is not None:\n            self.cur_item.analyze(self.laparams)\n        self.pageno += 1\n        self.receive_layout(self.cur_item)\n\n    def render_string(\n        self,\n        textstate: PDFTextState,\n        seq: PDFTextSeq,\n        ncs: PDFColorSpace,\n        graphicstate: PDFGraphicState,\n    ) -> None:\n        \"\"\"Respond to render string event by updating text state.\"\"\"\n        self.textstate = textstate.copy()\n        super().render_string(self.textstate, seq, ncs, graphicstate)\n\n    def render_char(\n        self,\n        matrix: Matrix,\n        font: PDFFont,\n        fontsize: float,\n        scaling: float,\n        rise: float,\n        cid: int,\n        ncs: PDFColorSpace,\n        graphicstate: PDFGraphicState,\n    ) -> float:\n        \"\"\"Respond to render char event by updating text state.\"\"\"\n        try:\n            text = font.to_unichr(cid)\n            assert isinstance(text, str), str(type(text))\n        except PDFUnicodeNotDefined:\n            text = self.handle_undefined_char(font, cid)\n        textwidth = font.char_width(cid)\n        textdisp = font.char_disp(cid)\n        item = LTStateAwareChar(\n            matrix,\n            font,\n            fontsize,\n            scaling,\n            rise,\n            text,\n            textwidth,\n            textdisp,\n            ncs,\n            graphicstate,\n            self.textstate,\n        )\n        self.cur_item.add(item)\n        return item.adv\n\n    def receive_layout(self, ltpage: LTPage) -> None:\n        \"\"\"Receive layout handler.\"\"\"\n        self.result = ltpage\n\n    def get_result(self) -> LTPage | None:\n        \"\"\"Get the result of the analysis.\"\"\"\n        return self.result\n\n\n@contextmanager\ndef patch_pdfminer(pscript5_mode: bool):\n    \"\"\"Patch pdfminer.six to work around bugs in PDFs created by PScript5.\"\"\"\n    if pscript5_mode:\n        with patch.multiple(\n            'pdfminer.pdffont.PDFType3Font',\n            spec=True,\n            get_ascent=pdftype3font__pscript5_get_ascent,\n            get_descent=pdftype3font__pscript5_get_descent,\n            get_height=pdftype3font__pscript5_get_height,\n        ):\n            yield\n    else:\n        yield\n\n\n@deprecated(deprecated_in='16.6.0', details='Use PdfMinerState instead.')\ndef get_page_analysis(\n    infile: PathLike, pageno: int, pscript5_mode: bool\n) -> LTPage | None:\n    \"\"\"Get the page analysis for a given page.\"\"\"\n    rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)\n    disable_boxes_flow = None\n    dev = TextPositionTracker(\n        rman,\n        laparams=LAParams(\n            all_texts=True, detect_vertical=True, boxes_flow=disable_boxes_flow\n        ),\n    )\n    interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev)\n\n    with patch_pdfminer(pscript5_mode):\n        try:\n            with Path(infile).open('rb') as f:\n                page_iter = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0)\n                page = next(page_iter, None)\n                if page is None:\n                    raise InputFileError(\n                        f\"pdfminer could not process page {pageno} (counting from 0).\"\n                    )\n                interp.process_page(page)\n        except PDFTextExtractionNotAllowed as e:\n            raise EncryptedPdfError() from e\n\n    return dev.get_result()\n\n\nclass PdfMinerState:\n    \"\"\"Provide a context manager for using pdfminer.six.\n\n    This ensures that the file is closed. It also provides a cache of pages\n    from the PDF so that they can be reused if needed, to improve performance.\n    \"\"\"\n\n    def __init__(self, infile: Path, pscript5_mode: bool) -> None:\n        \"\"\"Initialize the context manager.\n\n        Args:\n            infile: The path to the PDF file to be analyzed.\n            pscript5_mode: Whether the PDF was generated by PScript5.dll.\n        \"\"\"\n        self.infile = infile\n        self.rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)\n        self.disable_boxes_flow = None\n        self.page_iter = None\n        self.page_cache: list[PDFPage] = []\n        self.pscript5_mode = pscript5_mode\n        self.file = None\n\n    def __enter__(self):\n        \"\"\"Enter the context manager.\"\"\"\n        self.file = Path(self.infile).open('rb')\n        self.page_iter = PDFPage.get_pages(self.file)\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback):\n        \"\"\"Exit the context manager.\"\"\"\n        if self.file:\n            self.file.close()\n        return True\n\n    def get_page_analysis(self, pageno: int):\n        \"\"\"Get the page analysis for a given page.\"\"\"\n        while len(self.page_cache) <= pageno:\n            try:\n                self.page_cache.append(next(self.page_iter))\n            except StopIteration:\n                raise InputFileError(\n                    f\"pdfminer did not find page {pageno} in the input file.\"\n                ) from None\n        page = self.page_cache[pageno]\n        if not page:\n            raise InputFileError(\n                f\"pdfminer could not process page {pageno} (counting from 0).\"\n            )\n        dev = TextPositionTracker(\n            self.rman,\n            laparams=LAParams(\n                all_texts=True, detect_vertical=True, boxes_flow=self.disable_boxes_flow\n            ),\n        )\n        interp = pdfminer.pdfinterp.PDFPageInterpreter(self.rman, dev)\n\n        with patch_pdfminer(self.pscript5_mode):\n            interp.process_page(page)\n\n        return dev.get_result()\n\n\ndef get_text_boxes(obj) -> Iterator[LTTextBox]:\n    \"\"\"Get the text boxes attached to the current node.\"\"\"\n    for child in obj:\n        if isinstance(child, (LTTextBox)):\n            yield child\n        else:\n            try:\n                yield from get_text_boxes(child)\n            except TypeError:\n                continue\n"
  },
  {
    "path": "src/ocrmypdf/pluginspec.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"OCRmyPDF pluggy plugin specification.\"\"\"\n\nfrom __future__ import annotations\n\nfrom abc import ABC, abstractmethod\nfrom argparse import ArgumentParser\nfrom collections.abc import Sequence, Set\nfrom enum import StrEnum\nfrom logging import Handler\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, NamedTuple\n\nimport pluggy\nfrom pydantic import BaseModel\n\nfrom ocrmypdf import Executor, PdfContext\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf._progressbar import ProgressBar\nfrom ocrmypdf.helpers import Resolution\n\nif TYPE_CHECKING:\n    from PIL import Image\n\n    # pylint: disable=ungrouped-imports\n    from ocrmypdf._jobcontext import PageContext\n    from ocrmypdf.hocrtransform import OcrElement\n    from ocrmypdf.pdfinfo import PdfInfo\n\n    # pylint: enable=ungrouped-imports\n\n\nclass GhostscriptRasterDevice(StrEnum):\n    \"\"\"Possible raster devices for Ghostscript.\"\"\"\n\n    JPEGGRAY = 'jpeggray'\n    JPEGCOLOR = 'jpeg'\n    PNGMONO = 'pngmono'\n    PNGGRAY = 'pnggray'\n    PNG256 = 'png256'\n    PNG16M = 'png16m'\n\n\nhookspec = pluggy.HookspecMarker('ocrmypdf')\n\n# pylint: disable=unused-argument\n# mypy: disable-error-code=empty-body\n\n\n@hookspec(firstresult=True)\ndef get_logging_console() -> Handler:  # type: ignore[return-value]\n    \"\"\"Returns a custom logging handler.\n\n    Generally this is necessary when both logging output and a progress bar are both\n    outputting to ``sys.stderr``.\n\n    Note:\n        This is a :ref:`firstresult hook<firstresult>`.\n    \"\"\"\n\n\n@hookspec\ndef initialize(plugin_manager: pluggy.PluginManager) -> None:\n    \"\"\"Called when this plugin is first loaded into OCRmyPDF.\n\n    The primary intended use of this is for plugins to check compatibility with other\n    plugins and possibly block other blocks, a plugin that wishes to block ocrmypdf's\n    built-in optimize plugin could do:\n\n    .. code-block::\n\n        plugin_manager.set_blocked('ocrmypdf.builtin_plugins.optimize')\n\n    It would also be reasonable for an plugin implementation to check if it is unable\n    to proceed, for example, because a required dependency is missing. (If the plugin's\n    ability to proceed depends on options and arguments, use ``validate`` instead.)\n\n    Raises:\n        ocrmypdf.exceptions.ExitCodeException: If options are not acceptable\n            and the application should terminate gracefully with an informative\n            message and error code.\n\n    Note:\n        This hook will be called from the main process, and may modify global state\n        before child worker processes are forked.\n    \"\"\"\n\n\n@hookspec\ndef add_options(parser: ArgumentParser) -> None:\n    \"\"\"Allows the plugin to add its own command line and API arguments.\n\n    OCRmyPDF converts command line arguments to API arguments, so adding\n    arguments here will cause new arguments to be processed for API calls\n    to ``ocrmypdf.ocr``, or when invoked on the command line.\n\n    Note:\n        This hook will be called from the main process, and may modify global state\n        before child worker processes are forked.\n    \"\"\"\n\n\n@hookspec\ndef register_options() -> dict[str, type[BaseModel]]:\n    \"\"\"Return plugin's option models keyed by namespace.\n\n    This hook allows plugins to register their option models with the\n    plugin option registry. The returned dictionary should map namespace\n    strings to Pydantic model classes.\n\n    Returns:\n        Dictionary mapping namespace strings to BaseModel classes\n\n    Example:\n        @hookimpl\n        def register_options():\n            return {'tesseract': TesseractOptions}\n\n    Note:\n        This hook will be called from the main process during plugin\n        infrastructure setup, before child worker processes are forked.\n    \"\"\"\n\n\n@hookspec\ndef check_options(options: OcrOptions) -> None:\n    \"\"\"Called to ask the plugin to check all of the options.\n\n    The plugin may check if options that it added are valid.\n\n    Warnings or other messages may be passed to the user by creating a logger\n    object using ``log = logging.getLogger(__name__)`` and logging to this.\n\n    The plugin may also modify the *options*. All objects that are in options\n    must be picklable so they can be marshalled to child worker processes.\n\n    Raises:\n        ocrmypdf.exceptions.ExitCodeException: If options are not acceptable\n            and the application should terminate gracefully with an informative\n            message and error code.\n\n    Note:\n        This hook will be called from the main process, and may modify global state\n        before child worker processes are forked.\n    \"\"\"\n\n\n@hookspec(firstresult=True)\ndef get_executor(progressbar_class: type[ProgressBar]) -> Executor:  # type: ignore[return-value]\n    \"\"\"Called to obtain an object that manages parallel execution.\n\n    This may be used to replace OCRmyPDF's default parallel execution system\n    with a third party alternative. For example, you could make OCRmyPDF run in a\n    distributed environment.\n\n    OCRmyPDF's executors are analogous to the standard Python executors in\n    ``conconcurrent.futures``, but they do not work the same way. Executors may\n    be reused for different, unrelated batch operations, since all of the context\n    for a given job are passed to :meth:`Executor.__call__`.\n\n    Should be of type :class:`Executor` or otherwise conforming to the protocol\n    of that call.\n\n    Arguments:\n        progressbar_class: A progress bar class, which will be created when\n\n    Note:\n        This hook will be called from the main process, and may modify global state\n        before child worker processes are forked.\n\n    Note:\n        This is a :ref:`firstresult hook<firstresult>`.\n    \"\"\"\n\n\n@hookspec(firstresult=True)\ndef get_progressbar_class() -> type[ProgressBar]:  # type: ignore[return-value]\n    \"\"\"Called to obtain a class that can be used to monitor progress.\n\n    OCRmyPDF will call this function when it wants to display a progress bar.\n    The class returned by this function must be compatible with the\n    :class:`ProgressBar` protocol.\n\n    Example:\n        Here is how OCRmyPDF will use the progress bar:\n\n        .. code-block:: python\n\n            pbar_class = pm.hook.get_progressbar_class()\n            with pbar_class(**progress_kwargs) as pbar:\n                ... # do some work\n                pbar.update(1)\n    \"\"\"\n\n\n@hookspec\ndef validate(pdfinfo: PdfInfo, options: OcrOptions) -> None:\n    \"\"\"Called to give a plugin an opportunity to review *options* and *pdfinfo*.\n\n    *options* contains the \"work order\" to process a particular file. *pdfinfo*\n    contains information about the input file obtained after loading and\n    parsing. The plugin may modify the *options*. For example, you could decide\n    that a certain type of file should be treated with ``options.force_ocr = True``\n    based on information in its *pdfinfo*.\n\n    Raises:\n        ocrmypdf.exceptions.ExitCodeException: If options or pdfinfo are not acceptable\n            and the application should terminate gracefully with an informative\n            message and error code.\n\n    Note:\n        This hook will be called from the main process, and may modify global state\n        before child worker processes are forked.\n    \"\"\"\n\n\n@hookspec(firstresult=True)\ndef rasterize_pdf_page(\n    input_file: Path,\n    output_file: Path,\n    raster_device: GhostscriptRasterDevice,\n    raster_dpi: Resolution,\n    pageno: int,\n    page_dpi: Resolution | None,\n    rotation: int | None,\n    filter_vector: bool,\n    stop_on_soft_error: bool,\n    options: OcrOptions | None,\n    use_cropbox: bool,\n) -> Path:  # type: ignore[return-value]\n    \"\"\"Rasterize one page of a PDF at resolution raster_dpi in canvas units.\n\n    The image is sized to match the integer pixels dimensions implied by\n    raster_dpi even if those numbers are noninteger. The image's DPI will\n    be overridden with the values in page_dpi.\n\n    Args:\n        input_file: The PDF to rasterize.\n        output_file: The desired name of the rasterized image.\n        raster_device: Type of image to produce at output_file.\n        raster_dpi: Resolution in dots per inch at which to rasterize page.\n        pageno: Page number to rasterize (beginning at page 1).\n        page_dpi: Resolution, overriding output image DPI.\n        rotation: Cardinal angle, clockwise, to rotate page.\n        filter_vector: If True, remove vector graphics objects.\n        stop_on_soft_error: If there is an \"soft error\" such that PDF page image\n            generation can proceed, but may visually differ from the original,\n            the implementer of this hook should raise a detailed exception. If\n            ``False``, continue processing and report by logging it. If the hook\n            cannot proceed, it should always raise an exception, regardless of\n            this setting. One \"soft error\" would be a missing font that is\n            required to properly rasterize the PDF.\n        options: OCRmyPDF options. Plugins may use this to check settings like\n            ``options.rasterizer`` to determine whether they should handle the\n            request or defer to another plugin. Introduced in version 17.0.\n        use_cropbox: If True, rasterize the page's CropBox instead of the\n            MediaBox. Default is False (use MediaBox) for consistency with\n            Ghostscript's default behavior.\n\n    Returns:\n        Path: output_file if successful\n\n    Note:\n        This hook will be called from child processes. Modifying global state\n        will not affect the main process or other child processes.\n\n    Note:\n        This is a :ref:`firstresult hook<firstresult>`.\n    \"\"\"\n\n\n@hookspec(firstresult=True)\ndef filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image:  # type: ignore[return-value]\n    \"\"\"Called to filter the image before it is sent to OCR.\n\n    This is the image that OCR sees, not what the user sees when they view the\n    PDF. In certain modes such as ``--redo-ocr``, portions of the image may be\n    masked out to hide them from OCR.\n\n    The main uses of this hook are expected to be hiding content from OCR,\n    conditioning images to OCR better with filters, and adjusting images to\n    match any constraints imposed by the OCR engine.\n\n    The input image may be color, grayscale, or monochrome, and the\n    output image may differ. For example, if you know that a custom OCR engine\n    does not care about the color of the text, you could convert the image to\n    it to grayscale or monochrome.\n\n    Generally speaking, the output image should be a faithful representation of\n    of the input image. You *may* change the pixel width and height of the\n    the input image, but you must not change the aspect ratio, and you must\n    calculate the DPI of the output image based on the new pixel width and\n    height or the OCR text layer will be misaligned with the visual position.\n\n    The built-in Tesseract OCR engine uses this hook itself to downsample\n    very large images to fit its constraints.\n\n    Note:\n        This hook will be called from child processes. Modifying global state\n        will not affect the main process or other child processes.\n\n    Note:\n        This is a :ref:`firstresult hook<firstresult>`.\n    \"\"\"\n\n\n@hookspec(firstresult=True)\ndef filter_page_image(page: PageContext, image_filename: Path) -> Path:  # type: ignore[return-value]\n    \"\"\"Called to filter the whole page before it is inserted into the PDF.\n\n    A whole page image is only produced when preprocessing command line arguments\n    are issued or when ``--force-ocr`` is issued. If no whole page is image is\n    produced for a given page, this function will not be called. This is not\n    the image that will be shown to OCR.\n\n    If the function does not want to modify the image, it should return\n    ``image_filename``. The hook may overwrite ``image_filename`` with a new file.\n\n    The output image should preserve the same physical unit dimensions, that is\n    ``(width * dpi_x, height * dpi_y)``. That is, if the image is resized, the DPI\n    must be adjusted by the reciprocal. If this is not preserved, the PDF page\n    will be resized and the OCR layer misaligned. OCRmyPDF does nothing\n    to enforce these constraints; it is up to the plugin to do sensible things.\n\n    OCRmyPDF will create the PDF page based on the image format used (unless the\n    hook is overridden). If you convert the image to a JPEG, the output page will\n    be created as a JPEG, etc. If you change the colorspace, that change will be\n    kept. Note that the OCRmyPDF image optimization stage, if enabled, may\n    ultimately chose a different format.\n\n    If the return value is a file that does not exist, ``FileNotFoundError``\n    will occur. The return value should be a path to a file in the same folder\n    as ``image_filename``.\n\n    Note:\n        This hook will be called from child processes. Modifying global state\n        will not affect the main process or other child processes.\n\n    Note:\n        This is a :ref:`firstresult hook<firstresult>`.\n    \"\"\"\n\n\n@hookspec(firstresult=True)\ndef filter_pdf_page(page: PageContext, image_filename: Path, output_pdf: Path) -> Path:  # type: ignore[return-value]\n    \"\"\"Called to convert a filtered whole page image into a PDF.\n\n    A whole page image is only produced when preprocessing command line arguments\n    are issued or when ``--force-ocr`` is issued. If no whole page is image is\n    produced for a given page, this function will not be called. This is not\n    the image that will be shown to OCR. The whole page image is filtered in\n    the hook above, ``filter_page_image``, then this function is called for\n    PDF conversion.\n\n    This function will only be called when OCRmyPDF runs in a mode such as\n    \"force OCR\" mode where rasterizing of all content is performed.\n\n    Clever things could be done at this stage such as segmenting the page image into\n    color regions or vector equivalents.\n\n    The provider of the hook implementation is responsible for ensuring that the\n    OCR text layer is aligned with the PDF produced here, or text misalignment\n    will result.\n\n    Currently this function must produce a single page PDF or the pipeline will\n    fail.  If the intent is to remove the PDF, then create a single page empty\n    PDF.\n\n    Args:\n        page: Context for this page.\n        image_filename: Filename of the input image used to create output_pdf,\n            for \"reference\" if recreating the output_pdf entirely.\n        output_pdf: The previous created output_pdf.\n\n    Returns:\n        output_pdf\n\n    Note:\n        This hook will be called from child processes. Modifying global state\n        will not affect the main process or other child processes.\n\n    Note:\n        This is a :ref:`firstresult hook<firstresult>`.\n    \"\"\"\n\n\nclass OrientationConfidence(NamedTuple):\n    \"\"\"Expresses an OCR engine's confidence in page rotation.\n\n    Attributes:\n        angle: The clockwise angle (0, 90, 180, 270) that the page should be\n            rotated. 0 means no rotation.\n        confidence: How confident the OCR engine is that this the correct\n            rotation. 0 is not confident, 15 is very confident. Arbitrary units.\n    \"\"\"\n\n    angle: int\n    confidence: float\n\n\nclass OcrEngine(ABC):\n    \"\"\"A class representing an OCR engine with capabilities similar to Tesseract OCR.\n\n    This could be used to create a plugin for another OCR engine instead of\n    Tesseract OCR.\n    \"\"\"\n\n    @staticmethod\n    @abstractmethod\n    def version() -> str:\n        \"\"\"Returns the version of the OCR engine.\"\"\"\n\n    @staticmethod\n    @abstractmethod\n    def creator_tag(options: OcrOptions) -> str:\n        \"\"\"Returns the creator tag to identify this software's role in creating the PDF.\n\n        This tag will be inserted in the XMP metadata and DocumentInfo dictionary\n        as appropriate. Ideally you should include the name of the OCR engine and its\n        version. The text should not contain line breaks. This is to help developers\n        like yourself identify the software that produced this file.\n\n        OCRmyPDF will always prepend its name to this value.\n        \"\"\"\n\n    @abstractmethod\n    def __str__(self) -> str:\n        \"\"\"Returns name of OCR engine and version.\n\n        This is used when OCRmyPDF wants to mention the name of the OCR engine\n        to the user, usually in an error message.\n        \"\"\"\n\n    @staticmethod\n    @abstractmethod\n    def languages(options: OcrOptions) -> Set[str]:\n        \"\"\"Returns the set of all languages that are supported by the engine.\n\n        Languages are typically given in 3-letter ISO 3166-1 codes, but actually\n        can be any value understood by the OCR engine.\n        \"\"\"\n\n    @staticmethod\n    @abstractmethod\n    def get_orientation(input_file: Path, options: OcrOptions) -> OrientationConfidence:\n        \"\"\"Returns the orientation of the image.\"\"\"\n\n    @staticmethod\n    def get_deskew(input_file: Path, options: OcrOptions) -> float:\n        \"\"\"Returns the deskew angle of the image, in degrees.\"\"\"\n        return 0.0\n\n    @staticmethod\n    @abstractmethod\n    def generate_hocr(\n        input_file: Path, output_hocr: Path, output_text: Path, options: OcrOptions\n    ) -> None:\n        \"\"\"Called to produce a hOCR file from a page image and sidecar text file.\n\n        A hOCR file is an HTML-like file that describes the position of text on a\n        page. OCRmyPDF can create a text only PDF from the hOCR file and graft it\n        onto the output PDF.\n\n        This function executes in a worker thread or worker process. OCRmyPDF\n        automatically parallelizes OCR over pages. The OCR engine should not\n        introduce more parallelism.\n\n        Args:\n            input_file: A page image on which to perform OCR.\n            output_hocr: The expected name of the output hOCR file.\n            output_text: The expected name of a text file containing the\n                recognized text.\n            options: The command line options.\n        \"\"\"\n\n    @staticmethod\n    @abstractmethod\n    def generate_pdf(\n        input_file: Path, output_pdf: Path, output_text: Path, options: OcrOptions\n    ) -> None:\n        \"\"\"Called to produce a text only PDF from a page image.\n\n        A text only PDF should contain no visible material of any kind, as it\n        will be grafted onto the input PDF page. It must be sized to the\n        exact dimensions of the input image.\n\n        This function executes in a worker thread or worker process. OCRmyPDF\n        automatically parallelizes OCR over pages. The OCR engine should not\n        introduce more parallelism.\n\n        Args:\n            input_file: A page image on which to perform OCR.\n            output_pdf: The expected name of the output PDF.\n            output_text: The expected name of a text file containing the\n                recognized text.\n            options: The command line options.\n        \"\"\"\n\n    @staticmethod\n    def supports_generate_ocr() -> bool:\n        \"\"\"Return True if this engine supports the generate_ocr() API.\n\n        The pipeline uses this to determine whether to call generate_ocr()\n        or fall back to generate_hocr().\n\n        Returns:\n            False by default. Engines implementing generate_ocr() should\n            override this to return True.\n        \"\"\"\n        return False\n\n    @staticmethod\n    def generate_ocr(\n        input_file: Path,\n        options: OcrOptions,\n        page_number: int = 0,\n    ) -> tuple[OcrElement, str]:\n        \"\"\"Generate OCR results as an OcrElement tree.\n\n        This is the modern API for OCR engines. Engines implementing this method\n        can return structured OCR results directly without intermediate file formats.\n\n        This function executes in a worker thread or worker process. OCRmyPDF\n        automatically parallelizes OCR over pages. The OCR engine should not\n        introduce more parallelism.\n\n        Args:\n            input_file: A page image on which to perform OCR.\n            options: The command line options.\n            page_number: Zero-indexed page number (for multi-page context).\n\n        Returns:\n            A tuple of (OcrElement tree for the page, plain text content).\n            The OcrElement should have ocr_class=OcrClass.PAGE as its root.\n\n        Note:\n            This method is optional. Engines that don't implement it should\n            leave the default implementation, and the pipeline will fall back to\n            generate_hocr() or generate_pdf().\n        \"\"\"\n        raise NotImplementedError(\"This OcrEngine does not implement generate_ocr()\")\n\n\n@hookspec(firstresult=True)\ndef get_ocr_engine(options: OcrOptions | None) -> OcrEngine:  # type: ignore[return-value]\n    \"\"\"Returns an OcrEngine to use for processing this file.\n\n    The OcrEngine may be instantiated multiple times, by both the main process\n    and child process.\n\n    When multiple OCR engine plugins are installed, plugins should check\n    ``options.ocr_engine`` and return ``None`` if they are not the selected\n    engine. The hook caller will then try the next plugin.\n\n    Args:\n        options: The current OcrOptions, used to determine which engine\n            to select. May be None for backward compatibility with external\n            plugins.\n\n    Note:\n        This is a :ref:`firstresult hook<firstresult>`.\n    \"\"\"\n\n\n@hookspec(firstresult=True)\ndef generate_pdfa(\n    pdf_pages: list[Path],\n    pdfmark: Path,\n    output_file: Path,\n    context: PdfContext,\n    pdf_version: str,\n    pdfa_part: str,\n    progressbar_class: type[ProgressBar] | None,\n    stop_on_soft_error: bool,\n) -> Path:  # type: ignore[return-value]\n    \"\"\"Generate a PDF/A.\n\n    This API strongly assumes a PDF/A generator with Ghostscript's semantics.\n\n    OCRmyPDF will modify the metadata and possibly linearize the PDF/A after it\n    is generated.\n\n    Arguments:\n        pdf_pages: A list of one or more filenames, will be merged into output_file.\n        pdfmark: A PostScript file intended for Ghostscript with details on\n            how to perform the PDF/A conversion.\n        output_file: The name of the desired output file.\n        context: The current context.\n        pdf_version: The minimum PDF version that the output file should be.\n            At its own discretion, the PDF/A generator may raise the version,\n            but should not lower it.\n        pdfa_part: The desired PDF/A compliance level, such as ``'2b'``.\n        progressbar_class: The class of a progress bar, which must implement\n            the ProgressBar protocol. If None, no progress is reported.\n        stop_on_soft_error: If there is an \"soft error\" such that PDF/A generation\n            can proceed and produce a valid PDF/A, but output may be invalid or\n            may not visually resemble the original, the implementer of this hook\n            should raise a detailed exception. If ``False``, continue processing\n            and report by logging it. If the hook cannot proceed, it should always\n            raise an exception, regardless of this setting.\n\n    Returns:\n        Path: If successful, the hook should return ``output_file``.\n\n    Note:\n        This is a :ref:`firstresult hook<firstresult>`.\n\n    Note:\n        Before version 15.0.0, the ``context`` was not provided and ``compression``\n        was provided instead. Plugins should now read the context object to determine\n        if compression is requested.\n    \"\"\"\n\n\n@hookspec(firstresult=True)\ndef optimize_pdf(\n    input_pdf: Path,\n    output_pdf: Path,\n    context: PdfContext,\n    executor: Executor,\n    linearize: bool,\n) -> tuple[Path, Sequence[str]]:  # type: ignore[return-value]\n    \"\"\"Optimize a PDF after image, OCR and metadata processing.\n\n    If the input_pdf is a PDF/A, the plugin should modify input_pdf in a way\n    that preserves the PDF/A status, or report to the user when this is not possible.\n\n    If the implementation fails to produce a smaller file than the input file, it\n    should return input_pdf instead.\n\n    A plugin that implements a new optimizer may need to suppress the built-in\n    optimizer by implementing an ``initialize`` hook.\n\n    Arguments:\n        input_pdf: The input PDF, which has OCR added.\n        output_pdf: The requested filename of the output PDF which should be created\n            by this optimization hook.\n        context: The current context.\n        executor: An initialized executor which may be used during optimization,\n            to distribute optimization tasks.\n        linearize: If True, OCRmyPDF requires ``optimize_pdf`` to return a linearized,\n            also known as fast web view PDF.\n\n    Returns:\n        Path: If optimization is successful, the hook should return ``output_file``.\n            If optimization does not produce a smaller file, the hook should return\n            ``input_file``.\n        Sequence[str]: Any comments that the plugin wishes to report to the user,\n            especially reasons it was not able to further optimize the file. For\n            example, the plugin could report that a required third party was not\n            installed, so a specific optimization was not attempted.\n\n    Note:\n        This is a :ref:`firstresult hook<firstresult>`.\n    \"\"\"\n\n\n@hookspec(firstresult=True)\ndef is_optimization_enabled(context: PdfContext) -> bool:  # type: ignore[return-value]\n    \"\"\"For a given PdfContext, OCRmyPDF asks the plugin if optimization is enabled.\n\n    An optimization plugin might be installed and active but could be disabled by\n    user settings.\n\n    If this returns False, OCRmyPDF will take certain actions to finalize the PDF.\n\n    Returns:\n        True if the plugin's optimization is enabled.\n\n    Note:\n        This is a :ref:`firstresult hook<firstresult>`.\n    \"\"\"\n"
  },
  {
    "path": "src/ocrmypdf/py.typed",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n# ocrmypdf is typed\n"
  },
  {
    "path": "src/ocrmypdf/quality.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Utilities to measure OCR quality.\"\"\"\n\nfrom __future__ import annotations\n\nimport re\nfrom collections.abc import Iterable\n\n\nclass OcrQualityDictionary:\n    \"\"\"Manages a dictionary for simple OCR quality checks.\"\"\"\n\n    def __init__(self, *, wordlist: Iterable[str]):\n        \"\"\"Construct a dictionary from a list of words.\n\n        Words for which capitalization is important should be capitalized in the\n        dictionary. Words that contain spaces or other punctuation will never match.\n        \"\"\"\n        self.dictionary = set(wordlist)\n\n    def measure_words_matched(self, ocr_text: str) -> float:\n        \"\"\"Check how many unique words in the OCR text match a dictionary.\n\n        Words with mixed capitalized are only considered a match if the test word\n        matches that capitalization.\n\n        Returns:\n            number of words that match / number\n        \"\"\"\n        text = re.sub(r\"[0-9_]+\", ' ', ocr_text)\n        text = re.sub(r'\\W+', ' ', text)\n        text_words_list = re.split(r'\\s+', text)\n        text_words = {w for w in text_words_list if len(w) >= 3}\n\n        matches = 0\n        for w in text_words:\n            if w in self.dictionary or (\n                w != w.lower() and w.lower() in self.dictionary\n            ):\n                matches += 1\n        hit_ratio = matches / len(text_words) if matches > 0 else 0.0\n        return hit_ratio\n"
  },
  {
    "path": "src/ocrmypdf/subprocess/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"Wrappers to manage subprocess calls.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nimport sys\nfrom collections.abc import Callable, Mapping, Sequence\nfrom contextlib import suppress\nfrom pathlib import Path\nfrom subprocess import PIPE, STDOUT, CalledProcessError, CompletedProcess, Popen\nfrom subprocess import run as subprocess_run\n\nfrom packaging.version import Version\n\nfrom ocrmypdf.exceptions import MissingDependencyError\n\n# pylint: disable=logging-format-interpolation\n\nlog = logging.getLogger(__name__)\n\nArgs = Sequence[Path | str]\nEnviron = Mapping[str, str] | os._Environ  # pylint: disable=protected-access\n\n\ndef run(\n    args: Args,\n    *,\n    env: Environ | None = None,\n    logs_errors_to_stdout: bool = False,\n    check: bool = False,\n    **kwargs,\n) -> CompletedProcess:\n    \"\"\"Wrapper around :py:func:`subprocess.run`.\n\n    The main purpose of this wrapper is to log subprocess output in an orderly\n    fashion that identifies the responsible subprocess. An additional\n    task is that this function goes to greater lengths to find possible Windows\n    locations of our dependencies when they are not on the system PATH.\n\n    Arguments should be identical to ``subprocess.run``, except for following:\n\n    Args:\n        args: Positional arguments to pass to ``subprocess.run``.\n        env: A set of environment variables. If None, the OS environment is used.\n        logs_errors_to_stdout: If True, indicates that the process writes its error\n            messages to stdout rather than stderr, so stdout should be logged\n            if there is an error. If False, stderr is logged. Could be used with\n            stderr=STDOUT, stdout=PIPE for example.\n        check: If True, raise an exception if the process exits with a non-zero\n            status code. If False, the return value will indicate success or failure.\n        kwargs: Additional arguments to pass to ``subprocess.run``.\n    \"\"\"\n    args, env, process_log, _text = _fix_process_args(args, env, kwargs)\n\n    stderr = None\n    stderr_name = 'stderr' if not logs_errors_to_stdout else 'stdout'\n    try:\n        proc = subprocess_run(args, env=env, check=check, **kwargs)\n    except CalledProcessError as e:\n        stderr = getattr(e, stderr_name, None)\n        raise\n    else:\n        stderr = getattr(proc, stderr_name, None)\n    finally:\n        if process_log.isEnabledFor(logging.DEBUG) and stderr:\n            with suppress(AttributeError, UnicodeDecodeError):\n                stderr = stderr.decode('utf-8', 'replace')\n            if logs_errors_to_stdout:\n                process_log.debug(\"stdout/stderr = %s\", stderr)\n            else:\n                process_log.debug(\"stderr = %s\", stderr)\n    return proc\n\n\ndef run_polling_stderr(\n    args: Args,\n    *,\n    callback: Callable[[str], None],\n    check: bool = False,\n    env: Environ | None = None,\n    **kwargs,\n) -> CompletedProcess:\n    \"\"\"Run a process like ``ocrmypdf.subprocess.run``, and poll stderr.\n\n    Every line of produced by stderr will be forwarded to the callback function.\n    The intended use is monitoring progress of subprocesses that output their\n    own progress indicators. In addition, each line will be logged if debug\n    logging is enabled.\n\n    Requires stderr to be opened in text mode for ease of handling errors. In\n    addition the expected encoding= and errors= arguments should be set. Note\n    that if stdout is already set up, it need not be binary.\n    \"\"\"\n    args, env, process_log, text = _fix_process_args(args, env, kwargs)\n    assert text, \"Must use text=True\"\n\n    with Popen(args, env=env, **kwargs) as proc:\n        lines = []\n        while proc.poll() is None:\n            if proc.stderr is None:\n                continue\n            for msg in iter(proc.stderr.readline, ''):\n                if process_log.isEnabledFor(logging.DEBUG):\n                    process_log.debug(msg.strip())\n                callback(msg)\n                lines.append(msg)\n        stderr = ''.join(lines)\n\n        if check and proc.returncode != 0:\n            raise CalledProcessError(proc.returncode, args, output=None, stderr=stderr)\n        return CompletedProcess(args, proc.returncode, None, stderr=stderr)\n\n\ndef _fix_process_args(\n    args: Args, env: Environ | None, kwargs\n) -> tuple[Args, Environ, logging.Logger, bool]:\n    if not env:\n        env = os.environ\n\n    # Search in spoof path if necessary\n    program = str(args[0])\n\n    if sys.platform == 'win32':\n        # pylint: disable=import-outside-toplevel\n        from ocrmypdf.subprocess._windows import fix_windows_args\n\n        args = fix_windows_args(program, args, env)\n\n    log.debug(\"Running: %s\", args)\n    process_log = log.getChild(os.path.basename(program))\n    text = bool(kwargs.get('text', False))\n\n    return args, env, process_log, text\n\n\ndef get_version(\n    program: str,\n    *,\n    version_arg: str = '--version',\n    regex=r'(\\d+(\\.\\d+)*)',\n    env: Environ | None = None,\n) -> str:\n    \"\"\"Get the version of the specified program.\n\n    Arguments:\n        program: The program to version check.\n        version_arg: The argument needed to ask for its version, e.g. ``--version``.\n        regex: A regular expression to parse the program's output and obtain the\n            version.\n        env: Custom ``os.environ`` in which to run program.\n    \"\"\"\n    args_prog = [program, version_arg]\n    try:\n        proc = run(\n            args_prog,\n            close_fds=True,\n            text=True,\n            stdout=PIPE,\n            stderr=STDOUT,\n            check=True,\n            env=env,\n        )\n        output: str = proc.stdout\n    except FileNotFoundError as e:\n        raise MissingDependencyError(\n            f\"Could not find program '{program}' on the PATH\"\n        ) from e\n    except CalledProcessError as e:\n        if e.returncode != 0:\n            log.exception(e)\n            raise MissingDependencyError(\n                f\"Ran program '{program}' but it exited with an error:\\n{e.output}\"\n            ) from e\n        raise MissingDependencyError(\n            f\"Could not find program '{program}' on the PATH\"\n        ) from e\n\n    match = re.match(regex, output.strip())\n    if not match:\n        raise MissingDependencyError(\n            f\"The program '{program}' did not report its version. \"\n            f\"Message was:\\n{output}\"\n        )\n    version = match.group(1)\n\n    return version\n\n\nMISSING_PROGRAM = '''\nThe program '{program}' could not be executed or was not found on your\nsystem PATH.\n'''\n\nMISSING_OPTIONAL_PROGRAM = '''\nThe program '{program}' could not be executed or was not found on your\nsystem PATH.  This program is required when you use the\n{required_for} arguments.  You could try omitting these arguments, or install\nthe package.\n'''\n\nMISSING_RECOMMEND_PROGRAM = '''\nThe program '{program}' could not be executed or was not found on your\nsystem PATH.  This program is recommended when using the {required_for} arguments,\nbut not required, so we will proceed.  For best results, install the program.\n'''\n\nOLD_VERSION = '''\nOCRmyPDF requires '{program}' {need_version} or higher.  Your system appears\nto have {found_version}.  Please update this program.\n'''\n\nOLD_VERSION_REQUIRED_FOR = '''\nOCRmyPDF requires '{program}' {need_version} or higher when run with the\n{required_for} arguments.  {program} {found_version} is installed.\n\nIf you omit these arguments, OCRmyPDF may be able to\nproceed.  For best results, update the program.\n'''\n\nOSX_INSTALL_ADVICE = '''\nIf you have homebrew installed, try these command to install the missing\npackage:\n    brew install {package}\n'''\n\nLINUX_INSTALL_ADVICE = '''\nOn systems with the aptitude package manager (Debian, Ubuntu), try these\ncommands:\n    sudo apt update\n    sudo apt install {package}\n\nOn RPM-based systems (Red Hat, Fedora), try this command:\n    sudo dnf install {package}\n'''\n\nWINDOWS_INSTALL_ADVICE = '''\nIf not already installed, install the Chocolatey package manager. Then use\na command prompt to install the missing package:\n    choco install {package}\n'''\n\n\ndef _get_platform() -> str:\n    if sys.platform.startswith('freebsd'):\n        return 'freebsd'\n    elif sys.platform.startswith('linux'):\n        return 'linux'\n    elif sys.platform.startswith('win'):\n        return 'windows'\n    return sys.platform\n\n\ndef _error_trailer(program: str, package: str | Mapping[str, str], **kwargs) -> None:\n    del kwargs\n    if isinstance(package, Mapping):\n        package = package.get(_get_platform(), program)\n\n    if _get_platform() == 'darwin':\n        log.info(OSX_INSTALL_ADVICE.format(**locals()))\n    elif _get_platform() == 'linux':\n        log.info(LINUX_INSTALL_ADVICE.format(**locals()))\n    elif _get_platform() == 'windows':\n        log.info(WINDOWS_INSTALL_ADVICE.format(**locals()))\n\n\ndef _error_missing_program(\n    program: str, package: str, required_for: str | None, recommended: bool\n) -> None:\n    # pylint: disable=unused-argument\n    if recommended:\n        log.warning(MISSING_RECOMMEND_PROGRAM.format(**locals()))\n    elif required_for:\n        log.error(MISSING_OPTIONAL_PROGRAM.format(**locals()))\n    else:\n        log.error(MISSING_PROGRAM.format(**locals()))\n    _error_trailer(**locals())\n\n\ndef _error_old_version(\n    program: str,\n    package: str,\n    need_version: str,\n    found_version: str,\n    required_for: str | None,\n) -> None:\n    # pylint: disable=unused-argument\n    if required_for:\n        log.error(OLD_VERSION_REQUIRED_FOR.format(**locals()))\n    else:\n        log.error(OLD_VERSION.format(**locals()))\n    _error_trailer(**locals())\n\n\ndef check_external_program(\n    *,\n    program: str,\n    package: str,\n    version_checker: Callable[[], Version],\n    need_version: str | Version,\n    required_for: str | None = None,\n    recommended: bool = False,\n    version_parser: type[Version] = Version,\n) -> None:\n    \"\"\"Check for required version of external program and raise exception if not.\n\n    Args:\n        program: The name of the program to test.\n        package: The name of a software package that typically supplies this program.\n            Usually the same as program.\n        version_checker: A callable without arguments that retrieves the installed\n            version of program.\n        need_version: The minimum required version.\n        required_for: The name of an argument of feature that requires this program.\n        recommended: If this external program is recommended, instead of raising\n            an exception, log a warning and allow execution to continue.\n        version_parser: A class that should be used to parse and compare version\n            numbers. Used when version numbers do not follow standard conventions.\n    \"\"\"\n    if not isinstance(need_version, Version):\n        need_version = version_parser(need_version)\n    try:\n        found_version = version_checker()\n    except (CalledProcessError, FileNotFoundError) as e:\n        _error_missing_program(program, package, required_for, recommended)\n        if not recommended:\n            raise MissingDependencyError(program) from e\n        return\n    except MissingDependencyError:\n        _error_missing_program(program, package, required_for, recommended)\n        if not recommended:\n            raise\n        return\n\n    if found_version and found_version < need_version:\n        _error_old_version(\n            program, package, str(need_version), str(found_version), required_for\n        )\n        if not recommended:\n            raise MissingDependencyError(program)\n\n    log.debug('Found %s %s', program, found_version)\n"
  },
  {
    "path": "src/ocrmypdf/subprocess/_windows.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\"\"\"Find Tesseract and Ghostscript binaries on Windows using the registry.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport re\nimport shutil\nimport sys\nfrom collections.abc import Callable, Iterable, Iterator\nfrom itertools import chain\nfrom pathlib import Path\nfrom typing import Any, TypeAlias, TypeVar\n\nfrom packaging.version import InvalidVersion, Version\n\nif sys.platform == 'win32':\n    # mypy understands 'if sys.platform' better than try/except ModuleNotFoundError\n    import winreg  # pylint: disable=import-error\n\n    HKEYType: TypeAlias = winreg.HKEYType\nelse:\n    from unittest.mock import Mock\n\n    winreg = Mock(\n        spec=['HKEYType', 'EnumKey', 'EnumValue', 'HKEY_LOCAL_MACHINE', 'OpenKey']\n    )\n    # mypy does not understand winreg.HKeyType where winreg is a Mock (fair enough!)\n    HKEYType: TypeAlias = Any  # type: ignore\n\n\nlog = logging.getLogger(__name__)\n\nT = TypeVar('T')\nTkey = TypeVar('Tkey')\n\n\ndef ghostscript_version_key(s: str) -> tuple[int, int, int]:\n    \"\"\"Compare Ghostscript version numbers.\"\"\"\n    try:\n        release = [int(elem) for elem in s.split('.', maxsplit=3)]\n        while len(release) < 3:\n            release.append(0)\n        return (release[0], release[1], release[2])\n    except ValueError:\n        return (0, 0, 0)\n\n\ndef registry_enum(key: HKEYType, enum_fn: Callable[[HKEYType, int], T]) -> Iterator[T]:\n    limit = 999\n    n = 0\n    while n < limit:\n        try:\n            yield enum_fn(key, n)\n            n += 1\n        except OSError:\n            break\n    if n == limit:\n        raise ValueError(f\"Too many registry keys under {key}\")\n\n\ndef registry_subkeys(key: HKEYType) -> Iterator[str]:\n    return registry_enum(key, winreg.EnumKey)\n\n\ndef registry_values(key: HKEYType) -> Iterator[tuple[str, Any, int]]:\n    return registry_enum(key, winreg.EnumValue)\n\n\ndef registry_path_ghostscript(env=None) -> Iterator[Path]:\n    del env  # unused (but needed for protocol)\n    try:\n        with winreg.OpenKey(\n            winreg.HKEY_LOCAL_MACHINE, r\"SOFTWARE\\Artifex\\GPL Ghostscript\"\n        ) as k:\n            latest_gs = max(\n                registry_subkeys(k), key=ghostscript_version_key, default=(0, 0, 0)\n            )\n        with winreg.OpenKey(\n            winreg.HKEY_LOCAL_MACHINE, rf\"SOFTWARE\\Artifex\\GPL Ghostscript\\{latest_gs}\"\n        ) as k:\n            for _, gs_path, _ in registry_values(k):\n                yield Path(gs_path) / 'bin'\n    except OSError as e:\n        log.warning(e)\n\n\ndef registry_path_tesseract(env=None) -> Iterator[Path]:\n    del env  # unused (but needed for protocol)\n    try:\n        with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r\"SOFTWARE\\Tesseract-OCR\") as k:\n            for subkey, val, _valtype in registry_values(k):\n                if subkey == 'InstallDir':\n                    tesseract_path = Path(val)\n                    yield tesseract_path\n    except OSError as e:\n        log.warning(e)\n\n\ndef _gs_version_in_path_key(path: Path) -> tuple[str, Version | None]:\n    \"\"\"Key function for comparing Ghostscript and Tesseract paths.\n\n    Ghostscript installs on Windows:\n        %PROGRAMFILES%/gs/gs9.56.1/bin -> ('gs', Version('9.56.1'))\n        %PROGRAMFILES%/gs/9.24/bin -> ('gs', Version('9.24'))\n\n    Tesseract looks like:\n        %PROGRAMFILES%/Tesseract-OCR -> ('Tesseract-OCR', None)\n\n    Thus ensuring the resulting tuple will order the alternatives correctly,\n    e.g. gs10.0 > gs9.99.\n    \"\"\"\n    match = re.search(r'gs[/\\\\]?([0-9.]+)[/\\\\]bin', str(path))\n    if match:\n        try:\n            version_str = match.group(1)\n            version = Version(version_str)\n            return 'gs', version\n        except InvalidVersion:\n            pass\n    return path.name, None\n\n\ndef program_files_paths(env=None) -> Iterator[Path]:\n    if not env:\n        env = os.environ\n    program_files = env.get('PROGRAMFILES', '')\n\n    def path_walker() -> Iterator[Path]:\n        for path in Path(program_files).iterdir():\n            if not path.is_dir():\n                continue\n            if path.name.lower() == 'tesseract-ocr':\n                yield path\n            elif path.name.lower() == 'gs':\n                yield from (p for p in path.glob('**/bin') if p.is_dir())\n\n    return iter(\n        sorted(\n            (p for p in path_walker()),\n            key=_gs_version_in_path_key,\n            reverse=True,\n        )\n    )\n\n\ndef paths_from_env(env=None) -> Iterator[Path]:\n    return (Path(p) for p in os.get_exec_path(env) if p)\n\n\ndef shim_path(new_paths: Callable[[Any], Iterator[Path]], env=None) -> str:\n    if not env:\n        env = os.environ\n    return os.pathsep.join(str(p) for p in new_paths(env) if p)\n\n\nSHIMS = [\n    paths_from_env,\n    registry_path_ghostscript,\n    registry_path_tesseract,\n    program_files_paths,\n]\n\n\ndef fix_windows_args(program: str, args, env):\n    \"\"\"Adjust our desired program and command line arguments for use on Windows.\"\"\"\n    # If we are running a .py on Windows, ensure we call it with this Python\n    # (to support test suite shims)\n    if program.lower().endswith('.py'):\n        args = [sys.executable] + args\n\n    # If the program we want is not on the PATH, check elsewhere\n    for shim in SHIMS:\n        shimmed_path = shim_path(shim, env)\n        new_args0 = shutil.which(args[0], path=shimmed_path)\n        if new_args0:\n            args[0] = new_args0\n            break\n\n    return args\n\n\ndef unique_everseen(iterable: Iterable[T], key: Callable[[T], Tkey]) -> Iterator[T]:\n    \"\"\"List unique elements, preserving order.\"\"\"\n    # unique_everseen('AAAABBBCCDAABBB') --> A B C D\n    # unique_everseen('ABBCcAD', str.lower) --> A B C D\n    seen: set[Tkey] = set()\n    seen_add = seen.add\n    for element in iterable:\n        k = key(element)\n        if k not in seen:\n            seen_add(k)\n            yield element\n\n\ndef _casefold_path(path: Path) -> str:\n    return str.casefold(str(path))\n\n\ndef shim_env_path(env=None):\n    if env is None:\n        env = os.environ\n\n    shim_paths = chain.from_iterable(shim(env) for shim in SHIMS)\n    return os.pathsep.join(\n        str(p) for p in unique_everseen(shim_paths, key=_casefold_path)\n    )\n"
  },
  {
    "path": "tests/__init__.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Tests.\"\"\"\n\nfrom __future__ import annotations\n"
  },
  {
    "path": "tests/cache/manifest.jsonl",
    "content": "{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/trivial.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__2__000003_rasterize.png__stdout\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"2\", \"$TMPDIR/000003_rasterize.png\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__2__000004_rasterize.png__stdout\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"2\", \"$TMPDIR/000004_rasterize.png\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__2__000005_rasterize.png__stdout\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"2\", \"$TMPDIR/000005_rasterize.png\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__2__000001_rasterize.png__stdout\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"2\", \"$TMPDIR/000001_rasterize.png\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/2400dpi.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/3small.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000002_ocr.png\", \"$TMPDIR/000002_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__2__000006_rasterize.png__stdout\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"2\", \"$TMPDIR/000006_rasterize.png\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/3small.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000003_ocr.png\", \"$TMPDIR/000003_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000004_ocr.png\", \"$TMPDIR/000004_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__thresholding_method=1__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/trivial.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"thresholding_method=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/ccitt.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/jbig2.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/3small.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000003_ocr.png\", \"$TMPDIR/000003_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/skew.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000005_ocr.png\", \"$TMPDIR/000005_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__thresholding_method=2__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/trivial.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"thresholding_method=2\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__7__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/skew.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"7\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000006_ocr.png\", \"$TMPDIR/000006_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/graph_ocred.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/skew.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__7__000001_ocr.png__000001_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/skew.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"7\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000003_ocr.png\", \"$TMPDIR/000003_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000004_ocr.png\", \"$TMPDIR/000004_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--oem__1__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/trivial.pdf\", \"args\": [\"-l\", \"eng\", \"--oem\", \"1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000005_ocr.png__000005_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000005_ocr.png\", \"$TMPDIR/000005_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__2__000001_rasterize.png__stdout\", \"sourcefile\": \"resources/ccitt.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"2\", \"$TMPDIR/000001_rasterize.png\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__2__000001_rasterize.png__stdout\", \"sourcefile\": \"resources/jbig2.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"2\", \"$TMPDIR/000001_rasterize.png\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000006_ocr.png\", \"$TMPDIR/000006_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__2__000001_rasterize.png__stdout\", \"sourcefile\": \"resources/lichtenstein.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"2\", \"$TMPDIR/000001_rasterize.png\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/lichtenstein.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__2__000001_rasterize.png__stdout\", \"sourcefile\": \"resources/palette.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"2\", \"$TMPDIR/000001_rasterize.png\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/aspect.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/palette.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/palette.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000005_ocr.png\", \"$TMPDIR/000005_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000006_ocr.png\", \"$TMPDIR/000006_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000001_ocr.png\", \"/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000003_ocr.png\", \"/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000003_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/ccitt.pdf\", \"args\": [\"-l\", \"eng\", \"/tmp/pytest-of-jb/pytest-73/popen-gw4/test_hocr_to_pdf_api0/000001_ocr.png\", \"/tmp/pytest-of-jb/pytest-73/popen-gw4/test_hocr_to_pdf_api0/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/ccitt.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000002_ocr.png\", \"/tmp/pytest-of-jb/pytest-73/popen-gw3/test_hocr_api_multipage0/000002_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/trivial.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/multipage.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000004_ocr.png\", \"$TMPDIR/000004_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/3small.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000002_ocr.png\", \"$TMPDIR/000002_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/2400dpi.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/3small.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/jbig2.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/3small.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000003_ocr.png\", \"$TMPDIR/000003_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__thresholding_method=1__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/trivial.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"thresholding_method=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/graph_ocred.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--psm__7__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/skew.pdf\", \"args\": [\"-l\", \"eng\", \"--psm\", \"7\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__thresholding_method=2__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/trivial.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"thresholding_method=2\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/skew.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__--oem__1__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/trivial.pdf\", \"args\": [\"-l\", \"eng\", \"--oem\", \"1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/palette.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/jbig2.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/ccitt.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/lichtenstein.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/lichtenstein.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"osd\", \"--psm\", \"0\", \"$TMPDIR/000001_rasterize_preview.jpg\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__osd__--psm__0__000002_rasterize_preview.jpg__stdout\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"osd\", \"--psm\", \"0\", \"$TMPDIR/000002_rasterize_preview.jpg\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__osd__--psm__0__000003_rasterize_preview.jpg__stdout\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"osd\", \"--psm\", \"0\", \"$TMPDIR/000003_rasterize_preview.jpg\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/aspect.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__osd__--psm__0__000004_rasterize_preview.jpg__stdout\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"osd\", \"--psm\", \"0\", \"$TMPDIR/000004_rasterize_preview.jpg\", \"stdout\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000003_ocr.png\", \"$TMPDIR/000003_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000002_ocr.png\", \"$TMPDIR/000002_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000004_ocr.png\", \"$TMPDIR/000004_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000002_ocr.png\", \"$TMPDIR/000002_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000004_ocr.png\", \"$TMPDIR/000004_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/poster.pdf\", \"args\": [\"-l\", \"eng\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__deu__000001_ocr.png__000001_ocr_hocr__hocr__txt\", \"sourcefile\": \"resources/francais.pdf\", \"args\": [\"-l\", \"deu\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_hocr\", \"hocr\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000001_ocr.png\", \"$TMPDIR/000001_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000002_ocr.png\", \"$TMPDIR/000002_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000003_ocr.png\", \"$TMPDIR/000003_ocr_tess\", \"pdf\", \"txt\"]}\n{\"tesseract_version\": \"5.5.1\", \"system\": \"Linux\", \"python\": \"3.11.14\", \"argv_slug\": \"__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt\", \"sourcefile\": \"resources/cardinal.pdf\", \"args\": [\"-l\", \"eng\", \"-c\", \"textonly_pdf=1\", \"$TMPDIR/000004_ocr.png\", \"$TMPDIR/000004_ocr_tess\", \"pdf\", \"txt\"]}\n"
  },
  {
    "path": "tests/conftest.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport logging\nimport platform\nimport sys\nfrom pathlib import Path\nfrom subprocess import CompletedProcess, run\n\nimport pytest\n\nfrom ocrmypdf import api, pdfinfo\nfrom ocrmypdf._exec import unpaper\nfrom ocrmypdf.api import setup_plugin_infrastructure\nfrom ocrmypdf.cli import get_options_and_plugins\nfrom ocrmypdf.exceptions import ExitCode\n\n\nclass Gs106WarningFilter(logging.Filter):\n    \"\"\"Filter out expected Ghostscript 10.6.x warning from test logs.\"\"\"\n\n    def filter(self, record: logging.LogRecord) -> bool:\n        # Allow all records except the expected Ghostscript 10.6.x warning\n        return (\n            \"Ghostscript 10.6.x contains JPEG encoding errors\"\n            not in record.getMessage()\n        )\n\n\n@pytest.fixture(autouse=True)\ndef suppress_gs106_warning():\n    \"\"\"Suppress the expected Ghostscript 10.6.x JPEG encoding warning in tests.\"\"\"\n    # Add filter to root logger to suppress expected warnings\n    root_logger = logging.getLogger()\n    warning_filter = Gs106WarningFilter()\n    root_logger.addFilter(warning_filter)\n    yield\n    root_logger.removeFilter(warning_filter)\n\n\ndef is_linux():\n    return platform.system() == 'Linux'\n\n\ndef is_macos():\n    return platform.system() == 'Darwin'\n\n\ndef have_unpaper():\n    try:\n        unpaper.version()\n    except Exception:  # pylint: disable=broad-except\n        return False\n    return True\n\n\nTESTS_ROOT = Path(__file__).parent.resolve()\nPROJECT_ROOT = TESTS_ROOT\n\n\n@pytest.fixture(scope=\"session\")\ndef resources() -> Path:\n    return Path(TESTS_ROOT) / 'resources'\n\n\n@pytest.fixture\ndef ocrmypdf_exec() -> list[str]:\n    return [sys.executable, '-m', 'ocrmypdf']\n\n\n@pytest.fixture(scope=\"function\")\ndef outdir(tmp_path) -> Path:\n    return tmp_path\n\n\n@pytest.fixture(scope=\"function\")\ndef outpdf(tmp_path) -> Path:\n    return tmp_path / 'out.pdf'\n\n\n@pytest.fixture(scope=\"function\")\ndef outtxt(tmp_path) -> Path:\n    return tmp_path / 'out.txt'\n\n\n@pytest.fixture(scope=\"function\")\ndef no_outpdf(tmp_path) -> Path:\n    \"\"\"Document fact that a test is not expected to produce output.\n\n    This just documents the fact that a test is not expected to produce\n    output. Unfortunately an assertion failure inside a test fixture produces\n    an error rather than a test failure, so no testing is done. It's up to\n    the test to confirm that no output file was created.\n    \"\"\"\n    return tmp_path / 'no_output.pdf'\n\n\n@pytest.fixture(scope=\"session\")\ndef multipage(resources):\n    return resources / 'multipage.pdf'\n\n\ndef check_ocrmypdf(input_file: Path, output_file: Path, *args) -> Path:\n    \"\"\"Run ocrmypdf and confirm that a valid plausible PDF was created.\"\"\"\n    api_args = [str(input_file), str(output_file)] + [\n        str(arg) for arg in args if arg is not None\n    ]\n\n    options, plugin_manager = get_options_and_plugins(args=api_args)\n    api.check_options(options, plugin_manager)\n    result = api.run_pipeline(options, plugin_manager=plugin_manager)\n\n    assert result == 0\n    assert output_file.exists(), \"Output file not created\"\n    assert output_file.stat().st_size > 100, \"PDF too small or empty\"\n\n    return output_file\n\n\ndef run_ocrmypdf_api(input_file: Path, output_file: Path, *args) -> ExitCode:\n    \"\"\"Run ocrmypdf via its API in-process, but return CLI-style ExitCode.\n\n    This simulates calling the command line interface in a subprocess and allows us\n    to check that the command line interface is working correctly, but since it is\n    in-process it is easier to trace with a debugger or coverage tool.\n\n    Any exception raised will be trapped and converted to an exit code.\n    The return code must be checked by the caller to determine if the test passed.\n    \"\"\"\n    api_args = [str(input_file), str(output_file)] + [\n        str(arg) for arg in args if arg is not None\n    ]\n    options, plugin_manager = get_options_and_plugins(args=api_args)\n\n    api.check_options(options, plugin_manager)\n    return api.run_pipeline_cli(options, plugin_manager=plugin_manager)\n\n\ndef run_ocrmypdf(\n    input_file: Path, output_file: Path, *args, text: bool = True\n) -> CompletedProcess:\n    \"\"\"Run ocrmypdf in a subprocess and let test deal with results.\n\n    If an exception is thrown this fact will be returned as part of the result\n    text and return code rather than exception objects.\n    \"\"\"\n    p_args = (\n        [sys.executable, '-m', 'ocrmypdf']\n        + [str(arg) for arg in args if arg is not None]\n        + [str(input_file), str(output_file)]\n    )\n\n    p = run(\n        p_args,\n        capture_output=True,\n        text=text,\n        check=False,\n    )\n    # print(p.stderr)\n    return p\n\n\ndef first_page_dimensions(pdf: Path):\n    info = pdfinfo.PdfInfo(pdf)\n    page0 = info[0]\n    return (page0.width_inches, page0.height_inches)\n\n\ndef pytest_addoption(parser):\n    parser.addoption(\n        \"--runslow\",\n        action=\"store_true\",\n        default=False,\n        help=(\n            \"run slow tests only useful for development (unlikely to be \"\n            \"useful for downstream packagers)\"\n        ),\n    )\n\n\ndef pytest_collection_modifyitems(config, items):\n    if config.getoption(\"--runslow\"):\n        # --runslow given in cli: do not skip slow tests\n        return\n    skip_slow = pytest.mark.skip(reason=\"need --runslow option to run\")\n    for item in items:\n        if \"slow\" in item.keywords:\n            item.add_marker(skip_slow)\n\n\ndef get_test_plugin_manager(plugins=None):\n    \"\"\"Get a properly initialized plugin manager for testing.\"\"\"\n    return setup_plugin_infrastructure(plugins=plugins or [])\n"
  },
  {
    "path": "tests/plugins/gs_feature_elision.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\nfrom __future__ import annotations\n\nfrom unittest.mock import patch\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.builtin_plugins import ghostscript\nfrom ocrmypdf.subprocess import run_polling_stderr\n\nELISION_WARNING = \"\"\"GPL Ghostscript 9.50: Setting Overprint Mode to 1\nnot permitted in PDF/A-2, overprint mode not set\"\"\"\n\n\ndef run_append_stderr(*args, **kwargs):\n    proc = run_polling_stderr(*args, **kwargs)\n    proc.stderr += '\\n' + ELISION_WARNING + '\\n'\n    return proc\n\n\n@hookimpl\ndef generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):\n    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:\n        mock.side_effect = run_append_stderr\n        ghostscript.generate_pdfa(\n            pdf_pages=pdf_pages,\n            pdfmark=pdfmark,\n            output_file=output_file,\n            context=context,\n            pdf_version=pdf_version,\n            pdfa_part=pdfa_part,\n            progressbar_class=None,\n            stop_on_soft_error=True,\n        )\n        mock.assert_called_once()\n    return output_file\n"
  },
  {
    "path": "tests/plugins/gs_pdfa_failure.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\nfrom __future__ import annotations\n\nfrom unittest.mock import patch\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.builtin_plugins import ghostscript\nfrom ocrmypdf.subprocess import run_polling_stderr\n\n\ndef run_rig_args(args, **kwargs):\n    # Remove the two arguments that tell ghostscript to create a PDF/A\n    # Does not remove the Postscript definition file - not necessary\n    # to cause PDF/A creation failure\n    new_args = [\n        arg for arg in args if not arg.startswith('-dPDFA') and not arg.endswith('.ps')\n    ]\n    proc = run_polling_stderr(new_args, **kwargs)\n    return proc\n\n\n@hookimpl\ndef generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):\n    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:\n        mock.side_effect = run_rig_args\n        ghostscript.generate_pdfa(\n            pdf_pages=pdf_pages,\n            pdfmark=pdfmark,\n            output_file=output_file,\n            context=context,\n            pdf_version=pdf_version,\n            pdfa_part=pdfa_part,\n            progressbar_class=None,\n            stop_on_soft_error=True,\n        )\n        mock.assert_called()\n        return output_file\n"
  },
  {
    "path": "tests/plugins/gs_raster_failure.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\nfrom __future__ import annotations\n\nfrom pathlib import Path\nfrom subprocess import CalledProcessError\nfrom unittest.mock import patch\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.builtin_plugins import ghostscript\n\n\ndef raise_gs_fail(*args, **kwargs):\n    raise CalledProcessError(\n        1, 'gs', output=b\"\", stderr=b\"TEST ERROR: gs_raster_failure.py\"\n    )\n\n\n@hookimpl\ndef rasterize_pdf_page(\n    input_file,\n    output_file,\n    raster_device,\n    raster_dpi,\n    pageno,\n    page_dpi,\n    rotation,\n    filter_vector,\n    stop_on_soft_error,\n    options,\n    use_cropbox,\n) -> Path:\n    with patch('ocrmypdf._exec.ghostscript.run') as mock:\n        mock.side_effect = raise_gs_fail\n        ghostscript.rasterize_pdf_page(\n            input_file=input_file,\n            output_file=output_file,\n            raster_device=raster_device,\n            raster_dpi=raster_dpi,\n            pageno=pageno,\n            page_dpi=page_dpi,\n            rotation=rotation,\n            filter_vector=filter_vector,\n            stop_on_soft_error=stop_on_soft_error,\n            options=options,\n            use_cropbox=use_cropbox,\n        )\n        mock.assert_called()\n        return output_file\n"
  },
  {
    "path": "tests/plugins/gs_raster_soft_error.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\nfrom __future__ import annotations\n\nfrom pathlib import Path\nfrom subprocess import CalledProcessError\nfrom unittest.mock import patch\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.builtin_plugins import ghostscript\nfrom ocrmypdf.subprocess import run\n\n\ndef fail_if_stoponerror(args, **kwargs):\n    if '-dPDFSTOPONERROR' in args:\n        raise CalledProcessError(1, 'gs', output=b\"\", stderr=b\"PDF STOP ON ERROR\")\n    return run(args, **kwargs)\n\n\n@hookimpl\ndef rasterize_pdf_page(\n    input_file,\n    output_file,\n    raster_device,\n    raster_dpi,\n    pageno,\n    page_dpi,\n    rotation,\n    filter_vector,\n    stop_on_soft_error,\n    options,\n    use_cropbox,\n) -> Path:\n    with patch('ocrmypdf._exec.ghostscript.run') as mock:\n        mock.side_effect = fail_if_stoponerror\n        ghostscript.rasterize_pdf_page(\n            input_file=input_file,\n            output_file=output_file,\n            raster_device=raster_device,\n            raster_dpi=raster_dpi,\n            pageno=pageno,\n            page_dpi=page_dpi,\n            rotation=rotation,\n            filter_vector=filter_vector,\n            stop_on_soft_error=stop_on_soft_error,\n            options=options,\n            use_cropbox=use_cropbox,\n        )\n        mock.assert_called()\n        return output_file\n"
  },
  {
    "path": "tests/plugins/gs_render_failure.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\nfrom __future__ import annotations\n\nfrom subprocess import CalledProcessError\nfrom unittest.mock import patch\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.builtin_plugins import ghostscript\n\n\ndef raise_gs_fail(*args, **kwargs):\n    raise CalledProcessError(\n        1, 'gs', output=b\"\", stderr=b\"TEST ERROR: gs_render_failure.py\"\n    )\n\n\n@hookimpl\ndef generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):\n    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:\n        mock.side_effect = raise_gs_fail\n        ghostscript.generate_pdfa(\n            pdf_pages=pdf_pages,\n            pdfmark=pdfmark,\n            output_file=output_file,\n            context=context,\n            pdf_version=pdf_version,\n            pdfa_part=pdfa_part,\n            progressbar_class=None,\n            stop_on_soft_error=True,\n        )\n        mock.assert_called()\n        return output_file\n"
  },
  {
    "path": "tests/plugins/gs_render_soft_error.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\nfrom __future__ import annotations\n\nfrom subprocess import CalledProcessError\nfrom unittest.mock import patch\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.builtin_plugins import ghostscript\nfrom ocrmypdf.subprocess import run_polling_stderr\n\n\ndef fail_if_stoponerror(args, **kwargs):\n    if '-dPDFSTOPONERROR' in args:\n        raise CalledProcessError(1, 'gs', output=b\"\", stderr=b\"PDF STOP ON ERROR\")\n    return run_polling_stderr(args, **kwargs)\n\n\n@hookimpl\ndef generate_pdfa(\n    pdf_pages,\n    pdfmark,\n    output_file,\n    context,\n    pdf_version,\n    pdfa_part,\n    stop_on_soft_error,\n):\n    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:\n        mock.side_effect = fail_if_stoponerror\n        ghostscript.generate_pdfa(\n            pdf_pages=pdf_pages,\n            pdfmark=pdfmark,\n            output_file=output_file,\n            context=context,\n            pdf_version=pdf_version,\n            pdfa_part=pdfa_part,\n            progressbar_class=None,\n            stop_on_soft_error=stop_on_soft_error,\n        )\n        mock.assert_called()\n        return output_file\n"
  },
  {
    "path": "tests/plugins/tesseract_badutf8.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\n\"\"\"Tesseract bad utf8.\n\nIn some cases, some versions of Tesseract can output binary gibberish or data\nthat is not UTF-8 compatible, so we are forced to check that we can convert it\nand present it to the user.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom contextlib import contextmanager\nfrom subprocess import CalledProcessError\nfrom unittest.mock import patch\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine\n\n\ndef bad_utf8(*args, **kwargs):\n    raise CalledProcessError(\n        1,\n        'tesseract',\n        output=b'\\x96\\xb3\\x8c\\xf8\\x82\\xc8UTF-8\\x0a',  # \"Invalid UTF-8\" in Shift JIS\n        stderr=b\"\",\n    )\n\n\n@contextmanager\ndef patch_tesseract_run():\n    with patch('ocrmypdf._exec.tesseract.run') as mock:\n        mock.side_effect = bad_utf8\n        yield\n        mock.assert_called()\n\n\nclass BadUtf8OcrEngine(TesseractOcrEngine):\n    @staticmethod\n    def generate_hocr(input_file, output_hocr, output_text, options):\n        with patch_tesseract_run():\n            TesseractOcrEngine.generate_hocr(\n                input_file, output_hocr, output_text, options\n            )\n\n    @staticmethod\n    def generate_pdf(input_file, output_pdf, output_text, options):\n        with patch_tesseract_run():\n            TesseractOcrEngine.generate_pdf(\n                input_file, output_pdf, output_text, options\n            )\n\n\n@hookimpl\ndef get_ocr_engine():\n    return BadUtf8OcrEngine()\n"
  },
  {
    "path": "tests/plugins/tesseract_big_image_error.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\nfrom __future__ import annotations\n\nfrom contextlib import contextmanager\nfrom subprocess import CalledProcessError\nfrom unittest.mock import patch\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine\n\n\ndef raise_size_exception(*args, **kwargs):\n    raise CalledProcessError(\n        1,\n        'tesseract',\n        output=b\"Image too large: (33830, 14959)\\nError during processing.\",\n        stderr=b\"\",\n    )\n\n\n@contextmanager\ndef patch_tesseract_run():\n    with patch('ocrmypdf._exec.tesseract.run') as mock:\n        mock.side_effect = raise_size_exception\n        yield\n        mock.assert_called()\n\n\nclass BigImageErrorOcrEngine(TesseractOcrEngine):\n    @staticmethod\n    def get_orientation(input_file, options):\n        with patch_tesseract_run():\n            return TesseractOcrEngine.get_orientation(input_file, options)\n\n    @staticmethod\n    def generate_hocr(input_file, output_hocr, output_text, options):\n        with patch_tesseract_run():\n            TesseractOcrEngine.generate_hocr(\n                input_file, output_hocr, output_text, options\n            )\n\n    @staticmethod\n    def generate_pdf(input_file, output_pdf, output_text, options):\n        with patch_tesseract_run():\n            TesseractOcrEngine.generate_pdf(\n                input_file, output_pdf, output_text, options\n            )\n\n\n@hookimpl\ndef get_ocr_engine():\n    return BigImageErrorOcrEngine()\n"
  },
  {
    "path": "tests/plugins/tesseract_cache.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\"\"\"Cache output of tesseract to speed up test suite.\n\nThe cache is keyed by by the input test file The input arguments are slugged\ninto a hideous filename that more or less represents them literally.  Joined\ntogether, this becomes the name of the cache folder.  A few name files like\nstdout, stderr, hocr, pdf, describe the output to reproduce.\n\nChanges to tests/resources/ or image processing algorithms don't trigger a\ncache miss.  By design, an input image that varies according to platform\ndifferences (e.g. JPEG decoders are allowed to produce differing outputs,\nand in practice they do) will still be a cache hit.  By design, an\ninvocation of tesseract with the same parameters from a different test case\nwill be a hit.  It's fragile.\n\nThe tests/cache/manifest.jsonl is a JSON lines file that contains\ninformation about the system that produced the results used when cache was\ngenerated.  This mainly a log to answer questions about how the files\nwere produced.\n\nCertain operations are not cached and routed to Tesseract OCR directly.\n\nAssumes Tesseract 4+.\n\n\"\"\"\n\nfrom __future__ import annotations\n\nimport argparse\nimport json\nimport logging\nimport platform\nimport re\nimport shutil\nimport threading\nfrom functools import partial\nfrom pathlib import Path\nfrom subprocess import PIPE, CalledProcessError, CompletedProcess\nfrom unittest.mock import patch\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine\nfrom ocrmypdf.subprocess import run\n\nlog = logging.getLogger(__name__)\n\nTESTS_ROOT = Path(__file__).resolve().parent.parent\nCACHE_ROOT = TESTS_ROOT / 'cache'\n\n\nparser = argparse.ArgumentParser(\n    prog='tesseract-cache', description='cache output of tesseract'\n)\nparser.add_argument('-l', '--language', action='append')\nparser.add_argument('imagename')\nparser.add_argument('outputbase')\nparser.add_argument('configfiles', nargs='*')\nparser.add_argument('--user-words', type=str)\nparser.add_argument('--user-patterns', type=str)\nparser.add_argument('-c', action='append')\nparser.add_argument('--psm', type=int)\nparser.add_argument('--oem', type=int)\n\n\ndef get_cache_folder(source_pdf, run_args, parsed_args):\n    def slugs():\n        yield ''  # so we don't start with a '-' which makes rm difficult\n        for arg in run_args[1:]:\n            if arg == parsed_args.imagename:\n                yield Path(parsed_args.imagename).name\n            elif arg == parsed_args.outputbase:\n                yield Path(parsed_args.outputbase).name\n            elif arg == '-c' or arg.startswith('textonly'):\n                pass\n            else:\n                yield arg\n\n    argv_slug = '__'.join(slugs())\n    argv_slug = argv_slug.replace('/', '___')\n\n    return Path(CACHE_ROOT) / Path(source_pdf).stem / argv_slug\n\n\ndef cached_run(options, run_args, **run_kwargs):\n    run_args = [str(arg) for arg in run_args]  # flatten PosixPaths\n    args = parser.parse_args(run_args[1:])\n\n    if args.imagename in ('stdin', '-'):\n        return run(run_args, **run_kwargs)\n\n    source_file = options.input_file\n    cache_folder = get_cache_folder(source_file, run_args, args)\n    cache_folder.mkdir(parents=True, exist_ok=True)\n\n    log.debug(f\"Using Tesseract cache {cache_folder}\")\n\n    # Determine what configfiles we need\n    configfiles = args.configfiles if args.configfiles else ['txt']\n\n    # Check if cache has all required files\n    def cache_complete():\n        if not (cache_folder / 'stderr.bin').exists():\n            return False\n        if not (cache_folder / 'stdout.bin').exists():\n            return False\n        if args.outputbase != 'stdout':\n            for configfile in configfiles:\n                if not (cache_folder / f'{configfile}.bin').exists():\n                    return False\n        return True\n\n    if cache_complete():\n        log.debug(\"Cache HIT\")\n\n        # Replicate stdout/err\n        if args.outputbase != 'stdout':\n            for configfile in configfiles:\n                # cp cache -> output\n                tessfile = args.outputbase + '.' + configfile\n                shutil.copy(str(cache_folder / configfile) + '.bin', tessfile)\n        return CompletedProcess(\n            args=run_args,\n            returncode=0,\n            stdout=(cache_folder / 'stdout.bin').read_bytes(),\n            stderr=(cache_folder / 'stderr.bin').read_bytes(),\n        )\n\n    log.debug(\"Cache MISS\")\n\n    cache_kwargs = {\n        k: v for k, v in run_kwargs.items() if k not in ('stdout', 'stderr')\n    }\n    # Don't pass timeout=0 to the actual run call - it would timeout immediately\n    # A timeout of 0 means \"use default/no timeout\" in the caching context\n    if cache_kwargs.get('timeout') == 0.0:\n        cache_kwargs['timeout'] = None\n    if 'check' not in cache_kwargs:\n        cache_kwargs['check'] = True\n    try:\n        p = run(run_args, stdout=PIPE, stderr=PIPE, **cache_kwargs)\n    except CalledProcessError as e:\n        log.exception(e)\n        raise  # Pass exception onward\n\n    # Update cache\n    (cache_folder / 'stdout.bin').write_bytes(p.stdout)\n    (cache_folder / 'stderr.bin').write_bytes(p.stderr)\n\n    if args.outputbase != 'stdout':\n        for configfile in configfiles:\n            if configfile not in ('fpdf2', 'hocr', 'pdf', 'txt'):\n                continue\n            # cp pwd/{outputbase}.{configfile} -> {cache}/{configfile}\n            tessfile = args.outputbase + '.' + configfile\n            shutil.copy(tessfile, str(cache_folder / configfile) + '.bin')\n\n    def clean_sys_argv():\n        for arg in run_args[1:]:\n            yield re.sub(r'.*/ocrmypdf[.]io[.][^/]+[/](.*)', r'$TMPDIR/\\1', arg)\n\n    manifest = {\n        'tesseract_version': TesseractOcrEngine.version().replace('\\n', ' '),\n        'system': platform.system(),\n        'python': platform.python_version(),\n        'argv_slug': cache_folder.name,\n        'sourcefile': str(Path(source_file).relative_to(TESTS_ROOT)),\n        'args': list(clean_sys_argv()),\n    }\n\n    with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f:\n        json.dump(manifest, f)\n        f.write('\\n')\n        f.flush()\n    return p\n\n\nclass CacheOcrEngine(TesseractOcrEngine):\n    # Concurrent threads (with --use-threads) might try to use different parts\n    # of the OcrEngine, so we need a lock to protect the state of patched\n    # module whenever it's patched. Should refactor ocrmypdf._exec.tesseract so that\n    # it does not to be patched at all for testing.\n    lock = threading.Lock()\n\n    @staticmethod\n    def get_orientation(input_file, options):\n        with (\n            CacheOcrEngine.lock,\n            patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),\n        ):\n            return TesseractOcrEngine.get_orientation(input_file, options)\n\n    @staticmethod\n    def get_deskew(input_file, options) -> float:\n        with (\n            CacheOcrEngine.lock,\n            patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),\n        ):\n            return TesseractOcrEngine.get_deskew(input_file, options)\n\n    @staticmethod\n    def generate_hocr(input_file, output_hocr, output_text, options):\n        with (\n            CacheOcrEngine.lock,\n            patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),\n        ):\n            TesseractOcrEngine.generate_hocr(\n                input_file, output_hocr, output_text, options\n            )\n\n    @staticmethod\n    def generate_pdf(input_file, output_pdf, output_text, options):\n        with (\n            CacheOcrEngine.lock,\n            patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)),\n        ):\n            TesseractOcrEngine.generate_pdf(\n                input_file, output_pdf, output_text, options\n            )\n\n\n@hookimpl\ndef get_ocr_engine():\n    return CacheOcrEngine()\n"
  },
  {
    "path": "tests/plugins/tesseract_crash.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\nfrom __future__ import annotations\n\nimport signal\nfrom contextlib import contextmanager\nfrom subprocess import CalledProcessError\nfrom unittest.mock import patch\n\nfrom ocrmypdf import hookimpl\nfrom ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine\n\n\ndef raise_crash(*args, **kwargs):\n    raise CalledProcessError(\n        128 + signal.SIGABRT,\n        'tesseract',\n        output=b\"\",\n        stderr=b\"libc++abi.dylib: terminating with uncaught exception of type \"\n        + b\"std::bad_alloc: std::bad_alloc\",\n    )\n\n\n@contextmanager\ndef patch_tesseract_run():\n    with patch('ocrmypdf._exec.tesseract.run') as mock:\n        mock.side_effect = raise_crash\n        yield\n        mock.assert_called()\n\n\nclass CrashOcrEngine(TesseractOcrEngine):\n    @staticmethod\n    def get_orientation(input_file, options):\n        with patch_tesseract_run():\n            return TesseractOcrEngine.get_orientation(input_file, options)\n\n    @staticmethod\n    def generate_hocr(input_file, output_hocr, output_text, options):\n        with patch_tesseract_run():\n            TesseractOcrEngine.generate_hocr(\n                input_file, output_hocr, output_text, options\n            )\n\n    @staticmethod\n    def generate_pdf(input_file, output_pdf, output_text, options):\n        with patch_tesseract_run():\n            TesseractOcrEngine.generate_pdf(\n                input_file, output_pdf, output_text, options\n            )\n\n\n@hookimpl\ndef get_ocr_engine():\n    return CrashOcrEngine()\n"
  },
  {
    "path": "tests/plugins/tesseract_debug_rotate.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\"\"\"Tesseract no-op/fixed rotate plugin.\n\nTo quickly run tests where getting OCR output is not necessary and we want to test\nthe rotation pipeline.\n\nIn generate_hocr mode, create a .hocr file that specifies no text found.\n\nIn 'pdf' mode, convert the image to PDF using another program.\n\nIn orientation check mode, report 0, 90, 180, 270... based on page number.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport pikepdf\nfrom PIL import Image\n\nfrom ocrmypdf import OcrEngine, OrientationConfidence, hookimpl\nfrom ocrmypdf.helpers import page_number\n\nHOCR_TEMPLATE = '''<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n <head>\n  <title></title>\n  <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n  <meta name='ocr-system' content='tesseract 4.1.1' />\n  <meta name='ocr-capabilities'\n    content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>\n </head>\n <body>\n  <div class='ocr_page' id='page_1' title='image \"x.tif\"; bbox 0 0 {0} {1}; ppageno 0'>\n   <div class='ocr_carea' id='block_1_1' title=\"bbox 0 1 {0} {1}\">\n    <p class='ocr_par' dir='ltr' id='par_1' title=\"bbox 0 1 {0} {1}\">\n     <span class='ocr_line' id='line_1' title=\"bbox 0 1 {0} {1}\">\n       <span class='ocrx_word' id='word_1' title=\"bbox 0 1 {0} {1}\"> </span>\n     </span>\n    </p>\n   </div>\n  </div>\n </body>\n</html>'''\n\n\nclass FixedRotateNoopOcrEngine(OcrEngine):\n    @staticmethod\n    def version():\n        return '4.1.1'\n\n    @staticmethod\n    def creator_tag(options):\n        tag = '-PDF' if options.pdf_renderer == 'sandwich' else '-hOCR'\n        return f\"NO-OP {tag} {FixedRotateNoopOcrEngine.version()}\"\n\n    def __str__(self):\n        return f\"NO-OP {FixedRotateNoopOcrEngine.version()}\"\n\n    @staticmethod\n    def languages(options):\n        return {'eng'}\n\n    @staticmethod\n    def get_orientation(input_file, options):\n        page = page_number(input_file)\n\n        angle = ((page - 1) * 90) % 360\n\n        return OrientationConfidence(angle=angle, confidence=99.9)\n\n    @staticmethod\n    def generate_hocr(input_file, output_hocr, output_text, options):\n        with (\n            Image.open(input_file) as im,\n            open(output_hocr, 'w', encoding='utf-8') as f,\n        ):\n            w, h = im.size\n            f.write(HOCR_TEMPLATE.format(str(w), str(h)))\n        with open(output_text, 'w') as f:\n            f.write('')\n\n    @staticmethod\n    def generate_pdf(input_file, output_pdf, output_text, options):\n        with Image.open(input_file) as im:\n            dpi = im.info['dpi']\n            pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1]\n        ptsize = pagesize[0] * 72, pagesize[1] * 72\n        pdf = pikepdf.new()\n        pdf.add_blank_page(page_size=ptsize)\n        pdf.save(output_pdf, static_id=True)\n        output_text.write_text('')\n\n\n@hookimpl\ndef get_ocr_engine():\n    return FixedRotateNoopOcrEngine()\n"
  },
  {
    "path": "tests/plugins/tesseract_noop.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\"\"\"Tesseract no-op plugin.\n\nTo quickly run tests where getting OCR output is not necessary.\n\nIn generate_hocr mode, create a .hocr file that specifies no text found.\n\nIn 'pdf' mode, convert the image to PDF using another program.\n\nIn orientation check mode, report the orientation is upright.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport pikepdf\nfrom PIL import Image\n\nfrom ocrmypdf import OcrEngine, OrientationConfidence, hookimpl\n\nHOCR_TEMPLATE = '''<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n <head>\n  <title></title>\n  <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n  <meta name='ocr-system' content='tesseract 4.1.1' />\n  <meta name='ocr-capabilities'\n    content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>\n </head>\n <body>\n  <div class='ocr_page' id='page_1' title='image \"x.tif\"; bbox 0 0 {0} {1}; ppageno 0'>\n   <div class='ocr_carea' id='block_1_1' title=\"bbox 0 1 {0} {1}\">\n    <p class='ocr_par' dir='ltr' id='par_1' title=\"bbox 0 1 {0} {1}\">\n     <span class='ocr_line' id='line_1' title=\"bbox 0 1 {0} {1}\">\n       <span class='ocrx_word' id='word_1' title=\"bbox 0 1 {0} {1}\"> </span>\n     </span>\n    </p>\n   </div>\n  </div>\n </body>\n</html>'''\n\n\nclass NoopOcrEngine(OcrEngine):\n    @staticmethod\n    def version():\n        return '4.1.1'\n\n    @staticmethod\n    def creator_tag(options):\n        tag = '-PDF' if options.pdf_renderer == 'sandwich' else '-hOCR'\n        return f\"NO-OP {tag} {NoopOcrEngine.version()}\"\n\n    def __str__(self):\n        return f\"NO-OP {NoopOcrEngine.version()}\"\n\n    @staticmethod\n    def languages(options):\n        return {'eng'}\n\n    @staticmethod\n    def get_orientation(input_file, options):\n        return OrientationConfidence(angle=0, confidence=0.0)\n\n    @staticmethod\n    def get_deskew(input_file, options):\n        return 0.0\n\n    @staticmethod\n    def generate_hocr(input_file, output_hocr, output_text, options):\n        with (\n            Image.open(input_file) as im,\n            open(output_hocr, 'w', encoding='utf-8') as f,\n        ):\n            w, h = im.size\n            f.write(HOCR_TEMPLATE.format(str(w), str(h)))\n        with open(output_text, 'w') as f:\n            f.write('')\n\n    @staticmethod\n    def generate_pdf(input_file, output_pdf, output_text, options):\n        with Image.open(input_file) as im:\n            dpi = im.info['dpi']\n            pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1]\n        ptsize = pagesize[0] * 72, pagesize[1] * 72\n        pdf = pikepdf.new()\n        pdf.add_blank_page(page_size=ptsize)\n        pdf.save(output_pdf, static_id=True)\n        output_text.write_text('')\n\n\n@hookimpl\ndef get_ocr_engine():\n    return NoopOcrEngine()\n"
  },
  {
    "path": "tests/plugins/tesseract_simulate_oom_killer.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MIT\n\"\"\"Tesseract no-op plugin that simulates the OOM killer on page 4.\n\nOCRmyPDF can use a lot of memory, even that it might trigger the\nOOM killer on Linux or similar features on other platforms. We want to\nensure we fail with an error rather than deadlock in such cases.\n\nPage 4 was chosen because of this number's association with bad luck\nin many East Asian cultures.\n\"\"\"\n\n# type: ignore\nfrom __future__ import annotations\n\nimport os\nimport signal\nfrom pathlib import Path\n\nfrom ocrmypdf import hookimpl\n\n# type: ignore\n\n\n# Ugly hack that let us use the NoopOcrEngine without setting up packaging for our\n# tests.\n# This hack also requires us to set type: ignore\nparent_file = Path(__file__).with_name('tesseract_noop.py')\nparent = compile(parent_file.read_text(), parent_file, mode='exec')\nexec(parent)\nNoopOcrEngine = locals()['NoopOcrEngine']\n\n\nclass Page4Engine(NoopOcrEngine):  # type: ignore\n    def __str__(self):\n        return f\"NO-OP Page 4 {NoopOcrEngine.version()}\"\n\n    @staticmethod\n    def generate_hocr(input_file: Path, output_hocr, output_text, options):\n        if input_file.stem.startswith('000004'):\n            # Suicide\n            os.kill(os.getpid(), signal.SIGKILL)\n        else:\n            return NoopOcrEngine.generate_hocr(\n                input_file, output_hocr, output_text, options\n            )\n\n    @staticmethod\n    def generate_pdf(input_file, output_pdf, output_text, options):\n        if input_file.stem.startswith('000004'):\n            # Suicide\n            os.kill(os.getpid(), signal.SIGKILL)\n        else:\n            return NoopOcrEngine.generate_pdf(\n                input_file, output_pdf, output_text, options\n            )\n\n\n@hookimpl\ndef check_options(options):\n    if options.use_threads:\n        raise ValueError(\"I'm not compatible with use_threads\")\n\n\n@hookimpl\ndef get_ocr_engine():\n    return Page4Engine()\n"
  },
  {
    "path": "tests/resources/README.rst",
    "content": ".. SPDX-FileCopyrightText: 2022 James R. Barlow\n.. SPDX-License-Identifier: CC-BY-SA-4.0\n\nThese test files are used in OCRmyPDF's test suite. They do not necessarily produce OCR results\nat all and are not necessarily meant as examples of OCR output. Some are even invalid PDFs that might\ncrash certain PDF viewers.\n\nSome of these images were obtained from the public domain. Others are copyrighted and may have\nlicenses associated. Refer to ``.reuse/dep5`` file in OCRmyPDF's Git repository for information on\nthe copyright holder(s) and license(s) applicable to these resources.\n\n.. list-table::\n    :widths: 15 35 50\n    :header-rows: 1\n\n    *   - File\n        - Source\n        - Purpose\n    *   - c02-22.pdf\n        - `Project Gutenberg`_, Adventures of Huckleberry Finn, page 22\n        - difficult OCR image (obscure fonts and illustrations)\n    *   - graph.pdf\n        - `Wikimedia:Simple_line_graph_of_ACE_2012_results_by_candidate_sj01.png`_\n        - image with slanted text\n    *   - lichtenstein.pdf\n        - `Wikimedia: JPEG2000 Lichtenstein`_\n        - JPEG2000 image\n    *   - linn.png, linn.pdf, linn.txt\n        - `Wikimedia: LinnSequencer`_\n        - image with two columns\n    *   - typewriter.png, 2400dpi.pdf\n        - `Wikimedia: Triumph typewrtier text Linzensoep`_\n        - simple text\n    *   - baiona.png\n        - `Wikimedia: Baionako udalerri mugakideak`_\n        - multilingual text and images\n    *   - aspect.pdf\n        - synthetic\n        - test image with 200 x 100 DPI resolution\n    *   - blank.pdf\n        - synthetic\n        - blank PDF generated by Adobe Illustrator CC 17, containing a lot of application-specific metadata/bloat\n    *   - cmyk.pdf\n        - synthetic\n        - a CMYK image created in Photoshop\n    *   - crom.png\n        - synthetic\n        - test for non-dictionary words\n    *   - enormous.pdf\n        - synthetic\n        - very large PDF page\n    *   - epson.pdf\n        - synthetic\n        - a linearized PDF containing some unusual indirect objects, created by an Epson printer; printout of a Wikipedia article (CC-BY-SA)\n    *   - formxobject.pdf\n        - synthetic\n        - hand-crafted PDF containing an image inside a Form XObject\n    *   - francais.pdf\n        - synthetic\n        - a page containing French accents (diacritics)\n    *   - hugemono.pdf\n        - synthetic\n        - large monochrome 35000x35000 image in JBIG2 encoding\n    *   - invalid.pdf\n        - synthetic\n        - a PDF file header followed by EOF marker\n    *   - kcs.pdf\n        - synthetic\n        - PDF file generated by Kodak Capture Desktop Software 1.2; has invalid table of contents\n    *   - livecycle.pdf\n        - synthetic\n        - a minimal PDF that claims to use dynamic XFA forms\n    *   - masks.pdf\n        - synthetic\n        - file containing explicit masks and a stencil mask drawn without a proper transformation matrix; printout of a German Wikipedia article (CC-BY-SA)\n    *   - missing_docinfo.pdf\n        - synthetic\n        - PDF file with no /DocumentInfo section\n    *   - overlay.pdf\n        - synthetic\n        - PDF file generated by PDFPen pro that triggered content stream parse errors\n    *   - negzero.pdf\n        - synthetic\n        - copy of formxobject.pdf with token that qpdf doesn't like\n    *   - no_contents.pdf\n        - synthetic\n        - synthetic PDF with a blank page that has no /Contents entry\n    *   - truetype_font_nomapping.pdf\n        - synthetic\n        - example of a PDF with an embedded subsetted TrueType font with no Unicode mapping\n    *   - trivial.pdf\n        - synthetic\n        - smallest possible valid PDF-1.3 with all required fields\n    *   - type3_font_nomapping.pdf\n        - synthetic\n        - example of a PDF with an embedded subsetted TrueType font with no Unicode mapping\n    *   - vector.pdf\n        - synthetic\n        - a PDF with vector art and text rendered as curves with no fonts\n\nAssemblies\n==========\n\nThese test resources are assemblies or derivatives from other previously mentioned files, released under the same license terms as their input files.\n\n- baiona_gray.png (from baiona.png, grayscale version)\n- baiona_colormapped.png (from baiona.png, palette version)\n- baiona_alpha.png (from baiona.png, RGB+A version)\n- cardinal.pdf (four cardinal directions, baked-in rotated copies of linn.png)\n- ccitt.pdf (linn.png, converted to CCITT encoding)\n- graph_ocred.pdf (from graph.pdf)\n- jbig2.pdf (from linn.png)\n- multipage.pdf (from several other files)\n- palette.pdf (from baiona_colormapped.png)\n- poster.pdf (from linn.png)\n- rotated_skew.pdf (a /Rotate'd and skewed document from linn.png)\n- skew.pdf (from linn.png, skew simulated by adjusting the transformation matrix)\n- toc.pdf (from formxobject.pdf, trivial.pdf)\n\n\n.. _`Wikimedia: LinnSequencer`: https://upload.wikimedia.org/wikipedia/en/b/b7/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg\n\n.. _`Project Gutenberg`: https://www.gutenberg.org/files/76/76-h/76-h.htm#c2\n\n.. _`Wikimedia: Simple_line_graph_of_ACE_2012_results_by_candidate_sj01.png`: https://en.wikipedia.org/wiki/File:Simple_line_graph_of_ACE_2012_results_by_candidate_sj01.png\n\n.. _`Wikimedia: JPEG2000 Lichtenstein`: https://en.wikipedia.org/wiki/JPEG_2000#/media/File:Jpeg2000_2-level_wavelet_transform-lichtenstein.png\n\n.. _`Linux (Wikipedia Article)`: https://de.wikipedia.org/wiki/Linux\n\n.. _`Wikimedia: Triumph typewrtier text Linzensoep`: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif\n\n.. _`Wikimedia: Baionako udalerri mugakideak`: https://commons.wikimedia.org/wiki/File:Baionako_udalerri_mugakideak.png\n"
  },
  {
    "path": "tests/resources/arabic.hocr",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"ar\" lang=\"ar\">\n<head>\n<title></title>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\" />\n<meta name='ocr-system' content='tesseract 5.0.0' />\n<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>\n</head>\n<body>\n<div class='ocr_page' id='page_1' title='image \"test.png\"; bbox 0 0 2550 3300; ppageno 0; scan_res 300 300'>\n<div class='ocr_carea' id='carea_1_1' title=\"bbox 200 200 2350 1200\">\n<p class='ocr_par' id='par_1_1' lang='ara' dir='rtl' title=\"bbox 200 200 2350 400\">\n<span class='ocr_line' id='line_1_1' title=\"bbox 200 200 2350 400; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_1' title='bbox 200 200 600 400; x_wconf 95'>مرحبا</span>\n<span class='ocrx_word' id='word_1_2' title='bbox 650 200 1050 400; x_wconf 95'>بالعالم</span>\n</span>\n</p>\n<p class='ocr_par' id='par_1_2' lang='ara' dir='rtl' title=\"bbox 200 500 2350 700\">\n<span class='ocr_line' id='line_1_2' title=\"bbox 200 500 2350 700; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_3' title='bbox 200 500 600 700; x_wconf 95'>هذا</span>\n<span class='ocrx_word' id='word_1_4' title='bbox 650 500 1050 700; x_wconf 95'>نص</span>\n<span class='ocrx_word' id='word_1_5' title='bbox 1100 500 1500 700; x_wconf 95'>عربي</span>\n</span>\n</p>\n<p class='ocr_par' id='par_1_3' lang='per' dir='rtl' title=\"bbox 200 800 2350 1000\">\n<span class='ocr_line' id='line_1_3' title=\"bbox 200 800 2350 1000; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_6' title='bbox 200 800 600 1000; x_wconf 95'>سلام</span>\n<span class='ocrx_word' id='word_1_7' title='bbox 650 800 1050 1000; x_wconf 95'>فارسی</span>\n</span>\n</p>\n</div>\n</div>\n</body>\n</html>\n"
  },
  {
    "path": "tests/resources/cjk.hocr",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"zh\" lang=\"zh\">\n<head>\n<title></title>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\" />\n<meta name='ocr-system' content='tesseract 5.0.0' />\n<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>\n</head>\n<body>\n<div class='ocr_page' id='page_1' title='image \"test.png\"; bbox 0 0 2550 3300; ppageno 0; scan_res 300 300'>\n<div class='ocr_carea' id='carea_1_1' title=\"bbox 200 200 2350 1500\">\n<p class='ocr_par' id='par_1_1' lang='chi_sim' title=\"bbox 200 200 2350 400\">\n<span class='ocr_line' id='line_1_1' title=\"bbox 200 200 2350 400; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_1' title='bbox 200 200 600 400; x_wconf 95'>你好</span>\n<span class='ocrx_word' id='word_1_2' title='bbox 650 200 1050 400; x_wconf 95'>世界</span>\n</span>\n</p>\n<p class='ocr_par' id='par_1_2' lang='chi_tra' title=\"bbox 200 500 2350 700\">\n<span class='ocr_line' id='line_1_2' title=\"bbox 200 500 2350 700; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_3' title='bbox 200 500 600 700; x_wconf 95'>繁體</span>\n<span class='ocrx_word' id='word_1_4' title='bbox 650 500 1050 700; x_wconf 95'>中文</span>\n</span>\n</p>\n<p class='ocr_par' id='par_1_3' lang='jpn' title=\"bbox 200 800 2350 1000\">\n<span class='ocr_line' id='line_1_3' title=\"bbox 200 800 2350 1000; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_5' title='bbox 200 800 600 1000; x_wconf 95'>こんにちは</span>\n<span class='ocrx_word' id='word_1_6' title='bbox 650 800 1050 1000; x_wconf 95'>世界</span>\n</span>\n</p>\n<p class='ocr_par' id='par_1_4' lang='kor' title=\"bbox 200 1100 2350 1300\">\n<span class='ocr_line' id='line_1_4' title=\"bbox 200 1100 2350 1300; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_7' title='bbox 200 1100 600 1300; x_wconf 95'>안녕하세요</span>\n<span class='ocrx_word' id='word_1_8' title='bbox 650 1100 1050 1300; x_wconf 95'>세계</span>\n</span>\n</p>\n</div>\n</div>\n</body>\n</html>\n"
  },
  {
    "path": "tests/resources/devanagari.hocr",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"hi\" lang=\"hi\">\n<head>\n<title></title>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\" />\n<meta name='ocr-system' content='tesseract 5.0.0' />\n<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>\n</head>\n<body>\n<div class='ocr_page' id='page_1' title='image \"test.png\"; bbox 0 0 2550 3300; ppageno 0; scan_res 300 300'>\n<div class='ocr_carea' id='carea_1_1' title=\"bbox 200 200 2350 1200\">\n<p class='ocr_par' id='par_1_1' lang='hin' title=\"bbox 200 200 2350 400\">\n<span class='ocr_line' id='line_1_1' title=\"bbox 200 200 2350 400; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_1' title='bbox 200 200 600 400; x_wconf 95'>नमस्ते</span>\n<span class='ocrx_word' id='word_1_2' title='bbox 650 200 1050 400; x_wconf 95'>दुनिया</span>\n</span>\n</p>\n<p class='ocr_par' id='par_1_2' lang='hin' title=\"bbox 200 500 2350 700\">\n<span class='ocr_line' id='line_1_2' title=\"bbox 200 500 2350 700; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_3' title='bbox 200 500 600 700; x_wconf 95'>यह</span>\n<span class='ocrx_word' id='word_1_4' title='bbox 650 500 1050 700; x_wconf 95'>हिंदी</span>\n<span class='ocrx_word' id='word_1_5' title='bbox 1100 500 1500 700; x_wconf 95'>पाठ</span>\n<span class='ocrx_word' id='word_1_6' title='bbox 1550 500 1950 700; x_wconf 95'>है</span>\n</span>\n</p>\n<p class='ocr_par' id='par_1_3' lang='san' title=\"bbox 200 800 2350 1000\">\n<span class='ocr_line' id='line_1_3' title=\"bbox 200 800 2350 1000; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_7' title='bbox 200 800 700 1000; x_wconf 95'>संस्कृत</span>\n<span class='ocrx_word' id='word_1_8' title='bbox 750 800 1250 1000; x_wconf 95'>भाषा</span>\n</span>\n</p>\n</div>\n</div>\n</body>\n</html>\n"
  },
  {
    "path": "tests/resources/hello_world_scripts.hocr",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n<head>\n<title>Multilingual Hello World Script Test</title>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\" />\n<meta name='ocr-system' content='tesseract 5.0.0' />\n<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>\n</head>\n<body>\n<!-- Page: 8.5x11 inches at 300 DPI = 2550x3300 pixels -->\n<div class='ocr_page' id='page_1' title='image \"hello_scripts.png\"; bbox 0 0 2550 3300; ppageno 0; scan_res 300 300'>\n\n<!-- Row 1: English and Spanish (Latin script with accents/punctuation) -->\n<div class='ocr_carea' id='carea_1_1' title=\"bbox 150 150 1200 400\">\n<p class='ocr_par' id='par_1_1' lang='eng' title=\"bbox 150 150 600 350\">\n<span class='ocr_line' id='line_1_1' title=\"bbox 150 150 600 350; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_1_1' title='bbox 150 150 600 350; x_wconf 98'>Hello!</span>\n</span>\n</p>\n</div>\n\n<div class='ocr_carea' id='carea_1_2' title=\"bbox 1400 150 2400 400\">\n<p class='ocr_par' id='par_1_2' lang='spa' title=\"bbox 1400 150 2400 350\">\n<span class='ocr_line' id='line_1_2' title=\"bbox 1400 150 2000 350; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_1_2' title='bbox 1400 150 2000 350; x_wconf 97'>¡Hola!</span>\n</span>\n</p>\n</div>\n\n<!-- Row 2: French (accents) and German (umlauts, eszett) -->\n<div class='ocr_carea' id='carea_2_1' title=\"bbox 150 450 1200 700\">\n<p class='ocr_par' id='par_2_1' lang='fra' title=\"bbox 150 450 800 650\">\n<span class='ocr_line' id='line_2_1' title=\"bbox 150 450 800 650; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_2_1' title='bbox 150 450 800 650; x_wconf 96'>Bonjour!</span>\n</span>\n</p>\n</div>\n\n<div class='ocr_carea' id='carea_2_2' title=\"bbox 1400 450 2400 700\">\n<p class='ocr_par' id='par_2_2' lang='deu' title=\"bbox 1400 450 2100 650\">\n<span class='ocr_line' id='line_2_2' title=\"bbox 1400 450 2100 650; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_2_2' title='bbox 1400 450 2100 650; x_wconf 95'>Grüß Gott!</span>\n</span>\n</p>\n</div>\n\n<!-- Row 3: Russian (Cyrillic) and Greek -->\n<div class='ocr_carea' id='carea_3_1' title=\"bbox 150 750 1200 1000\">\n<p class='ocr_par' id='par_3_1' lang='rus' title=\"bbox 150 750 900 950\">\n<span class='ocr_line' id='line_3_1' title=\"bbox 150 750 900 950; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_3_1' title='bbox 150 750 900 950; x_wconf 94'>Привет!</span>\n</span>\n</p>\n</div>\n\n<div class='ocr_carea' id='carea_3_2' title=\"bbox 1400 750 2400 1000\">\n<p class='ocr_par' id='par_3_2' lang='ell' title=\"bbox 1400 750 2200 950\">\n<span class='ocr_line' id='line_3_2' title=\"bbox 1400 750 2200 950; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_3_2' title='bbox 1400 750 2200 950; x_wconf 93'>Γειά σου!</span>\n</span>\n</p>\n</div>\n\n<!-- Row 4: Chinese (Simplified) and Japanese -->\n<div class='ocr_carea' id='carea_4_1' title=\"bbox 150 1050 1200 1300\">\n<p class='ocr_par' id='par_4_1' lang='chi_sim' title=\"bbox 150 1050 700 1250\">\n<span class='ocr_line' id='line_4_1' title=\"bbox 150 1050 700 1250; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_4_1' title='bbox 150 1050 700 1250; x_wconf 92'>你好！</span>\n</span>\n</p>\n</div>\n\n<div class='ocr_carea' id='carea_4_2' title=\"bbox 1400 1050 2400 1300\">\n<p class='ocr_par' id='par_4_2' lang='jpn' title=\"bbox 1400 1050 2300 1250\">\n<span class='ocr_line' id='line_4_2' title=\"bbox 1400 1050 2300 1250; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_4_2' title='bbox 1400 1050 2300 1250; x_wconf 91'>こんにちは！</span>\n</span>\n</p>\n</div>\n\n<!-- Row 5: Korean and Turkish (Latin with special chars) -->\n<div class='ocr_carea' id='carea_5_1' title=\"bbox 150 1350 1200 1600\">\n<p class='ocr_par' id='par_5_1' lang='kor' title=\"bbox 150 1350 900 1550\">\n<span class='ocr_line' id='line_5_1' title=\"bbox 150 1350 900 1550; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_5_1' title='bbox 150 1350 900 1550; x_wconf 90'>안녕하세요!</span>\n</span>\n</p>\n</div>\n\n<div class='ocr_carea' id='carea_5_2' title=\"bbox 1400 1350 2400 1600\">\n<p class='ocr_par' id='par_5_2' lang='tur' title=\"bbox 1400 1350 2300 1550\">\n<span class='ocr_line' id='line_5_2' title=\"bbox 1400 1350 2300 1550; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_5_2' title='bbox 1400 1350 2300 1550; x_wconf 89'>Merhaba!</span>\n</span>\n</p>\n</div>\n\n<!-- Row 6: Hindi (Devanagari) and Arabic (RTL) -->\n<div class='ocr_carea' id='carea_6_1' title=\"bbox 150 1650 1200 1900\">\n<p class='ocr_par' id='par_6_1' lang='hin' title=\"bbox 150 1650 900 1850\">\n<span class='ocr_line' id='line_6_1' title=\"bbox 150 1650 900 1850; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_6_1' title='bbox 150 1650 900 1850; x_wconf 88'>नमस्ते!</span>\n</span>\n</p>\n</div>\n\n<div class='ocr_carea' id='carea_6_2' title=\"bbox 1400 1650 2400 1900\">\n<p class='ocr_par' id='par_6_2' lang='ara' dir='rtl' title=\"bbox 1400 1650 2300 1850\">\n<span class='ocr_line' id='line_6_2' title=\"bbox 1400 1650 2300 1850; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_6_2' title='bbox 1400 1650 2300 1850; x_wconf 87'>!مرحبا</span>\n</span>\n</p>\n</div>\n\n<!-- Row 7: Hebrew (RTL) and Portuguese (accents) -->\n<div class='ocr_carea' id='carea_7_1' title=\"bbox 150 1950 1200 2200\">\n<p class='ocr_par' id='par_7_1' lang='heb' dir='rtl' title=\"bbox 150 1950 800 2150\">\n<span class='ocr_line' id='line_7_1' title=\"bbox 150 1950 800 2150; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_7_1' title='bbox 150 1950 800 2150; x_wconf 86'>שלום</span>\n</span>\n</p>\n</div>\n\n<div class='ocr_carea' id='carea_7_2' title=\"bbox 1400 1950 2000 2200\">\n<p class='ocr_par' id='par_7_2' lang='por' title=\"bbox 1400 1950 1900 2150\">\n<span class='ocr_line' id='line_7_2' title=\"bbox 1400 1950 1900 2150; baseline 0 -40; x_size 140; x_descenders 28; x_ascenders 35\">\n<span class='ocrx_word' id='word_7_2' title='bbox 1400 1950 1900 2150; x_wconf 85'>Olá!</span>\n</span>\n</p>\n</div>\n\n<!-- Rotated text section: Various scripts at angles -->\n<!-- Rotated baseline: 15 degrees clockwise (baseline slope ~0.27) -->\n<div class='ocr_carea' id='carea_8_1' title=\"bbox 200 2150 900 2700\">\n<p class='ocr_par' id='par_8_1' lang='ita' title=\"bbox 200 2150 900 2650\">\n<span class='ocr_line' id='line_8_1' title=\"bbox 200 2150 900 2450; baseline 0.27 -30; x_size 130; x_descenders 26; x_ascenders 32\">\n<span class='ocrx_word' id='word_8_1' title='bbox 200 2150 900 2450; x_wconf 84'>Ciao!</span>\n</span>\n</p>\n</div>\n\n<!-- Rotated baseline: -10 degrees (baseline slope ~-0.18) -->\n<div class='ocr_carea' id='carea_8_2' title=\"bbox 1000 2350 1700 2700\">\n<p class='ocr_par' id='par_8_2' lang='pol' title=\"bbox 1000 2400 1700 2650\">\n<span class='ocr_line' id='line_8_2' title=\"bbox 1000 2400 1700 2650; baseline -0.18 -25; x_size 130; x_descenders 26; x_ascenders 32\">\n<span class='ocrx_word' id='word_8_2' title='bbox 1000 2400 1700 2650; x_wconf 83'>Cześć!</span>\n</span>\n</p>\n</div>\n\n<!-- Rotated baseline: 8 degrees clockwise (baseline slope ~0.14) - Chinese -->\n<div class='ocr_carea' id='carea_8_3' title=\"bbox 1800 2350 2450 2700\">\n<p class='ocr_par' id='par_8_3' lang='chi_tra' title=\"bbox 1800 2400 2450 2650\">\n<span class='ocr_line' id='line_8_3' title=\"bbox 1800 2400 2450 2650; baseline 0.14 -35; x_size 130; x_descenders 26; x_ascenders 32\">\n<span class='ocrx_word' id='word_8_3' title='bbox 1800 2400 2450 2650; x_wconf 82'>您好！</span>\n</span>\n</p>\n</div>\n\n<!-- Bottom row: More rotated examples -->\n<!-- Rotated baseline: -20 degrees (baseline slope ~-0.36) - Russian -->\n<div class='ocr_carea' id='carea_9_1' title=\"bbox 200 2750 900 3100\">\n<p class='ocr_par' id='par_9_1' lang='rus' title=\"bbox 200 2800 900 3050\">\n<span class='ocr_line' id='line_9_1' title=\"bbox 200 2800 900 3050; baseline -0.36 -20; x_size 120; x_descenders 24; x_ascenders 30\">\n<span class='ocrx_word' id='word_9_1' title='bbox 200 2800 900 3050; x_wconf 81'>Здравствуй!</span>\n</span>\n</p>\n</div>\n\n<!-- Rotated baseline: 12 degrees clockwise (baseline slope ~0.21) - Greek -->\n<div class='ocr_carea' id='carea_9_2' title=\"bbox 1000 2750 1700 3100\">\n<p class='ocr_par' id='par_9_2' lang='ell' title=\"bbox 1000 2780 1700 3050\">\n<span class='ocr_line' id='line_9_2' title=\"bbox 1000 2780 1700 3050; baseline 0.21 -30; x_size 120; x_descenders 24; x_ascenders 30\">\n<span class='ocrx_word' id='word_9_2' title='bbox 1000 2780 1700 3050; x_wconf 80'>Χαίρετε!</span>\n</span>\n</p>\n</div>\n\n<!-- Rotated baseline: -5 degrees (baseline slope ~-0.09) - Arabic RTL rotated -->\n<div class='ocr_carea' id='carea_9_3' title=\"bbox 1800 2750 2450 3100\">\n<p class='ocr_par' id='par_9_3' lang='ara' dir='rtl' title=\"bbox 1800 2800 2450 3050\">\n<span class='ocr_line' id='line_9_3' title=\"bbox 1800 2800 2450 3050; baseline -0.09 -25; x_size 120; x_descenders 24; x_ascenders 30\">\n<span class='ocrx_word' id='word_9_3' title='bbox 1800 2800 2450 3050; x_wconf 79'>!أهلاً</span>\n</span>\n</p>\n</div>\n\n</div>\n</body>\n</html>\n"
  },
  {
    "path": "tests/resources/latin.hocr",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n<head>\n<title></title>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\" />\n<meta name='ocr-system' content='tesseract 5.0.0' />\n<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>\n</head>\n<body>\n<div class='ocr_page' id='page_1' title='image \"test.png\"; bbox 0 0 2550 3300; ppageno 0; scan_res 300 300'>\n<div class='ocr_carea' id='carea_1_1' title=\"bbox 200 200 2350 1200\">\n<p class='ocr_par' id='par_1_1' lang='eng' title=\"bbox 200 200 2350 400\">\n<span class='ocr_line' id='line_1_1' title=\"bbox 200 200 2350 400; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_1' title='bbox 200 200 600 400; x_wconf 95'>The</span>\n<span class='ocrx_word' id='word_1_2' title='bbox 650 200 1050 400; x_wconf 95'>quick</span>\n<span class='ocrx_word' id='word_1_3' title='bbox 1100 200 1500 400; x_wconf 95'>brown</span>\n<span class='ocrx_word' id='word_1_4' title='bbox 1550 200 1850 400; x_wconf 95'>fox</span>\n<span class='ocrx_word' id='word_1_5' title='bbox 1900 200 2350 400; x_wconf 95'>jumps</span>\n</span>\n</p>\n<p class='ocr_par' id='par_1_2' lang='fra' title=\"bbox 200 500 2350 700\">\n<span class='ocr_line' id='line_1_2' title=\"bbox 200 500 2350 700; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_6' title='bbox 200 500 500 700; x_wconf 95'>Café</span>\n<span class='ocrx_word' id='word_1_7' title='bbox 550 500 950 700; x_wconf 95'>résumé</span>\n<span class='ocrx_word' id='word_1_8' title='bbox 1000 500 1400 700; x_wconf 95'>naïve</span>\n</span>\n</p>\n<p class='ocr_par' id='par_1_3' lang='deu' title=\"bbox 200 800 2350 1000\">\n<span class='ocr_line' id='line_1_3' title=\"bbox 200 800 2350 1000; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_9' title='bbox 200 800 700 1000; x_wconf 95'>Größe</span>\n<span class='ocrx_word' id='word_1_10' title='bbox 750 800 1250 1000; x_wconf 95'>Zürich</span>\n<span class='ocrx_word' id='word_1_11' title='bbox 1300 800 1800 1000; x_wconf 95'>Ärger</span>\n</span>\n</p>\n</div>\n</div>\n</body>\n</html>\n"
  },
  {
    "path": "tests/resources/linn.txt",
    "content": "The LinnSequencer\n32 Track MIDI Sequence Recorder\n\nThe LinnSequencer is a state—of—the-art composition and performance tool for the professional musician. It is\n\nextremely powerful, yet amazingly simple to learn and use. It’s many remarkable features include:\n\n0 Operation is similar to multi-track tape recorder with PLAY, STOP, RECORD, FAST\nFORWARD, REWIND, and LOCATE controls.\n\n0 Each of the 100 sequences contains 32 simultaneous, polyphonic tracks. Each track may\nbe assigned to one of 16 MIDI channels. Simultaneously plays up to 16 polyphonic\n\nsynthesizers !\n\n0 Ultra-fast 3 1/2 ” disk drive stores complex songs in seconds and holds over 110,000 notes\n\nper disk!\n\n0 One or all tracks may be TRANSPOSED at the touch of a key.\n0 Exclusive real—time ERASE function makes editing FAST.\n0 Exclusive REPEAT function automatically repeats any held notes at a pre-selected\n\nrhythmic value.\n\n0 TIMING CORRECTION works during playback and operates without ‘chopping’ notes.\n\n0 Optional SMPTE time code synchronization.\n\n0 Optional remote control.\n\nRecording a Sequence\n\nTo record a sequence, simply press RECORD and PLAY,\nthen play your MIDI keyboard in time to the Sequencer’s\nclick track. When the sequence loops back around to bar 1,\nyou’ll hear what you played—only all timing errors will be\n\ncorrected! (Timing correction may be adjusted 0r defeated).\n\nAny additional notes played will be added into the track\n—existing notes are not erased while recording!\n\nFAST FORWARD, REWIND, and LOCATE controls\nmay be used at any time to quickly access any location in\nyour sequence for spot-recording. To overdub a new part,\nselect a different track and start recording—while you\nrecord, the ﬁrst‘track will play in perfect sync (unless you\nMUTE it, or SOLO another track). In this way, up to 32\ntracks may be overdubbed! All MIDI effects are recorded\nincluding pitch bend, modulation, velocity, aftertouch,\nsustain pedal, and program changes!\n\nEditing\n\nTo erase a wrong note, simply hold ERASE and press\nthe note to be erased just before it plays in the sequence-—\nwhen played back, it will be gone. Notes may also be\n\nadded, erased, or changed using the SINGLE STEP func-\ntion. To overdub notes at specific points within a sequence,\n\nAdditional Features\n\nsimply use LOCATE, FAST FORWARD, or REWIND to\nfind the desired bar number, then start recording.\n\nThe INSERT/ COPY function allows you to move bars\nfrom one location to another—in the same sequence or a\ndifferent one. For example, you might insert a copy of the\nfirst verse between the second chorus and the bridge.\nDELETE BARS operates the same way to remove\nunwanted sections.\n\nCreating a Song\n\nOne way to create a song is to record each track all the\nway through (up to 999 bars). Another way is to record\neach basic section (verse, chorus, etc.) in individual\nsequences, then use the CREATE SONG function to “chain”\nthem together. CREATE SONG will then automatically\ncopy all the parts into a new sequence. If desired, you can\neven set the last few bars to repeat infinitely, for a fadeout.\n\nComposition Without Compromise\n\nThe technology you use should never be so complex that\nit interferes with the creative process. That’s precisely why\nthe LinnSequencer is designed to let you compose, record\nand edit while devoting your undivided attention to your\nmusic. See your Linn dealer today for a demonstration!\n\n0 Simple, easy to learn operation—the 32 character LCD display clearly guides you through all operations. If needed, the\n\nHELP button displays additional explanations.\n\n0 Non-destructive recording—existing notes are not erased while recording.\n0 Two FOOTSWIT CH INPUTS may be assigned to remotely control many of the commonly used functions, including\n\nERASE, REPEAT, PLAY/ STOP, or LOCATE.\n\n0 Two TRIGGER OUTPUTS may be programmed to output pulses at any selected note value.\n\n0 Will sync to standard LinnDrum or Linn 9000 sync tone.\n\n0 Utilizes ultra high—speed, 8 MHZ 80186 16 bit computer internally for FAST operation.\n0 TEMPO may be specified in BEATS-PER—MINUTE or FRAMES-PER—BEAT at 24, 25, or 30 frames per second,\n\n(even drop frame!)\n\n0 TEMPO may be entered numerically, adjustable in tenths of a Beat-Per-Minute increments, or by tapping quarter notes\n\non the TAP TEMPO button.\n\n0 TEMPO CHANGES may be programmed into a sequence, with smooth transitions if desired.\n0 Any TIME SIGNATURE may be used, and may be changed within a song.\n\nEDI]\nLinn Electronics, Inc.\n\n18720 Oxnard Street, Tarzana, CA 91356\n(818) 708-8131 TELEX #298949 LINN UR\n\n"
  },
  {
    "path": "tests/resources/multilingual.hocr",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n<head>\n<title></title>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\" />\n<meta name='ocr-system' content='tesseract 5.0.0' />\n<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>\n</head>\n<body>\n<div class='ocr_page' id='page_1' title='image \"test.png\"; bbox 0 0 2550 3300; ppageno 0'>\n<div class='ocr_carea' id='carea_1_1' title=\"bbox 200 200 2350 800\">\n<p class='ocr_par' id='par_1_1' lang='eng' title=\"bbox 200 200 2350 400\">\n<span class='ocr_line' id='line_1_1' title=\"bbox 200 200 2350 400; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_1' title='bbox 200 200 500 400; x_wconf 95'>English</span>\n<span class='ocrx_word' id='word_1_2' title='bbox 550 200 750 400; x_wconf 95'>Text</span>\n<span class='ocrx_word' id='word_1_3' title='bbox 800 200 1000 400; x_wconf 95'>Here</span>\n</span>\n</p>\n<p class='ocr_par' id='par_1_2' lang='ara' dir='rtl' title=\"bbox 200 500 2350 800\">\n<span class='ocr_line' id='line_1_2' title=\"bbox 200 500 2350 800; baseline 0 -50; x_size 150; x_descenders 30; x_ascenders 40\">\n<span class='ocrx_word' id='word_1_4' title='bbox 200 500 600 800; x_wconf 95'>مرحبا</span>\n<span class='ocrx_word' id='word_1_5' title='bbox 650 500 950 800; x_wconf 95'>بك</span>\n</span>\n</p>\n</div>\n</div>\n</body>\n</html>\n"
  },
  {
    "path": "tests/test_acroform.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport logging\n\nimport pikepdf\nimport pytest\n\nimport ocrmypdf\n\nfrom .conftest import check_ocrmypdf\n\n# pylint: disable=redefined-outer-name\n\n\n@pytest.fixture\ndef acroform(resources):\n    return resources / 'acroform.pdf'\n\n\ndef test_acroform_and_redo(acroform, no_outpdf):\n    with pytest.raises(\n        ocrmypdf.exceptions.InputFileError,\n        match=r'.*--redo-ocr.*is not currently possible.*',\n    ):\n        check_ocrmypdf(acroform, no_outpdf, '--redo-ocr')\n\n\ndef test_acroform_message(acroform, caplog, outpdf):\n    caplog.set_level(logging.INFO)\n    check_ocrmypdf(acroform, outpdf, '--plugin', 'tests/plugins/tesseract_noop.py')\n    assert 'fillable form' in caplog.text\n    assert '--force-ocr' in caplog.text\n\n\n@pytest.fixture\ndef digitally_signed(acroform, outdir):\n    out = outdir / 'acroform_signed.pdf'\n    with pikepdf.open(acroform) as pdf:\n        pdf.Root.AcroForm.SigFlags = 3\n        pdf.save(out)\n    yield out\n\n\ndef test_digital_signature(digitally_signed, no_outpdf):\n    with pytest.raises(ocrmypdf.exceptions.DigitalSignatureError):\n        check_ocrmypdf(digitally_signed, no_outpdf)\n\n\ndef test_digital_signature_invalidate(digitally_signed, no_outpdf):\n    check_ocrmypdf(\n        digitally_signed, no_outpdf, '--force-ocr', '--invalidate-digital-signatures'\n    )\n"
  },
  {
    "path": "tests/test_annots.py",
    "content": "# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nfrom pikepdf import Array, Dictionary, Name, NameTree, Pdf\n\nfrom ocrmypdf._annots import remove_broken_goto_annotations\n\n\ndef test_remove_broken_goto_annotations(resources):\n    with Pdf.open(resources / 'link.pdf') as pdf:\n        assert not remove_broken_goto_annotations(pdf), \"File should not be modified\"\n\n        # Construct Dests nametree\n        nt = NameTree.new(pdf)\n        names = pdf.Root[Name.Names] = pdf.make_indirect(Dictionary())\n        names[Name.Dests] = nt.obj\n        # Create a broken named destination\n        nt['Invalid'] = pdf.make_indirect(Dictionary())\n        # Create a valid named destination\n        nt['Valid'] = Array([pdf.pages[0].obj, Name.XYZ, 0, 0, 0])\n\n        pdf.pages[0].Annots[0].A.D = 'Missing'\n        pdf.pages[1].Annots[0].A.D = 'Valid'\n\n        assert remove_broken_goto_annotations(pdf), \"File should be modified\"\n\n        assert Name.D not in pdf.pages[0].Annots[0].A\n        assert Name.D in pdf.pages[1].Annots[0].A\n"
  },
  {
    "path": "tests/test_api.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport pickle\nfrom io import BytesIO\nfrom pathlib import Path\n\nimport pytest\nfrom pdfminer.high_level import extract_text\n\nimport ocrmypdf\nimport ocrmypdf._pipelines\nimport ocrmypdf.api\n\n\ndef test_language_list():\n    with pytest.raises(\n        (ocrmypdf.exceptions.InputFileError, ocrmypdf.exceptions.MissingDependencyError)\n    ):\n        ocrmypdf.ocr('doesnotexist.pdf', '_.pdf', language=['eng', 'deu'])\n\n\ndef test_language_parameter_mapped_to_languages():\n    \"\"\"Test that the API 'language' parameter is mapped to OcrOptions 'languages'.\n\n    Regression test for GitHub issue #1640: the Python API ignored the language\n    parameter, always defaulting to 'eng'.\n    \"\"\"\n    from ocrmypdf._options import OcrOptions\n    from ocrmypdf.api import create_options, setup_plugin_infrastructure\n    from ocrmypdf.cli import get_parser\n\n    setup_plugin_infrastructure()\n    parser = get_parser()\n\n    options = create_options(\n        input_file='test.pdf',\n        output_file='output.pdf',\n        parser=parser,\n        language=['tam'],\n    )\n    assert options.languages == ['tam']\n\n    # Test with a list of multiple languages\n    options = create_options(\n        input_file='test.pdf',\n        output_file='output.pdf',\n        parser=parser,\n        language=['fra', 'deu'],\n    )\n    assert options.languages == ['fra', 'deu']\n\n    # Test with a bare string (single language)\n    options = create_options(\n        input_file='test.pdf',\n        output_file='output.pdf',\n        parser=parser,\n        language='tam',\n    )\n    assert options.languages == ['tam']\n\n    # Test '+'-separated string is split like CLI --language\n    options = create_options(\n        input_file='test.pdf',\n        output_file='output.pdf',\n        parser=parser,\n        language='eng+spa',\n    )\n    assert options.languages == ['eng', 'spa']\n\n    # Test '+'-separated entry within a list is also split\n    options = create_options(\n        input_file='test.pdf',\n        output_file='output.pdf',\n        parser=parser,\n        language=['eng+spa'],\n    )\n    assert options.languages == ['eng', 'spa']\n\n\ndef test_stream_api(resources: Path):\n    in_ = (resources / 'graph.pdf').open('rb')\n    out = BytesIO()\n\n    ocrmypdf.ocr(in_, out, tesseract_timeout=0.0)\n    out.seek(0)\n    assert b'%PDF' in out.read(1024)\n\n\ndef test_sidecar_stringio(resources: Path, outdir: Path, outpdf: Path):\n    s = BytesIO()\n    ocrmypdf.ocr(\n        resources / 'ccitt.pdf',\n        outpdf,\n        plugins=['tests/plugins/tesseract_cache.py'],\n        sidecar=s,\n    )\n    s.seek(0)\n    assert b'the' in s.getvalue()\n\n\ndef test_hocr_api_multipage(resources: Path, outdir: Path, outpdf: Path):\n    ocrmypdf.api._pdf_to_hocr(\n        resources / 'multipage.pdf',\n        outdir,\n        language='eng',\n        skip_text=True,\n        plugins=['tests/plugins/tesseract_cache.py'],\n    )\n    assert (outdir / '000001_ocr_hocr.hocr').exists()\n    assert (outdir / '000006_ocr_hocr.hocr').exists()\n    assert not (outdir / '000004_ocr_hocr.hocr').exists()\n\n    ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf)\n    assert outpdf.exists()\n\n\ndef test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path):\n    ocrmypdf.api._pdf_to_hocr(\n        resources / 'ccitt.pdf',\n        outdir,\n        language='eng',\n        skip_text=True,\n        plugins=['tests/plugins/tesseract_cache.py'],\n    )\n    assert (outdir / '000001_ocr_hocr.hocr').exists()\n    hocr = (outdir / '000001_ocr_hocr.hocr').read_text(encoding='utf-8')\n    mangled = hocr.replace('the', 'hocr')\n    (outdir / '000001_ocr_hocr.hocr').write_text(mangled, encoding='utf-8')\n\n    ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf, optimize=0)\n\n    text = extract_text(outpdf)\n    assert 'hocr' in text and 'the' not in text\n\n\ndef test_hocr_result_json():\n    result = ocrmypdf._pipelines._common.HOCRResult(\n        pageno=1,\n        pdf_page_from_image=Path('a'),\n        hocr=Path('b'),\n        textpdf=Path('c'),\n        orientation_correction=180,\n    )\n    assert (\n        result.to_json()\n        == '{\"pageno\": 1, \"pdf_page_from_image\": {\"Path\": \"a\"}, \"hocr\": {\"Path\": \"b\"}, '\n        '\"textpdf\": {\"Path\": \"c\"}, \"orientation_correction\": 180, \"ocr_tree\": null}'\n    )\n    assert ocrmypdf._pipelines._common.HOCRResult.from_json(result.to_json()) == result\n\n\ndef test_hocr_result_pickle():\n    result = ocrmypdf._pipelines._common.HOCRResult(\n        pageno=1,\n        pdf_page_from_image=Path('a'),\n        hocr=Path('b'),\n        textpdf=Path('c'),\n        orientation_correction=180,\n    )\n    assert result == pickle.loads(pickle.dumps(result))\n\n\ndef test_nested_plugin_option_access():\n    \"\"\"Test that plugin options can be accessed via nested namespaces.\"\"\"\n    from ocrmypdf._options import OcrOptions\n    from ocrmypdf.api import setup_plugin_infrastructure\n\n    # Set up plugin infrastructure to register plugin models\n    setup_plugin_infrastructure()\n\n    # Create options with tesseract settings\n    options = OcrOptions(\n        input_file='test.pdf',\n        output_file='output.pdf',\n        tesseract_timeout=120.0,\n        tesseract_oem=1,\n        optimize=2,\n    )\n\n    # Test flat access still works\n    assert options.tesseract_timeout == 120.0\n    assert options.tesseract_oem == 1\n    assert options.optimize == 2\n\n    # Test nested access for tesseract\n    tesseract = options.tesseract\n    assert tesseract is not None\n    assert tesseract.timeout == 120.0\n    assert tesseract.oem == 1\n\n    # Test nested access for ghostscript\n    ghostscript = options.ghostscript\n    assert ghostscript is not None\n    assert ghostscript.color_conversion_strategy == \"LeaveColorUnchanged\"\n\n    # Test that cached instances are returned\n    assert options.tesseract is tesseract\n\n\ndef test_default_tesseract_timeout():\n    \"\"\"Test that OcrOptions without explicit tesseract_timeout uses plugin default.\n\n    Regression test for GitHub issue #1636: when using the Python API without\n    specifying tesseract_timeout, the default was 0.0 which caused Tesseract\n    to immediately time out and produce no OCR output.\n    \"\"\"\n    from ocrmypdf._options import OcrOptions\n    from ocrmypdf.api import setup_plugin_infrastructure\n\n    setup_plugin_infrastructure()\n\n    # Default OcrOptions should leave tesseract_timeout as None\n    options = OcrOptions(\n        input_file='test.pdf',\n        output_file='output.pdf',\n    )\n    assert options.tesseract_timeout is None\n\n    # The plugin default (180s) should be used when tesseract_timeout is None\n    assert options.tesseract.timeout == 180.0\n"
  },
  {
    "path": "tests/test_check_pdf.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nfrom ocrmypdf.helpers import check_pdf\n\n\ndef test_pdf_error(resources):\n    assert check_pdf(resources / 'blank.pdf')\n    assert not check_pdf(__file__)\n"
  },
  {
    "path": "tests/test_completion.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport os\nfrom subprocess import run\n\nimport pytest\n\nfrom ocrmypdf.helpers import running_in_docker\n\npytestmark = pytest.mark.skipif(\n    running_in_docker(),\n    reason=\"docker can't complete\",\n)\n\n\ndef test_fish():\n    try:\n        proc = run(\n            ['fish', '-n', 'misc/completion/ocrmypdf.fish'],\n            check=True,\n            encoding='utf-8',\n            capture_output=True,\n        )\n        assert proc.stderr == '', proc.stderr\n    except FileNotFoundError:\n        pytest.xfail('fish is not installed')\n\n\n@pytest.mark.skipif(\n    os.name == 'nt', reason=\"Windows CI workers have bash but are best left alone\"\n)\ndef test_bash():\n    try:\n        proc = run(\n            ['bash', '-n', 'misc/completion/ocrmypdf.bash'],\n            check=True,\n            encoding='utf-8',\n            capture_output=True,\n        )\n        assert proc.stderr == '', proc.stderr\n    except FileNotFoundError:\n        pytest.xfail('bash is not installed')\n"
  },
  {
    "path": "tests/test_concurrency.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport os\nimport platform\n\nimport pytest\n\nfrom ocrmypdf import ExitCode\n\nfrom .conftest import run_ocrmypdf_api\n\n\n@pytest.mark.skipif(os.name == 'nt', reason=\"Windows doesn't have SIGKILL\")\n@pytest.mark.skipif(\n    platform.python_version_tuple() >= ('3', '12'), reason=\"can deadlock due to fork\"\n)\ndef test_simulate_oom_killer(multipage, no_outpdf):\n    exitcode = run_ocrmypdf_api(\n        multipage,\n        no_outpdf,\n        '--force-ocr',\n        '--no-use-threads',\n        '--plugin',\n        'tests/plugins/tesseract_simulate_oom_killer.py',\n    )\n    assert exitcode == ExitCode.child_process_error\n"
  },
  {
    "path": "tests/test_fpdf_renderer.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Tests for fpdf2-based PDF renderer.\"\"\"\n\nfrom __future__ import annotations\n\nfrom pathlib import Path\n\nimport pytest\n\nfrom ocrmypdf.font import MultiFontManager\nfrom ocrmypdf.fpdf_renderer import (\n    DebugRenderOptions,\n    Fpdf2MultiPageRenderer,\n    Fpdf2PdfRenderer,\n)\nfrom ocrmypdf.hocrtransform.hocr_parser import HocrParser\nfrom ocrmypdf.models.ocr_element import OcrClass\n\n\n@pytest.fixture\ndef font_dir():\n    \"\"\"Return path to font directory.\"\"\"\n    return Path(__file__).parent.parent / \"src\" / \"ocrmypdf\" / \"data\"\n\n\n@pytest.fixture\ndef multi_font_manager(font_dir):\n    \"\"\"Create MultiFontManager instance for testing.\"\"\"\n    return MultiFontManager(font_dir)\n\n\n@pytest.fixture\ndef resources():\n    \"\"\"Return path to test resources directory.\"\"\"\n    return Path(__file__).parent / \"resources\"\n\n\nclass TestFpdf2RendererImports:\n    \"\"\"Test that all fpdf2 renderer modules can be imported.\"\"\"\n\n    def test_imports(self):\n        \"\"\"Test that all fpdf_renderer modules can be imported.\"\"\"\n        from ocrmypdf.fpdf_renderer import (\n            DebugRenderOptions,\n            Fpdf2MultiPageRenderer,\n            Fpdf2PdfRenderer,\n        )\n\n        assert DebugRenderOptions is not None\n        assert Fpdf2PdfRenderer is not None\n        assert Fpdf2MultiPageRenderer is not None\n\n\nclass TestDebugRenderOptions:\n    \"\"\"Test DebugRenderOptions dataclass.\"\"\"\n\n    def test_defaults(self):\n        \"\"\"Test default values.\"\"\"\n        opts = DebugRenderOptions()\n        assert opts.render_baseline is False\n        assert opts.render_line_bbox is False\n        assert opts.render_word_bbox is False\n\n    def test_custom_values(self):\n        \"\"\"Test custom values.\"\"\"\n        opts = DebugRenderOptions(\n            render_baseline=True,\n            render_line_bbox=True,\n            render_word_bbox=True,\n        )\n        assert opts.render_baseline is True\n        assert opts.render_line_bbox is True\n        assert opts.render_word_bbox is True\n\n\nclass TestFpdf2PdfRenderer:\n    \"\"\"Test Fpdf2PdfRenderer.\"\"\"\n\n    def test_requires_page_element(self, multi_font_manager):\n        \"\"\"Test that renderer requires ocr_page element.\"\"\"\n        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement\n\n        # Create a non-page element\n        word = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"test\",\n            bbox=BoundingBox(left=0, top=0, right=100, bottom=20),\n        )\n\n        with pytest.raises(ValueError, match=\"Root element must be ocr_page\"):\n            Fpdf2PdfRenderer(\n                page=word,\n                dpi=300,\n                multi_font_manager=multi_font_manager,\n            )\n\n    def test_requires_bbox(self, multi_font_manager):\n        \"\"\"Test that renderer requires page with bounding box.\"\"\"\n        from ocrmypdf.models.ocr_element import OcrElement\n\n        page = OcrElement(ocr_class=OcrClass.PAGE)\n\n        with pytest.raises(ValueError, match=\"Page must have bounding box\"):\n            Fpdf2PdfRenderer(\n                page=page,\n                dpi=300,\n                multi_font_manager=multi_font_manager,\n            )\n\n    def test_render_simple_page(self, multi_font_manager, tmp_path):\n        \"\"\"Test rendering a simple page with one word.\"\"\"\n        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement\n\n        # Create a simple page with one word\n        word = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"Hello\",\n            bbox=BoundingBox(left=100, top=100, right=200, bottom=130),\n        )\n        line = OcrElement(\n            ocr_class=OcrClass.LINE,\n            bbox=BoundingBox(left=100, top=100, right=200, bottom=130),\n            children=[word],\n        )\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=612, bottom=792),\n            children=[line],\n        )\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=72,  # 1:1 mapping to PDF points\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n\n        output_path = tmp_path / \"test_simple.pdf\"\n        renderer.render(output_path)\n\n        assert output_path.exists()\n        assert output_path.stat().st_size > 0\n\n    def test_render_invisible_text(self, multi_font_manager, tmp_path):\n        \"\"\"Test rendering invisible text (OCR layer).\"\"\"\n        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement\n\n        word = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"Invisible\",\n            bbox=BoundingBox(left=100, top=100, right=250, bottom=130),\n        )\n        line = OcrElement(\n            ocr_class=OcrClass.LINE,\n            bbox=BoundingBox(left=100, top=100, right=250, bottom=130),\n            children=[word],\n        )\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=612, bottom=792),\n            children=[line],\n        )\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=72,\n            multi_font_manager=multi_font_manager,\n            invisible_text=True,  # This is the default\n        )\n\n        output_path = tmp_path / \"test_invisible.pdf\"\n        renderer.render(output_path)\n\n        assert output_path.exists()\n        assert output_path.stat().st_size > 0\n\n\nclass TestFpdf2MultiPageRenderer:\n    \"\"\"Test Fpdf2MultiPageRenderer.\"\"\"\n\n    def test_requires_pages(self, multi_font_manager):\n        \"\"\"Test that renderer requires at least one page.\"\"\"\n        with pytest.raises(ValueError, match=\"No pages to render\"):\n            renderer = Fpdf2MultiPageRenderer(\n                pages_data=[],\n                multi_font_manager=multi_font_manager,\n            )\n            renderer.render(Path(\"/tmp/test.pdf\"))\n\n    def test_render_multiple_pages(self, multi_font_manager, tmp_path):\n        \"\"\"Test rendering multiple pages.\"\"\"\n        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement\n\n        pages_data = []\n        for i in range(3):\n            word = OcrElement(\n                ocr_class=OcrClass.WORD,\n                text=f\"Page{i+1}\",\n                bbox=BoundingBox(left=100, top=100, right=200, bottom=130),\n            )\n            line = OcrElement(\n                ocr_class=OcrClass.LINE,\n                bbox=BoundingBox(left=100, top=100, right=200, bottom=130),\n                children=[word],\n            )\n            page = OcrElement(\n                ocr_class=OcrClass.PAGE,\n                bbox=BoundingBox(left=0, top=0, right=612, bottom=792),\n                children=[line],\n            )\n            pages_data.append((i + 1, page, 72))\n\n        renderer = Fpdf2MultiPageRenderer(\n            pages_data=pages_data,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n\n        output_path = tmp_path / \"test_multipage.pdf\"\n        renderer.render(output_path)\n\n        assert output_path.exists()\n        assert output_path.stat().st_size > 0\n\n\nclass TestFpdf2RendererWithHocr:\n    \"\"\"Test fpdf2 renderer with actual hOCR files.\"\"\"\n\n    def test_render_latin_hocr(self, resources, multi_font_manager, tmp_path):\n        \"\"\"Test rendering Latin text from hOCR.\"\"\"\n        hocr_path = resources / \"latin.hocr\"\n        if not hocr_path.exists():\n            pytest.skip(\"latin.hocr not found\")\n\n        parser = HocrParser(hocr_path)\n        page = parser.parse()\n\n        # Ensure we got a page\n        assert page.ocr_class == OcrClass.PAGE\n        assert page.bbox is not None\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n\n        output_path = tmp_path / \"latin_fpdf2.pdf\"\n        renderer.render(output_path)\n\n        assert output_path.exists()\n        assert output_path.stat().st_size > 0\n\n    def test_render_cjk_hocr(self, resources, multi_font_manager, tmp_path):\n        \"\"\"Test rendering CJK text from hOCR.\"\"\"\n        hocr_path = resources / \"cjk.hocr\"\n        if not hocr_path.exists():\n            pytest.skip(\"cjk.hocr not found\")\n\n        parser = HocrParser(hocr_path)\n        page = parser.parse()\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n\n        output_path = tmp_path / \"cjk_fpdf2.pdf\"\n        renderer.render(output_path)\n\n        assert output_path.exists()\n        assert output_path.stat().st_size > 0\n\n    def test_render_arabic_hocr(self, resources, multi_font_manager, tmp_path):\n        \"\"\"Test rendering Arabic text from hOCR.\"\"\"\n        hocr_path = resources / \"arabic.hocr\"\n        if not hocr_path.exists():\n            pytest.skip(\"arabic.hocr not found\")\n\n        parser = HocrParser(hocr_path)\n        page = parser.parse()\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n\n        output_path = tmp_path / \"arabic_fpdf2.pdf\"\n        renderer.render(output_path)\n\n        assert output_path.exists()\n        assert output_path.stat().st_size > 0\n\n    def test_render_hello_world_scripts_hocr(\n        self, resources, multi_font_manager, tmp_path\n    ):\n        \"\"\"Test rendering comprehensive multilingual 'Hello!' hOCR file.\n\n        This tests all major scripts including:\n        - Latin (English, Spanish, French, German, Italian, Polish, Portuguese, Turkish)\n        - Cyrillic (Russian)\n        - Greek\n        - CJK (Chinese Simplified, Chinese Traditional, Japanese, Korean)\n        - Devanagari (Hindi)\n        - Arabic (RTL)\n        - Hebrew (RTL)\n\n        Also includes rotated baselines to exercise skew handling.\n        \"\"\"\n        hocr_path = resources / \"hello_world_scripts.hocr\"\n        if not hocr_path.exists():\n            pytest.skip(\"hello_world_scripts.hocr not found\")\n\n        parser = HocrParser(hocr_path)\n        page = parser.parse()\n\n        # Verify we parsed the page correctly\n        assert page.ocr_class == OcrClass.PAGE\n        assert page.bbox is not None\n        # Should have 2550x3300 at 300 DPI\n        assert page.bbox.right == 2550\n        assert page.bbox.bottom == 3300\n\n        # Test with visible text for visual inspection\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n\n        output_path = tmp_path / \"hello_world_scripts_fpdf2.pdf\"\n        renderer.render(output_path)\n\n        assert output_path.exists()\n        assert output_path.stat().st_size > 0\n\n    def test_render_hello_world_scripts_multipage(\n        self, resources, multi_font_manager, tmp_path\n    ):\n        \"\"\"Test rendering hello_world_scripts.hocr using MultiPageRenderer.\n\n        Uses Fpdf2MultiPageRenderer to render the multilingual test file,\n        demonstrating font handling across all major writing systems.\n        \"\"\"\n        hocr_path = resources / \"hello_world_scripts.hocr\"\n        if not hocr_path.exists():\n            pytest.skip(\"hello_world_scripts.hocr not found\")\n\n        parser = HocrParser(hocr_path)\n        page = parser.parse()\n\n        # Build pages_data list as expected by MultiPageRenderer\n        pages_data = [(1, page, 300)]  # (page_number, page_element, dpi)\n\n        renderer = Fpdf2MultiPageRenderer(\n            pages_data=pages_data,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n\n        output_path = tmp_path / \"hello_world_scripts_multipage.pdf\"\n        renderer.render(output_path)\n\n        assert output_path.exists()\n        assert output_path.stat().st_size > 0\n\n\nclass TestWordSegmentation:\n    \"\"\"Test that rendered PDFs have proper word segmentation for pdfminer.six.\"\"\"\n\n    def test_word_segmentation_with_pdfminer(self, multi_font_manager, tmp_path):\n        \"\"\"Test that pdfminer.six can extract words with proper spacing.\n\n        This test verifies that explicit space characters are inserted between\n        words so that pdfminer.six (and similar PDF readers) can properly\n        segment words during text extraction.\n        \"\"\"\n        from pdfminer.high_level import extract_text\n\n        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement\n\n        # Create a page with multiple words on one line\n        word1 = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"Hello\",\n            bbox=BoundingBox(left=100, top=100, right=200, bottom=130),\n        )\n        word2 = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"World\",\n            bbox=BoundingBox(left=220, top=100, right=320, bottom=130),\n        )\n        word3 = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"Test\",\n            bbox=BoundingBox(left=340, top=100, right=420, bottom=130),\n        )\n        line = OcrElement(\n            ocr_class=OcrClass.LINE,\n            bbox=BoundingBox(left=100, top=100, right=420, bottom=130),\n            children=[word1, word2, word3],\n        )\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=612, bottom=792),\n            children=[line],\n        )\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=72,  # 1:1 mapping to PDF points\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n\n        output_path = tmp_path / \"test_word_segmentation.pdf\"\n        renderer.render(output_path)\n\n        # Extract text using pdfminer.six\n        extracted_text = extract_text(str(output_path))\n\n        # Verify words are separated by spaces\n        assert \"Hello\" in extracted_text\n        assert \"World\" in extracted_text\n        assert \"Test\" in extracted_text\n\n        # The text should NOT be run together like \"HelloWorldTest\"\n        assert \"HelloWorld\" not in extracted_text\n        assert \"WorldTest\" not in extracted_text\n\n        # Verify proper word segmentation - words should be separated\n        # (allowing for whitespace variations)\n        words_found = extracted_text.split()\n        assert \"Hello\" in words_found\n        assert \"World\" in words_found\n        assert \"Test\" in words_found\n\n    def test_cjk_no_spurious_spaces(self, multi_font_manager, tmp_path):\n        \"\"\"Test that CJK text does not get spurious spaces inserted.\n\n        CJK scripts don't use spaces between characters/words, so we should\n        not insert spaces between adjacent CJK words.\n        \"\"\"\n        from pdfminer.high_level import extract_text\n\n        from ocrmypdf.models.ocr_element import BoundingBox, OcrElement\n\n        # Create a page with CJK words (Chinese characters)\n        # 你好 = \"Hello\" in Chinese\n        # 世界 = \"World\" in Chinese\n        word1 = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"你好\",\n            bbox=BoundingBox(left=100, top=100, right=160, bottom=130),\n        )\n        word2 = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"世界\",\n            bbox=BoundingBox(left=170, top=100, right=230, bottom=130),\n        )\n        line = OcrElement(\n            ocr_class=OcrClass.LINE,\n            bbox=BoundingBox(left=100, top=100, right=230, bottom=130),\n            children=[word1, word2],\n        )\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=612, bottom=792),\n            children=[line],\n        )\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=72,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n\n        output_path = tmp_path / \"test_cjk_segmentation.pdf\"\n        renderer.render(output_path)\n\n        # Extract text using pdfminer.six\n        extracted_text = extract_text(str(output_path))\n\n        # CJK text should be present\n        assert \"你好\" in extracted_text\n        assert \"世界\" in extracted_text\n\n        # There should NOT be spaces between CJK characters\n        # (but pdfminer may add some whitespace, so we check the raw chars)\n        extracted_chars = extracted_text.replace(\" \", \"\").replace(\"\\n\", \"\")\n        assert \"你好世界\" in extracted_chars or (\n            \"你好\" in extracted_chars and \"世界\" in extracted_chars\n        )\n\n    def test_latin_hocr_word_segmentation(\n        self, resources, multi_font_manager, tmp_path\n    ):\n        \"\"\"Test word segmentation with real Latin hOCR file.\"\"\"\n        from pdfminer.high_level import extract_text\n\n        hocr_path = resources / \"latin.hocr\"\n        if not hocr_path.exists():\n            pytest.skip(\"latin.hocr not found\")\n\n        parser = HocrParser(hocr_path)\n        page = parser.parse()\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n\n        output_path = tmp_path / \"latin_segmentation.pdf\"\n        renderer.render(output_path)\n\n        # Extract text using pdfminer.six\n        extracted_text = extract_text(str(output_path))\n\n        # The Latin text should have proper word segmentation\n        # Words should be separable\n        words = extracted_text.split()\n        assert len(words) > 0\n\n        # Check that common English words are properly segmented\n        # (not stuck together)\n        text_no_newlines = extracted_text.replace(\"\\n\", \" \")\n        # There should be spaces in the extracted text\n        assert \" \" in text_no_newlines\n"
  },
  {
    "path": "tests/test_ghostscript.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport logging\nimport secrets\nimport subprocess\nimport sys\nfrom decimal import Decimal\nfrom unittest.mock import patch\n\nimport pikepdf\nimport pytest\nfrom packaging.version import Version\nfrom PIL import Image, UnidentifiedImageError\n\nfrom ocrmypdf._exec import ghostscript\nfrom ocrmypdf._exec.ghostscript import DuplicateFilter, rasterize_pdf\nfrom ocrmypdf.builtin_plugins.ghostscript import _repair_gs106_jpeg_corruption\nfrom ocrmypdf.exceptions import ColorConversionNeededError, ExitCode, InputFileError\nfrom ocrmypdf.helpers import Resolution\nfrom ocrmypdf.pluginspec import GhostscriptRasterDevice\n\nfrom .conftest import check_ocrmypdf, run_ocrmypdf_api\n\n# pylint: disable=redefined-outer-name\n\n\n@pytest.fixture\ndef francais(resources):\n    path = resources / 'francais.pdf'\n    return path, pikepdf.open(path)\n\n\ndef test_rasterize_size(francais, outdir):\n    path, pdf = francais\n    page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])\n    assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0\n    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))\n    target_size = Decimal('50.0'), Decimal('30.0')\n    forced_dpi = Resolution(42.0, 4242.0)\n\n    rasterize_pdf(\n        path,\n        outdir / 'out.png',\n        raster_device=GhostscriptRasterDevice.PNGMONO,\n        raster_dpi=Resolution(\n            target_size[0] / page_size[0], target_size[1] / page_size[1]\n        ),\n        page_dpi=forced_dpi,\n    )\n\n    with Image.open(outdir / 'out.png') as im:\n        assert im.size == target_size\n        assert im.info['dpi'] == forced_dpi\n\n\ndef test_rasterize_rotated(francais, outdir, caplog):\n    path, pdf = francais\n    page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])\n    assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0\n    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))\n    target_size = Decimal('50.0'), Decimal('30.0')\n    forced_dpi = Resolution(42.0, 4242.0)\n\n    caplog.set_level(logging.DEBUG)\n    rasterize_pdf(\n        path,\n        outdir / 'out.png',\n        raster_device=GhostscriptRasterDevice.PNGMONO,\n        raster_dpi=Resolution(\n            target_size[0] / page_size[0], target_size[1] / page_size[1]\n        ),\n        page_dpi=forced_dpi,\n        rotation=90,\n    )\n\n    with Image.open(outdir / 'out.png') as im:\n        assert im.size == (target_size[1], target_size[0])\n        assert im.info['dpi'] == forced_dpi.flip_axis()\n\n\ndef test_rasterize_low_dpi(francais, outdir):\n    \"\"\"Test that very low DPI values (below 10) produce correctly sized output.\n\n    Ghostscript may fail with DPI values below 10. The workaround renders at\n    a minimum of 10 DPI and resizes the output to match the expected dimensions.\n    \"\"\"\n    path, pdf = francais\n    page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])\n    assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0\n    page_size = (float(page_size_pts[0]) / 72, float(page_size_pts[1]) / 72)\n\n    # Request a very small output (DPI below 10 on both axes)\n    target_size = (5, 3)\n    forced_dpi = Resolution(72.0, 72.0)\n\n    rasterize_pdf(\n        path,\n        outdir / 'out_low_dpi.png',\n        raster_device=GhostscriptRasterDevice.PNGMONO,\n        raster_dpi=Resolution(\n            target_size[0] / page_size[0], target_size[1] / page_size[1]\n        ),\n        page_dpi=forced_dpi,\n    )\n\n    with Image.open(outdir / 'out_low_dpi.png') as im:\n        assert im.size == target_size\n        assert im.info['dpi'] == forced_dpi\n\n\ndef test_rasterize_low_dpi_one_axis(francais, outdir):\n    \"\"\"Test low DPI on only one axis produces correctly sized output.\"\"\"\n    path, pdf = francais\n    page_size_pts = (pdf.pages[0].mediabox[2], pdf.pages[0].mediabox[3])\n    assert pdf.pages[0].mediabox[0] == pdf.pages[0].mediabox[1] == 0\n    page_size = (float(page_size_pts[0]) / 72, float(page_size_pts[1]) / 72)\n\n    # Request low DPI on X axis only (below 10), normal on Y axis\n    target_size = (5, 50)\n    forced_dpi = Resolution(72.0, 72.0)\n\n    rasterize_pdf(\n        path,\n        outdir / 'out_low_dpi_x.png',\n        raster_device=GhostscriptRasterDevice.PNGMONO,\n        raster_dpi=Resolution(\n            target_size[0] / page_size[0], target_size[1] / page_size[1]\n        ),\n        page_dpi=forced_dpi,\n    )\n\n    with Image.open(outdir / 'out_low_dpi_x.png') as im:\n        assert im.size == target_size\n        assert im.info['dpi'] == forced_dpi\n\n\ndef test_gs_render_failure(resources, outpdf, caplog):\n    exitcode = run_ocrmypdf_api(\n        resources / 'blank.pdf',\n        outpdf,\n        '--output-type',\n        'pdfa',  # Required to trigger Ghostscript PDF/A generation\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n        '--plugin',\n        'tests/plugins/gs_render_failure.py',\n    )\n    assert 'TEST ERROR: gs_render_failure.py' in caplog.text\n    assert exitcode == ExitCode.child_process_error\n\n\ndef test_gs_raster_failure(resources, outpdf, caplog):\n    exitcode = run_ocrmypdf_api(\n        resources / 'francais.pdf',\n        outpdf,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n        '--plugin',\n        'tests/plugins/gs_raster_failure.py',\n    )\n    assert 'TEST ERROR: gs_raster_failure.py' in caplog.text\n    assert exitcode == ExitCode.child_process_error\n\n\ndef test_ghostscript_pdfa_failure(resources, outpdf, caplog):\n    exitcode = run_ocrmypdf_api(\n        resources / 'francais.pdf',\n        outpdf,\n        '--output-type',\n        'pdfa',  # Required to trigger Ghostscript PDF/A generation\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n        '--plugin',\n        'tests/plugins/gs_pdfa_failure.py',\n    )\n    assert (\n        exitcode == ExitCode.pdfa_conversion_failed\n    ), \"Unexpected return when PDF/A fails\"\n\n\ndef test_ghostscript_feature_elision(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'francais.pdf',\n        outpdf,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n        '--plugin',\n        'tests/plugins/gs_feature_elision.py',\n    )\n\n\ndef test_ghostscript_mandatory_color_conversion(resources, outpdf):\n    with pytest.raises(ColorConversionNeededError):\n        check_ocrmypdf(\n            resources / 'jbig2_baddevicen.pdf',\n            outpdf,\n            '--output-type',\n            'pdfa',  # Required to trigger Ghostscript PDF/A generation\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n\ndef test_rasterize_pdf_errors(resources, no_outpdf, caplog):\n    with patch('ocrmypdf._exec.ghostscript.run') as mock:\n        # ghostscript can produce empty files with return code 0\n        mock.return_value = subprocess.CompletedProcess(\n            ['fakegs'], returncode=0, stdout=b'', stderr=b'error this is an error'\n        )\n        with pytest.raises(UnidentifiedImageError):\n            rasterize_pdf(\n                resources / 'francais.pdf',\n                no_outpdf,\n                raster_device=GhostscriptRasterDevice.PNGMONO,\n                raster_dpi=Resolution(100, 100),\n            )\n        assert \"this is an error\" in caplog.text\n        assert \"invalid page image file\" in caplog.text\n\n\nclass TestDuplicateFilter:\n    @pytest.fixture(scope='function')\n    def duplicate_filter_logger(self):\n        # token_urlsafe: ensure the logger has a unique name so tests are isolated\n        logger = logging.getLogger(__name__ + secrets.token_urlsafe(8))\n        logger.setLevel(logging.DEBUG)\n        logger.addFilter(DuplicateFilter(logger))\n        return logger\n\n    @pytest.mark.xfail(\n        (3, 13, 3) <= sys.version_info[:3] <= (3, 13, 5),\n        reason=\"https://github.com/python/cpython/pull/135858\",\n    )\n    def test_filter_duplicate_messages(self, duplicate_filter_logger, caplog):\n        log = duplicate_filter_logger\n        log.error(\"test error message\")\n        log.error(\"test error message\")\n        log.error(\"test error message\")\n        log.error(\"another error message\")\n        log.error(\"another error message\")\n        log.error(\"yet another error message\")\n\n        assert len(caplog.records) == 5\n        assert caplog.records[0].msg == \"test error message\"\n        assert caplog.records[1].msg == \"(suppressed 2 repeated lines)\"\n        assert caplog.records[2].msg == \"another error message\"\n        assert caplog.records[3].msg == \"(suppressed 1 repeated lines)\"\n        assert caplog.records[4].msg == \"yet another error message\"\n\n    def test_filter_does_not_affect_unique_messages(\n        self, duplicate_filter_logger, caplog\n    ):\n        log = duplicate_filter_logger\n        log.error(\"test error message\")\n        log.error(\"another error message\")\n        log.error(\"yet another error message\")\n\n        assert len(caplog.records) == 3\n        assert caplog.records[0].msg == \"test error message\"\n        assert caplog.records[1].msg == \"another error message\"\n        assert caplog.records[2].msg == \"yet another error message\"\n\n    @pytest.mark.xfail(\n        (3, 13, 3) <= sys.version_info[:3] <= (3, 13, 5),\n        reason=\"https://github.com/python/cpython/pull/135858\",\n    )\n    def test_filter_alt_messages(self, duplicate_filter_logger, caplog):\n        log = duplicate_filter_logger\n        log.error(\"test error message\")\n        log.error(\"another error message\")\n        log.error(\"test error message\")\n        log.error(\"another error message\")\n        log.error(\"test error message\")\n        log.error(\"test error message\")\n        log.error(\"another error message\")\n        log.error(\"yet another error message\")\n\n        assert len(caplog.records) == 4\n        assert caplog.records[0].msg == \"test error message\"\n        assert caplog.records[1].msg == \"another error message\"\n        assert caplog.records[2].msg == \"(suppressed 5 repeated lines)\"\n        assert caplog.records[3].msg == \"yet another error message\"\n\n\n@pytest.fixture\ndef pdf_with_invalid_image(outdir):\n    # issue 1451\n    Name = pikepdf.Name\n    pdf = pikepdf.new()\n    pdf.add_blank_page()\n    pdf.pages[0].Contents = pdf.make_stream(b'612 0 0 612 0 0 cm /Image Do')\n    # Create an invalid image object that has both ColorSpace and ImageMask set\n    pdf.pages[0].Resources = pikepdf.Dictionary(\n        XObject=pdf.make_indirect(\n            pikepdf.Dictionary(\n                Image=pdf.make_stream(\n                    b\"\\xf0\\x0f\" * 8,\n                    ColorSpace=Name.DeviceGray,\n                    BitsPerComponent=1,\n                    Width=8,\n                    Height=8,\n                    ImageMask=True,\n                    Subtype=Name.Image,\n                    Type=Name.XObject,\n                )\n            )\n        )\n    )\n    pdf.save(outdir / 'invalid_image.pdf')\n    pdf.save('invalid_image.pdf')\n    return outdir / 'invalid_image.pdf'\n\n\n@pytest.mark.xfail(\n    ghostscript.version() < Version('10.04.0'),\n    reason=\"Older Ghostscript behavior is different\",\n)\ndef test_recoverable_image_error(pdf_with_invalid_image, outdir, caplog):\n    # When stop_on_error is False, we expect Ghostscript to print an error\n    # but continue\n    rasterize_pdf(\n        outdir / 'invalid_image.pdf',\n        outdir / 'out.png',\n        raster_device=GhostscriptRasterDevice.PNGMONO,\n        raster_dpi=Resolution(10, 10),\n        stop_on_error=False,\n    )\n    assert 'Image has both ImageMask and ColorSpace' in caplog.text\n\n\n@pytest.mark.xfail(\n    ghostscript.version() < Version('10.04.0'),\n    reason=\"Older Ghostscript behavior is different\",\n)\ndef test_recoverable_image_error_with_stop(pdf_with_invalid_image, outdir, caplog):\n    # When stop_on_error is True, Ghostscript will print an error and exit\n    # but still produce a viable image. We intercept this case and raise\n    # InputFileError because it will contain an image of the whole page minus\n    # the image we are rendering.\n    with pytest.raises(\n        InputFileError, match=\"Try using --continue-on-soft-render-error\"\n    ):\n        rasterize_pdf(\n            outdir / 'invalid_image.pdf',\n            outdir / 'out.png',\n            raster_device=GhostscriptRasterDevice.PNGMONO,\n            raster_dpi=Resolution(100, 100),\n            stop_on_error=True,\n        )\n    # out2.png will not be created; if it were it would be blank.\n\n\nclass TestGs106JpegCorruptionRepair:\n    \"\"\"Test the Ghostscript 10.6 JPEG corruption repair function.\"\"\"\n\n    @pytest.fixture\n    def create_damaged_pdf(self, resources, outdir):\n        \"\"\"Create a damaged PDF by truncating JPEG data by 2 bytes.\"\"\"\n\n        def _create_damaged(source_pdf_name='francais.pdf', truncate_bytes=2):\n            source_path = resources / source_pdf_name\n            damaged_path = outdir / 'damaged.pdf'\n\n            with pikepdf.open(source_path) as pdf:\n                # Find and truncate DCTDecode images\n                Name = pikepdf.Name\n                damaged_count = 0\n                for page in pdf.pages:\n                    if Name.Resources not in page:\n                        continue\n                    resources_dict = page[Name.Resources]\n                    if Name.XObject not in resources_dict:\n                        continue\n                    for key in resources_dict[Name.XObject].keys():\n                        obj = resources_dict[Name.XObject][key]\n                        if obj.get(Name.Subtype) != Name.Image:\n                            continue\n                        if obj.get(Name.Filter) != Name.DCTDecode:\n                            continue\n                        # Truncate the JPEG data\n                        original_bytes = obj.read_raw_bytes()\n                        truncated_bytes = original_bytes[:-truncate_bytes]\n                        obj.write(truncated_bytes, filter=Name.DCTDecode)\n                        damaged_count += 1\n\n                pdf.save(damaged_path)\n                return source_path, damaged_path, damaged_count\n\n        return _create_damaged\n\n    def test_repair_truncated_jpeg(self, create_damaged_pdf, caplog):\n        \"\"\"Test that truncated JPEG images are repaired.\"\"\"\n        caplog.set_level(logging.DEBUG)\n        source_path, damaged_path, damaged_count = create_damaged_pdf()\n\n        assert damaged_count > 0, \"Test PDF should have DCTDecode images\"\n\n        # Get original image bytes for comparison\n        with pikepdf.open(source_path) as pdf:\n            Name = pikepdf.Name\n            original_bytes_list = []\n            for page in pdf.pages:\n                if Name.Resources not in page:\n                    continue\n                resources_dict = page[Name.Resources]\n                if Name.XObject not in resources_dict:\n                    continue\n                for key in resources_dict[Name.XObject].keys():\n                    obj = resources_dict[Name.XObject][key]\n                    if obj.get(Name.Subtype) != Name.Image:\n                        continue\n                    if obj.get(Name.Filter) != Name.DCTDecode:\n                        continue\n                    original_bytes_list.append(obj.read_raw_bytes())\n\n        # Run the repair function\n        repaired = _repair_gs106_jpeg_corruption(source_path, damaged_path)\n        assert repaired is True, \"Repair should have been performed\"\n\n        # Verify the repaired PDF has correct image bytes\n        with pikepdf.open(damaged_path) as pdf:\n            Name = pikepdf.Name\n            repaired_bytes_list = []\n            for page in pdf.pages:\n                if Name.Resources not in page:\n                    continue\n                resources_dict = page[Name.Resources]\n                if Name.XObject not in resources_dict:\n                    continue\n                for key in resources_dict[Name.XObject].keys():\n                    obj = resources_dict[Name.XObject][key]\n                    if obj.get(Name.Subtype) != Name.Image:\n                        continue\n                    if obj.get(Name.Filter) != Name.DCTDecode:\n                        continue\n                    repaired_bytes_list.append(obj.read_raw_bytes())\n\n        assert len(repaired_bytes_list) == len(original_bytes_list)\n        for orig, repaired_bytes in zip(original_bytes_list, repaired_bytes_list, strict=False):\n            assert orig == repaired_bytes, \"Repaired bytes should match original\"\n\n        # Check that error/warning was logged\n        assert \"JPEG corruption detected\" in caplog.text\n\n    def test_no_repair_when_not_truncated(self, resources, outdir, caplog):\n        \"\"\"Test that no repair is done when images are not truncated.\"\"\"\n        caplog.set_level(logging.DEBUG)\n        source_path = resources / 'francais.pdf'\n\n        # Copy source to output (no damage)\n        output_path = outdir / 'undamaged.pdf'\n        with pikepdf.open(source_path) as pdf:\n            pdf.save(output_path)\n\n        # Run the repair function - should not repair anything\n        repaired = _repair_gs106_jpeg_corruption(source_path, output_path)\n        assert repaired is False, \"No repair should have been performed\"\n        assert \"JPEG corruption detected\" not in caplog.text\n\n    def test_no_repair_when_truncation_too_large(self, create_damaged_pdf, caplog):\n        \"\"\"Test that images truncated by more than 15 bytes are not repaired.\"\"\"\n        caplog.set_level(logging.DEBUG)\n        source_path, damaged_path, _ = create_damaged_pdf(truncate_bytes=20)\n\n        repaired = _repair_gs106_jpeg_corruption(source_path, damaged_path)\n        assert repaired is False, \"Should not repair truncation > 15 bytes\"\n        assert \"JPEG corruption detected\" not in caplog.text\n"
  },
  {
    "path": "tests/test_graft.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nfrom unittest.mock import patch\n\nimport pikepdf\n\nimport ocrmypdf\n\n\ndef test_no_glyphless_graft(resources, outdir):\n    with (\n        pikepdf.open(resources / 'francais.pdf') as pdf,\n        pikepdf.open(resources / 'aspect.pdf') as pdf_aspect,\n        pikepdf.open(resources / 'cmyk.pdf') as pdf_cmyk,\n    ):\n        pdf.pages.extend(pdf_aspect.pages)\n        pdf.pages.extend(pdf_cmyk.pages)\n        pdf.save(outdir / 'test.pdf')\n\n    with patch('ocrmypdf._graft.MAX_REPLACE_PAGES', 2):\n        ocrmypdf.ocr(\n            outdir / 'test.pdf',\n            outdir / 'out.pdf',\n            deskew=True,\n            tesseract_timeout=0,\n            force_ocr=True,\n        )\n    # This test needs asserts\n\n\ndef test_links(resources, outpdf):\n    ocrmypdf.ocr(\n        resources / 'link.pdf', outpdf, redo_ocr=True, oversample=200, output_type='pdf'\n    )\n    with pikepdf.open(outpdf) as pdf:\n        p1 = pdf.pages[0]\n        p2 = pdf.pages[1]\n        assert p1.Annots[0].A.D[0].objgen == p2.objgen\n        assert p2.Annots[0].A.D[0].objgen == p1.objgen\n\n\ndef test_redo_ocr_with_offset_mediabox(resources, outdir):\n    \"\"\"Test that --redo-ocr handles non-zero mediabox origins correctly.\n\n    Regression test for issue #1630 where PDFs with mediabox origins like\n    [0, 100, width, height+100] (common in cropped/JSTOR-style PDFs)\n    would have OCR text shifted vertically because the text layer CTM\n    did not account for the page origin offset.\n    \"\"\"\n    # Create a PDF with a non-zero mediabox origin\n    input_pdf = outdir / 'offset_mediabox_input.pdf'\n    y_offset = 100\n\n    with pikepdf.open(resources / 'graph_ocred.pdf') as pdf:\n        page = pdf.pages[0]\n        original_mb = list(page.MediaBox)\n\n        # Shift mediabox Y origin to simulate cropped/JSTOR-style PDFs\n        page.MediaBox = [\n            original_mb[0],\n            original_mb[1] + y_offset,\n            original_mb[2],\n            original_mb[3] + y_offset,\n        ]\n\n        pdf.save(input_pdf)\n\n    # Run --redo-ocr (this is where the bug occurred)\n    output_pdf = outdir / 'offset_redo_ocr.pdf'\n    ocrmypdf.ocr(input_pdf, output_pdf, redo_ocr=True)\n\n    # Verify the output\n    with pikepdf.open(output_pdf) as pdf:\n        page = pdf.pages[0]\n        mediabox = list(page.MediaBox)\n\n        # MediaBox origin should be preserved\n        assert (\n            float(mediabox[1]) == y_offset\n        ), f\"MediaBox Y origin should be preserved at {y_offset}, got {mediabox[1]}\"\n\n        # The content stream should include a CTM with the Y origin translation.\n        # Without the fix, the CTM was omitted for rotation==0, causing a shift.\n        content = page.Contents.read_bytes()\n        assert b'cm' in content, (\n            \"Content stream should include a CTM to translate by the page origin\"\n        )\n\n\ndef test_strip_invisble_text():\n    pdf = pikepdf.Pdf.new()\n    print(pikepdf.parse_content_stream(pikepdf.Stream(pdf, b'3 Tr')))\n    page = pdf.add_blank_page()\n    visible_text = [\n        pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),\n        pikepdf.ContentStreamInstruction(\n            (pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')\n        ),\n        pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),\n        pikepdf.ContentStreamInstruction(\n            (pikepdf.String('visible'),), pikepdf.Operator('Tj')\n        ),\n        pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),\n    ]\n    invisible_text = [\n        pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),\n        pikepdf.ContentStreamInstruction(\n            (pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')\n        ),\n        pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),\n        pikepdf.ContentStreamInstruction(\n            (pikepdf.String('invisible'),), pikepdf.Operator('Tj')\n        ),\n        pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),\n    ]\n    invisible_text_setting_tr = [\n        pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),\n        pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),\n        pikepdf.ContentStreamInstruction(\n            (pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')\n        ),\n        pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),\n        pikepdf.ContentStreamInstruction(\n            (pikepdf.String('invisible'),), pikepdf.Operator('Tj')\n        ),\n        pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),\n    ]\n    stream = [\n        pikepdf.ContentStreamInstruction([], pikepdf.Operator('q')),\n        pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),\n        *invisible_text,\n        pikepdf.ContentStreamInstruction([], pikepdf.Operator('Q')),\n        *visible_text,\n        *invisible_text_setting_tr,\n        *invisible_text,\n    ]\n    content_stream = pikepdf.unparse_content_stream(stream)\n    page.Contents = pikepdf.Stream(pdf, content_stream)\n\n    def count(string, page):\n        return len(\n            [\n                True\n                for operands, operator in pikepdf.parse_content_stream(page)\n                if operator == pikepdf.Operator('Tj')\n                and operands[0] == pikepdf.String(string)\n            ]\n        )\n\n    nr_visible_pre = count('visible', page)\n    ocrmypdf._graft.strip_invisible_text(pdf, page)\n    nr_visible_post = count('visible', page)\n    assert (\n        nr_visible_pre == nr_visible_post\n    ), 'Number of visible text elements did not change'\n    assert count('invisible', page) == 0, 'No invisible elems left'\n"
  },
  {
    "path": "tests/test_helpers.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport logging\nimport multiprocessing\nimport os\nfrom pathlib import Path\nfrom unittest.mock import MagicMock\n\nimport pytest\nfrom packaging.version import Version\n\nfrom ocrmypdf import helpers\nfrom ocrmypdf.helpers import running_in_docker\n\nneeds_symlink = pytest.mark.skipif(os.name == 'nt', reason='needs posix symlink')\nwindows_only = pytest.mark.skipif(os.name != 'nt', reason=\"Windows test\")\n\n\nclass TestSafeSymlink:\n    def test_safe_symlink_link_self(self, tmp_path, caplog):\n        helpers.safe_symlink(tmp_path / 'self', tmp_path / 'self')\n        assert caplog.record_tuples[0][1] == logging.WARNING\n\n    def test_safe_symlink_overwrite(self, tmp_path):\n        (tmp_path / 'regular_file').touch()\n        with pytest.raises(FileExistsError):\n            helpers.safe_symlink(tmp_path / 'input', tmp_path / 'regular_file')\n\n    @needs_symlink\n    def test_safe_symlink_relink(self, tmp_path):\n        (tmp_path / 'regular_file_a').touch()\n        (tmp_path / 'regular_file_b').write_bytes(b'ABC')\n        (tmp_path / 'link').symlink_to(tmp_path / 'regular_file_a')\n        helpers.safe_symlink(tmp_path / 'regular_file_b', tmp_path / 'link')\n        assert (tmp_path / 'link').samefile(tmp_path / 'regular_file_b') or (\n            tmp_path / 'link'\n        ).read_bytes() == b'ABC'\n\n\ndef test_no_cpu_count(monkeypatch):\n    invoked = False\n\n    def cpu_count_raises():\n        nonlocal invoked\n        invoked = True\n        raise NotImplementedError()\n\n    monkeypatch.setattr(multiprocessing, 'cpu_count', cpu_count_raises)\n    with pytest.warns(expected_warning=UserWarning):\n        assert helpers.available_cpu_count() == 1\n    assert invoked, \"Patched function called during test\"\n\n\nskipif_docker = pytest.mark.skipif(running_in_docker(), reason=\"fails on Docker\")\n\n\nclass TestFileIsWritable:\n    @pytest.fixture\n    def non_existent(self, tmp_path):\n        return tmp_path / 'nofile'\n\n    @pytest.fixture\n    def basic_file(self, tmp_path):\n        basic = tmp_path / 'basic'\n        basic.touch()\n        return basic\n\n    def test_plain(self, non_existent):\n        assert helpers.is_file_writable(non_existent)\n\n    @needs_symlink\n    def test_symlink_loop(self, tmp_path):\n        loop = tmp_path / 'loop'\n        loop.symlink_to(loop)\n        assert not helpers.is_file_writable(loop)\n\n    @skipif_docker\n    def test_chmod(self, basic_file):\n        assert helpers.is_file_writable(basic_file)\n        basic_file.chmod(0o400)\n        assert not helpers.is_file_writable(basic_file)\n        basic_file.chmod(0o000)\n        assert not helpers.is_file_writable(basic_file)\n\n    def test_permission_error(self, basic_file):\n        pathmock = MagicMock(spec_set=basic_file)\n        pathmock.is_symlink.return_value = False\n        pathmock.exists.return_value = True\n        pathmock.is_file.side_effect = PermissionError\n        assert not helpers.is_file_writable(pathmock)\n\n\n@windows_only\ndef test_gs_install_locations():\n    # pylint: disable=import-outside-toplevel\n    from ocrmypdf.subprocess._windows import _gs_version_in_path_key\n\n    assert _gs_version_in_path_key(Path(\"C:\\\\Program Files\\\\gs\\\\gs9.52\\\\bin\")) == (\n        'gs',\n        Version('9.52'),\n    )\n\n\n@windows_only\ndef test_shim_paths(tmp_path):\n    # pylint: disable=import-outside-toplevel\n    from ocrmypdf.subprocess._windows import shim_env_path\n\n    progfiles = tmp_path / 'Program Files'\n    progfiles.mkdir()\n    (progfiles / 'tesseract-ocr').mkdir()\n    (progfiles / 'gs' / '9.51' / 'bin').mkdir(parents=True)\n    (progfiles / 'gs' / 'gs9.52.3' / 'bin').mkdir(parents=True)\n    syspath = tmp_path / 'bin'\n    env = {'PROGRAMFILES': str(progfiles), 'PATH': str(syspath)}\n\n    result_str = shim_env_path(env=env)\n    results = result_str.split(os.pathsep)\n    assert results[0] == str(syspath), results\n    assert results[-3].endswith('tesseract-ocr'), results\n    assert results[-2].endswith(os.path.join('gs9.52.3', 'bin')), results\n    assert results[-1].endswith(os.path.join('gs', '9.51', 'bin')), results\n\n\ndef test_resolution():\n    Resolution = helpers.Resolution\n    dpi_100 = Resolution(100, 100)\n    dpi_200 = Resolution(200, 200)\n    assert dpi_100.is_square\n    assert not Resolution(100, 200).is_square\n    assert dpi_100 == Resolution(100, 100)\n    assert str(dpi_100) != str(dpi_200)\n    assert dpi_100.take_max([200, 300], [400]) == Resolution(300, 400)\n"
  },
  {
    "path": "tests/test_hocr_parser.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Unit tests for HocrParser class.\"\"\"\n\nfrom __future__ import annotations\n\nfrom pathlib import Path\nfrom textwrap import dedent\n\nimport pytest\n\nfrom ocrmypdf.hocrtransform import (\n    HocrParseError,\n    HocrParser,\n    OcrClass,\n)\n\n\n@pytest.fixture\ndef simple_hocr(tmp_path) -> Path:\n    \"\"\"Create a simple valid hOCR file.\"\"\"\n    content = dedent(\"\"\"\\\n        <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n        <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n            \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n        <html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n        <head>\n            <title>Test</title>\n        </head>\n        <body>\n            <div class='ocr_page' title='bbox 0 0 1000 500; ppageno 0'>\n                <p class='ocr_par' lang='eng' dir='ltr'>\n                    <span class='ocr_line' title='bbox 100 100 900 150; baseline 0.01 -5'>\n                        <span class='ocrx_word' title='bbox 100 100 200 150; x_wconf 95'>Hello</span>\n                        <span class='ocrx_word' title='bbox 250 100 350 150; x_wconf 90'>World</span>\n                    </span>\n                </p>\n            </div>\n        </body>\n        </html>\n    \"\"\")\n    hocr_file = tmp_path / \"simple.hocr\"\n    hocr_file.write_text(content, encoding='utf-8')\n    return hocr_file\n\n\n@pytest.fixture\ndef multiline_hocr(tmp_path) -> Path:\n    \"\"\"Create an hOCR file with multiple lines and paragraphs.\"\"\"\n    content = dedent(\"\"\"\\\n        <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n        <html>\n        <body>\n            <div class='ocr_page' title='bbox 0 0 1000 1000'>\n                <p class='ocr_par' lang='eng'>\n                    <span class='ocr_line' title='bbox 100 100 900 150'>\n                        <span class='ocrx_word' title='bbox 100 100 200 150'>Line</span>\n                        <span class='ocrx_word' title='bbox 210 100 280 150'>one</span>\n                    </span>\n                    <span class='ocr_line' title='bbox 100 200 900 250'>\n                        <span class='ocrx_word' title='bbox 100 200 200 250'>Line</span>\n                        <span class='ocrx_word' title='bbox 210 200 280 250'>two</span>\n                    </span>\n                </p>\n                <p class='ocr_par' lang='deu'>\n                    <span class='ocr_line' title='bbox 100 400 900 450'>\n                        <span class='ocrx_word' title='bbox 100 400 200 450'>German</span>\n                        <span class='ocrx_word' title='bbox 210 400 280 450'>text</span>\n                    </span>\n                </p>\n            </div>\n        </body>\n        </html>\n    \"\"\")\n    hocr_file = tmp_path / \"multiline.hocr\"\n    hocr_file.write_text(content, encoding='utf-8')\n    return hocr_file\n\n\n@pytest.fixture\ndef rtl_hocr(tmp_path) -> Path:\n    \"\"\"Create an hOCR file with RTL text.\"\"\"\n    content = dedent(\"\"\"\\\n        <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n        <html>\n        <body>\n            <div class='ocr_page' title='bbox 0 0 1000 500'>\n                <p class='ocr_par' lang='ara' dir='rtl'>\n                    <span class='ocr_line' title='bbox 100 100 900 150'>\n                        <span class='ocrx_word' title='bbox 100 100 200 150'>مرحبا</span>\n                    </span>\n                </p>\n            </div>\n        </body>\n        </html>\n    \"\"\")\n    hocr_file = tmp_path / \"rtl.hocr\"\n    hocr_file.write_text(content, encoding='utf-8')\n    return hocr_file\n\n\n@pytest.fixture\ndef rotated_hocr(tmp_path) -> Path:\n    \"\"\"Create an hOCR file with rotated text (textangle).\"\"\"\n    content = dedent(\"\"\"\\\n        <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n        <html>\n        <body>\n            <div class='ocr_page' title='bbox 0 0 1000 500'>\n                <p class='ocr_par' lang='eng'>\n                    <span class='ocr_line' title='bbox 100 100 900 150; textangle 5.5'>\n                        <span class='ocrx_word' title='bbox 100 100 200 150'>Rotated</span>\n                    </span>\n                </p>\n            </div>\n        </body>\n        </html>\n    \"\"\")\n    hocr_file = tmp_path / \"rotated.hocr\"\n    hocr_file.write_text(content, encoding='utf-8')\n    return hocr_file\n\n\n@pytest.fixture\ndef header_hocr(tmp_path) -> Path:\n    \"\"\"Create an hOCR file with different line types.\"\"\"\n    content = dedent(\"\"\"\\\n        <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n        <html>\n        <body>\n            <div class='ocr_page' title='bbox 0 0 1000 500'>\n                <p class='ocr_par' lang='eng'>\n                    <span class='ocr_header' title='bbox 100 50 900 100'>\n                        <span class='ocrx_word' title='bbox 100 50 300 100'>Chapter</span>\n                        <span class='ocrx_word' title='bbox 310 50 400 100'>One</span>\n                    </span>\n                    <span class='ocr_line' title='bbox 100 150 900 200'>\n                        <span class='ocrx_word' title='bbox 100 150 200 200'>Body</span>\n                        <span class='ocrx_word' title='bbox 210 150 280 200'>text</span>\n                    </span>\n                    <span class='ocr_caption' title='bbox 100 300 900 350'>\n                        <span class='ocrx_word' title='bbox 100 300 200 350'>Figure</span>\n                        <span class='ocrx_word' title='bbox 210 300 250 350'>1</span>\n                    </span>\n                </p>\n            </div>\n        </body>\n        </html>\n    \"\"\")\n    hocr_file = tmp_path / \"header.hocr\"\n    hocr_file.write_text(content, encoding='utf-8')\n    return hocr_file\n\n\n@pytest.fixture\ndef font_info_hocr(tmp_path) -> Path:\n    \"\"\"Create an hOCR file with font information.\"\"\"\n    content = dedent(\"\"\"\\\n        <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n        <html>\n        <body>\n            <div class='ocr_page' title='bbox 0 0 1000 500'>\n                <p class='ocr_par' lang='eng'>\n                    <span class='ocr_line' title='bbox 100 100 900 150'>\n                        <span class='ocrx_word' title='bbox 100 100 200 150; x_font Arial; x_fsize 12.5'>Styled</span>\n                    </span>\n                </p>\n            </div>\n        </body>\n        </html>\n    \"\"\")\n    hocr_file = tmp_path / \"font_info.hocr\"\n    hocr_file.write_text(content, encoding='utf-8')\n    return hocr_file\n\n\nclass TestHocrParserBasic:\n    \"\"\"Basic HocrParser functionality tests.\"\"\"\n\n    def test_parse_simple_hocr(self, simple_hocr):\n        parser = HocrParser(simple_hocr)\n        page = parser.parse()\n\n        assert page.ocr_class == OcrClass.PAGE\n        assert page.bbox is not None\n        assert page.bbox.width == 1000\n        assert page.bbox.height == 500\n\n    def test_parse_page_number(self, simple_hocr):\n        parser = HocrParser(simple_hocr)\n        page = parser.parse()\n\n        assert page.page_number == 0\n\n    def test_parse_paragraphs(self, simple_hocr):\n        parser = HocrParser(simple_hocr)\n        page = parser.parse()\n\n        assert len(page.paragraphs) == 1\n        paragraph = page.paragraphs[0]\n        assert paragraph.ocr_class == OcrClass.PARAGRAPH\n        assert paragraph.language == \"eng\"\n        assert paragraph.direction == \"ltr\"\n\n    def test_parse_lines(self, simple_hocr):\n        parser = HocrParser(simple_hocr)\n        page = parser.parse()\n\n        lines = page.lines\n        assert len(lines) == 1\n        line = lines[0]\n        assert line.ocr_class == OcrClass.LINE\n        assert line.bbox is not None\n        assert line.baseline is not None\n        assert line.baseline.slope == pytest.approx(0.01)\n        assert line.baseline.intercept == -5\n\n    def test_parse_words(self, simple_hocr):\n        parser = HocrParser(simple_hocr)\n        page = parser.parse()\n\n        words = page.words\n        assert len(words) == 2\n        assert words[0].text == \"Hello\"\n        assert words[1].text == \"World\"\n\n    def test_parse_word_confidence(self, simple_hocr):\n        parser = HocrParser(simple_hocr)\n        page = parser.parse()\n\n        words = page.words\n        assert words[0].confidence == pytest.approx(0.95)\n        assert words[1].confidence == pytest.approx(0.90)\n\n    def test_parse_word_bbox(self, simple_hocr):\n        parser = HocrParser(simple_hocr)\n        page = parser.parse()\n\n        word = page.words[0]\n        assert word.bbox is not None\n        assert word.bbox.left == 100\n        assert word.bbox.top == 100\n        assert word.bbox.right == 200\n        assert word.bbox.bottom == 150\n\n\nclass TestHocrParserMultiline:\n    \"\"\"Test parsing of multi-line/multi-paragraph hOCR.\"\"\"\n\n    def test_multiple_lines(self, multiline_hocr):\n        parser = HocrParser(multiline_hocr)\n        page = parser.parse()\n\n        assert len(page.paragraphs) == 2\n        assert len(page.lines) == 3  # 2 in first par, 1 in second\n\n    def test_multiple_paragraphs_languages(self, multiline_hocr):\n        parser = HocrParser(multiline_hocr)\n        page = parser.parse()\n\n        paragraphs = page.paragraphs\n        assert paragraphs[0].language == \"eng\"\n        assert paragraphs[1].language == \"deu\"\n\n    def test_word_count(self, multiline_hocr):\n        parser = HocrParser(multiline_hocr)\n        page = parser.parse()\n\n        assert len(page.words) == 6  # 2 + 2 + 2\n\n\nclass TestHocrParserRTL:\n    \"\"\"Test parsing of RTL text.\"\"\"\n\n    def test_rtl_direction(self, rtl_hocr):\n        parser = HocrParser(rtl_hocr)\n        page = parser.parse()\n\n        paragraph = page.paragraphs[0]\n        assert paragraph.direction == \"rtl\"\n        assert paragraph.language == \"ara\"\n\n    def test_rtl_line_inherits_direction(self, rtl_hocr):\n        parser = HocrParser(rtl_hocr)\n        page = parser.parse()\n\n        line = page.lines[0]\n        assert line.direction == \"rtl\"\n\n\nclass TestHocrParserRotation:\n    \"\"\"Test parsing of rotated text.\"\"\"\n\n    def test_textangle(self, rotated_hocr):\n        parser = HocrParser(rotated_hocr)\n        page = parser.parse()\n\n        line = page.lines[0]\n        assert line.textangle == pytest.approx(5.5)\n\n\nclass TestHocrParserLineTypes:\n    \"\"\"Test parsing of different line types.\"\"\"\n\n    def test_header_line(self, header_hocr):\n        parser = HocrParser(header_hocr)\n        page = parser.parse()\n\n        lines = page.lines\n        assert len(lines) == 3\n\n        # Check line types\n        line_classes = [line.ocr_class for line in lines]\n        assert OcrClass.HEADER in line_classes\n        assert OcrClass.LINE in line_classes\n        assert OcrClass.CAPTION in line_classes\n\n    def test_all_line_types_have_words(self, header_hocr):\n        parser = HocrParser(header_hocr)\n        page = parser.parse()\n\n        for line in page.lines:\n            assert len(line.children) > 0\n\n\nclass TestHocrParserFontInfo:\n    \"\"\"Test parsing of font information.\"\"\"\n\n    def test_font_name_and_size(self, font_info_hocr):\n        parser = HocrParser(font_info_hocr)\n        page = parser.parse()\n\n        word = page.words[0]\n        assert word.font is not None\n        assert word.font.name == \"Arial\"\n        assert word.font.size == pytest.approx(12.5)\n\n\nclass TestHocrParserErrors:\n    \"\"\"Test error handling in HocrParser.\"\"\"\n\n    def test_missing_file(self, tmp_path):\n        with pytest.raises(FileNotFoundError):\n            HocrParser(tmp_path / \"nonexistent.hocr\")\n\n    def test_invalid_xml(self, tmp_path):\n        hocr_file = tmp_path / \"invalid.hocr\"\n        hocr_file.write_text(\"<html><body>not closed\", encoding='utf-8')\n\n        with pytest.raises(HocrParseError):\n            HocrParser(hocr_file)\n\n    def test_missing_ocr_page(self, tmp_path):\n        hocr_file = tmp_path / \"no_page.hocr\"\n        hocr_file.write_text(\n            \"<html><body><p>No ocr_page</p></body></html>\", encoding='utf-8'\n        )\n\n        parser = HocrParser(hocr_file)\n        with pytest.raises(HocrParseError, match=\"No ocr_page\"):\n            parser.parse()\n\n    def test_missing_page_bbox(self, tmp_path):\n        hocr_file = tmp_path / \"no_bbox.hocr\"\n        hocr_file.write_text(\n            \"<html><body><div class='ocr_page'>No bbox</div></body></html>\",\n            encoding='utf-8',\n        )\n\n        parser = HocrParser(hocr_file)\n        with pytest.raises(HocrParseError, match=\"bbox\"):\n            parser.parse()\n\n\nclass TestHocrParserEdgeCases:\n    \"\"\"Test edge cases in HocrParser.\"\"\"\n\n    def test_empty_word_text(self, tmp_path):\n        \"\"\"Words with empty text should be skipped.\"\"\"\n        content = dedent(\"\"\"\\\n            <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n            <html>\n            <body>\n                <div class='ocr_page' title='bbox 0 0 1000 500'>\n                    <p class='ocr_par'>\n                        <span class='ocr_line' title='bbox 100 100 900 150'>\n                            <span class='ocrx_word' title='bbox 100 100 200 150'></span>\n                            <span class='ocrx_word' title='bbox 210 100 300 150'>Valid</span>\n                        </span>\n                    </p>\n                </div>\n            </body>\n            </html>\n        \"\"\")\n        hocr_file = tmp_path / \"empty_word.hocr\"\n        hocr_file.write_text(content, encoding='utf-8')\n\n        parser = HocrParser(hocr_file)\n        page = parser.parse()\n\n        # Only the non-empty word should be parsed\n        assert len(page.words) == 1\n        assert page.words[0].text == \"Valid\"\n\n    def test_whitespace_only_word(self, tmp_path):\n        \"\"\"Words with only whitespace should be skipped.\"\"\"\n        content = dedent(\"\"\"\\\n            <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n            <html>\n            <body>\n                <div class='ocr_page' title='bbox 0 0 1000 500'>\n                    <p class='ocr_par'>\n                        <span class='ocr_line' title='bbox 100 100 900 150'>\n                            <span class='ocrx_word' title='bbox 100 100 200 150'>   </span>\n                            <span class='ocrx_word' title='bbox 210 100 300 150'>Valid</span>\n                        </span>\n                    </p>\n                </div>\n            </body>\n            </html>\n        \"\"\")\n        hocr_file = tmp_path / \"whitespace_word.hocr\"\n        hocr_file.write_text(content, encoding='utf-8')\n\n        parser = HocrParser(hocr_file)\n        page = parser.parse()\n\n        assert len(page.words) == 1\n        assert page.words[0].text == \"Valid\"\n\n    def test_line_without_bbox(self, tmp_path):\n        \"\"\"Lines without bbox should be skipped.\"\"\"\n        content = dedent(\"\"\"\\\n            <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n            <html>\n            <body>\n                <div class='ocr_page' title='bbox 0 0 1000 500'>\n                    <p class='ocr_par'>\n                        <span class='ocr_line'>\n                            <span class='ocrx_word' title='bbox 100 100 200 150'>Word</span>\n                        </span>\n                        <span class='ocr_line' title='bbox 100 200 900 250'>\n                            <span class='ocrx_word' title='bbox 100 200 200 250'>Valid</span>\n                        </span>\n                    </p>\n                </div>\n            </body>\n            </html>\n        \"\"\")\n        hocr_file = tmp_path / \"no_line_bbox.hocr\"\n        hocr_file.write_text(content, encoding='utf-8')\n\n        parser = HocrParser(hocr_file)\n        page = parser.parse()\n\n        # Only line with bbox should be parsed\n        assert len(page.lines) == 1\n        assert page.words[0].text == \"Valid\"\n\n    def test_unicode_normalization(self, tmp_path):\n        \"\"\"Text should be NFKC normalized.\"\"\"\n        # Use a string with combining characters\n        content = dedent(\"\"\"\\\n            <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n            <html>\n            <body>\n                <div class='ocr_page' title='bbox 0 0 1000 500'>\n                    <p class='ocr_par'>\n                        <span class='ocr_line' title='bbox 100 100 900 150'>\n                            <span class='ocrx_word' title='bbox 100 100 200 150'>ﬁ</span>\n                        </span>\n                    </p>\n                </div>\n            </body>\n            </html>\n        \"\"\")\n        hocr_file = tmp_path / \"unicode.hocr\"\n        hocr_file.write_text(content, encoding='utf-8')\n\n        parser = HocrParser(hocr_file)\n        page = parser.parse()\n\n        # fi ligature should be normalized to \"fi\"\n        assert page.words[0].text == \"fi\"\n\n    def test_words_directly_under_page(self, tmp_path):\n        \"\"\"Test fallback for words directly under page (no paragraph structure).\"\"\"\n        content = dedent(\"\"\"\\\n            <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n            <html>\n            <body>\n                <div class='ocr_page' title='bbox 0 0 1000 500'>\n                    <span class='ocrx_word' title='bbox 100 100 200 150'>Direct</span>\n                    <span class='ocrx_word' title='bbox 210 100 300 150'>Word</span>\n                </div>\n            </body>\n            </html>\n        \"\"\")\n        hocr_file = tmp_path / \"direct_words.hocr\"\n        hocr_file.write_text(content, encoding='utf-8')\n\n        parser = HocrParser(hocr_file)\n        page = parser.parse()\n\n        # Words should be parsed as direct children\n        assert len(page.children) == 2\n        assert page.children[0].text == \"Direct\"\n        assert page.children[1].text == \"Word\"\n\n    def test_no_namespace(self, tmp_path):\n        \"\"\"Test parsing hOCR without XHTML namespace.\"\"\"\n        content = dedent(\"\"\"\\\n            <html>\n            <body>\n                <div class='ocr_page' title='bbox 0 0 1000 500'>\n                    <p class='ocr_par'>\n                        <span class='ocr_line' title='bbox 100 100 900 150'>\n                            <span class='ocrx_word' title='bbox 100 100 200 150'>NoNS</span>\n                        </span>\n                    </p>\n                </div>\n            </body>\n            </html>\n        \"\"\")\n        hocr_file = tmp_path / \"no_namespace.hocr\"\n        hocr_file.write_text(content, encoding='utf-8')\n\n        parser = HocrParser(hocr_file)\n        page = parser.parse()\n\n        assert len(page.words) == 1\n        assert page.words[0].text == \"NoNS\"\n"
  },
  {
    "path": "tests/test_hocrtransform.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport re\nfrom io import StringIO\nfrom pathlib import Path\n\nimport pytest\nfrom pdfminer.converter import TextConverter\nfrom pdfminer.layout import LAParams\nfrom pdfminer.pdfdocument import PDFDocument\nfrom pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager\nfrom pdfminer.pdfpage import PDFPage\nfrom pdfminer.pdfparser import PDFParser\nfrom PIL import Image\n\nfrom ocrmypdf._exec.tesseract import generate_hocr\nfrom ocrmypdf.font import MultiFontManager\nfrom ocrmypdf.fpdf_renderer import Fpdf2PdfRenderer\nfrom ocrmypdf.helpers import check_pdf\nfrom ocrmypdf.hocrtransform import HocrParser\n\nfrom .conftest import check_ocrmypdf\n\n\ndef text_from_pdf(filename):\n    output_string = StringIO()\n    with open(filename, 'rb') as in_file:\n        parser = PDFParser(in_file)\n        doc = PDFDocument(parser)\n        rsrcmgr = PDFResourceManager()\n        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())\n        interpreter = PDFPageInterpreter(rsrcmgr, device)\n        for page in PDFPage.create_pages(doc):\n            interpreter.process_page(page)\n    return output_string.getvalue()\n\n\n# pylint: disable=redefined-outer-name\n\n\n@pytest.fixture\ndef font_dir():\n    \"\"\"Get the font directory.\"\"\"\n    return Path(__file__).parent.parent / \"src\" / \"ocrmypdf\" / \"data\"\n\n\n@pytest.fixture\ndef multi_font_manager(font_dir):\n    \"\"\"Create a MultiFontManager for tests.\"\"\"\n    return MultiFontManager(font_dir)\n\n\n@pytest.fixture\ndef blank_hocr(tmp_path):\n    im = Image.new('1', (8, 8), 0)\n    im.save(tmp_path / 'blank.tif', format='TIFF')\n    generate_hocr(\n        input_file=tmp_path / 'blank.tif',\n        output_hocr=tmp_path / 'blank.hocr',\n        output_text=tmp_path / 'blank.txt',\n        languages=['eng'],\n        engine_mode=1,\n        tessconfig=[],\n        pagesegmode=3,\n        thresholding=0,\n        user_words=None,\n        user_patterns=None,\n        timeout=None,\n    )\n    return tmp_path / 'blank.hocr'\n\n\ndef test_mono_image(blank_hocr, outdir, multi_font_manager):\n    im = Image.new('1', (8, 8), 0)\n    for n in range(8):\n        im.putpixel((n, n), 1)\n    im.save(outdir / 'mono.tif', format='TIFF')\n\n    # Parse hOCR file\n    parser = HocrParser(str(blank_hocr))\n    ocr_page = parser.parse()\n\n    # Use DPI from hOCR or default\n    dpi = ocr_page.dpi or 8\n\n    # Render to PDF using fpdf2\n    renderer = Fpdf2PdfRenderer(\n        page=ocr_page,\n        dpi=dpi,\n        multi_font_manager=multi_font_manager,\n        invisible_text=True,\n    )\n    renderer.render(outdir / 'mono.pdf')\n\n    check_pdf(outdir / 'mono.pdf')\n\n\n@pytest.mark.slow\ndef test_fpdf2_matches_sandwich(resources, outdir):\n    \"\"\"Test that fpdf2 renderer produces similar output to sandwich renderer.\"\"\"\n    # Note: hocr renderer now redirects to fpdf2\n    check_ocrmypdf(\n        resources / 'ccitt.pdf', outdir / 'fpdf2.pdf', '--pdf-renderer=fpdf2'\n    )\n    check_ocrmypdf(\n        resources / 'ccitt.pdf', outdir / 'tess.pdf', '--pdf-renderer=sandwich'\n    )\n\n    # Slight differences in spacing and word order can appear, so at least ensure\n    # that we get all of the same words...\n    def clean(s):\n        s = re.sub(r'\\s+', ' ', s)\n        words = s.split(' ')\n        return set(words)\n\n    fpdf2_words = clean(text_from_pdf(outdir / 'fpdf2.pdf'))\n    tess_words = clean(text_from_pdf(outdir / 'tess.pdf'))\n\n    similarity = len(fpdf2_words & tess_words) / len(fpdf2_words | tess_words)\n\n    assert similarity > 0.99\n"
  },
  {
    "path": "tests/test_image_input.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nfrom unittest.mock import patch\n\nimport img2pdf\nimport pikepdf\nimport pytest\nfrom PIL import Image\n\nimport ocrmypdf\n\nfrom .conftest import check_ocrmypdf, run_ocrmypdf_api\n\n# pylint: disable=redefined-outer-name\n\n\n@pytest.fixture\ndef baiona(resources):\n    return Image.open(resources / 'baiona_gray.png')\n\n\ndef test_image_to_pdf(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'crom.png',\n        outpdf,\n        '--image-dpi',\n        '200',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n\ndef test_no_dpi_info(caplog, baiona, outdir, no_outpdf):\n    im = baiona\n    assert 'dpi' not in im.info\n    input_image = outdir / 'baiona_no_dpi.png'\n    im.save(input_image)\n\n    rc = run_ocrmypdf_api(input_image, no_outpdf)\n    assert rc == ocrmypdf.ExitCode.input_file\n    assert \"--image-dpi\" in caplog.text\n\n\ndef test_dpi_not_credible(caplog, baiona, outdir, no_outpdf):\n    im = baiona\n    assert 'dpi' not in im.info\n    input_image = outdir / 'baiona_no_dpi.png'\n    im.save(input_image, dpi=(30, 30))\n\n    rc = run_ocrmypdf_api(input_image, no_outpdf)\n    assert rc == ocrmypdf.ExitCode.input_file\n    assert \"not credible\" in caplog.text\n\n\ndef test_cmyk_no_icc(caplog, resources, no_outpdf):\n    rc = run_ocrmypdf_api(resources / 'baiona_cmyk.jpg', no_outpdf)\n    assert rc == ocrmypdf.ExitCode.input_file\n    assert \"no ICC profile\" in caplog.text\n\n\ndef test_img2pdf_fails(resources, no_outpdf):\n    with patch(\n        'ocrmypdf._pipeline.img2pdf.convert', side_effect=img2pdf.ImageOpenError()\n    ) as mock:\n        rc = run_ocrmypdf_api(\n            resources / 'baiona_gray.png', no_outpdf, '--image-dpi', '200'\n        )\n        assert rc == ocrmypdf.ExitCode.input_file\n        mock.assert_called()\n\n\n@pytest.mark.xfail(reason=\"remove background disabled\")\ndef test_jpeg_in_jpeg_out(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'baiona_color.jpg',\n        outpdf,\n        '--image-dpi',\n        '100',\n        '--output-type',\n        'pdf',  # specifically check pdf because Ghostscript may convert to JPEG\n        '--remove-background',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    with pikepdf.open(outpdf) as pdf:\n        assert next(iter(pdf.pages[0].images.values())).Filter == pikepdf.Name.DCTDecode\n"
  },
  {
    "path": "tests/test_imageops.py",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport hypothesis.strategies as st\nfrom hypothesis import given\nfrom PIL import Image\n\nfrom ocrmypdf.imageops import (\n    _calculate_downsample,\n    bytes_per_pixel,\n    calculate_downsample,\n    downsample_image,\n)\n\n\ndef test_bytes_per_pixel():\n    assert bytes_per_pixel('RGB') == 4\n    assert bytes_per_pixel('RGBA') == 4\n    assert bytes_per_pixel('LA') == 2\n    assert bytes_per_pixel('L') == 1\n\n\ndef test_calculate_downsample():\n    im = Image.new('RGB', (100, 100))\n    assert calculate_downsample(im, max_size=(50, 50)) == (50, 50)\n    assert calculate_downsample(im, max_pixels=2500) == (50, 50)\n    assert calculate_downsample(im, max_bytes=10000) == (50, 50)\n    assert calculate_downsample(im, max_bytes=100000) == (100, 100)\n\n\n@given(\n    st.one_of(st.just(\"RGB\"), st.just('L')),\n    st.integers(min_value=1, max_value=100000),\n    st.integers(min_value=1, max_value=100000),\n    st.integers(min_value=64, max_value=100000),\n    st.integers(min_value=64, max_value=100000),\n    st.integers(min_value=64 * 64, max_value=1000000),\n)\ndef test_calculate_downsample_hypothesis(mode, im_w, im_h, max_x, max_y, max_bytes):\n    result = _calculate_downsample(\n        (im_w, im_h),\n        bytes_per_pixel(mode),\n        max_size=(max_x, max_y),\n        max_bytes=max_bytes,\n    )\n    assert result[0] <= max_x\n    assert result[1] <= max_y\n    assert result[0] * result[1] * bytes_per_pixel(mode) <= max_bytes\n\n\ndef test_downsample_image():\n    im = Image.new('RGB', (100, 100))\n    im.info['dpi'] = (300, 300)\n    ds = downsample_image(im, (50, 50))\n    assert ds.size == (50, 50)\n    assert ds.info['dpi'] == (150, 150)\n"
  },
  {
    "path": "tests/test_json_serialization.py",
    "content": "\"\"\"Test JSON serialization of OcrOptions for multiprocessing compatibility.\"\"\"\nfrom __future__ import annotations\n\nimport multiprocessing\nfrom io import BytesIO\nfrom pathlib import Path, PurePath\n\nimport pytest\n\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOptions\n\n\n@pytest.fixture(autouse=True)\ndef register_plugin_models():\n    \"\"\"Register plugin models for tests.\"\"\"\n    OcrOptions.register_plugin_models({'tesseract': TesseractOptions})\n    yield\n    # Clean up after test (optional, but good practice)\n\n\ndef worker_function(options_json: str) -> str:\n    \"\"\"Worker function that deserializes OcrOptions from JSON and returns a result.\"\"\"\n    # Register plugin models in worker process\n    from ocrmypdf._options import OcrOptions\n    from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOptions\n\n    OcrOptions.register_plugin_models({'tesseract': TesseractOptions})\n\n    # Reconstruct OcrOptions from JSON in worker process\n    options = OcrOptions.model_validate_json_safe(options_json)\n\n    # Verify we can access various option types\n    # Count only user-added extra_attrs (exclude plugin cache keys starting with '_')\n    user_attrs_count = len(\n        [k for k in options.extra_attrs.keys() if not k.startswith('_')]\n    )\n    result = {\n        'input_file': str(options.input_file),\n        'output_file': str(options.output_file),\n        'languages': options.languages,\n        'optimize': options.optimize,\n        'tesseract_timeout': options.tesseract.timeout,\n        'fast_web_view': options.fast_web_view,\n        'extra_attrs_count': user_attrs_count,\n    }\n\n    # Return as JSON string\n    import json\n\n    return json.dumps(result)\n\n\ndef test_json_serialization_multiprocessing():\n    \"\"\"Test that OcrOptions can be JSON serialized and used in multiprocessing.\"\"\"\n    # Create OcrOptions with various field types\n    options = OcrOptions(\n        input_file=Path('/test/input.pdf'),\n        output_file=Path('/test/output.pdf'),\n        languages=['eng', 'deu'],\n        optimize=2,\n        tesseract_timeout=120.0,\n        fast_web_view=2.5,\n        deskew=True,\n        clean=False,\n    )\n\n    # Add some extra attributes\n    options.extra_attrs['custom_field'] = 'test_value'\n    options.extra_attrs['numeric_field'] = 42\n\n    # Serialize to JSON\n    options_json = options.model_dump_json_safe()\n\n    # Test that we can deserialize in the main process\n    reconstructed = OcrOptions.model_validate_json_safe(options_json)\n    assert reconstructed.input_file == options.input_file\n    assert reconstructed.output_file == options.output_file\n    assert reconstructed.languages == options.languages\n    assert reconstructed.optimize == options.optimize\n    assert reconstructed.tesseract_timeout == options.tesseract.timeout\n    assert reconstructed.fast_web_view == options.fast_web_view\n    assert reconstructed.deskew == options.deskew\n    assert reconstructed.clean == options.clean\n    # Compare user-added extra_attrs (excluding plugin cache keys)\n    user_attrs = {k: v for k, v in options.extra_attrs.items() if not k.startswith('_')}\n    reconstructed_attrs = {\n        k: v for k, v in reconstructed.extra_attrs.items() if not k.startswith('_')\n    }\n    assert reconstructed_attrs == user_attrs\n\n    # Test multiprocessing with JSON serialization\n    with multiprocessing.Pool(processes=2) as pool:\n        # Send the JSON string to worker processes\n        results = pool.map(worker_function, [options_json, options_json])\n\n    # Verify results from worker processes\n    import json\n\n    for result_json in results:\n        result = json.loads(result_json)\n        assert PurePath(result['input_file']) == PurePath('/test/input.pdf')\n        assert PurePath(result['output_file']) == PurePath('/test/output.pdf')\n        assert result['languages'] == ['eng', 'deu']\n        assert result['optimize'] == 2\n        assert result['tesseract_timeout'] == 120.0\n        assert result['fast_web_view'] == 2.5\n        assert result['extra_attrs_count'] == 2  # custom_field and numeric_field\n\n\ndef test_json_serialization_with_streams():\n    \"\"\"Test JSON serialization with stream objects.\"\"\"\n    input_stream = BytesIO(b'fake pdf data')\n    output_stream = BytesIO()\n\n    options = OcrOptions(\n        input_file=input_stream,\n        output_file=output_stream,\n        languages=['eng'],\n        optimize=1,\n    )\n\n    # Serialize to JSON (streams should be converted to placeholders)\n    options_json = options.model_dump_json_safe()\n\n    # Deserialize (streams will be placeholder strings)\n    reconstructed = OcrOptions.model_validate_json_safe(options_json)\n\n    # Streams should be converted to placeholder strings\n    assert reconstructed.input_file == 'stream'\n    assert reconstructed.output_file == 'stream'\n    assert reconstructed.languages == ['eng']\n    assert reconstructed.optimize == 1\n\n\ndef test_json_serialization_with_none_values():\n    \"\"\"Test JSON serialization handles None values correctly.\"\"\"\n    options = OcrOptions(\n        input_file=Path('/test/input.pdf'),\n        output_file=Path('/test/output.pdf'),\n        languages=['eng'],\n        # Many fields will be None by default\n    )\n\n    # Serialize to JSON\n    options_json = options.model_dump_json_safe()\n\n    # Deserialize\n    reconstructed = OcrOptions.model_validate_json_safe(options_json)\n\n    # Verify None values are preserved (check actual defaults from model)\n    assert reconstructed.tesseract_timeout is None  # Default value\n    assert reconstructed.fast_web_view == 1.0  # Default value, not None\n    assert (\n        reconstructed.color_conversion_strategy == \"LeaveColorUnchanged\"\n    )  # Default value\n    assert reconstructed.pdfa_image_compression is None  # This one is actually None\n\n    # Verify non-None values are preserved\n    assert reconstructed.input_file == options.input_file\n    assert reconstructed.output_file == options.output_file\n    assert reconstructed.languages == options.languages\n"
  },
  {
    "path": "tests/test_logging.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport logging\n\nfrom ocrmypdf._pipelines._common import configure_debug_logging\n\n\ndef test_debug_logging(tmp_path):\n    # Just exercise the debug logger but don't validate it\n    # See https://github.com/pytest-dev/pytest/issues/5502 for pytest logging quirks\n    prefix = 'test_debug_logging'\n    log = logging.getLogger(prefix)\n    _handler, remover = configure_debug_logging(tmp_path / 'test.log', prefix)\n    log.info(\"test message\")\n    remover()\n"
  },
  {
    "path": "tests/test_main.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport os\nimport shutil\nimport sys\nfrom math import isclose\nfrom pathlib import Path\nfrom subprocess import run\nfrom unittest.mock import patch\n\nimport pikepdf\nimport pytest\nfrom PIL import Image\n\nimport ocrmypdf\nfrom ocrmypdf._exec import tesseract\nfrom ocrmypdf.exceptions import ExitCode, MissingDependencyError\nfrom ocrmypdf.helpers import running_in_docker\nfrom ocrmypdf.pdfa import file_claims_pdfa\nfrom ocrmypdf.pdfinfo import Colorspace, Encoding, PdfInfo\nfrom ocrmypdf.subprocess import get_version\n\nfrom .conftest import (\n    check_ocrmypdf,\n    first_page_dimensions,\n    have_unpaper,\n    is_macos,\n    run_ocrmypdf,\n    run_ocrmypdf_api,\n)\n\n# pylint: disable=redefined-outer-name\n\n\nRENDERERS = ['fpdf2', 'sandwich']\n\n\ndef test_quick(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'ccitt.pdf', outpdf, '--plugin', 'tests/plugins/tesseract_cache.py'\n    )\n\n\n@pytest.mark.parametrize('renderer', RENDERERS)\ndef test_oversample(renderer, resources, outpdf):\n    oversampled_pdf = check_ocrmypdf(\n        resources / 'skew.pdf',\n        outpdf,\n        '--oversample',\n        '350',\n        '-f',\n        '--pdf-renderer',\n        renderer,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n    pdfinfo = PdfInfo(oversampled_pdf)\n\n    print(pdfinfo[0].dpi.x)\n    assert abs(pdfinfo[0].dpi.x - 350) < 1\n\n\ndef test_repeat_ocr(resources, no_outpdf):\n    result = run_ocrmypdf_api(resources / 'graph_ocred.pdf', no_outpdf)\n    assert result == ExitCode.already_done_ocr\n\n\ndef test_force_ocr(resources, outpdf):\n    out = check_ocrmypdf(\n        resources / 'graph_ocred.pdf',\n        outpdf,\n        '-f',\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n    pdfinfo = PdfInfo(out)\n    assert pdfinfo[0].has_text\n\n\ndef test_skip_ocr(resources, outpdf):\n    out = check_ocrmypdf(\n        resources / 'graph_ocred.pdf',\n        outpdf,\n        '-s',\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n    pdfinfo = PdfInfo(out)\n    assert pdfinfo[0].has_text\n\n\ndef test_redo_ocr(resources, outpdf):\n    in_ = resources / 'graph_ocred.pdf'\n    before = PdfInfo(in_, detailed_analysis=True)\n    out = outpdf\n    out = check_ocrmypdf(in_, out, '--redo-ocr')\n    after = PdfInfo(out, detailed_analysis=True)\n    assert before[0].has_text and after[0].has_text\n    assert (\n        before[0].get_textareas() != after[0].get_textareas()\n    ), \"Expected text to be different after re-OCR\"\n\n\ndef test_argsfile(resources, outdir):\n    path_argsfile = outdir / 'test_argsfile.txt'\n    with open(str(path_argsfile), 'w') as argsfile:\n        print(\n            '--title',\n            'ArgsFile Test',\n            '--author',\n            'Test Cases',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n            sep='\\n',\n            end='\\n',\n            file=argsfile,\n        )\n    check_ocrmypdf(\n        resources / 'graph.pdf', path_argsfile, '@' + str(outdir / 'test_argsfile.txt')\n    )\n\n\n@pytest.mark.parametrize('renderer', RENDERERS)\ndef test_ocr_timeout(renderer, resources, outpdf):\n    out = check_ocrmypdf(\n        resources / 'skew.pdf',\n        outpdf,\n        '--tesseract-timeout',\n        '0',\n        '--pdf-renderer',\n        renderer,\n    )\n    pdfinfo = PdfInfo(out)\n    assert not pdfinfo[0].has_text\n\n\ndef test_skip_big(resources, outpdf):\n    out = check_ocrmypdf(\n        resources / 'jbig2.pdf',\n        outpdf,\n        '--skip-big',\n        '1',\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n    pdfinfo = PdfInfo(out)\n    assert not pdfinfo[0].has_text\n\n\n@pytest.mark.parametrize('renderer', RENDERERS)\n@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])\ndef test_maximum_options(renderer, output_type, multipage, outpdf):\n    check_ocrmypdf(\n        multipage,\n        outpdf,\n        '-d',\n        '-ci' if have_unpaper() else None,\n        '-f',\n        '-k',\n        '--oversample',\n        '300',\n        '--skip-big',\n        '10',\n        '--title',\n        'Too Many Weird Files',\n        '--author',\n        'py.test',\n        '--pdf-renderer',\n        renderer,\n        '--output-type',\n        output_type,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n\n@pytest.mark.skipif(\n    tesseract.version() >= tesseract.TesseractVersion('5'),\n    reason=\"tess 5 tries harder to find its files\",\n)\ndef test_tesseract_missing_tessdata(monkeypatch, resources, no_outpdf, tmpdir):\n    monkeypatch.setenv(\"TESSDATA_PREFIX\", os.fspath(tmpdir))\n    with pytest.raises(MissingDependencyError):\n        run_ocrmypdf_api(resources / 'graph.pdf', no_outpdf, '-v', '1', '--skip-text')\n\n\ndef test_invalid_input_pdf(resources, no_outpdf):\n    result = run_ocrmypdf_api(resources / 'invalid.pdf', no_outpdf)\n    assert result == ExitCode.input_file\n\n\ndef test_blank_input_pdf(resources, outpdf):\n    result = run_ocrmypdf_api(resources / 'blank.pdf', outpdf)\n    assert result == ExitCode.ok\n\n\ndef test_force_ocr_on_pdf_with_no_images(resources, no_outpdf):\n    # As a correctness test, make sure that --force-ocr on a PDF with no\n    # content still triggers tesseract. If tesseract crashes, then it was\n    # called.\n    exitcode = run_ocrmypdf_api(\n        resources / 'blank.pdf',\n        no_outpdf,\n        '--force-ocr',\n        '--plugin',\n        'tests/plugins/tesseract_crash.py',\n    )\n    assert exitcode == ExitCode.child_process_error\n    assert not no_outpdf.exists()\n\n\n@pytest.mark.skipif(\n    is_macos(),\n    reason=\"takes too long to install language packs in macOS homebrew\",\n)\ndef test_german(resources, outdir):\n    # Produce a sidecar too - implicit test that system locale is set up\n    # properly. It is fine that we are testing -l deu on a French file because\n    # we are exercising the functionality not going for accuracy.\n    sidecar = outdir / 'francais.txt'\n    try:\n        check_ocrmypdf(\n            resources / 'francais.pdf',\n            outdir / 'francais.pdf',\n            '-l',\n            'deu',  # more commonly installed\n            '--sidecar',\n            sidecar,\n            '--plugin',\n            'tests/plugins/tesseract_cache.py',\n        )\n    except MissingDependencyError:\n        if 'deu' not in tesseract.get_languages():\n            pytest.xfail(reason=\"tesseract-deu language pack not installed\")\n        raise\n\n\ndef test_klingon(resources, outpdf):\n    with pytest.raises(MissingDependencyError):\n        run_ocrmypdf_api(resources / 'francais.pdf', outpdf, '-l', 'klz')\n\n\ndef test_missing_docinfo(resources, outpdf):\n    result = run_ocrmypdf_api(\n        resources / 'missing_docinfo.pdf',\n        outpdf,\n        '-l',\n        'eng',\n        '--skip-text',\n        '--plugin',\n        Path('tests/plugins/tesseract_noop.py'),\n    )\n    assert result == ExitCode.ok\n\n\ndef test_uppercase_extension(resources, outdir):\n    shutil.copy(str(resources / \"skew.pdf\"), str(outdir / \"UPPERCASE.PDF\"))\n\n    check_ocrmypdf(\n        outdir / \"UPPERCASE.PDF\",\n        outdir / \"UPPERCASE_OUT.PDF\",\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n\ndef test_input_file_not_found(caplog, no_outpdf):\n    input_file = \"does not exist.pdf\"\n    result = run_ocrmypdf_api(input_file, no_outpdf)\n    assert result == ExitCode.input_file\n    assert input_file in caplog.text\n\n\n@pytest.mark.skipif(os.name == 'nt' or running_in_docker(), reason=\"chmod\")\ndef test_input_file_not_readable(caplog, resources, outdir, no_outpdf):\n    input_file = outdir / 'trivial.pdf'\n    shutil.copy(resources / 'trivial.pdf', input_file)\n    input_file.chmod(0o000)\n    result = run_ocrmypdf_api(input_file, no_outpdf)\n    assert result == ExitCode.input_file\n    assert str(input_file) in caplog.text\n\n\ndef test_input_file_not_a_pdf(caplog, no_outpdf):\n    input_file = __file__  # Try to OCR this file\n    result = run_ocrmypdf_api(input_file, no_outpdf)\n    assert result == ExitCode.input_file\n    if os.name != 'nt':  # name will be mangled with \\\\'s on nt\n        assert input_file in caplog.text\n\n\n@pytest.mark.parametrize('renderer', RENDERERS)\ndef test_pagesegmode(renderer, resources, outpdf):\n    check_ocrmypdf(\n        resources / 'skew.pdf',\n        outpdf,\n        '--tesseract-pagesegmode',\n        '7',\n        '-v',\n        '1',\n        '--pdf-renderer',\n        renderer,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n\ndef test_tesseract_oem(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'trivial.pdf',\n        outpdf,\n        '--tesseract-oem',\n        '1',\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n\n@pytest.mark.parametrize('value', ['auto', 'otsu', 'adaptive-otsu', 'sauvola'])\ndef test_tesseract_thresholding(value, resources, outpdf):\n    check_ocrmypdf(\n        resources / 'trivial.pdf',\n        outpdf,\n        '--tesseract-thresholding',\n        value,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n\n@pytest.mark.parametrize('value', ['abcxyz'])\ndef test_tesseract_thresholding_invalid(value, resources, no_outpdf):\n    with pytest.raises(SystemExit, match='2'):\n        run_ocrmypdf_api(\n            resources / 'trivial.pdf',\n            no_outpdf,\n            '--tesseract-thresholding',\n            value,\n            '--plugin',\n            'tests/plugins/tesseract_cache.py',\n        )\n\n\n@pytest.mark.parametrize('renderer', RENDERERS)\ndef test_tesseract_crash(renderer, resources, no_outpdf, caplog):\n    exitcode = run_ocrmypdf_api(\n        resources / 'ccitt.pdf',\n        no_outpdf,\n        '-v',\n        '1',\n        '--pdf-renderer',\n        renderer,\n        '--plugin',\n        'tests/plugins/tesseract_crash.py',\n    )\n    assert exitcode == ExitCode.child_process_error\n    assert not no_outpdf.exists()\n    assert \"SubprocessOutputError\" in caplog.text\n\n\ndef test_tesseract_crash_autorotate(resources, no_outpdf, caplog):\n    exitcode = run_ocrmypdf_api(\n        resources / 'ccitt.pdf',\n        no_outpdf,\n        '-r',\n        '--plugin',\n        'tests/plugins/tesseract_crash.py',\n    )\n    assert exitcode == ExitCode.child_process_error\n    assert not no_outpdf.exists()\n    assert \"uncaught exception\" in caplog.text\n\n\n@pytest.mark.parametrize('renderer', RENDERERS)\n@pytest.mark.slow\ndef test_tesseract_image_too_big(renderer, resources, outpdf):\n    check_ocrmypdf(\n        resources / 'hugemono.pdf',\n        outpdf,\n        '-r',\n        '--pdf-renderer',\n        renderer,\n        '--max-image-mpixels',\n        '0',\n        '--plugin',\n        'tests/plugins/tesseract_big_image_error.py',\n    )\n\n\n@pytest.mark.parametrize('encryption_level', [2, 3, 4, 6])\ndef test_encrypted(resources, outpdf, encryption_level, caplog):\n    if os.name == 'darwin' and sys.version_info >= (3, 12) and encryption_level <= 4:\n        # Error is: RuntimeError: unable to load openssl legacy provider\n        # pikepdf obtains encryption from qpdf, which gets it from openssl among other\n        # providers.\n        # Error message itself comes from here:\n        # https://github.com/qpdf/qpdf/blob/da3eae39c8e5261196bbc1b460e5b556c6836dbf/libqpdf/QPDFCrypto_openssl.cc#L56\n        # Somehow pikepdf + Python 3.12 + macOS does not have this problem, despite\n        # using Homebrew's qpdf. Possibly the difference is that pikepdf's Python 3.12\n        # comes from cibuildwheel, and our macOS Python 3.12 comes from GitHub Actions\n        # setup-python. It may be necessary to build a custom qpdf for macOS.\n        # In any case, OCRmyPDF doesn't support loading encrypted files at all, it\n        # just complains about encryption, and it's using pikepdf to generate encrypted\n        # files for testing.\n        pytest.skip(\"GitHub Python 3.12 on macOS does not have openssl legacy support\")\n    encryption = pikepdf.models.encryption.Encryption(\n        owner='ocrmypdf',\n        user='ocrmypdf',\n        R=encryption_level,\n        aes=(encryption_level >= 4),\n        metadata=(encryption_level == 6),\n    )\n\n    with pikepdf.open(resources / 'jbig2.pdf') as pdf:\n        pdf.save(outpdf, encryption=encryption)\n\n    exitcode = run_ocrmypdf_api(\n        outpdf,\n        outpdf,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    assert exitcode == ExitCode.encrypted_pdf\n    assert 'encryption must be removed' in caplog.text\n\n\ndef test_jbig2_passthrough(resources, outpdf):\n    out = check_ocrmypdf(\n        resources / 'jbig2.pdf',\n        outpdf,\n        '--output-type',\n        'pdf',\n        '--pdf-renderer',\n        'fpdf2',\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n    out_pageinfo = PdfInfo(out)\n    assert out_pageinfo[0].images[0].enc == Encoding.jbig2\n\n\ndef test_masks(resources, outpdf):\n    assert (\n        ocrmypdf.ocr(\n            resources / 'masks.pdf', outpdf, plugins=['tests/plugins/tesseract_noop.py']\n        )\n        == ExitCode.ok\n    )\n\n\ndef test_linearized_pdf_and_indirect_object(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'epson.pdf', outpdf, '--plugin', 'tests/plugins/tesseract_noop.py'\n    )\n\n\ndef test_very_high_dpi(resources, outpdf):\n    \"\"\"Checks for a Decimal quantize error with high DPI, etc.\"\"\"\n    check_ocrmypdf(\n        resources / '2400dpi.pdf',\n        outpdf,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n    pdfinfo = PdfInfo(outpdf)\n\n    image = pdfinfo[0].images[0]\n    assert isclose(image.dpi.x, image.dpi.y)\n    assert isclose(image.dpi.x, 2400)\n\n\ndef test_overlay(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'overlay.pdf',\n        outpdf,\n        '--skip-text',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n\n@pytest.fixture\ndef protected_file(outdir):\n    protected_file = outdir / 'protected.pdf'\n    protected_file.touch()\n    protected_file.chmod(0o400)  # Read-only\n    yield protected_file\n\n\n@pytest.mark.skipif(\n    os.name == 'nt' or os.geteuid() == 0, reason=\"root can write to anything\"\n)\ndef test_destination_not_writable(resources, protected_file):\n    exitcode = run_ocrmypdf_api(\n        resources / 'jbig2.pdf',\n        protected_file,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    assert exitcode == ExitCode.file_access_error\n\n\n@pytest.fixture\ndef valid_tess_config(outdir):\n    cfg_file = outdir / 'test.cfg'\n    with cfg_file.open('w') as f:\n        f.write(\n            '''\\\nload_system_dawg 0\nlanguage_model_penalty_non_dict_word 0\nlanguage_model_penalty_non_freq_dict_word 0\n'''\n        )\n    yield cfg_file\n\n\ndef test_tesseract_config_valid(resources, valid_tess_config, outpdf):\n    check_ocrmypdf(\n        resources / '3small.pdf',\n        outpdf,\n        '--tesseract-config',\n        valid_tess_config,\n        '--pages',\n        '1',\n    )\n\n\n@pytest.fixture\ndef invalid_tess_config(outdir):\n    cfg_file = outdir / 'test.cfg'\n    with cfg_file.open('w') as f:\n        f.write(\n            '''\\\nTHIS FILE IS INVALID\n'''\n        )\n    yield cfg_file\n\n\n@pytest.mark.slow  # This test sometimes times out in CI\n@pytest.mark.parametrize('renderer', RENDERERS)\ndef test_tesseract_config_invalid(renderer, resources, invalid_tess_config, outpdf):\n    p = run_ocrmypdf(\n        resources / 'ccitt.pdf',\n        outpdf,\n        '--pdf-renderer',\n        renderer,\n        '--tesseract-config',\n        invalid_tess_config,\n    )\n    assert (\n        \"parameter not found\" in p.stderr.lower()\n        or \"error occurred while parsing\" in p.stderr.lower()\n    ), \"No error message\"\n    assert p.returncode == ExitCode.invalid_config\n\n\ndef test_user_words_ocr(resources, outdir):\n    # Does not actually test if --user-words causes output to differ\n    word_list = outdir / 'wordlist.txt'\n    sidecar_after = outdir / 'sidecar.txt'\n\n    with word_list.open('w') as f:\n        f.write('cromulent\\n')  # a perfectly cromulent word\n\n    check_ocrmypdf(\n        resources / 'crom.png',\n        outdir / 'out.pdf',\n        '--image-dpi',\n        150,\n        '--sidecar',\n        sidecar_after,\n        '--user-words',\n        word_list,\n    )\n\n\ndef test_form_xobject(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'formxobject.pdf',\n        outpdf,\n        '--force-ocr',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n\n@pytest.mark.parametrize('renderer', RENDERERS)\ndef test_pagesize_consistency(renderer, resources, outpdf):\n    infile = resources / '3small.pdf'\n\n    before_dims = first_page_dimensions(infile)\n\n    check_ocrmypdf(\n        infile,\n        outpdf,\n        '--pdf-renderer',\n        renderer,\n        '--clean' if have_unpaper() else None,\n        '--deskew',\n        # '--remove-background',\n        '--clean-final' if have_unpaper() else None,\n        '-k',\n        '--pages',\n        '1',\n    )\n\n    after_dims = first_page_dimensions(outpdf)\n\n    assert isclose(before_dims[0], after_dims[0], rel_tol=1e-4)\n    assert isclose(before_dims[1], after_dims[1], rel_tol=1e-4)\n\n\ndef test_skip_big_with_no_images(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'blank.pdf',\n        outpdf,\n        '--skip-big',\n        '5',\n        '--force-ocr',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n\ndef test_no_contents(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'no_contents.pdf',\n        outpdf,\n        '--force-ocr',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n\n@pytest.mark.parametrize(\n    'image', ['baiona.png', 'baiona_gray.png', 'baiona_alpha.png', 'baiona_color.jpg']\n)\ndef test_compression_preserved(ocrmypdf_exec, resources, image, outpdf):\n    input_file = str(resources / image)\n    output_file = str(outpdf)\n\n    im = Image.open(input_file)\n    # Runs: ocrmypdf - output.pdf < testfile\n    with open(input_file, 'rb') as input_stream:\n        p_args = ocrmypdf_exec + [\n            '--optimize',\n            '0',\n            '--image-dpi',\n            '150',\n            '--output-type',\n            'pdf',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n            '-',\n            output_file,\n        ]\n        p = run(\n            p_args,\n            capture_output=True,\n            stdin=input_stream,\n            text=True,\n            check=False,\n        )\n\n        if im.mode in ('RGBA', 'LA'):\n            # If alpha image is input, expect an error\n            assert p.returncode != ExitCode.ok and 'alpha' in p.stderr\n            return\n\n        assert p.returncode == ExitCode.ok, p.stderr\n\n    pdfinfo = PdfInfo(output_file)\n\n    pdfimage = pdfinfo[0].images[0]\n\n    if input_file.endswith('.png'):\n        assert pdfimage.enc != Encoding.jpeg, \"Lossless compression changed to lossy!\"\n    elif input_file.endswith('.jpg'):\n        assert pdfimage.enc == Encoding.jpeg, \"Lossy compression changed to lossless!\"\n    if im.mode.startswith('RGB') or im.mode.startswith('BGR'):\n        assert pdfimage.color == Colorspace.rgb, \"Colorspace changed\"\n    elif im.mode.startswith('L'):\n        assert pdfimage.color == Colorspace.gray, \"Colorspace changed\"\n    im.close()\n\n\n@pytest.mark.parametrize(\n    'image,compression',\n    [\n        ('baiona.png', 'jpeg'),\n        ('baiona_gray.png', 'lossless'),\n        ('baiona_color.jpg', 'lossless'),\n    ],\n)\ndef test_compression_changed(ocrmypdf_exec, resources, image, compression, outpdf):\n    input_file = str(resources / image)\n    output_file = str(outpdf)\n\n    im = Image.open(input_file)\n\n    # Runs: ocrmypdf - output.pdf < testfile\n    with open(input_file, 'rb') as input_stream:\n        p_args = ocrmypdf_exec + [\n            '--image-dpi',\n            '150',\n            '--output-type',\n            'pdfa',\n            '--optimize',\n            '0',\n            '--pdfa-image-compression',\n            compression,\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n            '-',\n            output_file,\n        ]\n        p = run(\n            p_args,\n            capture_output=True,\n            stdin=input_stream,\n            text=True,\n            check=False,\n        )\n        assert p.returncode == ExitCode.ok, p.stderr\n\n    pdfinfo = PdfInfo(output_file)\n\n    pdfimage = pdfinfo[0].images[0]\n\n    if compression == \"jpeg\":\n        assert pdfimage.enc == Encoding.jpeg\n    else:\n        if image.endswith('jpg'):\n            # Ghostscript JPEG passthrough - no issue\n            assert pdfimage.enc == Encoding.jpeg\n        else:\n            assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000)\n\n    if im.mode.startswith('RGB') or im.mode.startswith('BGR'):\n        assert pdfimage.color == Colorspace.rgb, \"Colorspace changed\"\n    elif im.mode.startswith('L'):\n        assert pdfimage.color == Colorspace.gray, \"Colorspace changed\"\n    im.close()\n\n\ndef test_sidecar_pagecount(resources, outpdf):\n    sidecar = outpdf.with_suffix('.txt')\n    check_ocrmypdf(\n        resources / '3small.pdf',\n        outpdf,\n        '--skip-text',\n        '--sidecar',\n        sidecar,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n    pdfinfo = PdfInfo(resources / '3small.pdf')\n    num_pages = len(pdfinfo)\n\n    with open(sidecar, encoding='utf-8') as f:\n        ocr_text = f.read()\n\n    # There should a formfeed between each pair of pages, so the count of\n    # formfeeds is the page count less one\n    assert (\n        ocr_text.count('\\f') == num_pages - 1\n    ), \"Sidecar page count does not match PDF page count\"\n\n\ndef test_sidecar_nonempty(resources, outpdf):\n    sidecar = outpdf.with_suffix('.txt')\n    check_ocrmypdf(\n        resources / 'ccitt.pdf',\n        outpdf,\n        '--sidecar',\n        sidecar,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n    with open(sidecar, encoding='utf-8') as f:\n        ocr_text = f.read()\n    assert 'the' in ocr_text\n\n\n@pytest.mark.parametrize('pdfa_level', ['1', '2', '3'])\ndef test_pdfa_n(pdfa_level, resources, outpdf):\n    check_ocrmypdf(\n        resources / 'ccitt.pdf',\n        outpdf,\n        '--output-type',\n        'pdfa-' + pdfa_level,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n    pdfa_info = file_claims_pdfa(outpdf)\n    assert pdfa_info['conformance'] == f'PDF/A-{pdfa_level}b'\n\n\ndef test_decompression_bomb_error(resources, outpdf, caplog):\n    run_ocrmypdf_api(resources / 'hugemono.pdf', outpdf)\n    assert 'decompression bomb' in caplog.text\n    assert 'max-image-mpixels' in caplog.text\n\n\n@pytest.mark.slow\ndef test_decompression_bomb_succeeds(resources, outpdf):\n    exitcode = run_ocrmypdf_api(\n        resources / 'hugemono.pdf', outpdf, '--max-image-mpixels', '2000'\n    )\n    assert exitcode == 0\n\n\ndef test_text_curves(resources, outpdf):\n    with patch('ocrmypdf._pipeline.VECTOR_PAGE_DPI', 100):\n        check_ocrmypdf(\n            resources / 'vector.pdf',\n            outpdf,\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n        info = PdfInfo(outpdf)\n        assert len(info.pages[0].images) == 0, \"added images to the vector PDF\"\n\n\ndef test_text_curves_force(resources, outpdf):\n    with patch('ocrmypdf._pipeline.VECTOR_PAGE_DPI', 100):\n        check_ocrmypdf(\n            resources / 'vector.pdf',\n            outpdf,\n            '--force-ocr',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n        info = PdfInfo(outpdf)\n        assert len(info.pages[0].images) != 0, \"force did not rasterize\"\n\n\ndef test_output_is_dir(resources, outdir, caplog):\n    exitcode = run_ocrmypdf_api(\n        resources / 'trivial.pdf',\n        outdir,\n        '--force-ocr',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    assert exitcode == ExitCode.file_access_error\n    assert 'is not a writable file' in caplog.text\n\n\n@pytest.mark.skipif(os.name == 'nt', reason=\"symlink needs admin permissions\")\ndef test_output_is_symlink(resources, outdir):\n    sym = Path(outdir / 'this_is_a_symlink')\n    sym.symlink_to(outdir / 'out.pdf')\n    exitcode = run_ocrmypdf_api(\n        resources / 'trivial.pdf',\n        sym,\n        '--force-ocr',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    assert exitcode == ExitCode.ok\n    assert (outdir / 'out.pdf').stat().st_size > 0, 'target file not created'\n\n\ndef test_livecycle(resources, no_outpdf, caplog):\n    exitcode = run_ocrmypdf_api(resources / 'livecycle.pdf', no_outpdf)\n\n    assert exitcode == ExitCode.input_file, caplog.text\n\n\ndef test_version_check():\n    with pytest.raises(MissingDependencyError):\n        get_version('NOT_FOUND_UNLIKELY_ON_PATH')\n\n    with pytest.raises(MissingDependencyError):\n        get_version('sh', version_arg='-c')\n\n    with pytest.raises(MissingDependencyError):\n        get_version('echo')\n\n\n@pytest.mark.parametrize(\n    'threshold, optimize, output_type, expected',\n    [\n        [1.0, 0, 'pdfa', False],\n        [1.0, 0, 'pdf', False],\n        [0.0, 0, 'pdfa', True],\n        [0.0, 0, 'pdf', True],\n        [1.0, 1, 'pdfa', False],\n        [1.0, 1, 'pdf', False],\n        [0.0, 1, 'pdfa', True],\n        [0.0, 1, 'pdf', True],\n    ],\n)\ndef test_fast_web_view(resources, outpdf, threshold, optimize, output_type, expected):\n    check_ocrmypdf(\n        resources / 'trivial.pdf',\n        outpdf,\n        '--fast-web-view',\n        threshold,\n        '--optimize',\n        optimize,\n        '--output-type',\n        output_type,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    with pikepdf.open(outpdf) as pdf:\n        assert pdf.is_linearized == expected\n\n\ndef test_image_dpi_not_image(caplog, resources, outpdf):\n    check_ocrmypdf(\n        resources / 'trivial.pdf',\n        outpdf,\n        '--image-dpi',\n        '100',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    assert '--image-dpi is being ignored' in caplog.text\n\n\ndef test_outputtype_none_bad_setup(resources, outpdf):\n    p = run_ocrmypdf(\n        resources / 'trivial.pdf',\n        outpdf,\n        '--output-type=none',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    assert p.returncode == ExitCode.bad_args\n    assert 'Set the output file to' in p.stderr\n\n\ndef test_outputtype_none(resources, outtxt):\n    exitcode = run_ocrmypdf_api(\n        resources / 'trivial.pdf',\n        '-',\n        '--output-type=none',\n        '--sidecar',\n        outtxt,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    assert exitcode == ExitCode.ok\n    assert outtxt.exists()\n\n\n@pytest.fixture\ndef graph_bad_icc(resources, outdir):\n    synth_input_file = outdir / 'graph-bad-icc.pdf'\n    with pikepdf.open(resources / 'graph.pdf') as pdf:\n        icc = pdf.make_stream(\n            b'invalid icc profile', N=3, Alternate=pikepdf.Name.DeviceRGB\n        )\n        pdf.pages[0].Resources.XObject['/Im0'].ColorSpace = pikepdf.Array(\n            [pikepdf.Name.ICCBased, icc]\n        )\n        pdf.save(synth_input_file)\n        yield synth_input_file\n\n\ndef test_corrupt_icc(graph_bad_icc, outpdf, caplog):\n    result = run_ocrmypdf_api(graph_bad_icc, outpdf)\n    assert result == ExitCode.ok\n    assert any(\n        'corrupt or unreadable ICC profile' in rec.message for rec in caplog.records\n    )\n"
  },
  {
    "path": "tests/test_metadata.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport datetime as dt\nimport warnings\nfrom shutil import copyfile\n\nimport pikepdf\nimport pytest\nfrom pikepdf.models.metadata import decode_pdf_date\n\nfrom ocrmypdf._jobcontext import PdfContext\nfrom ocrmypdf._metadata import metadata_fixup\nfrom ocrmypdf._pipeline import convert_to_pdfa\nfrom ocrmypdf.api import setup_plugin_infrastructure\nfrom ocrmypdf.cli import get_options_and_plugins\nfrom ocrmypdf.exceptions import ExitCode\nfrom ocrmypdf.pdfa import file_claims_pdfa, generate_pdfa_ps\nfrom ocrmypdf.pdfinfo import PdfInfo\n\nfrom .conftest import check_ocrmypdf, run_ocrmypdf, run_ocrmypdf_api\n\n\n@pytest.mark.parametrize(\"output_type\", ['pdfa', 'pdf'])\ndef test_preserve_docinfo(output_type, resources, outpdf):\n    output = check_ocrmypdf(\n        resources / 'graph.pdf',\n        outpdf,\n        '--output-type',\n        output_type,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    with (\n        pikepdf.open(resources / 'graph.pdf') as pdf_before,\n        pikepdf.open(output) as pdf_after,\n    ):\n        for key in ('/Title', '/Author'):\n            assert pdf_before.docinfo[key] == pdf_after.docinfo[key]\n        pdfa_info = file_claims_pdfa(str(output))\n        assert pdfa_info['output'] == output_type\n\n\n@pytest.mark.parametrize(\"output_type\", ['pdfa', 'pdf'])\ndef test_override_metadata(output_type, resources, outpdf, caplog):\n    input_file = resources / 'c02-22.pdf'\n    german = 'Du siehst den Wald vor lauter Bäumen nicht.'\n    chinese = '孔子'\n\n    exitcode = run_ocrmypdf_api(\n        input_file,\n        outpdf,\n        '--title',\n        german,\n        '--author',\n        chinese,\n        '--output-type',\n        output_type,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n    assert exitcode == ExitCode.ok, caplog.text\n\n    with pikepdf.open(input_file) as before, pikepdf.open(outpdf) as after:\n        assert after.docinfo.Title == german, after.docinfo\n        assert after.docinfo.Author == chinese, after.docinfo\n        assert after.docinfo.get('/Keywords', '') == ''\n\n        before_date = decode_pdf_date(str(before.docinfo.CreationDate))\n        after_date = decode_pdf_date(str(after.docinfo.CreationDate))\n        assert before_date == after_date\n\n        pdfa_info = file_claims_pdfa(outpdf)\n        assert pdfa_info['output'] == output_type\n\n\n@pytest.mark.parametrize('output_type', ['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'])\n@pytest.mark.parametrize('field', ['title', 'author', 'subject', 'keywords'])\ndef test_unset_metadata(output_type, field, resources, outpdf, caplog):\n    input_file = resources / 'meta.pdf'\n\n    # magic strings contained in the input pdf metadata\n    meta = {\n        'title': b'NFY5f7Ft2DWMkxLhXwxvFf7eWR2KeK3vEDcd',\n        'author': b'yXaryipxyRk9dVjWjSSaVaNCKeLRgEVzPRMp',\n        'subject': b't49vimctvnuH7ZeAjAkv52ACvWFjcnm5MPJr',\n        'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh',\n    }\n\n    exitcode = run_ocrmypdf_api(\n        input_file,\n        outpdf,\n        f'--{field}',\n        '',\n        '--output-type',\n        output_type,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n    assert exitcode == ExitCode.ok, caplog.text\n\n    # We mainly want to ensure that when '' is passed, the corresponding\n    # metadata is unset in the output pdf. Since metedata is not compressed,\n    # the best way to gaurentee the metadata of interest didn't carry\n    # forward is to just check to ensure the corresponding magic string\n    # isn't contained anywhere in the output pdf. We'll also check to ensure\n    # it's in the input pdf and that any values not unset are still in the\n    # output pdf.\n    with open(input_file, 'rb') as before, open(outpdf, 'rb') as after:\n        before_data = before.read()\n        after_data = after.read()\n\n    for k, v in meta.items():\n        assert v in before_data\n        if k == field:\n            assert v not in after_data\n        else:\n            assert v in after_data\n\n\ndef test_high_unicode(resources, no_outpdf):\n    # Ghostscript doesn't support high Unicode, so neither do we, to be\n    # safe\n    input_file = resources / 'c02-22.pdf'\n    high_unicode = 'U+1030C is: 𐌌'\n\n    p = run_ocrmypdf(\n        input_file,\n        no_outpdf,\n        '--subject',\n        high_unicode,\n        '--output-type',\n        'pdfa',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n    assert p.returncode == ExitCode.bad_args, p.stderr\n\n\n@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])\n@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])\ndef test_bookmarks_preserved(output_type, ocr_option, resources, outpdf):\n    fitz = pytest.importorskip('fitz')\n    input_file = resources / 'toc.pdf'\n    before_toc = fitz.Document(str(input_file)).get_toc()\n\n    check_ocrmypdf(\n        input_file,\n        outpdf,\n        ocr_option,\n        '--output-type',\n        output_type,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n    after_toc = fitz.Document(str(outpdf)).get_toc()\n    print(before_toc)\n    print(after_toc)\n    assert before_toc == after_toc\n\n\ndef seconds_between_dates(date1, date2):\n    return (date2 - date1).total_seconds()\n\n\n@pytest.mark.parametrize('infile', ['trivial.pdf', 'jbig2.pdf'])\n@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])\ndef test_creation_date_preserved(output_type, resources, infile, outpdf):\n    input_file = resources / infile\n\n    check_ocrmypdf(\n        input_file,\n        outpdf,\n        '--output-type',\n        output_type,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n    with pikepdf.open(input_file) as pdf_before, pikepdf.open(outpdf) as pdf_after:\n        before = pdf_before.trailer.get('/Info', {})\n        after = pdf_after.trailer.get('/Info', {})\n\n        if not before:\n            assert after.get('/CreationDate', '') != ''\n        else:\n            # We expect that the creation date stayed the same\n            date_before = decode_pdf_date(str(before['/CreationDate']))\n            date_after = decode_pdf_date(str(after['/CreationDate']))\n            assert seconds_between_dates(date_before, date_after) < 1000\n\n        # We expect that the modified date is quite recent\n        date_after = decode_pdf_date(str(after['/ModDate']))\n        assert seconds_between_dates(date_after, dt.datetime.now(dt.UTC)) < 1000\n\n\n@pytest.fixture\ndef libxmp_file_to_dict():\n    try:\n        with warnings.catch_warnings():\n            # libxmp imports distutils.Version, which is deprecated\n            warnings.filterwarnings(\n                \"ignore\",\n                category=DeprecationWarning,\n                message=r\".*distutils Version classes are deprecated.*\",\n            )\n            from libxmp.utils import (\n                file_to_dict,  # pylint: disable=import-outside-toplevel\n            )\n    except Exception:  # pylint: disable=broad-except\n        pytest.skip(\"libxmp not available or libexempi3 not installed\")\n    return file_to_dict\n\n\n@pytest.mark.parametrize(\n    'test_file,output_type',\n    [\n        ('graph.pdf', 'pdf'),  # PDF with full metadata\n        ('graph.pdf', 'pdfa'),  # PDF/A with full metadata\n        ('overlay.pdf', 'pdfa'),  # /Title()\n        ('3small.pdf', 'pdfa'),\n    ],\n)\ndef test_xml_metadata_preserved(\n    libxmp_file_to_dict, test_file, output_type, resources, outpdf\n):\n    input_file = resources / test_file\n\n    before = libxmp_file_to_dict(str(input_file))\n\n    check_ocrmypdf(\n        input_file,\n        outpdf,\n        '--output-type',\n        output_type,\n        '--skip-text',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n    after = libxmp_file_to_dict(str(outpdf))\n\n    equal_properties = [\n        'dc:contributor',\n        'dc:coverage',\n        'dc:creator',\n        'dc:description',\n        'dc:format',\n        'dc:identifier',\n        'dc:language',\n        'dc:publisher',\n        'dc:relation',\n        'dc:rights',\n        'dc:source',\n        'dc:subject',\n        'dc:title',\n        'dc:type',\n        'pdf:keywords',\n    ]\n    acquired_properties = ['dc:format']\n\n    # Cleanup messy data structure\n    # Top level is key-value mapping of namespaces to keys under namespace,\n    # so we put everything in the same namespace\n    def unify_namespaces(xmpdict):\n        for entries in xmpdict.values():\n            yield from entries\n\n    # Now we have a list of (key, value, {infodict}). We don't care about\n    # infodict. Just flatten to keys and values\n    def keyval_from_tuple(list_of_tuples):\n        for k, v, *_ in list_of_tuples:\n            yield k, v\n\n    before = dict(keyval_from_tuple(unify_namespaces(before)))\n    after = dict(keyval_from_tuple(unify_namespaces(after)))\n\n    for prop in equal_properties:\n        if prop in before:\n            assert prop in after, f'{prop} dropped from xmp'\n            assert before[prop] == after[prop]\n\n        # libxmp presents multivalued entries (e.g. dc:title) as:\n        # 'dc:title': '' <- there's a title\n        # 'dc:title[1]: 'The Title' <- the actual title\n        # 'dc:title[1]/?xml:lang': 'x-default' <- language info\n        propidx = f'{prop}[1]'\n        if propidx in before:\n            assert (\n                after.get(propidx) == before[propidx]\n                or after.get(prop) == before[propidx]\n            )\n\n        if prop in after and prop not in before:\n            assert prop in acquired_properties, (\n                f\"acquired unexpected property {prop} with value \"\n                f\"{after.get(propidx) or after.get(prop)}\"\n            )\n\n\ndef test_kodak_toc(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'kcs.pdf',\n        outpdf,\n        '--output-type',\n        'pdf',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n    with pikepdf.open(outpdf) as p:\n        if pikepdf.Name.First in p.Root.Outlines:\n            assert isinstance(p.Root.Outlines.First, pikepdf.Dictionary)\n\n\ndef test_metadata_fixup_warning(resources, outdir, caplog):\n    options, _pm = get_options_and_plugins(\n        ['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']\n    )\n\n    copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')\n\n    # Use the new setup function instead of get_plugin_manager directly\n    plugin_manager = setup_plugin_infrastructure([])\n    context = PdfContext(options, outdir, outdir / 'graph.pdf', None, plugin_manager)\n    metadata_fixup(\n        working_file=outdir / 'graph.pdf', context=context, pdf_save_settings={}\n    )\n    for record in caplog.records:\n        assert record.levelname != 'WARNING', \"Unexpected warning\"\n\n    # Now add some metadata that will not be copyable\n    with pikepdf.open(outdir / 'graph.pdf') as graph:\n        with graph.open_metadata() as meta:\n            meta['prism2:publicationName'] = 'OCRmyPDF Test'\n        graph.save(outdir / 'graph_mod.pdf')\n\n    context = PdfContext(\n        options, outdir, outdir / 'graph_mod.pdf', None, plugin_manager\n    )\n    metadata_fixup(\n        working_file=outdir / 'graph.pdf', context=context, pdf_save_settings={}\n    )\n    assert any(record.levelname == 'WARNING' for record in caplog.records)\n\n\nXMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'\n\n\ndef test_prevent_gs_invalid_xml(resources, outdir):\n    generate_pdfa_ps(outdir / 'pdfa.ps')\n\n    # Inject a string with a trailing nul character into the DocumentInfo\n    # dictionary of this PDF, as often occurs in practice.\n    with pikepdf.open(resources / 'trivial.pdf') as pdf:\n        pdf.Root.DocumentInfo = pikepdf.Dictionary(\n            Title=b'String with trailing nul\\x00'\n        )\n        pdf.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)\n\n    options, _ = get_options_and_plugins(\n        args=[\n            '-j',\n            '1',\n            '--output-type',\n            'pdfa-2',\n            'a.pdf',\n            'b.pdf',\n        ]\n    )\n    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')\n\n    # Use the new setup function\n    plugin_manager = setup_plugin_infrastructure([])\n    context = PdfContext(\n        options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, plugin_manager\n    )\n\n    convert_to_pdfa(\n        str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context\n    )\n\n    contents = (outdir / 'pdfa.pdf').read_bytes()\n    # Since the XML may be invalid, we scan instead of actually feeding it\n    # to a parser.\n\n    xmp_start = contents.find(XMP_MAGIC)\n    xmp_end = contents.rfind(b'<?xpacket end', xmp_start)\n    assert 0 < xmp_start < xmp_end\n    # Ensure we did not carry the nul forward.\n    assert contents.find(b'&#0;', xmp_start, xmp_end) == -1, \"found escaped nul\"\n    assert contents.find(b'\\x00', xmp_start, xmp_end) == -1\n"
  },
  {
    "path": "tests/test_multi_font_manager.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Unit tests for MultiFontManager and FontProvider.\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nfrom pathlib import Path\n\nimport pytest\n\nfrom ocrmypdf.font import BuiltinFontProvider, FontManager, MultiFontManager\n\n\n@pytest.fixture\ndef font_dir():\n    \"\"\"Return path to font directory.\"\"\"\n    return Path(__file__).parent.parent / \"src\" / \"ocrmypdf\" / \"data\"\n\n\n@pytest.fixture\ndef multi_font_manager(font_dir):\n    \"\"\"Create MultiFontManager instance for testing.\"\"\"\n    return MultiFontManager(font_dir)\n\n\ndef has_cjk_font(manager: MultiFontManager) -> bool:\n    \"\"\"Check if CJK font is available (from system).\"\"\"\n    return 'NotoSansCJK-Regular' in manager.fonts\n\n\ndef has_arabic_font(manager: MultiFontManager) -> bool:\n    \"\"\"Check if Arabic font is available (from system).\"\"\"\n    return 'NotoSansArabic-Regular' in manager.fonts\n\n\ndef has_devanagari_font(manager: MultiFontManager) -> bool:\n    \"\"\"Check if Devanagari font is available (from system).\"\"\"\n    return 'NotoSansDevanagari-Regular' in manager.fonts\n\n\n# Marker for tests that require CJK fonts\nrequires_cjk = pytest.mark.skipif(\n    \"not has_cjk_font(MultiFontManager())\",\n    reason=\"CJK font not available (not installed on system)\"\n)\n\n\n# --- MultiFontManager Initialization Tests ---\n\n\ndef test_init_loads_builtin_fonts(multi_font_manager):\n    \"\"\"Test that initialization loads all expected builtin fonts.\"\"\"\n    # Only NotoSans-Regular and Occulta are bundled\n    assert 'NotoSans-Regular' in multi_font_manager.fonts\n    assert 'Occulta' in multi_font_manager.fonts\n\n    # At least 2 builtin fonts should be loaded\n    assert len(multi_font_manager.fonts) >= 2\n\n    # Arabic, Devanagari, CJK are optional (system fonts)\n\n\ndef test_missing_font_directory():\n    \"\"\"Test that missing font directory raises error for fallback font.\"\"\"\n    with pytest.raises(FileNotFoundError):\n        MultiFontManager(Path(\"/nonexistent/path\"))\n\n\n# --- Arabic Script Language Tests ---\n# These tests require Arabic fonts to be installed on the system\n\n\ndef test_select_font_for_arabic_language(multi_font_manager):\n    \"\"\"Test font selection with Arabic language hint.\"\"\"\n    if not has_arabic_font(multi_font_manager):\n        pytest.skip(\"Arabic font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"مرحبا\", \"ara\")\n    assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']\n\n\ndef test_select_font_for_persian_language(multi_font_manager):\n    \"\"\"Test font selection with Persian language hint.\"\"\"\n    if not has_arabic_font(multi_font_manager):\n        pytest.skip(\"Arabic font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"سلام\", \"per\")\n    assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']\n\n\ndef test_select_font_for_urdu_language(multi_font_manager):\n    \"\"\"Test font selection with Urdu language hint.\"\"\"\n    if not has_arabic_font(multi_font_manager):\n        pytest.skip(\"Arabic font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"ہیلو\", \"urd\")\n    assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']\n\n\ndef test_farsi_language_code(multi_font_manager):\n    \"\"\"Test that 'fas' (Farsi alternative code) maps to Arabic font.\"\"\"\n    if not has_arabic_font(multi_font_manager):\n        pytest.skip(\"Arabic font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"سلام\", \"fas\")\n    assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']\n\n\n# --- Devanagari Script Language Tests ---\n# These tests require Devanagari fonts to be installed on the system\n\n\ndef test_select_font_for_hindi_language(multi_font_manager):\n    \"\"\"Test font selection with Hindi language hint.\"\"\"\n    if not has_devanagari_font(multi_font_manager):\n        pytest.skip(\"Devanagari font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"नमस्ते\", \"hin\")\n    assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']\n\n\ndef test_select_font_for_sanskrit_language(multi_font_manager):\n    \"\"\"Test font selection with Sanskrit language hint.\"\"\"\n    if not has_devanagari_font(multi_font_manager):\n        pytest.skip(\"Devanagari font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"संस्कृतम्\", \"san\")\n    assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']\n\n\ndef test_select_font_for_marathi_language(multi_font_manager):\n    \"\"\"Test font selection with Marathi language hint.\"\"\"\n    if not has_devanagari_font(multi_font_manager):\n        pytest.skip(\"Devanagari font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"मराठी\", \"mar\")\n    assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']\n\n\ndef test_select_font_for_nepali_language(multi_font_manager):\n    \"\"\"Test font selection with Nepali language hint.\"\"\"\n    if not has_devanagari_font(multi_font_manager):\n        pytest.skip(\"Devanagari font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"नेपाली\", \"nep\")\n    assert font_manager == multi_font_manager.fonts['NotoSansDevanagari-Regular']\n\n\n# --- CJK Language Tests ---\n# These tests require CJK fonts to be installed on the system\n\n\ndef test_select_font_for_chinese_language(multi_font_manager):\n    \"\"\"Test font selection with Chinese language hint (ISO 639-3).\"\"\"\n    if not has_cjk_font(multi_font_manager):\n        pytest.skip(\"CJK font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"你好\", \"zho\")\n    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']\n\n\ndef test_select_font_for_chinese_generic(multi_font_manager):\n    \"\"\"Test font selection with generic Chinese language code.\"\"\"\n    if not has_cjk_font(multi_font_manager):\n        pytest.skip(\"CJK font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"中文\", \"chi\")\n    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']\n\n\ndef test_select_font_for_chinese_simplified(multi_font_manager):\n    \"\"\"Test font selection with Tesseract's chi_sim language code.\"\"\"\n    if not has_cjk_font(multi_font_manager):\n        pytest.skip(\"CJK font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"简体字\", \"chi_sim\")\n    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']\n\n\ndef test_select_font_for_chinese_traditional(multi_font_manager):\n    \"\"\"Test font selection with Tesseract's chi_tra language code.\"\"\"\n    if not has_cjk_font(multi_font_manager):\n        pytest.skip(\"CJK font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"漢字\", \"chi_tra\")\n    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']\n\n\ndef test_select_font_for_japanese_language(multi_font_manager):\n    \"\"\"Test font selection with Japanese language hint.\"\"\"\n    if not has_cjk_font(multi_font_manager):\n        pytest.skip(\"CJK font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"こんにちは\", \"jpn\")\n    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']\n\n\ndef test_select_font_for_korean_language(multi_font_manager):\n    \"\"\"Test font selection with Korean language hint.\"\"\"\n    if not has_cjk_font(multi_font_manager):\n        pytest.skip(\"CJK font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"안녕하세요\", \"kor\")\n    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']\n\n\n# --- Latin/English Tests ---\n\n\ndef test_select_font_for_english_text(multi_font_manager):\n    \"\"\"Test font selection for English text.\"\"\"\n    font_manager = multi_font_manager.select_font_for_word(\"Hello World\", \"eng\")\n    assert font_manager == multi_font_manager.fonts['NotoSans-Regular']\n\n\ndef test_select_font_without_language_hint(multi_font_manager):\n    \"\"\"Test font selection without language hint falls back to glyph checking.\"\"\"\n    font_manager = multi_font_manager.select_font_for_word(\"Hello\", None)\n    assert font_manager == multi_font_manager.fonts['NotoSans-Regular']\n\n\n# --- Fallback Behavior Tests ---\n\n\ndef test_select_font_arabic_text_without_language_hint(multi_font_manager):\n    \"\"\"Test that Arabic text is handled via fallback without language hint.\"\"\"\n    if not has_arabic_font(multi_font_manager):\n        pytest.skip(\"Arabic font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"مرحبا\", None)\n    # Should get NotoSansArabic-Regular via fallback chain glyph checking\n    assert font_manager == multi_font_manager.fonts['NotoSansArabic-Regular']\n\n\ndef test_devanagari_text_without_language_hint(multi_font_manager):\n    \"\"\"Test that Devanagari text is handled via fallback without language hint.\"\"\"\n    # NotoSans-Regular includes Devanagari glyphs, so it's selected first in fallback\n    font_manager = multi_font_manager.select_font_for_word(\"नमस्ते\", None)\n    # Could be NotoSans-Regular or NotoSansDevanagari-Regular depending on availability\n    assert font_manager is not None\n\n\ndef test_cjk_text_without_language_hint(multi_font_manager):\n    \"\"\"Test that CJK text is handled via fallback without language hint.\"\"\"\n    if not has_cjk_font(multi_font_manager):\n        pytest.skip(\"CJK font not available\")\n    font_manager = multi_font_manager.select_font_for_word(\"你好\", None)\n    assert font_manager == multi_font_manager.fonts['NotoSansCJK-Regular']\n\n\ndef test_fallback_to_occulta_font(multi_font_manager):\n    \"\"\"Test that unsupported characters fall back to Occulta.ttf.\"\"\"\n    # Use a character unlikely to be in any standard font\n    font_manager = multi_font_manager.select_font_for_word(\"test\", \"xyz\")\n    # Should return some valid font\n    assert font_manager in multi_font_manager.fonts.values()\n\n\ndef test_fallback_fonts_constant(multi_font_manager):\n    \"\"\"Test that FALLBACK_FONTS contains expected fonts.\"\"\"\n    # Check that core fonts are in fallback list\n    assert 'NotoSans-Regular' in MultiFontManager.FALLBACK_FONTS\n    assert 'NotoSansArabic-Regular' in MultiFontManager.FALLBACK_FONTS\n    assert 'NotoSansDevanagari-Regular' in MultiFontManager.FALLBACK_FONTS\n    assert 'NotoSansCJK-Regular' in MultiFontManager.FALLBACK_FONTS\n\n    # Only NotoSans-Regular is bundled; other scripts are system fonts\n    assert 'NotoSans-Regular' in multi_font_manager.fonts\n\n\n# --- Glyph Coverage Tests ---\n\n\ndef test_has_all_glyphs_for_english(multi_font_manager):\n    \"\"\"Test glyph coverage checking for English text.\"\"\"\n    assert multi_font_manager.has_all_glyphs('NotoSans-Regular', \"Hello World\")\n    assert multi_font_manager.has_all_glyphs('NotoSans-Regular', \"café\")\n\n\ndef test_has_all_glyphs_for_arabic(multi_font_manager):\n    \"\"\"Test glyph coverage checking for Arabic text.\"\"\"\n    if not has_arabic_font(multi_font_manager):\n        pytest.skip(\"Arabic font not available\")\n    assert multi_font_manager.has_all_glyphs('NotoSansArabic-Regular', \"مرحبا\")\n\n\ndef test_has_all_glyphs_for_devanagari(multi_font_manager):\n    \"\"\"Test glyph coverage checking for Devanagari text.\"\"\"\n    if not has_devanagari_font(multi_font_manager):\n        pytest.skip(\"Devanagari font not available\")\n    assert multi_font_manager.has_all_glyphs('NotoSansDevanagari-Regular', \"नमस्ते\")\n\n\ndef test_has_all_glyphs_for_cjk(multi_font_manager):\n    \"\"\"Test glyph coverage checking for CJK text.\"\"\"\n    if not has_cjk_font(multi_font_manager):\n        pytest.skip(\"CJK font not available\")\n    assert multi_font_manager.has_all_glyphs('NotoSansCJK-Regular', \"你好\")\n\n\ndef test_empty_text_has_all_glyphs(multi_font_manager):\n    \"\"\"Test that empty text returns True for glyph coverage.\"\"\"\n    assert multi_font_manager.has_all_glyphs('NotoSans-Regular', \"\")\n\n\ndef test_has_all_glyphs_missing_font(multi_font_manager):\n    \"\"\"Test that has_all_glyphs returns False for non-existent font.\"\"\"\n    assert not multi_font_manager.has_all_glyphs('NonExistentFont', \"test\")\n\n\n# --- Caching Tests ---\n\n\ndef test_font_selection_caching(multi_font_manager):\n    \"\"\"Test that font selection results are cached.\"\"\"\n    font1 = multi_font_manager.select_font_for_word(\"Hello\", \"eng\")\n\n    cache_key = (\"Hello\", \"eng\")\n    assert cache_key in multi_font_manager._selection_cache\n\n    font2 = multi_font_manager.select_font_for_word(\"Hello\", \"eng\")\n    assert font1 == font2\n\n\n# --- Language Font Map Tests ---\n\n\ndef test_language_font_map_coverage():\n    \"\"\"Test that LANGUAGE_FONT_MAP has valid structure.\"\"\"\n    # Only NotoSans-Regular is bundled now\n    # This test just verifies the structure is valid\n    for font_name in MultiFontManager.LANGUAGE_FONT_MAP.values():\n        # All font names should be valid strings\n        assert isinstance(font_name, str)\n        assert font_name.startswith('NotoSans')\n\n\n# --- get_all_fonts Tests ---\n\n\ndef test_get_all_fonts(multi_font_manager):\n    \"\"\"Test get_all_fonts returns all loaded fonts.\"\"\"\n    all_fonts = multi_font_manager.get_all_fonts()\n    assert isinstance(all_fonts, dict)\n    # At least 2 builtin fonts should be loaded (NotoSans-Regular and Occulta)\n    assert len(all_fonts) >= 2\n    assert 'NotoSans-Regular' in all_fonts\n    assert 'Occulta' in all_fonts\n    # Arabic, Devanagari, CJK are optional (system fonts)\n\n\n# --- FontProvider Tests ---\n\n\nclass MockFontProvider:\n    \"\"\"Mock FontProvider for testing missing fonts.\"\"\"\n\n    def __init__(\n        self, available_fonts: dict[str, FontManager], fallback: FontManager\n    ):\n        \"\"\"Initialize mock font provider with given fonts.\"\"\"\n        self._fonts = available_fonts\n        self._fallback = fallback\n\n    def get_font(self, font_name: str) -> FontManager | None:\n        return self._fonts.get(font_name)\n\n    def get_available_fonts(self) -> list[str]:\n        return list(self._fonts.keys())\n\n    def get_fallback_font(self) -> FontManager:\n        return self._fallback\n\n\ndef test_custom_font_provider(font_dir):\n    \"\"\"Test that custom FontProvider can be injected.\"\"\"\n    fonts = {\n        'NotoSans-Regular': FontManager(font_dir / 'NotoSans-Regular.ttf'),\n        'Occulta': FontManager(font_dir / 'Occulta.ttf'),\n    }\n    provider = MockFontProvider(fonts, fonts['Occulta'])\n\n    manager = MultiFontManager(font_provider=provider)\n\n    # Should only have the fonts we provided\n    assert len(manager.fonts) == 2\n    assert 'NotoSans-Regular' in manager.fonts\n    assert 'Occulta' in manager.fonts\n\n\ndef test_missing_font_uses_fallback(font_dir):\n    \"\"\"Test that missing fonts gracefully fall back.\"\"\"\n    fonts = {\n        'NotoSans-Regular': FontManager(font_dir / 'NotoSans-Regular.ttf'),\n        'Occulta': FontManager(font_dir / 'Occulta.ttf'),\n    }\n    provider = MockFontProvider(fonts, fonts['Occulta'])\n\n    manager = MultiFontManager(font_provider=provider)\n\n    # Arabic text should fall back to Occulta since NotoSansArabic is missing\n    font = manager.select_font_for_word(\"مرحبا\", \"ara\")\n    assert font == fonts['Occulta']\n\n\ndef test_builtin_font_provider_loads_expected_fonts(font_dir):\n    \"\"\"Test BuiltinFontProvider loads all expected builtin fonts.\"\"\"\n    provider = BuiltinFontProvider(font_dir)\n\n    available = provider.get_available_fonts()\n    assert 'NotoSans-Regular' in available\n    assert 'Occulta' in available\n    # Only Latin (NotoSans) and glyphless fallback (Occulta) are bundled.\n    # All other scripts (Arabic, Devanagari, CJK, etc.) are discovered\n    # from system fonts by SystemFontProvider to reduce package size.\n    assert len(available) == 2\n\n\ndef test_builtin_font_provider_get_font(font_dir):\n    \"\"\"Test BuiltinFontProvider.get_font returns correct fonts.\"\"\"\n    provider = BuiltinFontProvider(font_dir)\n\n    font = provider.get_font('NotoSans-Regular')\n    assert font is not None\n    assert isinstance(font, FontManager)\n\n    missing = provider.get_font('NonExistent')\n    assert missing is None\n\n\ndef test_builtin_font_provider_get_fallback(font_dir):\n    \"\"\"Test BuiltinFontProvider.get_fallback_font returns Occulta font.\"\"\"\n    provider = BuiltinFontProvider(font_dir)\n\n    fallback = provider.get_fallback_font()\n    assert fallback is not None\n    assert fallback == provider.get_font('Occulta')\n\n\ndef test_builtin_font_provider_missing_font_logs_warning(tmp_path, font_dir, caplog):\n    \"\"\"Test that missing expected fonts log a warning.\"\"\"\n    # Create minimal font directory with only Occulta.ttf\n    (tmp_path / 'Occulta.ttf').write_bytes((font_dir / 'Occulta.ttf').read_bytes())\n\n    with caplog.at_level(logging.WARNING):\n        provider = BuiltinFontProvider(tmp_path)\n\n    # Should have logged warnings for missing fonts\n    assert 'NotoSans-Regular' in caplog.text\n    assert 'not found' in caplog.text\n\n    # But Occulta should be loaded\n    assert provider.get_fallback_font() is not None\n\n\ndef test_builtin_font_provider_missing_occulta_raises(tmp_path):\n    \"\"\"Test that missing Occulta.ttf raises FileNotFoundError.\"\"\"\n    with pytest.raises(FileNotFoundError, match=\"Required fallback font\"):\n        BuiltinFontProvider(tmp_path)\n"
  },
  {
    "path": "tests/test_multilingual_direct.py",
    "content": "#!/usr/bin/env python3\n# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Direct tests for multilingual text rendering with fpdf2 renderer.\n\nThis tests the fpdf2 renderer with various language groups:\n- Latin (English, French, German with diacritics)\n- Arabic (Arabic, Persian - RTL scripts)\n- CJK (Chinese Simplified/Traditional, Japanese, Korean)\n- Devanagari (Hindi, Sanskrit)\n\"\"\"\nfrom __future__ import annotations\n\nimport shutil\nimport subprocess\nfrom pathlib import Path\n\nimport pytest\n\nfrom ocrmypdf.font import MultiFontManager\nfrom ocrmypdf.fpdf_renderer import DebugRenderOptions, Fpdf2PdfRenderer\nfrom ocrmypdf.hocrtransform.hocr_parser import HocrParser\n\nRESOURCES = Path(__file__).parent / \"resources\"\n\n\n@pytest.fixture\ndef pdftotext():\n    \"\"\"Return a function to extract text from PDF using pdftotext.\n\n    Skips the test if pdftotext is not available.\n    \"\"\"\n    pdftotext_path = shutil.which('pdftotext')\n    if pdftotext_path is None:\n        pytest.skip(\"pdftotext not available\")\n\n    def extract_text(pdf_path: Path) -> str:\n        return subprocess.check_output(\n            ['pdftotext', '-enc', 'UTF-8', str(pdf_path), '-'],\n            text=True,\n            encoding='utf-8',\n        )\n\n    return extract_text\n\n\n@pytest.fixture\ndef font_dir():\n    \"\"\"Return path to font directory.\"\"\"\n    return Path(__file__).parent.parent / \"src\" / \"ocrmypdf\" / \"data\"\n\n\n@pytest.fixture\ndef multi_font_manager(font_dir):\n    \"\"\"Create MultiFontManager instance for testing.\"\"\"\n    return MultiFontManager(font_dir)\n\n\n@pytest.fixture\ndef multi_font_manager_arabic(font_dir):\n    \"\"\"Create MultiFontManager instance for testing, with Arabic.\"\"\"\n    mfm = MultiFontManager(font_dir)\n    if not mfm.has_font(\"NotoSansArabic-Regular\"):\n        pytest.skip(\"NotoSansArabic font not available\")\n    return mfm\n\n\n# =============================================================================\n# Latin Script Tests\n# =============================================================================\n\n\nclass TestLatinScript:\n    \"\"\"Tests for Latin script (English, French, German, etc.).\"\"\"\n\n    @pytest.fixture\n    def latin_hocr(self):\n        \"\"\"Return path to Latin HOCR test file.\"\"\"\n        return RESOURCES / \"latin.hocr\"\n\n    def test_render_latin_basic(\n        self, latin_hocr, multi_font_manager, tmp_path, pdftotext\n    ):\n        \"\"\"Test rendering Latin script with various diacritics.\"\"\"\n        parser = HocrParser(latin_hocr)\n        page = parser.parse()\n\n        assert page is not None\n        paras = list(page.paragraphs)\n        assert len(paras) == 3  # English, French, German\n\n        # Check languages\n        assert paras[0].language == 'eng'\n        assert paras[1].language == 'fra'\n        assert paras[2].language == 'deu'\n\n        # Render to PDF\n        output_pdf = tmp_path / \"latin_output.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300.0,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n        renderer.render(output_pdf)\n\n        assert output_pdf.exists()\n        assert output_pdf.stat().st_size > 0\n\n        # Extract text and verify\n        text = pdftotext(output_pdf)\n\n        # English words\n        assert 'quick' in text or 'brown' in text or 'fox' in text\n\n        # French with diacritics\n        assert 'Café' in text or 'résumé' in text or 'naïve' in text\n\n        # German with umlauts and eszett\n        assert 'Größe' in text or 'Zürich' in text or 'Ärger' in text\n\n    def test_latin_font_selection(self, latin_hocr, multi_font_manager):\n        \"\"\"Test that NotoSans is selected for Latin text.\"\"\"\n        parser = HocrParser(latin_hocr)\n        page = parser.parse()\n\n        for line in page.lines:\n            for word in line.children:\n                if word.text:\n                    font = multi_font_manager.select_font_for_word(\n                        word.text, line.language\n                    )\n                    assert font is not None\n                    # Latin text should use NotoSans-Regular\n                    assert multi_font_manager.has_all_glyphs(\n                        'NotoSans-Regular', word.text\n                    )\n\n\n# =============================================================================\n# Arabic Script Tests\n# =============================================================================\n\n\nclass TestArabicScript:\n    \"\"\"Tests for Arabic script (Arabic, Persian, etc.).\"\"\"\n\n    @pytest.fixture\n    def arabic_hocr(self):\n        \"\"\"Return path to Arabic HOCR test file.\"\"\"\n        return RESOURCES / \"arabic.hocr\"\n\n    def test_render_arabic_basic(\n        self, arabic_hocr, multi_font_manager_arabic, tmp_path, pdftotext\n    ):\n        \"\"\"Test rendering Arabic script text.\"\"\"\n        parser = HocrParser(arabic_hocr)\n        page = parser.parse()\n\n        assert page is not None\n        paras = list(page.paragraphs)\n        assert len(paras) == 3  # Arabic paragraphs and Persian\n\n        # Render to PDF\n        output_pdf = tmp_path / \"arabic_output.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300.0,\n            multi_font_manager=multi_font_manager_arabic,\n            invisible_text=False,\n        )\n        renderer.render(output_pdf)\n\n        assert output_pdf.exists()\n        assert output_pdf.stat().st_size > 0\n\n        # Extract text and verify Arabic content\n        text = pdftotext(output_pdf)\n\n        # Arabic words: مرحبا بالعالم (Hello world)\n        assert 'مرحبا' in text or 'بالعالم' in text\n        # هذا نص عربي (This is Arabic text)\n        assert 'عربي' in text or 'نص' in text\n\n    def test_arabic_font_selection(self, arabic_hocr, multi_font_manager_arabic):\n        \"\"\"Test that NotoSansArabic is selected for Arabic text.\"\"\"\n        parser = HocrParser(arabic_hocr)\n        page = parser.parse()\n\n        for line in page.lines:\n            for word in line.children:\n                if word.text and line.language in ('ara', 'per'):\n                    font = multi_font_manager_arabic.select_font_for_word(\n                        word.text, line.language\n                    )\n                    assert font is not None\n                    # Arabic text should use NotoSansArabic\n                    assert multi_font_manager_arabic.has_all_glyphs(\n                        'NotoSansArabic-Regular', word.text\n                    ), f\"NotoSansArabic cannot render '{word.text}'\"\n\n    def test_arabic_rtl_handling(self, arabic_hocr):\n        \"\"\"Test that RTL direction is correctly parsed from hOCR.\"\"\"\n        parser = HocrParser(arabic_hocr)\n        page = parser.parse()\n\n        for para in page.paragraphs:\n            if para.language in ('ara', 'per'):\n                # Arabic paragraphs should have RTL direction\n                assert (\n                    para.direction == 'rtl'\n                ), \"Arabic paragraph should have RTL direction\"\n\n\n# =============================================================================\n# CJK Script Tests\n# =============================================================================\n\n\ndef _latin_font_works(multi_font_manager) -> bool:\n    \"\"\"Check if Latin font is available.\"\"\"\n    return multi_font_manager.has_font('NotoSans-Regular')\n\n\ndef _arabic_font_works(multi_font_manager) -> bool:\n    \"\"\"Check if Arabic font is available.\"\"\"\n    return multi_font_manager.has_font('NotoSansArabic-Regular')\n\n\ndef _devanagari_font_works(multi_font_manager) -> bool:\n    \"\"\"Check if Devanagari font is available.\"\"\"\n    return multi_font_manager.has_font('NotoSansDevanagari-Regular')\n\n\ndef _cjk_font_works(multi_font_manager) -> bool:\n    \"\"\"Check if CJK font is working (not corrupted).\"\"\"\n    return multi_font_manager.has_font('NotoSansCJK-Regular')\n\n\nclass TestCJKScript:\n    \"\"\"Tests for CJK scripts (Chinese, Japanese, Korean).\"\"\"\n\n    @pytest.fixture\n    def cjk_hocr(self):\n        \"\"\"Return path to CJK HOCR test file.\"\"\"\n        return RESOURCES / \"cjk.hocr\"\n\n    def test_render_cjk_basic(self, cjk_hocr, multi_font_manager, tmp_path, pdftotext):\n        \"\"\"Test rendering CJK script text.\"\"\"\n        if not _cjk_font_works(multi_font_manager):\n            pytest.skip(\"CJK font not available or corrupted\")\n\n        parser = HocrParser(cjk_hocr)\n        page = parser.parse()\n\n        assert page is not None\n        paras = list(page.paragraphs)\n        assert len(paras) == 4  # Chinese Simplified, Traditional, Japanese, Korean\n\n        # Check languages\n        languages = [p.language for p in paras]\n        assert 'chi_sim' in languages\n        assert 'chi_tra' in languages\n        assert 'jpn' in languages\n        assert 'kor' in languages\n\n        # Render to PDF\n        output_pdf = tmp_path / \"cjk_output.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300.0,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n        renderer.render(output_pdf)\n\n        assert output_pdf.exists()\n        assert output_pdf.stat().st_size > 0\n\n        # Extract text and verify CJK content\n        text = pdftotext(output_pdf)\n\n        # Chinese: 你好 世界 (Hello world)\n        assert '你好' in text or '世界' in text\n        # Japanese: こんにちは (Hello)\n        assert 'こんにちは' in text or '世界' in text\n        # Korean: 안녕하세요 (Hello)\n        assert '안녕하세요' in text or '세계' in text\n\n    def test_cjk_font_selection(self, cjk_hocr, multi_font_manager):\n        \"\"\"Test that NotoSansCJK is selected for CJK text.\"\"\"\n        if not _cjk_font_works(multi_font_manager):\n            pytest.skip(\"CJK font not available or corrupted\")\n\n        parser = HocrParser(cjk_hocr)\n        page = parser.parse()\n\n        cjk_languages = {'chi_sim', 'chi_tra', 'jpn', 'kor', 'zho', 'chi'}\n\n        for line in page.lines:\n            for word in line.children:\n                if word.text and line.language in cjk_languages:\n                    font = multi_font_manager.select_font_for_word(\n                        word.text, line.language\n                    )\n                    assert font is not None\n                    # CJK text should use NotoSansCJK\n                    assert multi_font_manager.has_all_glyphs(\n                        'NotoSansCJK-Regular', word.text\n                    ), f\"NotoSansCJK cannot render '{word.text}'\"\n\n\n# =============================================================================\n# Devanagari Script Tests\n# =============================================================================\n\n\nclass TestDevanagariScript:\n    \"\"\"Tests for Devanagari script (Hindi, Sanskrit, etc.).\"\"\"\n\n    @pytest.fixture\n    def devanagari_hocr(self):\n        \"\"\"Return path to Devanagari HOCR test file.\"\"\"\n        return RESOURCES / \"devanagari.hocr\"\n\n    def test_render_devanagari_basic(\n        self, devanagari_hocr, multi_font_manager, tmp_path, pdftotext\n    ):\n        \"\"\"Test rendering Devanagari script text.\"\"\"\n        parser = HocrParser(devanagari_hocr)\n        page = parser.parse()\n\n        assert page is not None\n        paras = list(page.paragraphs)\n        assert len(paras) == 3  # Hindi paragraphs and Sanskrit\n\n        # Render to PDF\n        output_pdf = tmp_path / \"devanagari_output.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300.0,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n        renderer.render(output_pdf)\n\n        assert output_pdf.exists()\n        assert output_pdf.stat().st_size > 0\n\n        # Extract text and verify Devanagari content\n        text = pdftotext(output_pdf)\n\n        # Hindi: नमस्ते दुनिया (Hello world)\n        assert 'नमस्ते' in text or 'दुनिया' in text\n        # यह हिंदी पाठ है (This is Hindi text)\n        assert 'हिंदी' in text or 'पाठ' in text\n\n    def test_devanagari_font_selection(self, devanagari_hocr, multi_font_manager):\n        \"\"\"Test that NotoSansDevanagari is selected for Devanagari text.\"\"\"\n        if not multi_font_manager.has_font('NotoSansDevanagari-Regular'):\n            pytest.skip(\"Devanagari font not available\")\n        parser = HocrParser(devanagari_hocr)\n        page = parser.parse()\n\n        devanagari_languages = {'hin', 'san', 'mar', 'nep'}\n\n        for line in page.lines:\n            for word in line.children:\n                if word.text and line.language in devanagari_languages:\n                    font = multi_font_manager.select_font_for_word(\n                        word.text, line.language\n                    )\n                    assert font is not None\n                    # Devanagari text should use NotoSansDevanagari\n                    assert multi_font_manager.has_all_glyphs(\n                        'NotoSansDevanagari-Regular', word.text\n                    ), f\"NotoSansDevanagari cannot render '{word.text}'\"\n\n\n# =============================================================================\n# Mixed Language / Multilingual Tests\n# =============================================================================\n\n\nclass TestMultilingual:\n    \"\"\"Tests for mixed-language documents.\"\"\"\n\n    @pytest.fixture\n    def multilingual_hocr(self):\n        \"\"\"Return path to multilingual HOCR test file.\"\"\"\n        return RESOURCES / \"multilingual.hocr\"\n\n    def test_render_multilingual_hocr_basic(\n        self, multilingual_hocr, multi_font_manager_arabic, tmp_path, pdftotext\n    ):\n        \"\"\"Test rendering multilingual HOCR file with English and Arabic text.\"\"\"\n        parser = HocrParser(multilingual_hocr)\n        page = parser.parse()\n\n        assert page is not None\n        assert len(list(page.paragraphs)) == 2  # English and Arabic paragraphs\n\n        # Check languages\n        paras = list(page.paragraphs)\n        assert paras[0].language == 'eng'\n        assert paras[1].language == 'ara'\n\n        # Render to PDF\n        output_pdf = tmp_path / \"multilingual_output.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300.0,\n            multi_font_manager=multi_font_manager_arabic,\n            invisible_text=False,\n        )\n        renderer.render(output_pdf)\n\n        assert output_pdf.exists()\n        assert output_pdf.stat().st_size > 0\n\n        # Extract text from PDF\n        text = pdftotext(output_pdf)\n\n        # Verify both English and Arabic text are present\n        assert 'English' in text or 'Text' in text or 'Here' in text\n        # Arabic text: مرحبا بك\n        assert 'مرحبا' in text or 'بك' in text\n\n    def test_render_multilingual_with_debug_options(\n        self, multilingual_hocr, multi_font_manager, tmp_path\n    ):\n        \"\"\"Test rendering with debug visualization enabled.\"\"\"\n        parser = HocrParser(multilingual_hocr)\n        page = parser.parse()\n\n        # Render with debug options\n        output_pdf = tmp_path / \"multilingual_debug.pdf\"\n        debug_options = DebugRenderOptions(\n            render_baseline=True,\n            render_line_bbox=True,\n            render_word_bbox=True,\n        )\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300.0,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n            debug_render_options=debug_options,\n        )\n        renderer.render(output_pdf)\n\n        assert output_pdf.exists()\n        assert output_pdf.stat().st_size > 0\n\n    def test_multilingual_invisible_text(\n        self, multilingual_hocr, multi_font_manager, tmp_path, pdftotext\n    ):\n        \"\"\"Test rendering with invisible text (default OCR mode).\"\"\"\n        parser = HocrParser(multilingual_hocr)\n        page = parser.parse()\n\n        # Render with invisible text (standard for OCR layer)\n        output_pdf = tmp_path / \"multilingual_invisible.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=300.0,\n            multi_font_manager=multi_font_manager,\n            invisible_text=True,\n        )\n        renderer.render(output_pdf)\n\n        assert output_pdf.exists()\n\n        # Text should still be extractable even though invisible\n        text = pdftotext(output_pdf)\n        assert len(text.strip()) > 0\n\n    def test_multilingual_font_selection(\n        self, multilingual_hocr, multi_font_manager_arabic\n    ):\n        \"\"\"Test that correct fonts are selected for each language.\"\"\"\n        parser = HocrParser(multilingual_hocr)\n        page = parser.parse()\n\n        # Get all words\n        words = []\n        for line in page.lines:\n            for word in line.children:\n                if word.text:\n                    words.append((word.text, line.language))\n\n        # Verify we have both English and Arabic words\n        eng_words = [w for w, lang in words if lang == 'eng']\n        ara_words = [w for w, lang in words if lang == 'ara']\n\n        assert len(eng_words) > 0, \"Should have English words\"\n        assert len(ara_words) > 0, \"Should have Arabic words\"\n\n        # Test font selection\n        for text, lang in words:\n            font_mgr = multi_font_manager_arabic.select_font_for_word(text, lang)\n            assert font_mgr is not None, f\"No font selected for '{text}' ({lang})\"\n\n            if lang == 'ara':\n                assert multi_font_manager_arabic.has_all_glyphs(\n                    'NotoSansArabic-Regular', text\n                ), f\"NotoSansArabic cannot render '{text}'\"\n\n\n# =============================================================================\n# Baseline and Structure Tests\n# =============================================================================\n\n\nclass TestBaselineHandling:\n    \"\"\"Tests for baseline and hOCR structure handling.\"\"\"\n\n    @pytest.fixture\n    def multilingual_hocr(self):\n        \"\"\"Return path to multilingual HOCR test file.\"\"\"\n        return RESOURCES / \"multilingual.hocr\"\n\n    def test_multilingual_baseline_handling(self, multilingual_hocr):\n        \"\"\"Test that baseline information is correctly parsed from hOCR.\"\"\"\n        parser = HocrParser(multilingual_hocr)\n        page = parser.parse()\n\n        for line in page.lines:\n            if line.baseline:\n                # Baseline should be reasonable\n                assert (\n                    -1.0 <= line.baseline.slope <= 1.0\n                ), \"Baseline slope should be reasonable\"\n\n\n# =============================================================================\n# Font Coverage Tests\n# =============================================================================\n\n\nclass TestFontCoverage:\n    \"\"\"Tests verifying font coverage for various scripts.\"\"\"\n\n    def test_noto_sans_latin_coverage(self, multi_font_manager):\n        \"\"\"Test NotoSans covers common Latin characters and diacritics.\"\"\"\n        if not _latin_font_works(multi_font_manager):\n            pytest.skip(\"NotoSans font not available\")\n\n        latin_samples = [\n            \"Hello World\",\n            \"Café résumé naïve\",\n            \"Größe Zürich Ärger\",\n            \"ÀÁÂÃÄÅÆÇÈÉÊË\",\n            \"àáâãäåæçèéêë\",\n        ]\n\n        for sample in latin_samples:\n            assert multi_font_manager.has_all_glyphs(\n                'NotoSans-Regular', sample\n            ), f\"NotoSans should cover: {sample}\"\n\n    def test_noto_sans_arabic_coverage(self, multi_font_manager_arabic):\n        \"\"\"Test NotoSansArabic covers Arabic characters.\"\"\"\n        arabic_samples = [\n            \"مرحبا\",  # Hello\n            \"بالعالم\",  # World\n            \"العربية\",  # Arabic\n        ]\n\n        for sample in arabic_samples:\n            assert multi_font_manager_arabic.has_all_glyphs(\n                'NotoSansArabic-Regular', sample\n            ), f\"NotoSansArabic should cover: {sample}\"\n\n    def test_noto_sans_devanagari_coverage(self, multi_font_manager):\n        \"\"\"Test NotoSansDevanagari covers Devanagari characters.\"\"\"\n        if not _devanagari_font_works(multi_font_manager):\n            pytest.skip(\"NotoSansDevanagari font not available\")\n\n        devanagari_samples = [\n            \"नमस्ते\",  # Hello\n            \"हिंदी\",  # Hindi\n            \"संस्कृत\",  # Sanskrit\n        ]\n\n        for sample in devanagari_samples:\n            assert multi_font_manager.has_all_glyphs(\n                'NotoSansDevanagari-Regular', sample\n            ), f\"NotoSansDevanagari should cover: {sample}\"\n\n    def test_noto_sans_cjk_coverage(self, multi_font_manager):\n        \"\"\"Test NotoSansCJK covers CJK characters.\"\"\"\n        if not _cjk_font_works(multi_font_manager):\n            pytest.skip(\"CJK font not available or corrupted\")\n\n        cjk_samples = [\n            \"你好\",  # Chinese: Hello\n            \"世界\",  # Chinese: World\n            \"こんにちは\",  # Japanese: Hello\n            \"안녕하세요\",  # Korean: Hello\n        ]\n\n        for sample in cjk_samples:\n            assert multi_font_manager.has_all_glyphs(\n                'NotoSansCJK-Regular', sample\n            ), f\"NotoSansCJK should cover: {sample}\"\n\n\nif __name__ == \"__main__\":\n    # Allow running this test directly for quick iteration\n    import sys\n\n    sys.exit(pytest.main([__file__, \"-v\", \"-s\"]))\n"
  },
  {
    "path": "tests/test_null_ocr_engine.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Unit tests for NullOcrEngine (--ocr-engine none).\n\nTests verify that the Null OCR engine exists and functions correctly\nfor scenarios where users want PDF processing without OCR.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom pathlib import Path\nfrom unittest.mock import MagicMock\n\nimport pytest\n\n\nclass TestNullOcrEngineExists:\n    \"\"\"Test that NullOcrEngine plugin exists and is loadable.\"\"\"\n\n    def test_null_ocr_module_importable(self):\n        \"\"\"null_ocr module should be importable.\"\"\"\n        from ocrmypdf.builtin_plugins import null_ocr\n\n        assert null_ocr is not None\n\n    def test_null_ocr_engine_class_exists(self):\n        \"\"\"NullOcrEngine class should exist.\"\"\"\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        assert NullOcrEngine is not None\n\n\nclass TestNullOcrEngineInterface:\n    \"\"\"Test NullOcrEngine implements OcrEngine interface.\"\"\"\n\n    def test_version_returns_none(self):\n        \"\"\"NullOcrEngine.version() should return 'none'.\"\"\"\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        assert NullOcrEngine.version() == \"none\"\n\n    def test_creator_tag(self):\n        \"\"\"NullOcrEngine.creator_tag() should indicate no OCR.\"\"\"\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        tag = NullOcrEngine.creator_tag(MagicMock())\n        tag_lower = tag.lower()\n        assert \"no ocr\" in tag_lower or \"null\" in tag_lower or \"none\" in tag_lower\n\n    def test_languages_returns_empty_set(self):\n        \"\"\"NullOcrEngine.languages() should return empty set.\"\"\"\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        langs = NullOcrEngine.languages(MagicMock())\n        assert langs == set()\n\n    def test_supports_generate_ocr_returns_true(self):\n        \"\"\"NullOcrEngine should support generate_ocr().\"\"\"\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        assert NullOcrEngine.supports_generate_ocr() is True\n\n    def test_get_orientation_returns_zero(self):\n        \"\"\"NullOcrEngine.get_orientation() should return angle=0.\"\"\"\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        result = NullOcrEngine.get_orientation(Path(\"test.png\"), MagicMock())\n        assert result.angle == 0\n\n    def test_get_deskew_returns_zero(self):\n        \"\"\"NullOcrEngine.get_deskew() should return 0.0.\"\"\"\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        result = NullOcrEngine.get_deskew(Path(\"test.png\"), MagicMock())\n        assert result == 0.0\n\n\nclass TestNullOcrEngineGenerateOcr:\n    \"\"\"Test NullOcrEngine.generate_ocr() output.\"\"\"\n\n    @pytest.fixture\n    def sample_image(self, tmp_path):\n        \"\"\"Create a simple test image.\"\"\"\n        from PIL import Image\n\n        img_path = tmp_path / \"test.png\"\n        img = Image.new('RGB', (100, 100), color='white')\n        img.save(img_path, dpi=(300, 300))\n        return img_path\n\n    def test_generate_ocr_returns_tuple(self, sample_image):\n        \"\"\"generate_ocr() should return (OcrElement, str) tuple.\"\"\"\n        from ocrmypdf import OcrElement\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        result = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)\n\n        assert isinstance(result, tuple)\n        assert len(result) == 2\n        assert isinstance(result[0], OcrElement)\n        assert isinstance(result[1], str)\n\n    def test_generate_ocr_returns_empty_text(self, sample_image):\n        \"\"\"generate_ocr() should return empty text string.\"\"\"\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        _, text = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)\n\n        assert text == \"\"\n\n    def test_generate_ocr_returns_page_element(self, sample_image):\n        \"\"\"generate_ocr() should return OcrElement with ocr_class PAGE.\"\"\"\n        from ocrmypdf import OcrClass\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        ocr_tree, _ = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)\n\n        assert ocr_tree.ocr_class == OcrClass.PAGE\n\n    def test_generate_ocr_page_has_correct_dimensions(self, sample_image):\n        \"\"\"generate_ocr() page element should have image dimensions.\"\"\"\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        ocr_tree, _ = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)\n\n        # Image is 100x100\n        assert ocr_tree.bbox.right == 100\n        assert ocr_tree.bbox.bottom == 100\n\n\nclass TestOcrEngineOption:\n    \"\"\"Test --ocr-engine CLI option.\"\"\"\n\n    def test_ocr_engine_option_accepted(self):\n        \"\"\"CLI should accept --ocr-engine option.\"\"\"\n        from ocrmypdf.cli import get_parser\n\n        parser = get_parser()\n\n        # Should not raise\n        args = parser.parse_args(['--ocr-engine', 'none', 'in.pdf', 'out.pdf'])\n        assert args.ocr_engine == 'none'\n\n    def test_ocr_engine_choices_include_none(self):\n        \"\"\"--ocr-engine should include 'none' as a choice.\"\"\"\n        from ocrmypdf.cli import get_parser\n\n        parser = get_parser()\n\n        # Find the --ocr-engine action\n        for action in parser._actions:\n            if '--ocr-engine' in action.option_strings:\n                assert 'none' in action.choices\n                break\n        else:\n            pytest.fail(\"--ocr-engine option not found\")\n\n    def test_ocr_engine_choices_include_auto(self):\n        \"\"\"--ocr-engine should include 'auto' as default.\"\"\"\n        from ocrmypdf.cli import get_parser\n\n        parser = get_parser()\n\n        for action in parser._actions:\n            if '--ocr-engine' in action.option_strings:\n                assert 'auto' in action.choices\n                assert action.default == 'auto'\n                break\n"
  },
  {
    "path": "tests/test_ocr_element.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Unit tests for OcrElement dataclass and related classes.\"\"\"\n\nfrom __future__ import annotations\n\nimport pytest\n\nfrom ocrmypdf.hocrtransform import (\n    Baseline,\n    BoundingBox,\n    FontInfo,\n    OcrClass,\n    OcrElement,\n)\n\n\nclass TestBoundingBox:\n    \"\"\"Tests for BoundingBox dataclass.\"\"\"\n\n    def test_basic_creation(self):\n        bbox = BoundingBox(left=10, top=20, right=100, bottom=50)\n        assert bbox.left == 10\n        assert bbox.top == 20\n        assert bbox.right == 100\n        assert bbox.bottom == 50\n\n    def test_width_height(self):\n        bbox = BoundingBox(left=10, top=20, right=110, bottom=70)\n        assert bbox.width == 100\n        assert bbox.height == 50\n\n    def test_zero_size_box(self):\n        bbox = BoundingBox(left=10, top=20, right=10, bottom=20)\n        assert bbox.width == 0\n        assert bbox.height == 0\n\n    def test_invalid_left_right(self):\n        with pytest.raises(ValueError, match=\"right.*left\"):\n            BoundingBox(left=100, top=20, right=10, bottom=50)\n\n    def test_invalid_top_bottom(self):\n        with pytest.raises(ValueError, match=\"bottom.*top\"):\n            BoundingBox(left=10, top=50, right=100, bottom=20)\n\n\nclass TestBaseline:\n    \"\"\"Tests for Baseline dataclass.\"\"\"\n\n    def test_defaults(self):\n        baseline = Baseline()\n        assert baseline.slope == 0.0\n        assert baseline.intercept == 0.0\n\n    def test_with_values(self):\n        baseline = Baseline(slope=0.01, intercept=-5)\n        assert baseline.slope == 0.01\n        assert baseline.intercept == -5\n\n\nclass TestFontInfo:\n    \"\"\"Tests for FontInfo dataclass.\"\"\"\n\n    def test_defaults(self):\n        font = FontInfo()\n        assert font.name is None\n        assert font.size is None\n        assert font.bold is False\n        assert font.italic is False\n\n    def test_with_values(self):\n        font = FontInfo(name=\"Arial\", size=12.0, bold=True)\n        assert font.name == \"Arial\"\n        assert font.size == 12.0\n        assert font.bold is True\n        assert font.italic is False\n\n\nclass TestOcrElement:\n    \"\"\"Tests for OcrElement dataclass.\"\"\"\n\n    def test_minimal_element(self):\n        elem = OcrElement(ocr_class=OcrClass.WORD, text=\"hello\")\n        assert elem.ocr_class == \"ocrx_word\"\n        assert elem.text == \"hello\"\n        assert elem.bbox is None\n        assert elem.children == []\n\n    def test_element_with_bbox(self):\n        bbox = BoundingBox(left=0, top=0, right=100, bottom=50)\n        elem = OcrElement(ocr_class=OcrClass.LINE, bbox=bbox)\n        assert elem.bbox == bbox\n        assert elem.bbox.width == 100\n\n    def test_element_hierarchy(self):\n        word1 = OcrElement(ocr_class=OcrClass.WORD, text=\"Hello\")\n        word2 = OcrElement(ocr_class=OcrClass.WORD, text=\"World\")\n        line = OcrElement(ocr_class=OcrClass.LINE, children=[word1, word2])\n        paragraph = OcrElement(ocr_class=OcrClass.PARAGRAPH, children=[line])\n        page = OcrElement(ocr_class=OcrClass.PAGE, children=[paragraph])\n\n        assert len(page.children) == 1\n        assert len(page.children[0].children) == 1\n        assert len(page.children[0].children[0].children) == 2\n\n    def test_iter_by_class_single(self):\n        word = OcrElement(ocr_class=OcrClass.WORD, text=\"test\")\n        line = OcrElement(ocr_class=OcrClass.LINE, children=[word])\n        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])\n\n        words = page.iter_by_class(OcrClass.WORD)\n        assert len(words) == 1\n        assert words[0].text == \"test\"\n\n    def test_iter_by_class_multiple(self):\n        words = [\n            OcrElement(ocr_class=OcrClass.WORD, text=\"one\"),\n            OcrElement(ocr_class=OcrClass.WORD, text=\"two\"),\n            OcrElement(ocr_class=OcrClass.WORD, text=\"three\"),\n        ]\n        line = OcrElement(ocr_class=OcrClass.LINE, children=words)\n        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])\n\n        result = page.iter_by_class(OcrClass.WORD)\n        assert len(result) == 3\n        assert [w.text for w in result] == [\"one\", \"two\", \"three\"]\n\n    def test_iter_by_class_multiple_types(self):\n        line = OcrElement(ocr_class=OcrClass.LINE)\n        header = OcrElement(ocr_class=OcrClass.HEADER)\n        caption = OcrElement(ocr_class=OcrClass.CAPTION)\n        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line, header, caption])\n\n        result = page.iter_by_class(OcrClass.LINE, OcrClass.HEADER)\n        assert len(result) == 2\n\n    def test_find_by_class(self):\n        word = OcrElement(ocr_class=OcrClass.WORD, text=\"found\")\n        line = OcrElement(ocr_class=OcrClass.LINE, children=[word])\n        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])\n\n        result = page.find_by_class(OcrClass.WORD)\n        assert result is not None\n        assert result.text == \"found\"\n\n    def test_find_by_class_not_found(self):\n        line = OcrElement(ocr_class=OcrClass.LINE)\n        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])\n\n        result = page.find_by_class(OcrClass.WORD)\n        assert result is None\n\n    def test_get_text_recursive_leaf(self):\n        word = OcrElement(ocr_class=OcrClass.WORD, text=\"hello\")\n        assert word.get_text_recursive() == \"hello\"\n\n    def test_get_text_recursive_nested(self):\n        word1 = OcrElement(ocr_class=OcrClass.WORD, text=\"Hello\")\n        word2 = OcrElement(ocr_class=OcrClass.WORD, text=\"World\")\n        line = OcrElement(ocr_class=OcrClass.LINE, children=[word1, word2])\n\n        assert line.get_text_recursive() == \"Hello World\"\n\n    def test_words_property(self):\n        words = [\n            OcrElement(ocr_class=OcrClass.WORD, text=\"a\"),\n            OcrElement(ocr_class=OcrClass.WORD, text=\"b\"),\n        ]\n        line = OcrElement(ocr_class=OcrClass.LINE, children=words)\n        page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])\n\n        assert len(page.words) == 2\n        assert page.words[0].text == \"a\"\n\n    def test_lines_property(self):\n        line1 = OcrElement(ocr_class=OcrClass.LINE)\n        line2 = OcrElement(ocr_class=OcrClass.HEADER)  # Also a line type\n        par = OcrElement(ocr_class=OcrClass.PARAGRAPH, children=[line1, line2])\n        page = OcrElement(ocr_class=OcrClass.PAGE, children=[par])\n\n        assert len(page.lines) == 2\n\n    def test_paragraphs_property(self):\n        par1 = OcrElement(ocr_class=OcrClass.PARAGRAPH)\n        par2 = OcrElement(ocr_class=OcrClass.PARAGRAPH)\n        page = OcrElement(ocr_class=OcrClass.PAGE, children=[par1, par2])\n\n        assert len(page.paragraphs) == 2\n\n    def test_direction_ltr(self):\n        elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, direction=\"ltr\")\n        assert elem.direction == \"ltr\"\n\n    def test_direction_rtl(self):\n        elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, direction=\"rtl\")\n        assert elem.direction == \"rtl\"\n\n    def test_language(self):\n        elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, language=\"eng\")\n        assert elem.language == \"eng\"\n\n    def test_baseline(self):\n        baseline = Baseline(slope=0.01, intercept=-3)\n        elem = OcrElement(ocr_class=OcrClass.LINE, baseline=baseline)\n        assert elem.baseline.slope == 0.01\n        assert elem.baseline.intercept == -3\n\n    def test_textangle(self):\n        elem = OcrElement(ocr_class=OcrClass.LINE, textangle=5.0)\n        assert elem.textangle == 5.0\n\n    def test_confidence(self):\n        elem = OcrElement(ocr_class=OcrClass.WORD, confidence=0.95)\n        assert elem.confidence == 0.95\n\n    def test_page_properties(self):\n        elem = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            dpi=300.0,\n            page_number=0,\n            logical_page_number=1,\n        )\n        assert elem.dpi == 300.0\n        assert elem.page_number == 0\n        assert elem.logical_page_number == 1\n\n\nclass TestOcrClass:\n    \"\"\"Tests for OcrClass constants.\"\"\"\n\n    def test_class_values(self):\n        assert OcrClass.PAGE == \"ocr_page\"\n        assert OcrClass.PARAGRAPH == \"ocr_par\"\n        assert OcrClass.LINE == \"ocr_line\"\n        assert OcrClass.WORD == \"ocrx_word\"\n        assert OcrClass.HEADER == \"ocr_header\"\n        assert OcrClass.CAPTION == \"ocr_caption\"\n\n    def test_line_types_frozenset(self):\n        assert OcrClass.LINE in OcrClass.LINE_TYPES\n        assert OcrClass.HEADER in OcrClass.LINE_TYPES\n        assert OcrClass.CAPTION in OcrClass.LINE_TYPES\n        assert OcrClass.WORD not in OcrClass.LINE_TYPES\n"
  },
  {
    "path": "tests/test_ocr_engine_interface.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Unit tests for OcrEngine interface extensions.\n\nThese tests verify that the OcrEngine ABC has the new generate_ocr() method\nand that OcrElement classes are exported from the public API.\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom pathlib import Path\nfrom unittest.mock import MagicMock\n\nimport pytest\n\nfrom ocrmypdf.pluginspec import OcrEngine\n\n\nclass TestOcrEngineInterface:\n    \"\"\"Test that OcrEngine ABC has required methods.\"\"\"\n\n    def test_generate_ocr_method_exists(self):\n        \"\"\"OcrEngine must have generate_ocr() method signature.\"\"\"\n        assert hasattr(OcrEngine, 'generate_ocr')\n\n    def test_supports_generate_ocr_method_exists(self):\n        \"\"\"OcrEngine must have supports_generate_ocr() method.\"\"\"\n        assert hasattr(OcrEngine, 'supports_generate_ocr')\n\n    def test_supports_generate_ocr_default_false(self):\n        \"\"\"Default supports_generate_ocr() should return False.\"\"\"\n        from ocrmypdf.pluginspec import OrientationConfidence\n\n        # Create a minimal concrete implementation\n        class MinimalEngine(OcrEngine):\n            @staticmethod\n            def version():\n                return \"1.0\"\n\n            @staticmethod\n            def creator_tag(options):\n                return \"test\"\n\n            def __str__(self):\n                return \"test\"\n\n            @staticmethod\n            def languages(options):\n                return set()\n\n            @staticmethod\n            def get_orientation(input_file, options):\n                return OrientationConfidence(0, 0.0)\n\n            @staticmethod\n            def get_deskew(input_file, options):\n                return 0.0\n\n            @staticmethod\n            def generate_hocr(input_file, output_hocr, output_text, options):\n                pass\n\n            @staticmethod\n            def generate_pdf(input_file, output_pdf, output_text, options):\n                pass\n\n        engine = MinimalEngine()\n        assert engine.supports_generate_ocr() is False\n\n    def test_generate_ocr_raises_not_implemented_by_default(self):\n        \"\"\"Default generate_ocr() should raise NotImplementedError.\"\"\"\n        from ocrmypdf.pluginspec import OrientationConfidence\n\n        class MinimalEngine(OcrEngine):\n            @staticmethod\n            def version():\n                return \"1.0\"\n\n            @staticmethod\n            def creator_tag(options):\n                return \"test\"\n\n            def __str__(self):\n                return \"test\"\n\n            @staticmethod\n            def languages(options):\n                return set()\n\n            @staticmethod\n            def get_orientation(input_file, options):\n                return OrientationConfidence(0, 0.0)\n\n            @staticmethod\n            def get_deskew(input_file, options):\n                return 0.0\n\n            @staticmethod\n            def generate_hocr(input_file, output_hocr, output_text, options):\n                pass\n\n            @staticmethod\n            def generate_pdf(input_file, output_pdf, output_text, options):\n                pass\n\n        engine = MinimalEngine()\n        with pytest.raises(NotImplementedError):\n            engine.generate_ocr(Path(\"test.png\"), MagicMock(), 0)\n\n\nclass TestOcrElementExport:\n    \"\"\"Test that OcrElement is exported from public API.\"\"\"\n\n    def test_ocrelement_importable_from_ocrmypdf(self):\n        \"\"\"OcrElement should be importable from ocrmypdf package.\"\"\"\n        from ocrmypdf import OcrElement\n\n        assert OcrElement is not None\n\n    def test_ocrclass_importable_from_ocrmypdf(self):\n        \"\"\"OcrClass should be importable from ocrmypdf package.\"\"\"\n        from ocrmypdf import OcrClass\n\n        assert OcrClass is not None\n\n    def test_boundingbox_importable_from_ocrmypdf(self):\n        \"\"\"BoundingBox should be importable from ocrmypdf package.\"\"\"\n        from ocrmypdf import BoundingBox\n\n        assert BoundingBox is not None\n"
  },
  {
    "path": "tests/test_ocr_engine_selection.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Unit tests for OCR engine selection mechanism.\n\nTests verify that the --ocr-engine option works correctly and that\nengine-specific options are available.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport pytest\n\n\nclass TestOcrEngineCliOption:\n    \"\"\"Test --ocr-engine CLI option.\"\"\"\n\n    def test_ocr_engine_option_exists(self):\n        \"\"\"CLI should have --ocr-engine option.\"\"\"\n        from ocrmypdf.cli import get_parser\n\n        parser = get_parser()\n\n        option_strings = []\n        for action in parser._actions:\n            option_strings.extend(action.option_strings)\n\n        assert '--ocr-engine' in option_strings\n\n    def test_ocr_engine_accepts_tesseract(self):\n        \"\"\"--ocr-engine should accept 'tesseract'.\"\"\"\n        from ocrmypdf.cli import get_parser\n\n        parser = get_parser()\n\n        args = parser.parse_args(['--ocr-engine', 'tesseract', 'in.pdf', 'out.pdf'])\n        assert args.ocr_engine == 'tesseract'\n\n    def test_ocr_engine_accepts_auto(self):\n        \"\"\"--ocr-engine should accept 'auto'.\"\"\"\n        from ocrmypdf.cli import get_parser\n\n        parser = get_parser()\n\n        args = parser.parse_args(['--ocr-engine', 'auto', 'in.pdf', 'out.pdf'])\n        assert args.ocr_engine == 'auto'\n\n    def test_ocr_engine_accepts_none(self):\n        \"\"\"--ocr-engine should accept 'none'.\"\"\"\n        from ocrmypdf.cli import get_parser\n\n        parser = get_parser()\n\n        args = parser.parse_args(['--ocr-engine', 'none', 'in.pdf', 'out.pdf'])\n        assert args.ocr_engine == 'none'\n\n    def test_ocr_engine_default_is_auto(self):\n        \"\"\"--ocr-engine should default to 'auto'.\"\"\"\n        from ocrmypdf.cli import get_parser\n\n        parser = get_parser()\n\n        args = parser.parse_args(['in.pdf', 'out.pdf'])\n        assert args.ocr_engine == 'auto'\n\n    def test_ocr_engine_rejects_invalid(self):\n        \"\"\"--ocr-engine should reject invalid values.\"\"\"\n        from ocrmypdf.cli import get_parser\n\n        parser = get_parser()\n\n        with pytest.raises(SystemExit):\n            parser.parse_args(['--ocr-engine', 'invalid_engine', 'in.pdf', 'out.pdf'])\n\n\nclass TestOcrEngineOptionsModel:\n    \"\"\"Test OcrOptions has ocr_engine field.\"\"\"\n\n    def test_ocr_options_has_ocr_engine_field(self):\n        \"\"\"OcrOptions should have ocr_engine field.\"\"\"\n        from ocrmypdf._options import OcrOptions\n\n        # Check field exists in model\n        assert 'ocr_engine' in OcrOptions.model_fields\n\n\nclass TestOcrEnginePluginSelection:\n    \"\"\"Test that get_ocr_engine() hook selects correct engine based on options.\"\"\"\n\n    def test_tesseract_selected_when_auto(self):\n        \"\"\"TesseractOcrEngine should be returned when ocr_engine='auto'.\"\"\"\n        from unittest.mock import MagicMock\n\n        from ocrmypdf.builtin_plugins import tesseract_ocr\n        from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine\n\n        options = MagicMock()\n        options.ocr_engine = 'auto'\n\n        engine = tesseract_ocr.get_ocr_engine(options=options)\n        assert isinstance(engine, TesseractOcrEngine)\n\n    def test_tesseract_selected_when_tesseract(self):\n        \"\"\"TesseractOcrEngine should be returned when ocr_engine='tesseract'.\"\"\"\n        from unittest.mock import MagicMock\n\n        from ocrmypdf.builtin_plugins import tesseract_ocr\n        from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine\n\n        options = MagicMock()\n        options.ocr_engine = 'tesseract'\n\n        engine = tesseract_ocr.get_ocr_engine(options=options)\n        assert isinstance(engine, TesseractOcrEngine)\n\n    def test_null_selected_when_none(self):\n        \"\"\"NullOcrEngine should be returned when ocr_engine='none'.\"\"\"\n        from unittest.mock import MagicMock\n\n        from ocrmypdf.builtin_plugins import null_ocr\n        from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine\n\n        options = MagicMock()\n        options.ocr_engine = 'none'\n\n        engine = null_ocr.get_ocr_engine(options=options)\n        assert isinstance(engine, NullOcrEngine)\n\n    def test_null_returns_none_when_auto(self):\n        \"\"\"null_ocr.get_ocr_engine() should return None when ocr_engine='auto'.\"\"\"\n        from unittest.mock import MagicMock\n\n        from ocrmypdf.builtin_plugins import null_ocr\n\n        options = MagicMock()\n        options.ocr_engine = 'auto'\n\n        engine = null_ocr.get_ocr_engine(options=options)\n        assert engine is None\n"
  },
  {
    "path": "tests/test_optimize.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nfrom io import BytesIO\nfrom os import fspath\nfrom pathlib import Path\nfrom unittest.mock import patch\n\nimport img2pdf\nimport pikepdf\nimport pytest\nfrom pikepdf import Array, Dictionary, Name\nfrom PIL import Image, ImageDraw\n\nfrom ocrmypdf import optimize as opt\nfrom ocrmypdf._exec import jbig2enc, pngquant\nfrom ocrmypdf._exec.ghostscript import rasterize_pdf\nfrom ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution\nfrom ocrmypdf.optimize import PdfImage, extract_image_filter\nfrom ocrmypdf.pluginspec import GhostscriptRasterDevice\nfrom tests.conftest import check_ocrmypdf\n\nneeds_pngquant = pytest.mark.skipif(\n    not pngquant.available(), reason=\"pngquant not installed\"\n)\nneeds_jbig2enc = pytest.mark.skipif(\n    not jbig2enc.available(), reason=\"jbig2enc not installed\"\n)\n\n\n# pylint:disable=redefined-outer-name\n\n\n@pytest.fixture(scope=\"session\")\ndef palette(resources):\n    return resources / 'palette.pdf'\n\n\n@needs_pngquant\n@pytest.mark.parametrize('pdf', ['multipage', 'palette'])\ndef test_basic(multipage, palette, pdf, outpdf):\n    infile = multipage if pdf == 'multipage' else palette\n    opt.main(infile, outpdf, level=3)\n\n    assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size\n\n\n@needs_pngquant\ndef test_mono_not_inverted(resources, outdir):\n    infile = resources / '2400dpi.pdf'\n    opt.main(infile, outdir / 'out.pdf', level=3)\n\n    rasterize_pdf(\n        outdir / 'out.pdf',\n        outdir / 'im.png',\n        raster_device=GhostscriptRasterDevice.PNGGRAY,\n        raster_dpi=Resolution(10, 10),\n    )\n\n    with Image.open(fspath(outdir / 'im.png')) as im:\n        assert im.getpixel((0, 0)) > 240, \"Expected white background\"\n\n\n@needs_pngquant\ndef test_jpg_png_params(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'crom.png',\n        outpdf,\n        '--image-dpi',\n        '200',\n        '--optimize',\n        '3',\n        '--jpg-quality',\n        '50',\n        '--png-quality',\n        '20',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n\n@needs_jbig2enc\ndef test_jbig2_lossless(resources, outpdf):\n    \"\"\"Test that JBIG2 lossless encoding works without JBIG2Globals.\"\"\"\n    args = [\n        resources / 'ccitt.pdf',\n        outpdf,\n        '--image-dpi',\n        '200',\n        '--optimize',\n        '3',\n        '--jpg-quality',\n        '50',\n        '--png-quality',\n        '20',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n        '--jbig2-threshold',\n        '0.7',\n    ]\n\n    check_ocrmypdf(*args)\n\n    with pikepdf.open(outpdf) as pdf:\n        pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))\n        assert pim.filters[0] == '/JBIG2Decode'\n        # Lossless JBIG2 has no JBIG2Globals (no shared symbol dictionary)\n        assert len(pim.decode_parms) == 0\n\n\n@needs_pngquant\n@needs_jbig2enc\ndef test_flate_to_jbig2(resources, outdir):\n    # This test requires an image that pngquant is capable of converting to\n    # to 1bpp - so use an existing 1bpp image, convert up, confirm it can\n    # convert down\n    with Image.open(fspath(resources / 'typewriter.png')) as im:\n        assert im.mode in ('1', 'P')\n        im = im.convert('L')\n        im.save(fspath(outdir / 'type8.png'))\n\n    check_ocrmypdf(\n        outdir / 'type8.png',\n        outdir / 'out.pdf',\n        '--image-dpi',\n        '100',\n        '--png-quality',\n        '50',\n        '--optimize',\n        '3',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n    with pikepdf.open(outdir / 'out.pdf') as pdf:\n        pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))\n        assert pim.filters[0] == '/JBIG2Decode'\n\n\n@needs_pngquant\ndef test_multiple_pngs(resources, outdir):\n    with Path.open(outdir / 'in.pdf', 'wb') as inpdf:\n        img2pdf.convert(\n            fspath(resources / 'baiona_colormapped.png'),\n            fspath(resources / 'baiona_gray.png'),\n            outputstream=inpdf,\n            **IMG2PDF_KWARGS,\n        )\n\n    def mockquant(input_file, output_file, *_args):\n        with Image.open(input_file) as im:\n            draw = ImageDraw.Draw(im)\n            draw.rectangle((0, 0, im.width, im.height), fill=128)\n            im.save(output_file)\n\n    with patch('ocrmypdf.optimize.pngquant.quantize') as mock:\n        mock.side_effect = mockquant\n        check_ocrmypdf(\n            outdir / 'in.pdf',\n            outdir / 'out.pdf',\n            '--optimize',\n            '3',\n            '--jobs',\n            '1',\n            '--use-threads',\n            '--output-type',\n            'pdf',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n        mock.assert_called()\n\n    with (\n        pikepdf.open(outdir / 'in.pdf') as inpdf,\n        pikepdf.open(outdir / 'out.pdf') as outpdf,\n    ):\n        for n in range(len(inpdf.pages)):\n            inim = next(iter(inpdf.pages[n].images.values()))\n            outim = next(iter(outpdf.pages[n].images.values()))\n            assert len(outim.read_raw_bytes()) < len(inim.read_raw_bytes()), n\n\n\ndef test_optimize_off(resources, outpdf):\n    check_ocrmypdf(\n        resources / 'trivial.pdf',\n        outpdf,\n        '--optimize=0',\n        '--output-type',\n        'pdf',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n\ndef test_group3(resources):\n    with pikepdf.open(resources / 'ccitt.pdf') as pdf:\n        im = pdf.pages[0].Resources.XObject['/Im1']\n        assert (\n            opt.extract_image_filter(im, im.objgen[0]) is not None\n        ), \"Group 4 should be allowed\"\n\n        im.DecodeParms['/K'] = 0\n        assert (\n            opt.extract_image_filter(im, im.objgen[0]) is None\n        ), \"Group 3 should be disallowed\"\n\n\ndef test_find_formx(resources):\n    with pikepdf.open(resources / 'formxobject.pdf') as pdf:\n        working, pagenos = opt._find_image_xrefs(pdf)\n        assert len(working) == 1\n        xref = next(iter(working))\n        assert pagenos[xref] == 0\n\n\ndef test_extract_image_filter_with_pdf_image():\n    image = Dictionary()\n    image.Subtype = Name.Image\n    image.Length = 200\n    image.Width = 10\n    image.Height = 10\n    image.Filter = [Name.FlateDecode, Name.DCTDecode]\n    pdf_image = PdfImage(image)\n    image.BitsPerComponent = 8\n    assert extract_image_filter(image, None) == (\n        pdf_image,\n        pdf_image.filter_decodeparms[1],\n    )\n\n\ndef test_extract_image_filter_with_non_image():\n    image = Dictionary()\n    image.Subtype = Name.Form\n    assert extract_image_filter(image, None) is None\n\n\ndef test_extract_image_filter_with_small_stream_size():\n    image = Dictionary()\n    image.Subtype = Name.Image\n    image.Length = 50\n    assert extract_image_filter(image, None) is None\n\n\ndef test_extract_image_filter_with_small_dimensions():\n    image = Dictionary()\n    image.Subtype = Name.Image\n    image.Length = 200\n    image.Width = 5\n    image.Height = 5\n    assert extract_image_filter(image, None) is None\n\n\ndef test_extract_image_filter_with_multiple_compression_filters():\n    image = Dictionary()\n    image.Subtype = Name.Image\n    image.Length = 200\n    image.Width = 10\n    image.Height = 10\n    image.BitsPerComponent = 8\n    image.Filter = [Name.ASCII85Decode, Name.FlateDecode, Name.DCTDecode]\n    assert extract_image_filter(image, None) is None\n\n\ndef test_extract_image_filter_with_wide_gamut_image():\n    image = Dictionary()\n    image.Subtype = Name.Image\n    image.Length = 200\n    image.Width = 10\n    image.Height = 10\n    image.BitsPerComponent = 16\n    image.Filter = Name.FlateDecode\n    assert extract_image_filter(image, None) is None\n\n\ndef test_extract_image_filter_with_jpeg2000_image():\n    im = Image.new('RGB', (10, 10))\n    bio = BytesIO()\n    im.save(bio, format='JPEG2000')\n    pdf = pikepdf.new()\n    stream = pdf.make_stream(\n        data=bio.getvalue(),\n        Subtype=Name.Image,\n        Length=200,\n        Width=10,\n        Height=10,\n        BitsPerComponent=8,\n        Filter=Name.JPXDecode,\n    )\n    assert extract_image_filter(stream, None) is None\n\n\ndef test_extract_image_filter_with_ccitt_group_3_image():\n    image = Dictionary()\n    image.Subtype = Name.Image\n    image.Length = 200\n    image.Width = 10\n    image.Height = 10\n    image.BitsPerComponent = 1\n    image.Filter = Name.CCITTFaxDecode\n    image.DecodeParms = Array([Dictionary(K=1)])\n    assert extract_image_filter(image, None) is None\n\n\n# Triggers pikepdf bug\n# def test_extract_image_filter_with_decode_table():\n#     image = Dictionary()\n#     image.Subtype = Name.Image\n#     image.Length = 200\n#     image.Width = 10\n#     image.Height = 10\n#     image.Filter = Name.FlateDecode\n#     image.BitsPerComponent = 8\n#     image.ColorSpace = Name.DeviceGray\n#     image.Decode = [42, 0]\n#     assert extract_image_filter(image, None) is None\n\n\ndef test_extract_image_filter_with_rgb_smask_matte():\n    image = Dictionary()\n    image.Subtype = Name.Image\n    image.Length = 200\n    image.Width = 10\n    image.Height = 10\n    image.Filter = Name.FlateDecode\n    image.BitsPerComponent = 8\n    image.ColorSpace = Name.DeviceRGB\n    image.SMask = Dictionary(\n        Type=Name.Image,\n        Subtype=Name.Image,\n        Length=200,\n        Width=10,\n        Height=10,\n        Filter=Name.FlateDecode,\n        BitsPerComponent=8,\n        ColorSpace=Name.DeviceGray,\n        Matte=Array([1, 2, 3]),\n    )\n    assert extract_image_filter(image, None) is None\n"
  },
  {
    "path": "tests/test_page_boxes.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport pikepdf\nimport pytest\n\nfrom ocrmypdf._exec import verapdf\n\nfrom .conftest import check_ocrmypdf\n\npage_rect = [0, 0, 612, 792]\ninset_rect = [200, 200, 612, 792]\nwh_rect = [0, 0, 412, 592]\n\nneg_rect = [-100, -100, 512, 692]\n\n# When speculative PDF/A succeeds (verapdf available), MediaBox is preserved.\n# Ghostscript would normalize MediaBox to start at origin, but speculative\n# conversion bypasses Ghostscript.\n_pdfa_inset_expected = inset_rect if verapdf.available() else wh_rect\n\nmediabox_testdata = [\n    ('fpdf2', 'pdfa', 'ccitt.pdf', None, inset_rect, _pdfa_inset_expected),\n    ('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, _pdfa_inset_expected),\n    ('fpdf2', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),\n    ('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),\n    (\n        'fpdf2',\n        'pdfa',\n        'ccitt.pdf',\n        '--force-ocr',\n        inset_rect,\n        wh_rect,\n    ),\n    (\n        'fpdf2',\n        'pdf',\n        'ccitt.pdf',\n        '--force-ocr',\n        inset_rect,\n        wh_rect,\n    ),\n    ('fpdf2', 'pdfa', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),\n    ('fpdf2', 'pdf', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),\n]\n\n\n@pytest.mark.parametrize(\n    'renderer, output_type, in_pdf, mode, crop_to, crop_expected', mediabox_testdata\n)\ndef test_media_box(\n    resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected\n):\n    with pikepdf.open(resources / in_pdf) as pdf:\n        page = pdf.pages[0]\n        page.MediaBox = crop_to\n        pdf.save(outdir / 'cropped.pdf')\n    args = [\n        '--jobs',\n        '1',\n        '--pdf-renderer',\n        renderer,\n        '--output-type',\n        output_type,\n    ]\n    if mode:\n        args.append(mode)\n\n    check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)\n\n    with pikepdf.open(outdir / 'processed.pdf') as pdf:\n        page = pdf.pages[0]\n        assert [float(x) for x in page.mediabox] == crop_expected\n\n\ncropbox_testdata = [\n    ('fpdf2', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),\n    ('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),\n    ('fpdf2', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),\n    ('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),\n    (\n        'fpdf2',\n        'pdfa',\n        'ccitt.pdf',\n        '--force-ocr',\n        inset_rect,\n        inset_rect,\n    ),\n    (\n        'fpdf2',\n        'pdf',\n        'ccitt.pdf',\n        '--force-ocr',\n        inset_rect,\n        inset_rect,\n    ),\n]\n\n\n@pytest.mark.parametrize(\n    'renderer, output_type, in_pdf, mode, crop_to, crop_expected', cropbox_testdata\n)\ndef test_crop_box(\n    resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected\n):\n    with pikepdf.open(resources / in_pdf) as pdf:\n        page = pdf.pages[0]\n        page.CropBox = crop_to\n        pdf.save(outdir / 'cropped.pdf')\n    args = [\n        '--jobs',\n        '1',\n        '--pdf-renderer',\n        renderer,\n        '--output-type',\n        output_type,\n        '--optimize',\n        '0',\n    ]\n    if mode:\n        args.append(mode)\n\n    check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)\n\n    with pikepdf.open(outdir / 'processed.pdf') as pdf:\n        page = pdf.pages[0]\n        assert [float(x) for x in page.cropbox] == crop_expected\n"
  },
  {
    "path": "tests/test_page_numbers.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport pytest\n\nimport ocrmypdf\nfrom ocrmypdf._options import _pages_from_ranges\nfrom ocrmypdf.exceptions import BadArgsError\nfrom ocrmypdf.pdfinfo import PdfInfo\n\n\n@pytest.mark.parametrize(\n    'pages, result',\n    [\n        ['1', {0}],\n        ['1,2', {0, 1}],\n        ['1-3', {0, 1, 2}],\n        ['2,5,6', {1, 4, 5}],\n        ['11-15, 18, ', {10, 11, 12, 13, 14, 17}],\n        [',,3', {2}],\n        ['3, 3, 3, 3,', {2}],\n        ['3, 2, 1, 42', {0, 1, 2, 41}],\n        ['-1', BadArgsError],\n        ['1,3,-11', BadArgsError],\n        ['1-,', BadArgsError],\n        ['start-end', BadArgsError],\n        ['1-0', BadArgsError],\n        ['99-98', BadArgsError],\n        ['0-0', BadArgsError],\n        ['1-0,3-4', BadArgsError],\n        [',', BadArgsError],\n        ['', BadArgsError],\n    ],\n)\ndef test_pages(pages, result):\n    if isinstance(result, type):\n        with pytest.raises(result):\n            _pages_from_ranges(pages)\n    else:\n        assert _pages_from_ranges(pages) == result\n\n\ndef test_nonmonotonic_warning(caplog):\n    pages = _pages_from_ranges('1, 3, 2')\n    assert pages == {0, 1, 2}\n    assert 'out of order' in caplog.text\n\n\ndef test_limited_pages(multipage, outpdf):\n    ocrmypdf.ocr(\n        multipage,\n        outpdf,\n        pages='5-6',\n        optimize=0,\n        output_type='pdf',\n        plugins=['tests/plugins/tesseract_cache.py'],\n    )\n    pi = PdfInfo(outpdf)\n    assert not pi.pages[0].has_text\n    assert pi.pages[4].has_text\n    assert pi.pages[5].has_text\n"
  },
  {
    "path": "tests/test_pdf_renderer.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Unit tests for Fpdf2PdfRenderer class.\"\"\"\n\nfrom __future__ import annotations\n\nfrom io import StringIO\nfrom pathlib import Path\n\nimport pytest\nfrom pdfminer.converter import TextConverter\nfrom pdfminer.layout import LAParams\nfrom pdfminer.pdfdocument import PDFDocument\nfrom pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager\nfrom pdfminer.pdfpage import PDFPage\nfrom pdfminer.pdfparser import PDFParser\n\nfrom ocrmypdf.font import MultiFontManager\nfrom ocrmypdf.fpdf_renderer import DebugRenderOptions, Fpdf2PdfRenderer\nfrom ocrmypdf.helpers import check_pdf\nfrom ocrmypdf.hocrtransform import (\n    Baseline,\n    BoundingBox,\n    OcrClass,\n    OcrElement,\n)\n\n\ndef text_from_pdf(filename: Path) -> str:\n    \"\"\"Extract text from a PDF file using pdfminer.\"\"\"\n    output_string = StringIO()\n    with open(filename, 'rb') as in_file:\n        parser = PDFParser(in_file)\n        doc = PDFDocument(parser)\n        rsrcmgr = PDFResourceManager()\n        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())\n        interpreter = PDFPageInterpreter(rsrcmgr, device)\n        for page in PDFPage.create_pages(doc):\n            interpreter.process_page(page)\n    return output_string.getvalue()\n\n\n@pytest.fixture\ndef font_dir():\n    \"\"\"Get the font directory.\"\"\"\n    return Path(__file__).parent.parent / \"src\" / \"ocrmypdf\" / \"data\"\n\n\n@pytest.fixture\ndef multi_font_manager(font_dir):\n    \"\"\"Create a MultiFontManager for tests.\"\"\"\n    return MultiFontManager(font_dir)\n\n\ndef create_simple_page(\n    width: float = 1000,\n    height: float = 500,\n    words: list[tuple[str, tuple[float, float, float, float]]] | None = None,\n) -> OcrElement:\n    \"\"\"Create a simple OcrElement page for testing.\n\n    Args:\n        width: Page width in pixels\n        height: Page height in pixels\n        words: List of (text, (left, top, right, bottom)) tuples\n\n    Returns:\n        OcrElement representing the page\n    \"\"\"\n    if words is None:\n        words = [(\"Hello\", (100, 100, 200, 150)), (\"World\", (250, 100, 350, 150))]\n\n    word_elements = [\n        OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=text,\n            bbox=BoundingBox(left=bbox[0], top=bbox[1], right=bbox[2], bottom=bbox[3]),\n        )\n        for text, bbox in words\n    ]\n\n    line = OcrElement(\n        ocr_class=OcrClass.LINE,\n        bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n        baseline=Baseline(slope=0.0, intercept=0),\n        children=word_elements,\n    )\n\n    paragraph = OcrElement(\n        ocr_class=OcrClass.PARAGRAPH,\n        bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n        direction=\"ltr\",\n        language=\"eng\",\n        children=[line],\n    )\n\n    page = OcrElement(\n        ocr_class=OcrClass.PAGE,\n        bbox=BoundingBox(left=0, top=0, right=width, bottom=height),\n        children=[paragraph],\n    )\n\n    return page\n\n\nclass TestFpdf2PdfRendererBasic:\n    \"\"\"Basic Fpdf2PdfRenderer functionality tests.\"\"\"\n\n    def test_render_simple_page(self, tmp_path, multi_font_manager):\n        \"\"\"Test rendering a simple page with two words.\"\"\"\n        page = create_simple_page()\n        output_pdf = tmp_path / \"simple.pdf\"\n\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        assert output_pdf.exists()\n        check_pdf(str(output_pdf))\n\n    def test_rendered_text_extractable(self, tmp_path, multi_font_manager):\n        \"\"\"Test that rendered text can be extracted from the PDF.\"\"\"\n        page = create_simple_page()\n        output_pdf = tmp_path / \"extractable.pdf\"\n\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        extracted_text = text_from_pdf(output_pdf)\n        assert \"Hello\" in extracted_text\n        assert \"World\" in extracted_text\n\n    def test_invisible_text_mode(self, tmp_path, multi_font_manager):\n        \"\"\"Test that invisible_text=True creates a valid PDF.\"\"\"\n        page = create_simple_page()\n        output_pdf = tmp_path / \"invisible.pdf\"\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=72.0,\n            multi_font_manager=multi_font_manager,\n            invisible_text=True,\n        )\n        renderer.render(output_pdf)\n\n        # Text should still be extractable even when invisible\n        extracted_text = text_from_pdf(output_pdf)\n        assert \"Hello\" in extracted_text\n\n    def test_visible_text_mode(self, tmp_path, multi_font_manager):\n        \"\"\"Test that invisible_text=False creates a valid PDF with visible text.\"\"\"\n        page = create_simple_page()\n        output_pdf = tmp_path / \"visible.pdf\"\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=72.0,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n        )\n        renderer.render(output_pdf)\n\n        # Text should be extractable\n        extracted_text = text_from_pdf(output_pdf)\n        assert \"Hello\" in extracted_text\n\n\nclass TestFpdf2PdfRendererPageSize:\n    \"\"\"Test page size calculations.\"\"\"\n\n    def test_page_dimensions(self, tmp_path, multi_font_manager):\n        \"\"\"Test that page dimensions are calculated correctly.\"\"\"\n        # 1000x500 pixels at 72 dpi = 1000x500 points\n        page = create_simple_page(width=1000, height=500)\n        output_pdf = tmp_path / \"dimensions.pdf\"\n\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        assert renderer.coord_transform.page_width_pt == pytest.approx(1000.0)\n        assert renderer.coord_transform.page_height_pt == pytest.approx(500.0)\n\n        renderer.render(output_pdf)\n\n    def test_high_dpi_page(self, tmp_path, multi_font_manager):\n        \"\"\"Test page dimensions at higher DPI.\"\"\"\n        # 720x360 pixels at 144 dpi = 360x180 points\n        page = create_simple_page(width=720, height=360)\n        output_pdf = tmp_path / \"high_dpi.pdf\"\n\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=144.0, multi_font_manager=multi_font_manager\n        )\n        assert renderer.coord_transform.page_width_pt == pytest.approx(360.0)\n        assert renderer.coord_transform.page_height_pt == pytest.approx(180.0)\n\n        renderer.render(output_pdf)\n        check_pdf(str(output_pdf))\n\n\nclass TestFpdf2PdfRendererMultiLine:\n    \"\"\"Test rendering of multi-line content.\"\"\"\n\n    def test_multiple_lines(self, tmp_path, multi_font_manager):\n        \"\"\"Test rendering multiple lines of text.\"\"\"\n        line1_words = [\n            OcrElement(\n                ocr_class=OcrClass.WORD,\n                text=\"Line\",\n                bbox=BoundingBox(left=100, top=100, right=180, bottom=150),\n            ),\n            OcrElement(\n                ocr_class=OcrClass.WORD,\n                text=\"one\",\n                bbox=BoundingBox(left=190, top=100, right=250, bottom=150),\n            ),\n        ]\n        line1 = OcrElement(\n            ocr_class=OcrClass.LINE,\n            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n            baseline=Baseline(slope=0.0, intercept=0),\n            children=line1_words,\n        )\n\n        line2_words = [\n            OcrElement(\n                ocr_class=OcrClass.WORD,\n                text=\"Line\",\n                bbox=BoundingBox(left=100, top=200, right=180, bottom=250),\n            ),\n            OcrElement(\n                ocr_class=OcrClass.WORD,\n                text=\"two\",\n                bbox=BoundingBox(left=190, top=200, right=250, bottom=250),\n            ),\n        ]\n        line2 = OcrElement(\n            ocr_class=OcrClass.LINE,\n            bbox=BoundingBox(left=100, top=200, right=900, bottom=250),\n            baseline=Baseline(slope=0.0, intercept=0),\n            children=line2_words,\n        )\n\n        paragraph = OcrElement(\n            ocr_class=OcrClass.PARAGRAPH,\n            bbox=BoundingBox(left=100, top=100, right=900, bottom=250),\n            direction=\"ltr\",\n            language=\"eng\",\n            children=[line1, line2],\n        )\n\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),\n            children=[paragraph],\n        )\n\n        output_pdf = tmp_path / \"multiline.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        extracted_text = text_from_pdf(output_pdf)\n        assert \"Line\" in extracted_text\n        assert \"one\" in extracted_text\n        assert \"two\" in extracted_text\n\n\nclass TestFpdf2PdfRendererTextDirection:\n    \"\"\"Test rendering of different text directions.\"\"\"\n\n    def test_ltr_text(self, tmp_path, multi_font_manager):\n        \"\"\"Test rendering LTR text.\"\"\"\n        page = create_simple_page()\n        output_pdf = tmp_path / \"ltr.pdf\"\n\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        check_pdf(str(output_pdf))\n\n    def test_rtl_text(self, tmp_path, multi_font_manager):\n        \"\"\"Test rendering RTL text.\"\"\"\n        word = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"مرحبا\",\n            bbox=BoundingBox(left=100, top=100, right=200, bottom=150),\n        )\n        line = OcrElement(\n            ocr_class=OcrClass.LINE,\n            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n            baseline=Baseline(slope=0.0, intercept=0),\n            direction=\"rtl\",\n            children=[word],\n        )\n        paragraph = OcrElement(\n            ocr_class=OcrClass.PARAGRAPH,\n            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n            direction=\"rtl\",\n            language=\"ara\",\n            children=[line],\n        )\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),\n            children=[paragraph],\n        )\n\n        output_pdf = tmp_path / \"rtl.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        check_pdf(str(output_pdf))\n\n\nclass TestFpdf2PdfRendererBaseline:\n    \"\"\"Test baseline handling in rendering.\"\"\"\n\n    def test_sloped_baseline(self, tmp_path, multi_font_manager):\n        \"\"\"Test rendering with a sloped baseline.\"\"\"\n        word = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"Sloped\",\n            bbox=BoundingBox(left=100, top=100, right=200, bottom=150),\n        )\n        line = OcrElement(\n            ocr_class=OcrClass.LINE,\n            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n            baseline=Baseline(slope=0.02, intercept=-5),\n            children=[word],\n        )\n        paragraph = OcrElement(\n            ocr_class=OcrClass.PARAGRAPH,\n            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n            direction=\"ltr\",\n            language=\"eng\",\n            children=[line],\n        )\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),\n            children=[paragraph],\n        )\n\n        output_pdf = tmp_path / \"sloped.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        check_pdf(str(output_pdf))\n        extracted_text = text_from_pdf(output_pdf)\n        assert \"Sloped\" in extracted_text\n\n\nclass TestFpdf2PdfRendererTextangle:\n    \"\"\"Test textangle (rotation) handling in rendering.\"\"\"\n\n    def test_rotated_text(self, tmp_path, multi_font_manager):\n        \"\"\"Test rendering rotated text.\"\"\"\n        word = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"Rotated\",\n            bbox=BoundingBox(left=100, top=100, right=200, bottom=150),\n        )\n        line = OcrElement(\n            ocr_class=OcrClass.LINE,\n            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n            baseline=Baseline(slope=0.0, intercept=0),\n            textangle=5.0,\n            children=[word],\n        )\n        paragraph = OcrElement(\n            ocr_class=OcrClass.PARAGRAPH,\n            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n            direction=\"ltr\",\n            language=\"eng\",\n            children=[line],\n        )\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),\n            children=[paragraph],\n        )\n\n        output_pdf = tmp_path / \"rotated.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        check_pdf(str(output_pdf))\n        extracted_text = text_from_pdf(output_pdf)\n        assert \"Rotated\" in extracted_text\n\n\nclass TestFpdf2PdfRendererWordBreaks:\n    \"\"\"Test word rendering.\"\"\"\n\n    def test_word_breaks_english(self, tmp_path, multi_font_manager):\n        \"\"\"Test that words are rendered for English text.\"\"\"\n        page = create_simple_page()\n        output_pdf = tmp_path / \"english.pdf\"\n\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        extracted_text = text_from_pdf(output_pdf)\n        # Words should be present\n        assert \"Hello\" in extracted_text\n        assert \"World\" in extracted_text\n\n    def test_cjk_text(self, tmp_path, multi_font_manager):\n        \"\"\"Test rendering CJK text.\"\"\"\n        words = [\n            OcrElement(\n                ocr_class=OcrClass.WORD,\n                text=\"你好\",\n                bbox=BoundingBox(left=100, top=100, right=150, bottom=150),\n            ),\n            OcrElement(\n                ocr_class=OcrClass.WORD,\n                text=\"世界\",\n                bbox=BoundingBox(left=160, top=100, right=210, bottom=150),\n            ),\n        ]\n        line = OcrElement(\n            ocr_class=OcrClass.LINE,\n            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n            baseline=Baseline(slope=0.0, intercept=0),\n            children=words,\n        )\n        paragraph = OcrElement(\n            ocr_class=OcrClass.PARAGRAPH,\n            bbox=BoundingBox(left=100, top=100, right=900, bottom=150),\n            direction=\"ltr\",\n            language=\"chi_sim\",  # Simplified Chinese\n            children=[line],\n        )\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),\n            children=[paragraph],\n        )\n\n        output_pdf = tmp_path / \"chinese.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        check_pdf(str(output_pdf))\n\n\nclass TestFpdf2PdfRendererDebugOptions:\n    \"\"\"Test debug rendering options.\"\"\"\n\n    def test_debug_render_options_default(self, multi_font_manager):\n        \"\"\"Test that debug options are disabled by default.\"\"\"\n        page = create_simple_page()\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n\n        assert renderer.debug_options.render_baseline is False\n        assert renderer.debug_options.render_word_bbox is False\n        assert renderer.debug_options.render_line_bbox is False\n\n    def test_debug_render_options_enabled(self, tmp_path, multi_font_manager):\n        \"\"\"Test rendering with debug options enabled.\"\"\"\n        page = create_simple_page()\n        output_pdf = tmp_path / \"debug.pdf\"\n\n        debug_opts = DebugRenderOptions(\n            render_baseline=True,\n            render_word_bbox=True,\n            render_line_bbox=True,\n        )\n\n        renderer = Fpdf2PdfRenderer(\n            page=page,\n            dpi=72.0,\n            multi_font_manager=multi_font_manager,\n            invisible_text=False,\n            debug_render_options=debug_opts,\n        )\n        renderer.render(output_pdf)\n\n        check_pdf(str(output_pdf))\n        # Text should still be extractable\n        extracted_text = text_from_pdf(output_pdf)\n        assert \"Hello\" in extracted_text\n\n\nclass TestFpdf2PdfRendererErrors:\n    \"\"\"Test error handling in Fpdf2PdfRenderer.\"\"\"\n\n    def test_invalid_ocr_class(self, multi_font_manager):\n        \"\"\"Test that non-page elements are rejected.\"\"\"\n        line = OcrElement(\n            ocr_class=OcrClass.LINE, bbox=BoundingBox(left=0, top=0, right=100, bottom=50)\n        )\n\n        with pytest.raises(ValueError, match=\"ocr_page\"):\n            Fpdf2PdfRenderer(page=line, dpi=72.0, multi_font_manager=multi_font_manager)\n\n    def test_page_without_bbox(self, multi_font_manager):\n        \"\"\"Test that pages without bbox are rejected.\"\"\"\n        page = OcrElement(ocr_class=OcrClass.PAGE)\n\n        with pytest.raises(ValueError, match=\"bounding box\"):\n            Fpdf2PdfRenderer(page=page, dpi=72.0, multi_font_manager=multi_font_manager)\n\n\nclass TestFpdf2PdfRendererLineTypes:\n    \"\"\"Test rendering of different line types.\"\"\"\n\n    def test_header_line(self, tmp_path, multi_font_manager):\n        \"\"\"Test rendering header lines.\"\"\"\n        word = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"Header\",\n            bbox=BoundingBox(left=100, top=50, right=200, bottom=100),\n        )\n        header = OcrElement(\n            ocr_class=OcrClass.HEADER,\n            bbox=BoundingBox(left=100, top=50, right=900, bottom=100),\n            baseline=Baseline(slope=0.0, intercept=0),\n            children=[word],\n        )\n        paragraph = OcrElement(\n            ocr_class=OcrClass.PARAGRAPH,\n            bbox=BoundingBox(left=100, top=50, right=900, bottom=100),\n            direction=\"ltr\",\n            language=\"eng\",\n            children=[header],\n        )\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),\n            children=[paragraph],\n        )\n\n        output_pdf = tmp_path / \"header.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        check_pdf(str(output_pdf))\n        extracted_text = text_from_pdf(output_pdf)\n        assert \"Header\" in extracted_text\n\n    def test_caption_line(self, tmp_path, multi_font_manager):\n        \"\"\"Test rendering caption lines.\"\"\"\n        word = OcrElement(\n            ocr_class=OcrClass.WORD,\n            text=\"Caption\",\n            bbox=BoundingBox(left=100, top=300, right=200, bottom=350),\n        )\n        caption = OcrElement(\n            ocr_class=OcrClass.CAPTION,\n            bbox=BoundingBox(left=100, top=300, right=900, bottom=350),\n            baseline=Baseline(slope=0.0, intercept=0),\n            children=[word],\n        )\n        paragraph = OcrElement(\n            ocr_class=OcrClass.PARAGRAPH,\n            bbox=BoundingBox(left=100, top=300, right=900, bottom=350),\n            direction=\"ltr\",\n            language=\"eng\",\n            children=[caption],\n        )\n        page = OcrElement(\n            ocr_class=OcrClass.PAGE,\n            bbox=BoundingBox(left=0, top=0, right=1000, bottom=500),\n            children=[paragraph],\n        )\n\n        output_pdf = tmp_path / \"caption.pdf\"\n        renderer = Fpdf2PdfRenderer(\n            page=page, dpi=72.0, multi_font_manager=multi_font_manager\n        )\n        renderer.render(output_pdf)\n\n        check_pdf(str(output_pdf))\n        extracted_text = text_from_pdf(output_pdf)\n        assert \"Caption\" in extracted_text\n"
  },
  {
    "path": "tests/test_pdfa.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport os\n\nimport pikepdf\nimport pytest\n\nfrom ocrmypdf.exceptions import MissingDependencyError\n\nfrom .conftest import check_ocrmypdf\n\n\n@pytest.mark.parametrize('optimize', (0, 3))\n@pytest.mark.parametrize('pdfa_level', (1, 2, 3))\ndef test_pdfa(resources, outpdf, optimize, pdfa_level):\n    try:\n        check_ocrmypdf(\n            resources / 'francais.pdf',\n            outpdf,\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n            f'--output-type=pdfa-{pdfa_level}',\n            f'--optimize={optimize}',\n        )\n    except MissingDependencyError as e:\n        if 'pngquant' in str(e) and optimize in (2, 3) and os.name == 'nt':\n            pytest.xfail(\"pngquant currently not available on Windows\")\n    if pdfa_level in (2, 3):\n        # PDF/A-2 allows ObjStm\n        assert b'/ObjStm' in outpdf.read_bytes()\n    elif pdfa_level == 1:\n        # PDF/A-1 might allow ObjStm, but Acrobat does not approve it, so\n        # we don't use it\n        assert b'/ObjStm' not in outpdf.read_bytes()\n\n    with pikepdf.open(outpdf) as pdf, pdf.open_metadata() as m:\n        assert m.pdfa_status == f'{pdfa_level}B'\n"
  },
  {
    "path": "tests/test_pdfinfo.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport pickle\nimport warnings\nfrom io import BytesIO\nfrom math import isclose\n\nimport img2pdf\nimport pikepdf\nimport pytest\nfrom PIL import Image\nfrom reportlab.lib.units import inch\nfrom reportlab.pdfgen.canvas import Canvas\n\nfrom ocrmypdf import pdfinfo\nfrom ocrmypdf.exceptions import InputFileError\nfrom ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution\nfrom ocrmypdf.pdfinfo import Colorspace, Encoding\nfrom ocrmypdf.pdfinfo._contentstream import _interpret_contents\nfrom ocrmypdf.pdfinfo.layout import PDFPage\n\nwarnings.filterwarnings(\n    \"ignore\", category=DeprecationWarning, module=\"reportlab.lib.rl_safe_eval\"\n)\n\n# pylint: disable=protected-access\n\n\n@pytest.fixture\ndef single_page_text(outdir):\n    filename = outdir / 'text.pdf'\n    pdf = Canvas(str(filename), pagesize=(8 * inch, 6 * inch))\n    text = pdf.beginText()\n    text.setFont('Helvetica', 12)\n    text.setTextOrigin(1 * inch, 3 * inch)\n    text.textLine(\n        \"Methink'st thou art a general offence and every man should beat thee.\"\n    )\n    pdf.drawText(text)\n    pdf.showPage()\n    pdf.save()\n    return filename\n\n\ndef test_single_page_text(single_page_text):\n    info = pdfinfo.PdfInfo(single_page_text)\n\n    assert len(info) == 1\n    page = info[0]\n\n    assert page.has_text\n    assert len(page.images) == 0\n\n\n@pytest.fixture(scope='session')\ndef eight_by_eight():\n    im = Image.new('1', (8, 8), 0)\n    for n in range(8):\n        im.putpixel((n, n), 1)\n    return im\n\n\n@pytest.fixture\ndef eight_by_eight_regular_image(eight_by_eight, outpdf):\n    im = eight_by_eight\n    bio = BytesIO()\n    im.save(bio, format='PNG')\n    bio.seek(0)\n\n    imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))\n    layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)\n\n    with outpdf.open('wb') as f:\n        img2pdf.convert(\n            bio,\n            producer=\"img2pdf\",\n            layout_fun=layout_fun,\n            outputstream=f,\n            **IMG2PDF_KWARGS,\n        )\n    return outpdf\n\n\ndef test_single_page_image(eight_by_eight_regular_image):\n    info = pdfinfo.PdfInfo(eight_by_eight_regular_image)\n\n    assert len(info) == 1\n    page = info[0]\n\n    assert not page.has_text\n    assert len(page.images) == 1\n\n    pdfimage = page.images[0]\n    assert pdfimage.width == 8\n    assert pdfimage.color == Colorspace.gray\n\n    # DPI in a 1\"x1\" is the image width\n    assert isclose(pdfimage.dpi.x, 8)\n    assert isclose(pdfimage.dpi.y, 8)\n\n\n@pytest.fixture\ndef eight_by_eight_inline_image(eight_by_eight, outpdf):\n    pdf = Canvas(str(outpdf), pagesize=(8 * 72, 6 * 72))\n    # Draw image in a 72x72 pt or 1\"x1\" area\n    pdf.drawInlineImage(eight_by_eight, 0, 0, width=72, height=72)\n    pdf.showPage()\n    pdf.save()\n    return outpdf\n\n\ndef test_single_page_inline_image(eight_by_eight_inline_image):\n    info = pdfinfo.PdfInfo(eight_by_eight_inline_image)\n    print(info)\n    pdfimage = info[0].images[0]\n    assert isclose(pdfimage.dpi.x, 8)\n    assert pdfimage.color == Colorspace.gray\n    assert pdfimage.width == 8\n\n\ndef test_jpeg(resources):\n    filename = resources / 'c02-22.pdf'\n\n    pdf = pdfinfo.PdfInfo(filename)\n\n    pdfimage = pdf[0].images[0]\n    assert pdfimage.enc == Encoding.jpeg\n    assert isclose(pdfimage.dpi.x, 150)\n\n\n@pytest.fixture\ndef flate_jpeg_pdf(outpdf):\n    \"\"\"Create a PDF with a FlateDecode+DCTDecode (flate+jpeg) encoded image.\n\n    This simulates what OCRmyPDF's optimizer does when it deflates JPEGs.\n    \"\"\"\n    from zlib import compress\n\n    # Create an RGB image and save as JPEG\n    im = Image.new('RGB', (64, 64), color=(128, 64, 192))\n    bio = BytesIO()\n    im.save(bio, format='JPEG')\n    jpeg_data = bio.getvalue()\n\n    # Compress the JPEG data with flate\n    flate_jpeg_data = compress(jpeg_data)\n\n    # Create a PDF with the flate+jpeg image\n    with pikepdf.Pdf.new() as pdf:\n        pdf.add_blank_page(page_size=(72, 72))\n        image_dict = pikepdf.Stream(\n            pdf,\n            flate_jpeg_data,\n            BitsPerComponent=8,\n            ColorSpace=pikepdf.Name.DeviceRGB,\n            Filter=[pikepdf.Name.FlateDecode, pikepdf.Name.DCTDecode],\n            Height=64,\n            Subtype=pikepdf.Name.Image,\n            Type=pikepdf.Name.XObject,\n            Width=64,\n        )\n        objname = pdf.pages[0].add_resource(\n            image_dict, pikepdf.Name.XObject, pikepdf.Name.Im0\n        )\n        pdf.pages[0].Contents = pikepdf.Stream(\n            pdf, b\"q 72 0 0 72 0 0 cm %s Do Q\" % bytes(objname)\n        )\n        pdf.save(outpdf)\n    return outpdf\n\n\ndef test_flate_jpeg(flate_jpeg_pdf):\n    \"\"\"Test that pdfinfo correctly identifies FlateDecode+DCTDecode as flate_jpeg.\"\"\"\n    pdf = pdfinfo.PdfInfo(flate_jpeg_pdf)\n\n    pdfimage = pdf[0].images[0]\n    assert pdfimage.enc == Encoding.flate_jpeg\n\n\ndef test_form_xobject(resources):\n    filename = resources / 'formxobject.pdf'\n\n    pdf = pdfinfo.PdfInfo(filename)\n    pdfimage = pdf[0].images[0]\n    assert pdfimage.width == 50\n\n\ndef test_no_contents(resources):\n    filename = resources / 'no_contents.pdf'\n\n    pdf = pdfinfo.PdfInfo(filename)\n    assert len(pdf[0].images) == 0\n    assert not pdf[0].has_text\n\n\ndef test_oversized_page(resources):\n    pdf = pdfinfo.PdfInfo(resources / 'poster.pdf')\n    image = pdf[0].images[0]\n    assert image.width * image.dpi.x > 200, \"this is supposed to be oversized\"\n\n\ndef test_pickle(resources):\n    # For multiprocessing we must be able to pickle our information - if\n    # this fails then we are probably storing some unpickleabe pikepdf or\n    # other external data around\n    filename = resources / 'graph_ocred.pdf'\n    pdf = pdfinfo.PdfInfo(filename)\n    pickle.dumps(pdf)\n\n\ndef test_vector(resources):\n    filename = resources / 'vector.pdf'\n    pdf = pdfinfo.PdfInfo(filename)\n    assert pdf[0].has_vector\n    assert not pdf[0].has_text\n\n\ndef test_ocr_detection(resources):\n    filename = resources / 'graph_ocred.pdf'\n    pdf = pdfinfo.PdfInfo(filename)\n    assert not pdf[0].has_vector\n    assert pdf[0].has_text\n\n\n@pytest.mark.parametrize(\n    'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf')\n)\ndef test_corrupt_font_detection(resources, testfile):\n    filename = resources / testfile\n    pdf = pdfinfo.PdfInfo(filename, detailed_analysis=True)\n    assert pdf[0].has_corrupt_text\n\n\ndef test_stack_abuse():\n    p = pikepdf.Pdf.new()\n\n    stream = pikepdf.Stream(p, b'q ' * 35)\n    with pytest.warns(UserWarning, match=\"overflowed\"):\n        _interpret_contents(stream)\n\n    stream = pikepdf.Stream(p, b'q Q Q Q Q')\n    with pytest.warns(UserWarning, match=\"underflowed\"):\n        _interpret_contents(stream)\n\n    stream = pikepdf.Stream(p, b'q ' * 135)\n    with pytest.warns(UserWarning), pytest.raises(RuntimeError):\n        _interpret_contents(stream)\n\n\ndef test_pages_issue700(monkeypatch, resources):\n    def get_no_pages(*args, **kwargs):\n        return iter([])\n\n    monkeypatch.setattr(PDFPage, 'get_pages', get_no_pages)\n\n    with pytest.raises(InputFileError, match=\"pdfminer\"):\n        pi = pdfinfo.PdfInfo(\n            resources / 'cardinal.pdf',\n            detailed_analysis=True,\n            progbar=False,\n            max_workers=1,\n        )\n        pi._miner_state.get_page_analysis(0)\n\n\n@pytest.fixture\ndef image_scale0(resources, outpdf):\n    with pikepdf.open(resources / 'cmyk.pdf') as cmyk:\n        xobj = cmyk.pages[0].as_form_xobject()\n\n        p = pikepdf.Pdf.new()\n        p.add_blank_page(page_size=(72, 72))\n        objname = p.pages[0].add_resource(\n            p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0\n        )\n        print(objname)\n        p.pages[0].Contents = pikepdf.Stream(\n            p, b\"q 0 0 0 0 0 0 cm %s Do Q\" % bytes(objname)\n        )\n        p.save(outpdf)\n    return outpdf\n\n\ndef test_image_scale0(image_scale0):\n    pi = pdfinfo.PdfInfo(\n        image_scale0, detailed_analysis=True, progbar=False, max_workers=1\n    )\n    assert not pi.pages[0]._images[0].dpi.is_finite\n    assert pi.pages[0].dpi == Resolution(0, 0)\n"
  },
  {
    "path": "tests/test_pipeline.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport warnings\nfrom unittest.mock import Mock\n\nimport pytest\nfrom PIL import Image\nfrom reportlab.lib.units import inch\nfrom reportlab.lib.utils import ImageReader\nfrom reportlab.pdfgen.canvas import Canvas\n\nfrom ocrmypdf import _pipeline, pdfinfo\nfrom ocrmypdf.helpers import Resolution\nfrom ocrmypdf.pdfinfo import Encoding\n\nwarnings.filterwarnings(\n    \"ignore\", category=DeprecationWarning, module=\"reportlab.lib.rl_safe_eval\"\n)\n\n\n@pytest.fixture(scope='session')\ndef rgb_image():\n    im = Image.new('RGB', (8, 8))\n    im.putpixel((4, 4), (255, 0, 0))\n    im.putpixel((5, 5), (0, 255, 0))\n    im.putpixel((6, 6), (0, 0, 255))\n    return ImageReader(im)\n\n\nDUMMY_OVERSAMPLE_RESOLUTION = Resolution(42.0, 42.0)\nVECTOR_RESOLUTION = Resolution(_pipeline.VECTOR_PAGE_DPI, _pipeline.VECTOR_PAGE_DPI)\n\n\n@pytest.mark.parametrize(\n    'image, text, vector, result',\n    [\n        (False, False, False, VECTOR_RESOLUTION),\n        (False, True, False, VECTOR_RESOLUTION),\n        (True, False, False, DUMMY_OVERSAMPLE_RESOLUTION),\n        (True, True, False, VECTOR_RESOLUTION),\n        (False, False, True, VECTOR_RESOLUTION),\n        (False, True, True, VECTOR_RESOLUTION),\n        (True, False, True, VECTOR_RESOLUTION),\n        (True, True, True, VECTOR_RESOLUTION),\n    ],\n)\ndef test_dpi_needed(image, text, vector, result, rgb_image, outdir):\n    c = Canvas(str(outdir / 'dpi.pdf'), pagesize=(5 * inch, 5 * inch))\n    if image:\n        c.drawImage(rgb_image, 1 * inch, 1 * inch, width=1 * inch, height=1 * inch)\n    if text:\n        c.drawString(1 * inch, 4 * inch, \"Actual text\")\n    if vector:\n        c.ellipse(3 * inch, 3 * inch, 4 * inch, 4 * inch)\n    c.showPage()\n    c.save()\n\n    pi = pdfinfo.PdfInfo(outdir / 'dpi.pdf')\n    pageinfo = pi[0]\n    ctx = Mock()\n    ctx.options.oversample = DUMMY_OVERSAMPLE_RESOLUTION[0]\n    ctx.pageinfo = pageinfo\n\n    assert _pipeline.get_canvas_square_dpi(ctx) == result\n    assert _pipeline.get_page_square_dpi(ctx) == result\n\n\n@pytest.mark.parametrize(\n    # Name for nicer -v output\n    'name,input,output',\n    (\n        (\n            'empty_input',\n            # Input:\n            (),\n            # Output:\n            (),\n        ),\n        (\n            'no_values',\n            # Input:\n            ('', '', '', '', ''),\n            # Output:\n            (((1, 5), None),),\n        ),\n        (\n            'no_empty_values',\n            # Input:\n            ('v', 'w', 'x', 'y', 'z'),\n            # Output:\n            (\n                ((1, 1), 'v'),\n                ((2, 2), 'w'),\n                ((3, 3), 'x'),\n                ((4, 4), 'y'),\n                ((5, 5), 'z'),\n            ),\n        ),\n        (\n            'skip_head',\n            # Input:\n            ('', '', 'x', 'y', 'z'),\n            # Output:\n            (\n                ((1, 2), None),\n                ((3, 3), 'x'),\n                ((4, 4), 'y'),\n                ((5, 5), 'z'),\n            ),\n        ),\n        (\n            'skip_tail',\n            # Input:\n            ('x', 'y', 'z', '', ''),\n            # Output:\n            (\n                ((1, 1), 'x'),\n                ((2, 2), 'y'),\n                ((3, 3), 'z'),\n                ((4, 5), None),\n            ),\n        ),\n        (\n            'range_in_middle',\n            # Input:\n            ('x', '', '', '', 'y'),\n            # Output:\n            (\n                ((1, 1), 'x'),\n                ((2, 4), None),\n                ((5, 5), 'y'),\n            ),\n        ),\n        (\n            'range_in_middle_2',\n            # Input:\n            ('x', '', '', 'y', '', '', '', 'z'),\n            # Output:\n            (\n                ((1, 1), 'x'),\n                ((2, 3), None),\n                ((4, 4), 'y'),\n                ((5, 7), None),\n                ((8, 8), 'z'),\n            ),\n        ),\n    ),\n)\ndef test_enumerate_compress_ranges(name, input, output):\n    assert output == tuple(_pipeline.enumerate_compress_ranges(input))\n\n\n@pytest.mark.parametrize(\n    'encodings, expected',\n    [\n        # Empty images list returns False\n        ([], False),\n        # Single JPEG returns True\n        ([Encoding.jpeg], True),\n        # Single flate_jpeg returns True\n        ([Encoding.flate_jpeg], True),\n        # Mix of jpeg and flate_jpeg returns True\n        ([Encoding.jpeg, Encoding.flate_jpeg], True),\n        # Non-JPEG encoding returns False\n        ([Encoding.flate], False),\n        # Mix with non-JPEG returns False\n        ([Encoding.jpeg, Encoding.flate], False),\n        ([Encoding.flate_jpeg, Encoding.flate], False),\n    ],\n)\ndef test_should_visible_page_image_use_jpg(encodings, expected):\n    \"\"\"Test that should_visible_page_image_use_jpg correctly handles flate_jpeg.\"\"\"\n    pageinfo = Mock()\n    pageinfo.images = [Mock(enc=enc) for enc in encodings]\n    assert _pipeline.should_visible_page_image_use_jpg(pageinfo) == expected\n"
  },
  {
    "path": "tests/test_pipeline_generate_ocr.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Unit tests for pipeline support of generate_ocr().\n\nThese tests verify that the pipeline supports the new generate_ocr() API\nalongside the existing hOCR path.\n\"\"\"\n\nfrom __future__ import annotations\n\nimport dataclasses\nfrom pathlib import Path\nfrom unittest.mock import MagicMock, patch\n\nfrom ocrmypdf import BoundingBox, OcrElement\n\n\nclass TestOcrEngineDirect:\n    \"\"\"Test the ocr_engine_direct() pipeline function.\"\"\"\n\n    def test_ocr_engine_direct_function_exists(self):\n        \"\"\"ocr_engine_direct function should exist in _pipeline module.\"\"\"\n        from ocrmypdf import _pipeline\n\n        assert hasattr(_pipeline, 'ocr_engine_direct')\n\n    def test_ocr_engine_direct_returns_tuple(self, tmp_path):\n        \"\"\"ocr_engine_direct should return (OcrElement, Path) tuple.\"\"\"\n        from ocrmypdf._pipeline import ocr_engine_direct\n\n        # Mock page context with an engine that supports generate_ocr\n        mock_context = MagicMock()\n        mock_engine = MagicMock()\n        mock_engine.supports_generate_ocr.return_value = True\n        mock_engine.generate_ocr.return_value = (\n            OcrElement(ocr_class='ocr_page', bbox=BoundingBox(0, 0, 100, 100)),\n            \"test text\",\n        )\n        mock_context.plugin_manager.get_ocr_engine.return_value = mock_engine\n        mock_context.get_path.return_value = tmp_path / Path(\"test.txt\")\n        mock_context.pageno = 0\n\n        with patch('builtins.open', MagicMock()):\n            result = ocr_engine_direct(Path(\"test.png\"), mock_context)\n\n        assert isinstance(result, tuple)\n        assert len(result) == 2\n\n\nclass TestPageResultExtension:\n    \"\"\"Test PageResult NamedTuple extension.\"\"\"\n\n    def test_page_result_has_ocr_tree_field(self):\n        \"\"\"PageResult should have ocr_tree field.\"\"\"\n        from ocrmypdf._pipelines._common import PageResult\n\n        # PageResult is a NamedTuple, use _fields\n        assert 'ocr_tree' in PageResult._fields\n\n    def test_page_result_ocr_tree_default_none(self):\n        \"\"\"PageResult.ocr_tree should default to None.\"\"\"\n        from ocrmypdf._pipelines._common import PageResult\n\n        result = PageResult(pageno=0)\n        assert result.ocr_tree is None\n\n\nclass TestFpdf2DirectPage:\n    \"\"\"Test Fpdf2DirectPage dataclass for direct OcrElement input.\"\"\"\n\n    def test_fpdf2_direct_page_exists(self):\n        \"\"\"Fpdf2DirectPage dataclass should exist.\"\"\"\n        from ocrmypdf._graft import Fpdf2DirectPage\n\n        assert Fpdf2DirectPage is not None\n\n    def test_fpdf2_direct_page_has_ocr_tree(self):\n        \"\"\"Fpdf2DirectPage should have ocr_tree field.\"\"\"\n        from ocrmypdf._graft import Fpdf2DirectPage\n\n        fields = {f.name for f in dataclasses.fields(Fpdf2DirectPage)}\n        assert 'ocr_tree' in fields\n\n\nclass TestHOCRResultExtension:\n    \"\"\"Test HOCRResult dataclass extension.\"\"\"\n\n    def test_hocr_result_has_ocr_tree_field(self):\n        \"\"\"HOCRResult should have ocr_tree field.\"\"\"\n        from ocrmypdf._pipelines._common import HOCRResult\n\n        fields = {f.name for f in dataclasses.fields(HOCRResult)}\n        assert 'ocr_tree' in fields\n\n    def test_hocr_result_ocr_tree_default_none(self):\n        \"\"\"HOCRResult.ocr_tree should default to None.\"\"\"\n        from ocrmypdf._pipelines._common import HOCRResult\n\n        result = HOCRResult(pageno=0)\n        assert result.ocr_tree is None\n"
  },
  {
    "path": "tests/test_preprocessing.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nfrom math import isclose\n\nimport pytest\nfrom PIL import Image\n\nfrom ocrmypdf._exec import ghostscript, tesseract\nfrom ocrmypdf.exceptions import ExitCode\nfrom ocrmypdf.helpers import Resolution\nfrom ocrmypdf.pdfinfo import PdfInfo\nfrom ocrmypdf.pluginspec import GhostscriptRasterDevice\n\nfrom .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf\n\nRENDERERS = ['fpdf2', 'sandwich']\n\n\ndef test_deskew(resources, outdir):\n    # Run with deskew\n    deskewed_pdf = check_ocrmypdf(resources / 'skew.pdf', outdir / 'skew.pdf', '-d')\n\n    # Now render as an image again...\n    deskewed_png = outdir / 'deskewed.png'\n\n    ghostscript.rasterize_pdf(\n        deskewed_pdf,\n        deskewed_png,\n        raster_device=GhostscriptRasterDevice.PNGMONO,\n        raster_dpi=Resolution(150, 150),\n        pageno=1,\n    )\n\n    # ...and use Tessera to find the skew angle to confirm that it was deskewed\n    skew_angle = tesseract.get_deskew(deskewed_png, [], None, 5.0)\n    print(skew_angle)\n    assert -0.5 < skew_angle < 0.5, \"Deskewing failed\"\n\n\ndef test_deskew_blank_page(resources, outpdf):\n    # Tesseract doesn't like blank pages - make sure we can get through\n    check_ocrmypdf(resources / 'blank.pdf', outpdf, '--deskew')\n\n\n@pytest.mark.xfail(reason=\"remove background disabled\")\ndef test_remove_background(resources, outdir):\n    # Ensure the input image does not contain pure white/black\n    with Image.open(resources / 'baiona_color.jpg') as im:\n        assert im.getextrema() != ((0, 255), (0, 255), (0, 255))\n\n    output_pdf = check_ocrmypdf(\n        resources / 'baiona_color.jpg',\n        outdir / 'test_remove_bg.pdf',\n        '--remove-background',\n        '--image-dpi',\n        '150',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n    output_png = outdir / 'remove_bg.png'\n\n    ghostscript.rasterize_pdf(\n        output_pdf,\n        output_png,\n        raster_device=GhostscriptRasterDevice.PNG16M,\n        raster_dpi=Resolution(100, 100),\n        pageno=1,\n    )\n\n    # The output image should contain pure white and black\n    with Image.open(output_png) as im:\n        assert im.getextrema() == ((0, 255), (0, 255), (0, 255))\n\n\n# This will run 5 * 2 * 2 = 20 test cases\n@pytest.mark.parametrize(\n    \"pdf\", ['palette.pdf', 'cmyk.pdf', 'ccitt.pdf', 'jbig2.pdf', 'lichtenstein.pdf']\n)\n@pytest.mark.parametrize(\"renderer\", ['sandwich', 'fpdf2'])\n@pytest.mark.parametrize(\"output_type\", ['pdf', 'pdfa'])\ndef test_exotic_image(pdf, renderer, output_type, resources, outdir):\n    outfile = outdir / f'test_{pdf}_{renderer}.pdf'\n    check_ocrmypdf(\n        resources / pdf,\n        outfile,\n        '-dc' if have_unpaper() else '-d',\n        '-v',\n        '1',\n        '--output-type',\n        output_type,\n        '--sidecar',\n        '--skip-text',\n        '--pdf-renderer',\n        renderer,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n    assert outfile.with_suffix('.pdf.txt').exists()\n\n\n@pytest.mark.parametrize('renderer', RENDERERS)\ndef test_non_square_resolution(renderer, resources, outpdf):\n    # Confirm input image is non-square resolution\n    in_pageinfo = PdfInfo(resources / 'aspect.pdf')\n    assert in_pageinfo[0].dpi.x != in_pageinfo[0].dpi.y\n\n    proc = run_ocrmypdf(\n        resources / 'aspect.pdf',\n        outpdf,\n        '--pdf-renderer',\n        renderer,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n    # PDF/A conversion can fail for this file if Ghostscript >= 10.3, so don't test\n    # exit code in that case\n    if proc.returncode != ExitCode.pdfa_conversion_failed:\n        proc.check_returncode()\n\n    out_pageinfo = PdfInfo(outpdf)\n\n    # Confirm resolution was kept the same\n    assert in_pageinfo[0].dpi == out_pageinfo[0].dpi\n\n\n@pytest.mark.parametrize('renderer', RENDERERS)\ndef test_convert_to_square_resolution(renderer, resources, outpdf):\n    # Confirm input image is non-square resolution\n    in_pageinfo = PdfInfo(resources / 'aspect.pdf')\n    assert in_pageinfo[0].dpi.x != in_pageinfo[0].dpi.y\n\n    # --force-ocr requires means forced conversion to square resolution\n    check_ocrmypdf(\n        resources / 'aspect.pdf',\n        outpdf,\n        '--force-ocr',\n        '--pdf-renderer',\n        renderer,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n    out_pageinfo = PdfInfo(outpdf)\n\n    in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]\n\n    # Resolution show now be equal\n    assert out_p0.dpi.x == out_p0.dpi.y\n\n    # Page size should match input page size\n    assert isclose(in_p0.width_inches, out_p0.width_inches)\n    assert isclose(in_p0.height_inches, out_p0.height_inches)\n\n    # Because we rasterized the page to produce a new image, it should occupy\n    # the entire page\n    out_im_w = out_p0.images[0].width / out_p0.images[0].dpi.x\n    out_im_h = out_p0.images[0].height / out_p0.images[0].dpi.y\n    assert isclose(out_p0.width_inches, out_im_w)\n    assert isclose(out_p0.height_inches, out_im_h)\n"
  },
  {
    "path": "tests/test_quality.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nfrom ocrmypdf import quality as qual\n\n\ndef test_quality_measurement():\n    oqd = qual.OcrQualityDictionary(\n        wordlist=[\"words\", \"words\", \"quick\", \"brown\", \"fox\", \"dog\", \"lazy\"]\n    )\n    assert len(oqd.dictionary) == 6  # 6 unique\n\n    assert (\n        oqd.measure_words_matched(\"The quick brown fox jumps quickly over the lazy dog\")\n        == 0.5\n    )\n    assert oqd.measure_words_matched(\"12345 10% _f  7fox -brown   | words\") == 1.0\n\n    assert oqd.measure_words_matched(\"quick quick quick\") == 1.0\n"
  },
  {
    "path": "tests/test_rasterizer.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Tests for the --rasterizer CLI option.\"\"\"\n\nfrom __future__ import annotations\n\nfrom io import BytesIO\n\nimport img2pdf\nimport pikepdf\nimport pytest\nfrom PIL import Image\n\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf._plugin_manager import get_plugin_manager\nfrom ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution\n\nfrom .conftest import check_ocrmypdf\n\n# Check if pypdfium2 is available\ntry:\n    import pypdfium2  # noqa: F401\n\n    PYPDFIUM_AVAILABLE = True\nexcept ImportError:\n    PYPDFIUM_AVAILABLE = False\n\n\nclass TestRasterizerOption:\n    \"\"\"Test the --rasterizer CLI option.\"\"\"\n\n    def test_rasterizer_auto_default(self, resources, outpdf):\n        \"\"\"Test that --rasterizer auto (default) works.\"\"\"\n        check_ocrmypdf(\n            resources / 'graph.pdf',\n            outpdf,\n            '--rasterizer',\n            'auto',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n    def test_rasterizer_ghostscript(self, resources, outpdf):\n        \"\"\"Test that --rasterizer ghostscript works.\"\"\"\n        check_ocrmypdf(\n            resources / 'graph.pdf',\n            outpdf,\n            '--rasterizer',\n            'ghostscript',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason=\"pypdfium2 not installed\")\n    def test_rasterizer_pypdfium(self, resources, outpdf):\n        \"\"\"Test that --rasterizer pypdfium works when pypdfium2 is installed.\"\"\"\n        check_ocrmypdf(\n            resources / 'graph.pdf',\n            outpdf,\n            '--rasterizer',\n            'pypdfium',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n    def test_rasterizer_invalid(self):\n        \"\"\"Test that an invalid rasterizer value is rejected.\"\"\"\n        with pytest.raises(ValueError, match=\"rasterizer must be one of\"):\n            OcrOptions(\n                input_file='test.pdf', output_file='out.pdf', rasterizer='invalid'\n            )\n\n\nclass TestRasterizerWithRotation:\n    \"\"\"Test --rasterizer interaction with --rotate-pages.\"\"\"\n\n    def test_ghostscript_with_rotation(self, resources, outpdf):\n        \"\"\"Test Ghostscript rasterizer with page rotation.\"\"\"\n        check_ocrmypdf(\n            resources / 'cardinal.pdf',\n            outpdf,\n            '--rasterizer',\n            'ghostscript',\n            '--rotate-pages',\n            '--rotate-pages-threshold',\n            '0.1',\n            '--plugin',\n            'tests/plugins/tesseract_cache.py',\n        )\n\n    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason=\"pypdfium2 not installed\")\n    def test_pypdfium_with_rotation(self, resources, outpdf):\n        \"\"\"Test pypdfium rasterizer with page rotation.\"\"\"\n        check_ocrmypdf(\n            resources / 'cardinal.pdf',\n            outpdf,\n            '--rasterizer',\n            'pypdfium',\n            '--rotate-pages',\n            '--rotate-pages-threshold',\n            '0.1',\n            '--plugin',\n            'tests/plugins/tesseract_cache.py',\n        )\n\n    def test_auto_with_rotation(self, resources, outpdf):\n        \"\"\"Test auto rasterizer with page rotation.\"\"\"\n        check_ocrmypdf(\n            resources / 'cardinal.pdf',\n            outpdf,\n            '--rasterizer',\n            'auto',\n            '--rotate-pages',\n            '--rotate-pages-threshold',\n            '0.1',\n            '--plugin',\n            'tests/plugins/tesseract_cache.py',\n        )\n\n\nclass TestRasterizerHookDirect:\n    \"\"\"Test rasterize_pdf_page hook directly with different rasterizer options.\"\"\"\n\n    def test_ghostscript_hook_respects_option(self, resources, tmp_path):\n        \"\"\"Test that Ghostscript hook returns None when pypdfium is requested.\"\"\"\n        pm = get_plugin_manager([])\n\n        # Create options requesting pypdfium\n        options = OcrOptions(\n            input_file=resources / 'graph.pdf',\n            output_file=tmp_path / 'out.pdf',\n            rasterizer='pypdfium',\n        )\n\n        img = tmp_path / 'ghostscript_test.png'\n        result = pm.rasterize_pdf_page(\n            input_file=resources / 'graph.pdf',\n            output_file=img,\n            raster_device='pngmono',\n            raster_dpi=Resolution(50, 50),\n            page_dpi=Resolution(50, 50),\n            pageno=1,\n            rotation=0,\n            filter_vector=False,\n            stop_on_soft_error=True,\n            options=options,\n            use_cropbox=False,\n        )\n        # When pypdfium is requested:\n        # - If pypdfium IS available, pypdfium handles it and returns the path\n        # - If pypdfium is NOT available, both plugins return None\n        #   (ghostscript returns None because pypdfium was requested,\n        #    pypdfium returns None because it's not installed)\n        if PYPDFIUM_AVAILABLE:\n            assert result == img\n        else:\n            assert result is None\n\n    def test_pypdfium_hook_respects_option(self, resources, tmp_path):\n        \"\"\"Test that pypdfium hook returns None when ghostscript is requested.\"\"\"\n        pm = get_plugin_manager([])\n\n        # Create options requesting ghostscript\n        options = OcrOptions(\n            input_file=resources / 'graph.pdf',\n            output_file=tmp_path / 'out.pdf',\n            rasterizer='ghostscript',\n        )\n\n        img = tmp_path / 'pypdfium_test.png'\n        result = pm.rasterize_pdf_page(\n            input_file=resources / 'graph.pdf',\n            output_file=img,\n            raster_device='pngmono',\n            raster_dpi=Resolution(50, 50),\n            page_dpi=Resolution(50, 50),\n            pageno=1,\n            rotation=0,\n            filter_vector=False,\n            stop_on_soft_error=True,\n            options=options,\n            use_cropbox=False,\n        )\n        # Ghostscript should handle it\n        assert result == img\n        assert img.exists()\n\n    def test_auto_uses_pypdfium_when_available(self, resources, tmp_path):\n        \"\"\"Test that auto mode uses pypdfium when available.\"\"\"\n        pm = get_plugin_manager([])\n\n        options = OcrOptions(\n            input_file=resources / 'graph.pdf',\n            output_file=tmp_path / 'out.pdf',\n            rasterizer='auto',\n        )\n\n        img = tmp_path / 'auto_test.png'\n        result = pm.rasterize_pdf_page(\n            input_file=resources / 'graph.pdf',\n            output_file=img,\n            raster_device='pngmono',\n            raster_dpi=Resolution(50, 50),\n            page_dpi=Resolution(50, 50),\n            pageno=1,\n            rotation=0,\n            filter_vector=False,\n            stop_on_soft_error=True,\n            options=options,\n            use_cropbox=False,\n        )\n        assert result == img\n        assert img.exists()\n\n\ndef _create_gradient_image(width: int, height: int) -> Image.Image:\n    \"\"\"Create an image with multiple gradients to detect rasterization errors.\n\n    The image contains:\n    - Horizontal gradient from red to blue\n    - Vertical gradient overlay from green to transparent\n    - Diagonal bands for edge detection\n    \"\"\"\n    img = Image.new('RGB', (width, height))\n    pixels = img.load()\n\n    for y in range(height):\n        for x in range(width):\n            # Horizontal gradient: red to blue\n            r = int(255 * (1 - x / width))\n            b = int(255 * (x / width))\n\n            # Vertical gradient: add green component\n            g = int(255 * (y / height))\n\n            # Add diagonal bands for edge detection\n            band = ((x + y) // 20) % 2\n            if band:\n                r = min(255, r + 40)\n                g = min(255, g + 40)\n                b = min(255, b + 40)\n\n            pixels[x, y] = (r, g, b)\n\n    return img\n\n\n@pytest.fixture\ndef pdf_with_nonstandard_boxes(tmp_path):\n    \"\"\"Create a PDF with nonstandard MediaBox, TrimBox and CropBox.\"\"\"\n    # Create an image with gradients to detect rasterization errors\n    img = _create_gradient_image(200, 300)\n    img_bytes = BytesIO()\n    img.save(img_bytes, format='PNG')\n    img_bytes.seek(0)\n\n    # Convert to PDF\n    pdf_bytes = BytesIO()\n    img2pdf.convert(\n        img_bytes.read(),\n        layout_fun=img2pdf.get_fixed_dpi_layout_fun((72, 72)),\n        outputstream=pdf_bytes,\n        **IMG2PDF_KWARGS,\n    )\n    pdf_bytes.seek(0)\n\n    # Modify the PDF to have nonstandard boxes\n    pdf_path = tmp_path / 'nonstandard_boxes.pdf'\n    with pikepdf.open(pdf_bytes) as pdf:\n        page = pdf.pages[0]\n        # Set MediaBox larger than content\n        page.MediaBox = pikepdf.Array([0, 0, 400, 500])\n        # Set CropBox smaller - this is what viewers typically show\n        page.CropBox = pikepdf.Array([50, 50, 350, 450])\n        # Set TrimBox even smaller - indicates intended trim area\n        page.TrimBox = pikepdf.Array([75, 75, 325, 425])\n        pdf.save(pdf_path)\n\n    return pdf_path\n\n\n@pytest.fixture\ndef pdf_with_negative_mediabox(tmp_path):\n    \"\"\"Create a PDF with MediaBox that has negative origin coordinates.\"\"\"\n    # Create an image with gradients to detect rasterization errors\n    img = _create_gradient_image(200, 300)\n    img_bytes = BytesIO()\n    img.save(img_bytes, format='PNG')\n    img_bytes.seek(0)\n\n    pdf_bytes = BytesIO()\n    img2pdf.convert(\n        img_bytes.read(),\n        layout_fun=img2pdf.get_fixed_dpi_layout_fun((72, 72)),\n        outputstream=pdf_bytes,\n        **IMG2PDF_KWARGS,\n    )\n    pdf_bytes.seek(0)\n\n    pdf_path = tmp_path / 'negative_mediabox.pdf'\n    with pikepdf.open(pdf_bytes) as pdf:\n        page = pdf.pages[0]\n        # MediaBox with negative origin (valid PDF but unusual)\n        page.MediaBox = pikepdf.Array([-100, -100, 300, 400])\n        pdf.save(pdf_path)\n\n    return pdf_path\n\n\nclass TestRasterizerWithNonStandardBoxes:\n    \"\"\"Test rasterizers with PDFs having nonstandard MediaBox/TrimBox/CropBox.\"\"\"\n\n    def test_ghostscript_nonstandard_boxes(self, pdf_with_nonstandard_boxes, outpdf):\n        \"\"\"Test Ghostscript handles nonstandard page boxes correctly.\"\"\"\n        check_ocrmypdf(\n            pdf_with_nonstandard_boxes,\n            outpdf,\n            '--rasterizer',\n            'ghostscript',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason=\"pypdfium2 not installed\")\n    def test_pypdfium_nonstandard_boxes(self, pdf_with_nonstandard_boxes, outpdf):\n        \"\"\"Test pypdfium handles nonstandard page boxes correctly.\"\"\"\n        check_ocrmypdf(\n            pdf_with_nonstandard_boxes,\n            outpdf,\n            '--rasterizer',\n            'pypdfium',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n    def test_ghostscript_negative_mediabox(self, pdf_with_negative_mediabox, outpdf):\n        \"\"\"Test Ghostscript handles negative MediaBox origin.\"\"\"\n        check_ocrmypdf(\n            pdf_with_negative_mediabox,\n            outpdf,\n            '--rasterizer',\n            'ghostscript',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason=\"pypdfium2 not installed\")\n    def test_pypdfium_negative_mediabox(self, pdf_with_negative_mediabox, outpdf):\n        \"\"\"Test pypdfium handles negative MediaBox origin.\"\"\"\n        check_ocrmypdf(\n            pdf_with_negative_mediabox,\n            outpdf,\n            '--rasterizer',\n            'pypdfium',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n    def test_compare_rasterizers_nonstandard_boxes(\n        self, pdf_with_nonstandard_boxes, tmp_path\n    ):\n        \"\"\"Compare output dimensions between rasterizers for nonstandard boxes.\"\"\"\n        pm = get_plugin_manager([])\n\n        options_gs = OcrOptions(\n            input_file=pdf_with_nonstandard_boxes,\n            output_file=tmp_path / 'out_gs.pdf',\n            rasterizer='ghostscript',\n        )\n\n        img_gs = tmp_path / 'gs.png'\n        pm.rasterize_pdf_page(\n            input_file=pdf_with_nonstandard_boxes,\n            output_file=img_gs,\n            raster_device='png16m',\n            raster_dpi=Resolution(72, 72),\n            page_dpi=Resolution(72, 72),\n            pageno=1,\n            rotation=0,\n            filter_vector=False,\n            stop_on_soft_error=True,\n            options=options_gs,\n            use_cropbox=False,\n        )\n\n        with Image.open(img_gs) as im_gs:\n            gs_size = im_gs.size\n\n        if PYPDFIUM_AVAILABLE:\n            options_pdfium = OcrOptions(\n                input_file=pdf_with_nonstandard_boxes,\n                output_file=tmp_path / 'out_pdfium.pdf',\n                rasterizer='pypdfium',\n            )\n\n            img_pdfium = tmp_path / 'pdfium.png'\n            pm.rasterize_pdf_page(\n                input_file=pdf_with_nonstandard_boxes,\n                output_file=img_pdfium,\n                raster_device='png16m',\n                raster_dpi=Resolution(72, 72),\n                page_dpi=Resolution(72, 72),\n                pageno=1,\n                rotation=0,\n                filter_vector=False,\n                stop_on_soft_error=True,\n                options=options_pdfium,\n                use_cropbox=False,\n            )\n\n            with Image.open(img_pdfium) as im_pdfium:\n                pdfium_size = im_pdfium.size\n\n            # Both rasterizers should now produce MediaBox dimensions (400x500)\n            # when use_cropbox=False (the default)\n            assert gs_size == (400, 500), f\"Ghostscript size: {gs_size}\"\n            assert pdfium_size == (400, 500), f\"pypdfium size: {pdfium_size}\"\n\n\nclass TestRasterizerWithRotationAndBoxes:\n    \"\"\"Test rasterizer + rotation + nonstandard boxes combinations.\"\"\"\n\n    # The pdf_with_nonstandard_boxes fixture creates a PDF with:\n    # - MediaBox: [0, 0, 400, 500] → 400x500 points\n    # - CropBox: [50, 50, 350, 450] → 300x400 points\n    # - TrimBox: [75, 75, 325, 425] → 250x350 points\n    #\n    # With use_cropbox=False (default), both rasterizers use MediaBox\n    MEDIABOX_WIDTH = 400\n    MEDIABOX_HEIGHT = 500\n\n    def _get_expected_size(self, rotation: int) -> tuple[int, int]:\n        \"\"\"Get expected image dimensions after rotation.\"\"\"\n        width, height = self.MEDIABOX_WIDTH, self.MEDIABOX_HEIGHT\n\n        if rotation in (0, 180):\n            return (width, height)\n        else:  # 90, 270\n            return (height, width)\n\n    def test_ghostscript_rotation_dimensions(\n        self, pdf_with_nonstandard_boxes, tmp_path\n    ):\n        \"\"\"Test Ghostscript produces correct dimensions with rotation.\"\"\"\n        pm = get_plugin_manager([])\n\n        options = OcrOptions(\n            input_file=pdf_with_nonstandard_boxes,\n            output_file=tmp_path / 'out.pdf',\n            rasterizer='ghostscript',\n        )\n\n        for rotation in [0, 90, 180, 270]:\n            img_path = tmp_path / f'gs_rot{rotation}.png'\n            pm.rasterize_pdf_page(\n                input_file=pdf_with_nonstandard_boxes,\n                output_file=img_path,\n                raster_device='png16m',\n                raster_dpi=Resolution(72, 72),\n                page_dpi=Resolution(72, 72),\n                pageno=1,\n                rotation=rotation,\n                filter_vector=False,\n                stop_on_soft_error=True,\n                options=options,\n                use_cropbox=False,\n            )\n            assert img_path.exists(), f\"Failed to rasterize with rotation {rotation}\"\n\n            with Image.open(img_path) as img:\n                expected = self._get_expected_size(rotation)\n                # Allow small tolerance for rounding\n                assert abs(img.size[0] - expected[0]) <= 2, (\n                    f\"Width mismatch at {rotation}°: got {img.size[0]}, \"\n                    f\"expected {expected[0]}\"\n                )\n                assert abs(img.size[1] - expected[1]) <= 2, (\n                    f\"Height mismatch at {rotation}°: got {img.size[1]}, \"\n                    f\"expected {expected[1]}\"\n                )\n\n    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason=\"pypdfium2 not installed\")\n    def test_pypdfium_rotation_dimensions(self, pdf_with_nonstandard_boxes, tmp_path):\n        \"\"\"Test pypdfium produces correct dimensions with rotation.\"\"\"\n        pm = get_plugin_manager([])\n\n        options = OcrOptions(\n            input_file=pdf_with_nonstandard_boxes,\n            output_file=tmp_path / 'out.pdf',\n            rasterizer='pypdfium',\n        )\n\n        for rotation in [0, 90, 180, 270]:\n            img_path = tmp_path / f'pdfium_rot{rotation}.png'\n            pm.rasterize_pdf_page(\n                input_file=pdf_with_nonstandard_boxes,\n                output_file=img_path,\n                raster_device='png16m',\n                raster_dpi=Resolution(72, 72),\n                page_dpi=Resolution(72, 72),\n                pageno=1,\n                rotation=rotation,\n                filter_vector=False,\n                stop_on_soft_error=True,\n                options=options,\n                use_cropbox=False,\n            )\n            assert img_path.exists(), f\"Failed to rasterize with rotation {rotation}\"\n\n            with Image.open(img_path) as img:\n                expected = self._get_expected_size(rotation)\n                # Allow small tolerance for rounding\n                assert abs(img.size[0] - expected[0]) <= 2, (\n                    f\"Width mismatch at {rotation}°: got {img.size[0]}, \"\n                    f\"expected {expected[0]}\"\n                )\n                assert abs(img.size[1] - expected[1]) <= 2, (\n                    f\"Height mismatch at {rotation}°: got {img.size[1]}, \"\n                    f\"expected {expected[1]}\"\n                )\n\n    @pytest.mark.skipif(not PYPDFIUM_AVAILABLE, reason=\"pypdfium2 not installed\")\n    def test_rasterizers_produce_same_dimensions(\n        self, pdf_with_nonstandard_boxes, tmp_path\n    ):\n        \"\"\"Verify ghostscript and pypdfium produce the same MediaBox dimensions.\n\n        With use_cropbox=False (the default), both rasterizers should render\n        to the MediaBox and produce identical dimensions.\n        \"\"\"\n        pm = get_plugin_manager([])\n\n        for rotation in [0, 90, 180, 270]:\n            # Rasterize with Ghostscript\n            gs_options = OcrOptions(\n                input_file=pdf_with_nonstandard_boxes,\n                output_file=tmp_path / 'out.pdf',\n                rasterizer='ghostscript',\n            )\n            gs_img_path = tmp_path / f'gs_cmp_rot{rotation}.png'\n            pm.rasterize_pdf_page(\n                input_file=pdf_with_nonstandard_boxes,\n                output_file=gs_img_path,\n                raster_device='png16m',\n                raster_dpi=Resolution(72, 72),\n                page_dpi=Resolution(72, 72),\n                pageno=1,\n                rotation=rotation,\n                filter_vector=False,\n                stop_on_soft_error=True,\n                options=gs_options,\n                use_cropbox=False,\n            )\n\n            # Rasterize with pypdfium\n            pdfium_options = OcrOptions(\n                input_file=pdf_with_nonstandard_boxes,\n                output_file=tmp_path / 'out.pdf',\n                rasterizer='pypdfium',\n            )\n            pdfium_img_path = tmp_path / f'pdfium_cmp_rot{rotation}.png'\n            pm.rasterize_pdf_page(\n                input_file=pdf_with_nonstandard_boxes,\n                output_file=pdfium_img_path,\n                raster_device='png16m',\n                raster_dpi=Resolution(72, 72),\n                page_dpi=Resolution(72, 72),\n                pageno=1,\n                rotation=rotation,\n                filter_vector=False,\n                stop_on_soft_error=True,\n                options=pdfium_options,\n                use_cropbox=False,\n            )\n\n            # Verify both produce the same MediaBox dimensions\n            with (\n                Image.open(gs_img_path) as gs_img,\n                Image.open(pdfium_img_path) as pdfium_img,\n            ):\n                expected = self._get_expected_size(rotation)\n\n                assert abs(gs_img.size[0] - expected[0]) <= 2, (\n                    f\"GS width at {rotation}°: {gs_img.size[0]}, \"\n                    f\"expected {expected[0]}\"\n                )\n                assert abs(gs_img.size[1] - expected[1]) <= 2, (\n                    f\"GS height at {rotation}°: {gs_img.size[1]}, \"\n                    f\"expected {expected[1]}\"\n                )\n                assert abs(pdfium_img.size[0] - expected[0]) <= 2, (\n                    f\"pdfium width at {rotation}°: {pdfium_img.size[0]}, \"\n                    f\"expected {expected[0]}\"\n                )\n                assert abs(pdfium_img.size[1] - expected[1]) <= 2, (\n                    f\"pdfium height at {rotation}°: {pdfium_img.size[1]}, \"\n                    f\"expected {expected[1]}\"\n                )\n"
  },
  {
    "path": "tests/test_rotation.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport operator\nfrom io import BytesIO\nfrom math import cos, pi, sin\nfrom os import fspath\nfrom subprocess import run\n\nimport img2pdf\nimport pikepdf\nimport pytest\nfrom PIL import Image, ImageChops\nfrom reportlab.pdfgen.canvas import Canvas\n\nfrom ocrmypdf._exec import ghostscript\nfrom ocrmypdf._plugin_manager import get_plugin_manager\nfrom ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution\nfrom ocrmypdf.pdfinfo import PdfInfo\nfrom ocrmypdf.pluginspec import GhostscriptRasterDevice\n\nfrom .conftest import check_ocrmypdf, run_ocrmypdf_api\n\n# pylintx: disable=unused-variable\n\nRENDERERS = ['fpdf2', 'sandwich']\n\n\ndef compare_images_monochrome(\n    outdir, reference_pdf, reference_pageno, test_pdf, test_pageno\n):\n    reference_png = outdir / f'{reference_pdf.name}.ref{reference_pageno:04d}.png'\n    test_png = outdir / f'{test_pdf.name}.test{test_pageno:04d}.png'\n\n    def rasterize(pdf, pageno, png):\n        if png.exists():\n            print(png)\n            return\n        ghostscript.rasterize_pdf(\n            pdf,\n            png,\n            raster_device=GhostscriptRasterDevice.PNGMONO,\n            raster_dpi=Resolution(100, 100),\n            pageno=pageno,\n            rotation=0,\n        )\n\n    rasterize(reference_pdf, reference_pageno, reference_png)\n    rasterize(test_pdf, test_pageno, test_png)\n\n    with Image.open(reference_png) as reference_im, Image.open(test_png) as test_im:\n        assert reference_im.mode == test_im.mode == '1'\n        assert reference_im.size == test_im.size, \"Images must be the same size\"\n\n        # XOR the images: matching pixels become 0, different pixels become 1\n        difference = ImageChops.logical_xor(reference_im, test_im)\n\n        # Count matching pixels directly using getcolors()\n        # For a binary image, getcolors returns [(count, 0), (count, 1)] or subset\n        colors = difference.getcolors()\n        color_counts = {color: count for count, color in colors}\n        count_same = color_counts.get(0, 0)  # 0 = matching pixels (XOR result is 0)\n        count_different = color_counts.get(255, 0)  # 255 = different pixels\n        total = count_same + count_different\n\n        return count_same / total\n\n\ndef test_monochrome_comparison(resources, outdir):\n    # Self test: check that an incorrect rotated image has poor\n    # comparison with reference\n    cmp = compare_images_monochrome(\n        outdir,\n        reference_pdf=resources / 'cardinal.pdf',\n        reference_pageno=1,  # north facing page\n        test_pdf=resources / 'cardinal.pdf',\n        test_pageno=3,  # south facing page\n    )\n    assert cmp < 0.90\n    cmp = compare_images_monochrome(\n        outdir,\n        reference_pdf=resources / 'cardinal.pdf',\n        reference_pageno=2,\n        test_pdf=resources / 'cardinal.pdf',\n        test_pageno=2,\n    )\n    assert cmp > 0.95\n\n\n@pytest.mark.slow\n@pytest.mark.parametrize('renderer', RENDERERS)\ndef test_autorotate(renderer, resources, outdir):\n    # cardinal.pdf contains four copies of an image rotated in each cardinal\n    # direction - these ones are \"burned in\" not tagged with /Rotate\n    check_ocrmypdf(\n        resources / 'cardinal.pdf',\n        outdir / 'out.pdf',\n        '-r',\n        '-v',\n        '1',\n        '--pdf-renderer',\n        renderer,\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n    for n in range(1, 4 + 1):\n        cmp = compare_images_monochrome(\n            outdir,\n            reference_pdf=resources / 'cardinal.pdf',\n            reference_pageno=1,\n            test_pdf=outdir / 'out.pdf',\n            test_pageno=n,\n        )\n        assert cmp > 0.95\n\n\n@pytest.mark.parametrize(\n    'threshold, op, comparison_threshold',\n    [\n        ('1', operator.ge, 0.95),  # Low thresh -> always rotate -> high score\n        ('99', operator.le, 0.90),  # High thres -> never rotate -> low score\n    ],\n)\ndef test_autorotate_threshold(threshold, op, comparison_threshold, resources, outdir):\n    check_ocrmypdf(\n        resources / 'cardinal.pdf',\n        outdir / 'out.pdf',\n        '--rotate-pages-threshold',\n        threshold,\n        '-r',\n        # '-v',\n        # '1',\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n    cmp = compare_images_monochrome(  # pylint: disable=unused-variable\n        outdir,\n        reference_pdf=resources / 'cardinal.pdf',\n        reference_pageno=1,\n        test_pdf=outdir / 'out.pdf',\n        test_pageno=3,\n    )\n\n    assert op(cmp, comparison_threshold)\n\n\n@pytest.mark.parametrize('rasterizer', ['pypdfium', 'ghostscript'])\ndef test_rotated_skew_timeout(resources, outpdf, rasterizer):\n    \"\"\"Check rotated skew timeout.\n\n    This document contains an image that is rotated 90 into place with a\n    /Rotate tag and intentionally skewed by altering the transformation matrix.\n\n    This tests for a bug where the combination of preprocessing and a tesseract\n    timeout produced a page whose dimensions did not match the original's.\n    \"\"\"\n    input_file = resources / 'rotated_skew.pdf'\n    in_pageinfo = PdfInfo(input_file)[0]\n\n    assert (\n        in_pageinfo.height_pixels < in_pageinfo.width_pixels\n    ), \"Expected the input page to be landscape\"\n    assert in_pageinfo.rotation == 90, \"Expected a rotated page\"\n\n    out = check_ocrmypdf(\n        input_file,\n        outpdf,\n        '--pdf-renderer',\n        'fpdf2',\n        '--deskew',\n        '--tesseract-timeout',\n        '0',\n        '--rasterizer',\n        rasterizer,\n    )\n\n    out_pageinfo = PdfInfo(out)[0]\n    w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels\n\n    assert h > w, \"Expected the output page to be portrait\"\n\n    assert out_pageinfo.rotation == 0, \"Expected no page rotation for output\"\n\n    assert (\n        in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w\n    ), \"Expected page rotation to be baked in\"\n\n\n@pytest.mark.parametrize('rasterizer', ['pypdfium', 'ghostscript'])\ndef test_rotate_deskew_ocr_timeout(resources, outdir, rasterizer):\n    check_ocrmypdf(\n        resources / 'rotated_skew.pdf',\n        outdir / 'deskewed.pdf',\n        '--rotate-pages',\n        '--rotate-pages-threshold',\n        '0',\n        '--deskew',\n        '--tesseract-timeout',\n        '0',\n        '--pdf-renderer',\n        'fpdf2',\n        '--rasterizer',\n        rasterizer,\n    )\n\n    cmp = compare_images_monochrome(\n        outdir,\n        reference_pdf=resources / 'ccitt.pdf',\n        reference_pageno=1,\n        test_pdf=outdir / 'deskewed.pdf',\n        test_pageno=1,\n    )\n\n    # Confirm that the page still got deskewed\n    # pypdfium anti-aliases so gets better visual quality, but lower score (0.88)\n    # on monochrome comparison; ghostscript looks ugly but gets > 0.95\n    assert cmp > 0.85\n\n\ndef make_rotate_test(imagefile, outdir, prefix, image_angle, page_angle, cropbox=None):\n    memimg = BytesIO()\n    with Image.open(fspath(imagefile)) as im:\n        if image_angle != 0:\n            ccw_angle = -image_angle % 360\n            im = im.transpose(getattr(Image.Transpose, f'ROTATE_{ccw_angle}'))\n        im.save(memimg, format='PNG')\n    memimg.seek(0)\n    mempdf = BytesIO()\n    img2pdf.convert(\n        memimg.read(),\n        layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),\n        outputstream=mempdf,\n        **IMG2PDF_KWARGS,\n    )\n    mempdf.seek(0)\n    with pikepdf.open(mempdf) as pdf:\n        pdf.pages[0].Rotate = page_angle\n        target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'\n        if cropbox:\n            pdf.pages[0].CropBox = cropbox\n        pdf.save(target)\n        return target\n\n\n@pytest.mark.slow\n@pytest.mark.parametrize('page_angle', (0, 90, 180, 270))\n@pytest.mark.parametrize('image_angle', (0, 90, 180, 270))\ndef test_rotate_page_level(image_angle, page_angle, resources, outdir, caplog):\n    reference = make_rotate_test(resources / 'typewriter.png', outdir, 'ref', 0, 0)\n    test = make_rotate_test(\n        resources / 'typewriter.png', outdir, 'test', image_angle, page_angle\n    )\n    out = test.with_suffix('.out.pdf')\n\n    exitcode = run_ocrmypdf_api(\n        test,\n        out,\n        '-O0',\n        '--rotate-pages',\n        '--rotate-pages-threshold',\n        '0.001',\n    )\n    assert exitcode == 0, caplog.text\n\n    assert compare_images_monochrome(outdir, reference, 1, out, 1) > 0.2\n\n\n@pytest.mark.slow\n@pytest.mark.parametrize('page_rotate_angle', (0, 90, 180, 270))\ndef test_page_rotate_tag(page_rotate_angle, resources, outdir, caplog):\n    # Check that pages that have an image that is misrotated but restored to\n    # correct rotation with a /Rotate will be processed correct and yield text.\n    test = make_rotate_test(\n        resources / 'crom.png', outdir, 'test', -page_rotate_angle, page_rotate_angle\n    )\n    out = test.with_suffix('.out.pdf')\n    exitcode = run_ocrmypdf_api(\n        test,\n        out,\n        '-O0',\n    )\n    assert exitcode == 0, caplog.text\n\n    def pdftotext(filename):\n        return (\n            run(['pdftotext', '-enc', 'UTF-8', filename, '-'], capture_output=True)\n            .stdout.strip()\n            .decode('utf-8')\n        )\n\n    test_text = pdftotext(out)\n    assert 'is a' in test_text, test_text\n\n\n@pytest.mark.parametrize('page_rotate_angle', (0, 90, 180, 270))\n@pytest.mark.parametrize('renderer', ['sandwich', 'fpdf2'])\n@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])\ndef test_rotate_and_crop(\n    resources, outdir, page_rotate_angle, renderer, output_type, caplog\n):\n    cropbox = (100, 200, 1000, 800)\n    reference = make_rotate_test(\n        resources / 'typewriter.png', outdir, 'ref', 0, 0, cropbox\n    )\n    test = make_rotate_test(\n        resources / 'typewriter.png',\n        outdir,\n        'test',\n        -page_rotate_angle,\n        page_rotate_angle,\n        cropbox,\n    )\n    out = test.with_suffix('.out.pdf')\n\n    exitcode = run_ocrmypdf_api(\n        test,\n        out,\n        '-O0',\n        '--rotate-pages',\n        '--rotate-pages-threshold',\n        '0',\n        '--pdf-renderer',\n        renderer,\n        '--output-type',\n        output_type,\n        '--no-progress-bar',\n    )\n    assert exitcode == 0, caplog.text\n\n    assert compare_images_monochrome(outdir, reference, 1, out, 1) > 0.9\n\n\n@pytest.mark.parametrize('rasterizer', ['pypdfium', 'ghostscript'])\ndef test_rasterize_rotates(resources, tmp_path, rasterizer):\n    from ocrmypdf._options import OcrOptions\n\n    pm = get_plugin_manager([])\n\n    options = OcrOptions(\n        input_file=resources / 'graph.pdf',\n        output_file=tmp_path / 'out.pdf',\n        rasterizer=rasterizer,\n    )\n\n    img = tmp_path / 'img90.png'\n    pm.rasterize_pdf_page(\n        input_file=resources / 'graph.pdf',\n        output_file=img,\n        raster_device=GhostscriptRasterDevice.PNGMONO,\n        raster_dpi=Resolution(20, 20),\n        page_dpi=Resolution(20, 20),\n        pageno=1,\n        rotation=90,\n        filter_vector=False,\n        stop_on_soft_error=True,\n        options=options,\n        use_cropbox=False,\n    )\n    with Image.open(img) as im:\n        assert im.size == (83, 200), \"Image not rotated\"\n\n    img = tmp_path / 'img180.png'\n    pm.rasterize_pdf_page(\n        input_file=resources / 'graph.pdf',\n        output_file=img,\n        raster_device=GhostscriptRasterDevice.PNGMONO,\n        raster_dpi=Resolution(20, 20),\n        page_dpi=Resolution(20, 20),\n        pageno=1,\n        rotation=180,\n        filter_vector=False,\n        stop_on_soft_error=True,\n        options=options,\n        use_cropbox=False,\n    )\n    assert Image.open(img).size == (200, 83), \"Image not rotated\"\n\n\ndef test_simulated_scan(outdir):\n    canvas = Canvas(\n        fspath(outdir / 'fakescan.pdf'),\n        pagesize=(209.8, 297.6),\n    )\n\n    page_vars = [(2, 36, 250), (91, 170, 240), (179, 190, 36), (271, 36, 36)]\n\n    for n, page_var in enumerate(page_vars):\n        text = canvas.beginText()\n        text.setFont('Helvetica', 20)\n\n        angle, x, y = page_var\n        cos_a, sin_a = cos(angle / 180.0 * pi), sin(angle / 180.0 * pi)\n\n        text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, x, y)\n        text.textOut(f'Page {n + 1}')\n        canvas.drawText(text)\n        canvas.showPage()\n    canvas.save()\n\n    check_ocrmypdf(\n        outdir / 'fakescan.pdf',\n        outdir / 'out.pdf',\n        '--force-ocr',\n        '--deskew',\n        '--rotate-pages',\n        '--plugin',\n        'tests/plugins/tesseract_debug_rotate.py',\n    )\n\n    with pikepdf.open(outdir / 'out.pdf') as pdf:\n        assert (\n            pdf.pages[1].mediabox[2] > pdf.pages[1].mediabox[3]\n        ), \"Wrong orientation: not landscape\"\n        assert (\n            pdf.pages[3].mediabox[2] > pdf.pages[3].mediabox[3]\n        ), \"Wrong orientation: Not landscape\"\n\n        assert (\n            pdf.pages[0].mediabox[2] < pdf.pages[0].mediabox[3]\n        ), \"Wrong orientation: Not portrait\"\n        assert (\n            pdf.pages[2].mediabox[2] < pdf.pages[2].mediabox[3]\n        ), \"Wrong orientation: Not portrait\"\n"
  },
  {
    "path": "tests/test_semfree.py",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport sys\n\nimport pytest\n\nfrom ocrmypdf.exceptions import ExitCode\n\nfrom .conftest import is_linux, run_ocrmypdf_api\n\n\n@pytest.mark.skipif(not is_linux(), reason='semfree plugin only works on Linux')\n@pytest.mark.skipif(\n    sys.version_info >= (3, 14),\n    reason='semfree plugin only works on Python 3.13 or earlier',\n)\ndef test_semfree(resources, outpdf):\n    with pytest.warns(DeprecationWarning, match=\"semfree.py is deprecated\"):\n        exitcode = run_ocrmypdf_api(\n            resources / 'multipage.pdf',\n            outpdf,\n            '--skip-text',\n            '--skip-big',\n            '2',\n            '--plugin',\n            'ocrmypdf.extra_plugins.semfree',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n        assert exitcode in (ExitCode.ok, ExitCode.pdfa_conversion_failed)\n"
  },
  {
    "path": "tests/test_soft_error.py",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport os\n\nimport pytest\n\nfrom ocrmypdf.exceptions import ExitCode\n\nfrom .conftest import run_ocrmypdf_api\n\n\ndef test_raster_continue_on_soft_error(resources, outpdf):\n    exitcode = run_ocrmypdf_api(\n        resources / 'francais.pdf',\n        outpdf,\n        '--continue-on-soft-render-error',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n        '--plugin',\n        'tests/plugins/gs_raster_soft_error.py',\n    )\n    assert exitcode == ExitCode.ok\n\n\ndef test_raster_stop_on_soft_error(resources, outpdf):\n    exitcode = run_ocrmypdf_api(\n        resources / 'francais.pdf',\n        outpdf,\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n        '--plugin',\n        'tests/plugins/gs_raster_soft_error.py',\n    )\n    assert exitcode == ExitCode.child_process_error\n\n\ndef test_render_continue_on_soft_error(resources, outpdf):\n    exitcode = run_ocrmypdf_api(\n        resources / 'francais.pdf',\n        outpdf,\n        '--output-type',\n        'pdfa',  # Required to trigger Ghostscript PDF/A generation\n        '--continue-on-soft-render-error',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n        '--plugin',\n        'tests/plugins/gs_render_soft_error.py',\n    )\n    assert exitcode == ExitCode.ok\n\n\n@pytest.mark.skipif(os.name == 'nt', reason='Ghostscript on Windows errors out')\ndef test_render_stop_on_soft_error(resources, outpdf):\n    exitcode = run_ocrmypdf_api(\n        resources / 'francais.pdf',\n        outpdf,\n        '--output-type',\n        'pdfa',  # Required to trigger Ghostscript PDF/A generation\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n        '--plugin',\n        'tests/plugins/gs_render_soft_error.py',\n    )\n    assert exitcode == ExitCode.child_process_error\n"
  },
  {
    "path": "tests/test_stdio.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport os\nfrom subprocess import DEVNULL, PIPE, run\n\nimport pytest\n\nfrom ocrmypdf.helpers import check_pdf\n\nfrom .conftest import run_ocrmypdf\n\n\ndef test_stdin(ocrmypdf_exec, resources, outpdf):\n    input_file = str(resources / 'francais.pdf')\n    output_file = str(outpdf)\n\n    # Runs: ocrmypdf - output.pdf < testfile.pdf\n    with open(input_file, 'rb') as input_stream:\n        p_args = ocrmypdf_exec + [\n            '-',\n            output_file,\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        ]\n        run(p_args, capture_output=True, stdin=input_stream, check=True)\n\n\ndef test_stdout(ocrmypdf_exec, resources, outpdf):\n    if 'COV_CORE_DATAFILE' in os.environ:\n        pytest.skip(\"Coverage uses stdout\")\n\n    input_file = str(resources / 'francais.pdf')\n    output_file = str(outpdf)\n\n    # Runs: ocrmypdf francais.pdf - > test_stdout.pdf\n    with open(output_file, 'wb') as output_stream:\n        p_args = ocrmypdf_exec + [\n            input_file,\n            '-',\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        ]\n        run(p_args, stdout=output_stream, stderr=PIPE, stdin=DEVNULL, check=True)\n\n    assert check_pdf(output_file)\n\n\n@pytest.mark.skipif(os.name == 'nt', reason='Windows does not support /dev/null')\ndef test_dev_null(resources):\n    if 'COV_CORE_DATAFILE' in os.environ:\n        pytest.skip(\"Coverage uses stdout\")\n\n    p = run_ocrmypdf(\n        resources / 'trivial.pdf',\n        os.devnull,\n        '--force-ocr',\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    assert p.returncode == 0, \"could not send output to /dev/null\"\n    assert len(p.stdout) == 0, \"wrote to stdout\"\n"
  },
  {
    "path": "tests/test_system_font_provider.py",
    "content": "# SPDX-FileCopyrightText: 2025 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\n\"\"\"Unit tests for SystemFontProvider and ChainedFontProvider.\"\"\"\n\nfrom __future__ import annotations\n\nimport sys\nfrom pathlib import Path\nfrom unittest.mock import MagicMock, patch\n\nimport pytest\n\nfrom ocrmypdf.font import (\n    BuiltinFontProvider,\n    ChainedFontProvider,\n    SystemFontProvider,\n)\n\n# --- SystemFontProvider Platform Detection Tests ---\n\n\nclass TestSystemFontProviderPlatform:\n    \"\"\"Test platform detection in SystemFontProvider.\"\"\"\n\n    def test_get_platform_linux(self):\n        \"\"\"Test Linux platform detection.\"\"\"\n        provider = SystemFontProvider()\n        with patch.object(sys, 'platform', 'linux'):\n            assert provider._get_platform() == 'linux'\n\n    def test_get_platform_darwin(self):\n        \"\"\"Test macOS platform detection.\"\"\"\n        provider = SystemFontProvider()\n        with patch.object(sys, 'platform', 'darwin'):\n            assert provider._get_platform() == 'darwin'\n\n    def test_get_platform_windows(self):\n        \"\"\"Test Windows platform detection.\"\"\"\n        provider = SystemFontProvider()\n        with patch.object(sys, 'platform', 'win32'):\n            assert provider._get_platform() == 'windows'\n\n    def test_get_platform_freebsd(self):\n        \"\"\"Test FreeBSD platform detection.\"\"\"\n        provider = SystemFontProvider()\n        with patch.object(sys, 'platform', 'freebsd13'):\n            assert provider._get_platform() == 'freebsd'\n\n\nclass TestSystemFontProviderDirectories:\n    \"\"\"Test font directory resolution.\"\"\"\n\n    def test_linux_font_dirs(self):\n        \"\"\"Test Linux font directories.\"\"\"\n        provider = SystemFontProvider()\n        with patch.object(sys, 'platform', 'linux'):\n            provider._font_dirs = None  # Reset cache\n            dirs = provider._get_font_dirs()\n            assert Path('/usr/share/fonts') in dirs\n            assert Path('/usr/local/share/fonts') in dirs\n\n    def test_darwin_font_dirs(self):\n        \"\"\"Test macOS font directories.\"\"\"\n        provider = SystemFontProvider()\n        with patch.object(sys, 'platform', 'darwin'):\n            provider._font_dirs = None  # Reset cache\n            dirs = provider._get_font_dirs()\n            assert Path('/Library/Fonts') in dirs\n            assert Path('/System/Library/Fonts') in dirs\n\n    def test_windows_font_dirs_with_windir(self):\n        \"\"\"Test Windows font directory from WINDIR env var.\"\"\"\n        provider = SystemFontProvider()\n        with (\n            patch.object(sys, 'platform', 'win32'),\n            patch.dict('os.environ', {'WINDIR': r'D:\\Windows'}),\n        ):\n            provider._font_dirs = None  # Reset cache\n            dirs = provider._get_font_dirs()\n            # Check that Fonts subdir of WINDIR is included\n            # Use str comparison to avoid Path normalization issues across platforms\n            dir_strs = [str(d) for d in dirs]\n            assert any('Fonts' in d for d in dir_strs)\n\n    def test_windows_font_dirs_default(self):\n        \"\"\"Test Windows font directory with default path.\"\"\"\n        provider = SystemFontProvider()\n        with (\n            patch.object(sys, 'platform', 'win32'),\n            patch.dict('os.environ', {}, clear=True),\n        ):\n            provider._font_dirs = None  # Reset cache\n            dirs = provider._get_font_dirs()\n            # Check that Windows\\Fonts is included (default fallback)\n            dir_strs = [str(d) for d in dirs]\n            assert any('Windows' in d and 'Fonts' in d for d in dir_strs)\n\n    def test_windows_font_dirs_with_localappdata(self):\n        \"\"\"Test Windows user fonts directory from LOCALAPPDATA env var.\"\"\"\n        provider = SystemFontProvider()\n        with (\n            patch.object(sys, 'platform', 'win32'),\n            patch.dict(\n                'os.environ',\n                {'WINDIR': r'C:\\Windows', 'LOCALAPPDATA': r'C:\\Users\\Test\\AppData\\Local'},\n            ),\n        ):\n            provider._font_dirs = None  # Reset cache\n            dirs = provider._get_font_dirs()\n            dir_strs = [str(d) for d in dirs]\n            # Should have both system and user font directories\n            assert len(dirs) == 2\n            assert any('Windows' in d and 'Fonts' in d for d in dir_strs)\n            assert any(\n                'AppData' in d and 'Local' in d and 'Fonts' in d\n                for d in dir_strs\n            )\n\n    def test_font_dirs_cached(self):\n        \"\"\"Test that font directories are cached.\"\"\"\n        provider = SystemFontProvider()\n        dirs1 = provider._get_font_dirs()\n        dirs2 = provider._get_font_dirs()\n        assert dirs1 is dirs2  # Same object, not recomputed\n\n\nclass TestSystemFontProviderLazyLoading:\n    \"\"\"Test lazy loading behavior.\"\"\"\n\n    def test_no_scanning_on_init(self):\n        \"\"\"Test that no directory scanning happens during initialization.\"\"\"\n        provider = SystemFontProvider()\n        # Caches should be empty\n        assert len(provider._font_cache) == 0\n        assert len(provider._not_found) == 0\n\n    def test_get_font_unknown_name_returns_none(self):\n        \"\"\"Test that unknown font names return None.\"\"\"\n        provider = SystemFontProvider()\n        result = provider.get_font('UnknownFont-Regular')\n        assert result is None\n        # Unknown fonts are added to not_found to cache the negative result\n        assert 'UnknownFont-Regular' in provider._not_found\n\n    def test_negative_cache(self):\n        \"\"\"Test that not-found results are cached.\"\"\"\n        provider = SystemFontProvider()\n        # Mock _find_font_file to return None\n        with patch.object(provider, '_find_font_file', return_value=None):\n            result1 = provider.get_font('NotoSansCJK-Regular')\n            assert result1 is None\n            assert 'NotoSansCJK-Regular' in provider._not_found\n\n            # Second call should not call _find_font_file again\n            provider._find_font_file = MagicMock(return_value=None)\n            result2 = provider.get_font('NotoSansCJK-Regular')\n            assert result2 is None\n            provider._find_font_file.assert_not_called()\n\n    def test_positive_cache(self):\n        \"\"\"Test that found fonts are cached.\"\"\"\n        provider = SystemFontProvider()\n        font_dir = Path(__file__).parent.parent / \"src\" / \"ocrmypdf\" / \"data\"\n        font_path = font_dir / \"NotoSans-Regular.ttf\"\n\n        if not font_path.exists():\n            pytest.skip(\"Test font not available\")\n\n        with patch.object(provider, '_find_font_file', return_value=font_path):\n            result1 = provider.get_font('NotoSans-Regular')\n            assert result1 is not None\n            assert 'NotoSans-Regular' in provider._font_cache\n\n            # Second call should use cache\n            provider._find_font_file = MagicMock()\n            result2 = provider.get_font('NotoSans-Regular')\n            assert result2 is result1\n            provider._find_font_file.assert_not_called()\n\n\nclass TestSystemFontProviderAvailableFonts:\n    \"\"\"Test get_available_fonts method.\"\"\"\n\n    def test_returns_all_patterns(self):\n        \"\"\"Test that get_available_fonts returns all known font patterns.\"\"\"\n        provider = SystemFontProvider()\n        fonts = provider.get_available_fonts()\n        assert 'NotoSans-Regular' in fonts\n        assert 'NotoSansCJK-Regular' in fonts\n        assert 'NotoSansArabic-Regular' in fonts\n        assert 'NotoSansThai-Regular' in fonts\n\n    def test_fallback_font_raises(self):\n        \"\"\"Test that get_fallback_font raises NotImplementedError.\"\"\"\n        provider = SystemFontProvider()\n        with pytest.raises(NotImplementedError):\n            provider.get_fallback_font()\n\n\n# --- ChainedFontProvider Tests ---\n\n\nclass TestChainedFontProvider:\n    \"\"\"Test ChainedFontProvider.\"\"\"\n\n    def test_requires_at_least_one_provider(self):\n        \"\"\"Test that empty provider list raises error.\"\"\"\n        with pytest.raises(ValueError, match=\"At least one provider\"):\n            ChainedFontProvider([])\n\n    def test_get_font_tries_providers_in_order(self):\n        \"\"\"Test that get_font tries providers in order.\"\"\"\n        provider1 = MagicMock()\n        provider1.get_font.return_value = None\n\n        provider2 = MagicMock()\n        mock_font = MagicMock()\n        provider2.get_font.return_value = mock_font\n\n        chain = ChainedFontProvider([provider1, provider2])\n        result = chain.get_font('TestFont')\n\n        provider1.get_font.assert_called_once_with('TestFont')\n        provider2.get_font.assert_called_once_with('TestFont')\n        assert result is mock_font\n\n    def test_get_font_stops_on_first_match(self):\n        \"\"\"Test that get_font stops after first successful match.\"\"\"\n        mock_font = MagicMock()\n        provider1 = MagicMock()\n        provider1.get_font.return_value = mock_font\n\n        provider2 = MagicMock()\n\n        chain = ChainedFontProvider([provider1, provider2])\n        result = chain.get_font('TestFont')\n\n        provider1.get_font.assert_called_once()\n        provider2.get_font.assert_not_called()\n        assert result is mock_font\n\n    def test_get_font_returns_none_if_all_fail(self):\n        \"\"\"Test that get_font returns None if all providers fail.\"\"\"\n        provider1 = MagicMock()\n        provider1.get_font.return_value = None\n\n        provider2 = MagicMock()\n        provider2.get_font.return_value = None\n\n        chain = ChainedFontProvider([provider1, provider2])\n        result = chain.get_font('TestFont')\n\n        assert result is None\n\n    def test_get_available_fonts_combines_providers(self):\n        \"\"\"Test that get_available_fonts combines all providers.\"\"\"\n        provider1 = MagicMock()\n        provider1.get_available_fonts.return_value = ['Font1', 'Font2']\n\n        provider2 = MagicMock()\n        provider2.get_available_fonts.return_value = ['Font2', 'Font3']\n\n        chain = ChainedFontProvider([provider1, provider2])\n        fonts = chain.get_available_fonts()\n\n        assert fonts == ['Font1', 'Font2', 'Font3']  # Deduplicated, order preserved\n\n    def test_get_fallback_font_from_first_provider(self):\n        \"\"\"Test that get_fallback_font uses first available fallback.\"\"\"\n        mock_font = MagicMock()\n        provider1 = MagicMock()\n        provider1.get_fallback_font.return_value = mock_font\n\n        provider2 = MagicMock()\n\n        chain = ChainedFontProvider([provider1, provider2])\n        result = chain.get_fallback_font()\n\n        assert result is mock_font\n        provider2.get_fallback_font.assert_not_called()\n\n    def test_get_fallback_font_skips_not_implemented(self):\n        \"\"\"Test that get_fallback_font skips providers that raise.\"\"\"\n        provider1 = MagicMock()\n        provider1.get_fallback_font.side_effect = NotImplementedError()\n\n        mock_font = MagicMock()\n        provider2 = MagicMock()\n        provider2.get_fallback_font.return_value = mock_font\n\n        chain = ChainedFontProvider([provider1, provider2])\n        result = chain.get_fallback_font()\n\n        assert result is mock_font\n\n    def test_get_fallback_font_raises_if_none_available(self):\n        \"\"\"Test that get_fallback_font raises if no provider has fallback.\"\"\"\n        provider1 = MagicMock()\n        provider1.get_fallback_font.side_effect = NotImplementedError()\n\n        provider2 = MagicMock()\n        provider2.get_fallback_font.side_effect = KeyError()\n\n        chain = ChainedFontProvider([provider1, provider2])\n        with pytest.raises(RuntimeError, match=\"No fallback font available\"):\n            chain.get_fallback_font()\n\n\nclass TestChainedFontProviderIntegration:\n    \"\"\"Integration tests with real providers.\"\"\"\n\n    @pytest.fixture\n    def font_dir(self):\n        \"\"\"Return path to font directory.\"\"\"\n        return Path(__file__).parent.parent / \"src\" / \"ocrmypdf\" / \"data\"\n\n    def test_builtin_then_system_chain(self, font_dir):\n        \"\"\"Test chaining BuiltinFontProvider with SystemFontProvider.\"\"\"\n        builtin = BuiltinFontProvider(font_dir)\n        system = SystemFontProvider()\n\n        chain = ChainedFontProvider([builtin, system])\n\n        # Should find NotoSans from builtin\n        font = chain.get_font('NotoSans-Regular')\n        assert font is not None\n\n        # Should get fallback from builtin\n        fallback = chain.get_fallback_font()\n        assert fallback is not None\n\n    def test_system_fonts_extend_builtin(self, font_dir):\n        \"\"\"Test that system fonts add to builtin fonts.\"\"\"\n        builtin = BuiltinFontProvider(font_dir)\n        system = SystemFontProvider()\n\n        chain = ChainedFontProvider([builtin, system])\n\n        builtin_fonts = set(builtin.get_available_fonts())\n        chain_fonts = set(chain.get_available_fonts())\n\n        # Chain should have at least as many fonts as builtin\n        assert chain_fonts >= builtin_fonts\n"
  },
  {
    "path": "tests/test_tagged.py",
    "content": "# SPDX-FileCopyrightText: 2023 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport pytest\n\nimport ocrmypdf\n\n\ndef test_block_tagged(resources):\n    with pytest.raises(ocrmypdf.exceptions.TaggedPDFError):\n        ocrmypdf.ocr(resources / 'tagged.pdf', '_.pdf')\n\n\ndef test_force_tagged_warns(resources, outpdf, caplog):\n    caplog.set_level('WARNING')\n    ocrmypdf.ocr(\n        resources / 'tagged.pdf',\n        outpdf,\n        force_ocr=True,\n        plugins=['tests/plugins/tesseract_noop.py'],\n    )\n    assert 'marked as a Tagged PDF' in caplog.text\n\n\ndef test_tagged_pdf_mode_ignore_with_skip_text(resources, outpdf, caplog):\n    \"\"\"Ignore tagged_pdf_mode should warn but not error.\"\"\"\n    caplog.set_level('WARNING')\n    ocrmypdf.ocr(\n        resources / 'tagged.pdf',\n        outpdf,\n        tagged_pdf_mode='ignore',\n        skip_text=True,  # Tagged PDF has text, so skip pages with text\n        plugins=['tests/plugins/tesseract_noop.py'],\n    )\n    assert 'marked as a Tagged PDF' in caplog.text\n\n\ndef test_tagged_pdf_mode_ignore_with_force(resources, outpdf, caplog):\n    \"\"\"Ignore tagged_pdf_mode with force mode should warn.\"\"\"\n    caplog.set_level('WARNING')\n    ocrmypdf.ocr(\n        resources / 'tagged.pdf',\n        outpdf,\n        tagged_pdf_mode='ignore',\n        force_ocr=True,\n        plugins=['tests/plugins/tesseract_noop.py'],\n    )\n    assert 'marked as a Tagged PDF' in caplog.text\n"
  },
  {
    "path": "tests/test_tesseract.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nimport subprocess\nfrom os import fspath\nfrom pathlib import Path\n\nimport pytest\n\nfrom ocrmypdf import pdfinfo\nfrom ocrmypdf._exec import tesseract\nfrom ocrmypdf.exceptions import BadArgsError, MissingDependencyError\n\nfrom .conftest import check_ocrmypdf, run_ocrmypdf_api\n\n# pylint: disable=redefined-outer-name\n\n\n@pytest.mark.parametrize('basename', ['graph_ocred.pdf', 'cardinal.pdf'])\ndef test_skip_pages_does_not_replicate(resources, basename, outdir):\n    infile = resources / basename\n    outpdf = outdir / basename\n\n    check_ocrmypdf(\n        infile,\n        outpdf,\n        '--pdf-renderer',\n        'sandwich',\n        '--force-ocr',\n        '--tesseract-timeout',\n        '0',\n    )\n\n    info_in = pdfinfo.PdfInfo(infile)\n\n    info = pdfinfo.PdfInfo(outpdf)\n    for page in info:\n        assert len(page.images) == 1, \"skipped page was replicated\"\n\n    for n, info_out_n in enumerate(info):\n        assert info_out_n.width_inches == info_in[n].width_inches, \"output resized\"\n        assert info_out_n.height_inches == info_in[n].height_inches, \"output resized\"\n\n\ndef test_content_preservation(resources, outpdf):\n    infile = resources / 'masks.pdf'\n\n    check_ocrmypdf(\n        infile, outpdf, '--pdf-renderer', 'fpdf2', '--tesseract-timeout', '0'\n    )\n\n    info = pdfinfo.PdfInfo(outpdf)\n    page = info[0]\n    assert len(page.images) > 1, \"masks were rasterized\"\n\n\n@pytest.mark.skipif(\n    tesseract.version() >= tesseract.TesseractVersion('5'), reason=\"doesn't fool Tess 5\"\n)\ndef test_no_languages(tmp_path, monkeypatch):\n    (tmp_path / 'tessdata').mkdir()\n    monkeypatch.setenv('TESSDATA_PREFIX', fspath(tmp_path))\n    with pytest.raises(MissingDependencyError):\n        tesseract.get_languages()\n\n\ndef test_image_too_large_hocr(monkeypatch, resources, outdir):\n    def dummy_run(args, *, env=None, **kwargs):\n        raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')\n\n    monkeypatch.setattr(tesseract, 'run', dummy_run)\n    tesseract.generate_hocr(\n        input_file=resources / 'crom.png',\n        output_hocr=outdir / 'out.hocr',\n        output_text=outdir / 'out.txt',\n        languages=['eng'],\n        engine_mode=None,\n        tessconfig=[],\n        timeout=180.0,\n        pagesegmode=None,\n        thresholding=0,\n        user_words=None,\n        user_patterns=None,\n    )\n    assert Path(outdir / 'out.hocr').read_text() == ''\n\n\ndef test_image_too_large_pdf(monkeypatch, resources, outdir):\n    def dummy_run(args, *, env=None, **kwargs):\n        raise subprocess.CalledProcessError(1, 'tesseract', output=b'Image too large')\n\n    monkeypatch.setattr(tesseract, 'run', dummy_run)\n    tesseract.generate_pdf(\n        input_file=resources / 'crom.png',\n        output_pdf=outdir / 'pdf.pdf',\n        output_text=outdir / 'txt.txt',\n        languages=['eng'],\n        engine_mode=None,\n        tessconfig=[],\n        timeout=180.0,\n        pagesegmode=None,\n        thresholding=0,\n        user_words=None,\n        user_patterns=None,\n    )\n    assert Path(outdir / 'txt.txt').read_text() == '[skipped page]'\n    if os.name != 'nt':  # different semantics\n        assert Path(outdir / 'pdf.pdf').stat().st_size == 0\n\n\ndef test_timeout(caplog):\n    tesseract.page_timedout(5)\n    assert \"took too long\" in caplog.text\n\n\n@pytest.mark.parametrize(\n    'in_, logged',\n    [\n        (b'Tesseract Open Source', ''),\n        (b'lots of diacritics blah blah', 'diacritics'),\n        (b'Warning in pixReadMem', ''),\n        (b'OSD: Weak margin', 'unsure about page orientation'),\n        (b'Error in pixScanForForeground', ''),\n        (b'Error in boxClipToRectangle', ''),\n        (b'an unexpected error', 'an unexpected error'),\n        (b'a dire warning', 'a dire warning'),\n        (b'read_params_file something', 'read_params_file'),\n        (b'an innocent message', 'innocent'),\n        (b'\\x7f\\x7f\\x80innocent unicode failure', 'innocent'),\n    ],\n)\ndef test_tesseract_log_output(caplog, in_, logged):\n    caplog.set_level(logging.INFO)\n    tesseract.tesseract_log_output(in_)\n    if logged == '':\n        assert caplog.text == ''\n    else:\n        assert logged in caplog.text\n\n\ndef test_tesseract_log_output_raises(caplog):\n    with pytest.raises(tesseract.TesseractConfigError):\n        tesseract.tesseract_log_output(b'parameter not found: moo')\n    assert 'not found' in caplog.text\n\n\ndef test_blocked_language(resources, no_outpdf):\n    infile = resources / 'masks.pdf'\n    for bad_lang in ['osd', 'equ']:\n        with pytest.raises(BadArgsError):\n            run_ocrmypdf_api(infile, no_outpdf, '-l', bad_lang)\n"
  },
  {
    "path": "tests/test_unpaper.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport logging\nfrom os import fspath\nfrom unittest.mock import Mock, patch\n\nimport pytest\nfrom packaging.version import Version\nfrom pydantic import ValidationError\n\nfrom ocrmypdf._exec import unpaper\nfrom ocrmypdf._validation import check_options\nfrom ocrmypdf.cli import get_options_and_plugins\nfrom ocrmypdf.exceptions import ExitCode, MissingDependencyError\n\nfrom .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf_api\n\n# pylint: disable=redefined-outer-name\n\nneeds_unpaper = pytest.mark.skipif(not have_unpaper(), reason=\"requires unpaper\")\n\n\ndef test_no_unpaper(resources, no_outpdf):\n    input_ = fspath(resources / \"c02-22.pdf\")\n    output = fspath(no_outpdf)\n\n    options, pm = get_options_and_plugins([\"--clean\", input_, output])\n    with patch(\"ocrmypdf._exec.unpaper.version\") as mock:\n        mock.side_effect = FileNotFoundError(\"unpaper\")\n\n        with pytest.raises(MissingDependencyError):\n            check_options(options, pm)\n        mock.assert_called()\n\n\ndef test_old_unpaper(resources, no_outpdf):\n    input_ = fspath(resources / \"c02-22.pdf\")\n    output = fspath(no_outpdf)\n\n    options, pm = get_options_and_plugins([\"--clean\", input_, output])\n    with patch(\"ocrmypdf._exec.unpaper.version\") as mock:\n        mock.return_value = Version('0.5')\n\n        with pytest.raises(MissingDependencyError):\n            check_options(options, pm)\n        mock.assert_called()\n\n\ndef test_unpaper_version_chatter(resources, no_outpdf):\n    input_ = fspath(resources / \"c02-22.pdf\")\n    output = fspath(no_outpdf)\n\n    options, pm = get_options_and_plugins([\"--clean\", input_, output])\n    with patch(\"ocrmypdf.subprocess.run\") as mock:\n        mock.return_value = Mock(stdout='Warning: using insecure memory!\\n7.0.0\\n')\n\n        with pytest.raises(MissingDependencyError):\n            check_options(options, pm)\n        mock.assert_called()\n\n\n@needs_unpaper\ndef test_clean(resources, outpdf):\n    check_ocrmypdf(\n        resources / \"skew.pdf\",\n        outpdf,\n        \"-c\",\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n\n@needs_unpaper\ndef test_unpaper_args_valid(resources, outpdf):\n    check_ocrmypdf(\n        resources / \"skew.pdf\",\n        outpdf,\n        \"-c\",\n        \"--unpaper-args\",\n        \"--layout double\",  # Spaces required here\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n\n\n@needs_unpaper\ndef test_unpaper_args_invalid_filename(resources, outpdf, caplog):\n    with pytest.raises(ValidationError, match=\"No filenames allowed\"):\n        run_ocrmypdf_api(\n            resources / \"skew.pdf\",\n            outpdf,\n            \"-c\",\n            \"--unpaper-args\",\n            \"/etc/passwd\",\n            '--plugin',\n            'tests/plugins/tesseract_noop.py',\n        )\n\n\n@needs_unpaper\ndef test_unpaper_args_invalid(resources, outpdf):\n    exitcode = run_ocrmypdf_api(\n        resources / \"skew.pdf\",\n        outpdf,\n        \"-c\",\n        \"--unpaper-args\",\n        \"unpaper is not going to like these arguments\",\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n    # Can't tell difference between unpaper choking on bad arguments or some\n    # other unpaper failure\n    assert exitcode == ExitCode.child_process_error\n\n\n@needs_unpaper\ndef test_unpaper_image_too_big(resources, outdir, caplog):\n    with patch('ocrmypdf._exec.unpaper.UNPAPER_IMAGE_PIXEL_LIMIT', 42):\n        infile = resources / 'crom.png'\n        assert unpaper.clean(infile, outdir / 'out.png', dpi=300) == infile\n\n        assert any(\n            'too large for cleaning' in rec.message\n            for rec in caplog.get_records('call')\n            if rec.levelno == logging.WARNING\n        )\n\n\n@needs_unpaper\ndef test_palette_image(resources, outpdf):\n    check_ocrmypdf(\n        resources / \"palette.pdf\",\n        outpdf,\n        \"-c\",\n        '--plugin',\n        'tests/plugins/tesseract_noop.py',\n    )\n"
  },
  {
    "path": "tests/test_userunit.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nfrom math import isclose\n\nimport pytest\n\nfrom ocrmypdf.pdfinfo import PdfInfo\n\nfrom .conftest import check_ocrmypdf\n\n# pylint: disable=redefined-outer-name\n\n\n@pytest.fixture\ndef poster(resources):\n    return resources / 'poster.pdf'\n\n\n@pytest.mark.parametrize(\"mode\", ['pdf', 'pdfa'])\ndef test_userunit_pdf_passes(mode, poster, outpdf):\n    before = PdfInfo(poster)\n    check_ocrmypdf(\n        poster,\n        outpdf,\n        f'--output-type={mode}',\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n\n    after = PdfInfo(outpdf)\n    assert isclose(before[0].width_inches, after[0].width_inches)\n\n\ndef test_rotate_interaction(poster, outpdf):\n    check_ocrmypdf(\n        poster,\n        outpdf,\n        '--output-type=pdf',\n        '--rotate-pages',\n        '--plugin',\n        'tests/plugins/tesseract_cache.py',\n    )\n"
  },
  {
    "path": "tests/test_validation.py",
    "content": "# SPDX-FileCopyrightText: 2022 James R. Barlow\n# SPDX-License-Identifier: MPL-2.0\n\nfrom __future__ import annotations\n\nimport logging\nimport os\nfrom unittest.mock import patch\n\nimport pikepdf\nimport pytest\n\nfrom ocrmypdf import _validation as vd\nfrom ocrmypdf._concurrent import NullProgressBar, SerialExecutor\nfrom ocrmypdf._exec.tesseract import TesseractVersion\nfrom ocrmypdf._options import OcrOptions\nfrom ocrmypdf.api import create_options, setup_plugin_infrastructure\nfrom ocrmypdf.cli import get_parser\nfrom ocrmypdf.exceptions import BadArgsError, MissingDependencyError\nfrom ocrmypdf.pdfinfo import PdfInfo\n\nfrom .conftest import run_ocrmypdf_api\n\n\ndef make_opts_pm(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs):\n    if language is not None:\n        kwargs['language'] = language\n    parser = get_parser()\n    pm = setup_plugin_infrastructure(plugins=kwargs.get('plugins', []))\n    pm.add_options(parser=parser)\n    return (\n        create_options(\n            input_file=input_file, output_file=output_file, parser=parser, **kwargs\n        ),\n        pm,\n    )\n\n\ndef make_opts(*args, **kwargs):\n    opts, _pm = make_opts_pm(*args, **kwargs)\n    return opts\n\n\ndef make_ocr_opts(input_file='a.pdf', output_file='b.pdf', **kwargs):\n    \"\"\"Create OcrOptions directly for testing Pydantic validation.\"\"\"\n    return OcrOptions(input_file=input_file, output_file=output_file, **kwargs)\n\n\ndef test_old_tesseract_error():\n    with patch(\n        'ocrmypdf._exec.tesseract.version',\n        return_value=TesseractVersion('4.00.00alpha'),\n    ), pytest.raises(MissingDependencyError):\n        vd.check_options(*make_opts_pm(pdf_renderer='sandwich', language='eng'))\n\n\ndef test_tesseract_not_installed(caplog):\n    with patch('ocrmypdf.subprocess.run') as not_found:\n        not_found.side_effect = FileNotFoundError('tesseract')\n        with pytest.raises(MissingDependencyError, match=\"Could not find program\"):\n            vd.check_options(*make_opts_pm())\n            assert (\n                \"'tesseract' could not be executed\" in caplog.text\n            ), \"Error message not printed\"\n            assert 'install' in caplog.text, \"Install advice not printed\"\n        not_found.assert_called()\n\n\ndef test_lossless_redo():\n    with pytest.raises(ValueError, match=\"--redo-ocr.*is not currently compatible\"):\n        make_ocr_opts(redo_ocr=True, deskew=True)\n\n\ndef test_mutex_options():\n    with pytest.raises(\n        ValueError, match=\"Choose only one of --force-ocr, --skip-text, --redo-ocr\"\n    ):\n        make_ocr_opts(force_ocr=True, skip_text=True)\n    with pytest.raises(\n        ValueError, match=\"Choose only one of --force-ocr, --skip-text, --redo-ocr\"\n    ):\n        make_ocr_opts(redo_ocr=True, skip_text=True)\n    with pytest.raises(\n        ValueError, match=\"Choose only one of --force-ocr, --skip-text, --redo-ocr\"\n    ):\n        make_ocr_opts(redo_ocr=True, force_ocr=True)\n\n\ndef test_optimizing(caplog):\n    vd.check_options(\n        *make_opts_pm(optimize=0, png_quality=18, jpeg_quality=10)\n    )\n    assert 'will be ignored because' in caplog.text\n\n\ndef test_pillow_options():\n    # Test that max_image_mpixels=0 is valid (validation now in OcrOptions)\n    opts = make_ocr_opts(max_image_mpixels=0)\n    assert opts.max_image_mpixels == 0\n\n    # Test that negative values are rejected\n    with pytest.raises(ValueError, match=\"max_image_mpixels must be non-negative\"):\n        make_ocr_opts(max_image_mpixels=-1)\n\n\ndef test_output_tty():\n    with patch('sys.stdout.isatty', return_value=True), pytest.raises(BadArgsError):\n        vd.check_requested_output_file(make_opts(output_file='-'))\n\n\ndef test_report_file_size(tmp_path, caplog):\n    logging.getLogger('pikepdf._qpdf').setLevel(logging.CRITICAL)  # Suppress logging\n\n    in_ = tmp_path / 'a.pdf'\n    out = tmp_path / 'b.pdf'\n    pdf = pikepdf.new()\n    pdf.save(in_)\n    pdf.save(out)\n    opts = make_opts(output_type='pdf')\n    vd.report_output_file_size(opts, in_, out)\n    assert caplog.text == ''\n    caplog.clear()\n\n    waste_of_space = b'Dummy' * 5000\n    pdf.Root.Dummy = waste_of_space\n    pdf.save(in_)\n    pdf.Root.Dummy2 = waste_of_space + waste_of_space\n    pdf.save(out)\n\n    vd.report_output_file_size(opts, in_, out, ['The optional dependency...'])\n    assert 'optional dependency' in caplog.text\n    caplog.clear()\n\n    vd.report_output_file_size(opts, in_, out, [])\n    assert 'No reason' in caplog.text\n    caplog.clear()\n\n    opts = make_opts(in_, out, optimize=0, output_type='pdf')\n    vd.report_output_file_size(opts, in_, out, [\"Optimization was disabled.\"])\n    assert 'disabled' in caplog.text\n    caplog.clear()\n\n\ndef test_false_action_store_true():\n    opts = make_opts(keep_temporary_files=True)\n    assert opts.keep_temporary_files\n    opts = make_opts(keep_temporary_files=False)\n    assert not opts.keep_temporary_files\n\n\n@pytest.mark.parametrize('progress_bar', [True, False])\ndef test_no_progress_bar(progress_bar, resources):\n    opts, pm = make_opts_pm(\n        progress_bar=progress_bar, input_file=(resources / 'trivial.pdf')\n    )\n    vd.check_options(opts, pm)\n\n    pbar_disabled = None\n\n    class CheckProgressBar(NullProgressBar):\n        def __init__(self, disable, **kwargs):\n            nonlocal pbar_disabled\n            pbar_disabled = disable\n            super().__init__(disable=disable, **kwargs)\n\n    executor = SerialExecutor(pbar_class=CheckProgressBar)\n    pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar, executor=executor)\n\n    assert pdfinfo is not None\n    assert pbar_disabled is not None and pbar_disabled != progress_bar\n\n\ndef make_version(version):\n    def _make_version():\n        return TesseractVersion(version)\n\n    return _make_version\n\n\ndef test_version_comparison():\n    vd.check_external_program(\n        program=\"dummy_basic\",\n        package=\"dummy\",\n        version_checker=make_version('9.0'),\n        need_version='8.0.2',\n    )\n    vd.check_external_program(\n        program=\"dummy_doubledigit\",\n        package=\"dummy\",\n        version_checker=make_version('10.0'),\n        need_version='8.0.2',\n    )\n    with pytest.raises(MissingDependencyError):\n        vd.check_external_program(\n            program=\"tesseract\",\n            package=\"tesseract\",\n            version_checker=make_version('4.0.0-beta.1'),\n            need_version='4.1.1',\n            version_parser=TesseractVersion,\n        )\n    vd.check_external_program(\n        program=\"tesseract\",\n        package=\"tesseract\",\n        version_checker=make_version('v5.0.0-alpha.20200201'),\n        need_version='4.1.1',\n        version_parser=TesseractVersion,\n    )\n    vd.check_external_program(\n        program=\"tesseract\",\n        package=\"tesseract\",\n        version_checker=make_version('5.0.0-rc1.20211030'),\n        need_version='4.1.1',\n        version_parser=TesseractVersion,\n    )\n    vd.check_external_program(\n        program=\"tesseract\",\n        package=\"tesseract\",\n        version_checker=make_version('v4.1.1.20181030'),  # Used in some Windows builds\n        need_version='4.1.1',\n        version_parser=TesseractVersion,\n    )\n    vd.check_external_program(\n        program=\"gs\",\n        package=\"ghostscript\",\n        version_checker=make_version('10.0'),\n        need_version='9.50',\n    )\n    with pytest.raises(MissingDependencyError):\n        vd.check_external_program(\n            program=\"tesseract\",\n            package=\"tesseract\",\n            version_checker=make_version('4.1.1-rc2-25-g9707'),\n            need_version='4.1.1',\n            version_parser=TesseractVersion,\n        )\n    with pytest.raises(MissingDependencyError):\n        vd.check_external_program(\n            program=\"dummy_fails\",\n            package=\"dummy\",\n            version_checker=make_version('1.0'),\n            need_version='2.0',\n        )\n\n\ndef test_optional_program_recommended(caplog):\n    caplog.clear()\n\n    def raiser():\n        raise FileNotFoundError('jbig2')\n\n    with caplog.at_level(logging.WARNING):\n        vd.check_external_program(\n            program=\"jbig2\",\n            package=\"jbig2enc\",\n            version_checker=raiser,\n            need_version='42',\n            required_for='this test case',\n            recommended=True,\n        )\n        assert any(\n            (loglevel == logging.WARNING and \"recommended\" in msg)\n            for _logger_name, loglevel, msg in caplog.record_tuples\n        )\n\n\ndef test_pagesegmode_warning(caplog):\n    opts = make_opts(tesseract_pagesegmode='0')\n    plugin_manager = setup_plugin_infrastructure(plugins=opts.plugins or [])\n    vd.check_options(opts, plugin_manager)\n    assert 'disable OCR' in caplog.text\n\n\ndef test_two_languages():\n    vd.check_options_languages(\n        create_options(\n            input_file='a.pdf',\n            output_file='b.pdf',\n            parser=get_parser(),\n            languages=['fakelang1', 'fakelang2'],\n        ),\n        ['fakelang1', 'fakelang2'],\n    )\n\n\ndef test_sidecar_equals_output(resources, no_outpdf):\n    op = no_outpdf\n    with pytest.raises(BadArgsError, match=r'--sidecar'):\n        run_ocrmypdf_api(resources / 'trivial.pdf', op, '--sidecar', op)\n\n\ndef test_devnull_sidecar(resources):\n    with pytest.raises(BadArgsError, match=r'--sidecar.*NUL'):\n        run_ocrmypdf_api(resources / 'trivial.pdf', os.devnull, '--sidecar')\n"
  },
  {
    "path": "tests/test_verapdf.py",
    "content": "# SPDX-FileCopyrightText: 2024 James R. Barlow\n# SPDX-License-Identifier: CC-BY-SA-4.0\n\n\"\"\"Tests for verapdf wrapper and speculative PDF/A conversion.\"\"\"\n\nfrom __future__ import annotations\n\nimport pikepdf\nimport pytest\nfrom pikepdf import Name\n\nfrom ocrmypdf._exec import verapdf\nfrom ocrmypdf.pdfa import (\n    _pdfa_part_conformance,\n    add_pdfa_metadata,\n    add_srgb_output_intent,\n    speculative_pdfa_conversion,\n)\n\n\nclass TestVerapdfModule:\n    \"\"\"Tests for verapdf wrapper module.\"\"\"\n\n    def test_output_type_to_flavour(self):\n        assert verapdf.output_type_to_flavour('pdfa') == '2b'\n        assert verapdf.output_type_to_flavour('pdfa-1') == '1b'\n        assert verapdf.output_type_to_flavour('pdfa-2') == '2b'\n        assert verapdf.output_type_to_flavour('pdfa-3') == '3b'\n        # Unknown should default to 2b\n        assert verapdf.output_type_to_flavour('unknown') == '2b'\n\n    @pytest.mark.skipif(not verapdf.available(), reason='verapdf not installed')\n    def test_version(self):\n        ver = verapdf.version()\n        assert ver.major >= 1\n\n    @pytest.mark.skipif(not verapdf.available(), reason='verapdf not installed')\n    def test_validate_non_pdfa(self, tmp_path):\n        \"\"\"Test validation of a non-PDF/A file returns invalid.\"\"\"\n        test_pdf = tmp_path / 'test.pdf'\n        with pikepdf.new() as pdf:\n            pdf.add_blank_page()\n            pdf.save(test_pdf)\n\n        result = verapdf.validate(test_pdf, '2b')\n        assert not result.valid\n        assert result.failed_rules > 0\n\n\nclass TestPdfaPartConformance:\n    \"\"\"Tests for _pdfa_part_conformance helper.\"\"\"\n\n    def test_pdfa_part_conformance(self):\n        assert _pdfa_part_conformance('pdfa') == ('2', 'B')\n        assert _pdfa_part_conformance('pdfa-1') == ('1', 'B')\n        assert _pdfa_part_conformance('pdfa-2') == ('2', 'B')\n        assert _pdfa_part_conformance('pdfa-3') == ('3', 'B')\n        # Unknown should default to 2B\n        assert _pdfa_part_conformance('unknown') == ('2', 'B')\n\n\nclass TestAddPdfaMetadata:\n    \"\"\"Tests for add_pdfa_metadata function.\"\"\"\n\n    def test_add_pdfa_metadata(self, tmp_path):\n        \"\"\"Test adding PDF/A XMP metadata.\"\"\"\n        test_pdf = tmp_path / 'test.pdf'\n        with pikepdf.new() as pdf:\n            pdf.add_blank_page()\n            pdf.save(test_pdf)\n\n        with pikepdf.open(test_pdf, allow_overwriting_input=True) as pdf:\n            add_pdfa_metadata(pdf, '2', 'B')\n            with pdf.open_metadata() as meta:\n                assert meta.pdfa_status == '2B'\n            pdf.save(test_pdf)\n\n        # Verify it persists after save\n        with pikepdf.open(test_pdf) as pdf, pdf.open_metadata() as meta:\n            assert meta.pdfa_status == '2B'\n\n\nclass TestAddSrgbOutputIntent:\n    \"\"\"Tests for add_srgb_output_intent function.\"\"\"\n\n    def test_add_srgb_output_intent(self, tmp_path):\n        \"\"\"Test adding sRGB OutputIntent to a PDF.\"\"\"\n        test_pdf = tmp_path / 'test.pdf'\n        with pikepdf.new() as pdf:\n            pdf.add_blank_page()\n            pdf.save(test_pdf)\n\n        with pikepdf.open(test_pdf, allow_overwriting_input=True) as pdf:\n            add_srgb_output_intent(pdf)\n            assert Name.OutputIntents in pdf.Root\n            assert len(pdf.Root.OutputIntents) == 1\n            intent = pdf.Root.OutputIntents[0]\n            assert str(intent.get(Name.OutputConditionIdentifier)) == 'sRGB'\n            pdf.save(test_pdf)\n\n    def test_add_srgb_output_intent_idempotent(self, tmp_path):\n        \"\"\"Test that adding OutputIntent twice doesn't duplicate.\"\"\"\n        test_pdf = tmp_path / 'test.pdf'\n        with pikepdf.new() as pdf:\n            pdf.add_blank_page()\n            pdf.save(test_pdf)\n\n        with pikepdf.open(test_pdf, allow_overwriting_input=True) as pdf:\n            add_srgb_output_intent(pdf)\n            add_srgb_output_intent(pdf)  # Second call should be a no-op\n            assert len(pdf.Root.OutputIntents) == 1\n            pdf.save(test_pdf)\n\n\nclass TestSpeculativePdfaConversion:\n    \"\"\"Tests for speculative PDF/A conversion.\"\"\"\n\n    def test_speculative_conversion_creates_pdfa_structures(self, tmp_path, resources):\n        \"\"\"Test that speculative conversion adds PDF/A structures.\"\"\"\n        input_pdf = resources / 'graph.pdf'\n        output_pdf = tmp_path / 'output.pdf'\n\n        result = speculative_pdfa_conversion(input_pdf, output_pdf, 'pdfa-2')\n\n        assert result.exists()\n        with pikepdf.open(result) as pdf:\n            assert Name.OutputIntents in pdf.Root\n            with pdf.open_metadata() as meta:\n                assert meta.pdfa_status == '2B'\n\n    def test_speculative_conversion_different_parts(self, tmp_path, resources):\n        \"\"\"Test speculative conversion with different PDF/A parts.\"\"\"\n        input_pdf = resources / 'graph.pdf'\n\n        for output_type, expected_status in [\n            ('pdfa-1', '1B'),\n            ('pdfa-2', '2B'),\n            ('pdfa-3', '3B'),\n        ]:\n            output_pdf = tmp_path / f'output_{output_type}.pdf'\n            speculative_pdfa_conversion(input_pdf, output_pdf, output_type)\n\n            with pikepdf.open(output_pdf) as pdf, pdf.open_metadata() as meta:\n                assert meta.pdfa_status == expected_status\n\n\n@pytest.mark.skipif(not verapdf.available(), reason='verapdf not installed')\nclass TestVerapdfIntegration:\n    \"\"\"Integration tests requiring verapdf.\"\"\"\n\n    def test_speculative_conversion_validation(self, tmp_path, resources):\n        \"\"\"Test that speculative conversion can be validated by verapdf.\n\n        Note: Most test PDFs will fail validation because they have issues\n        that require Ghostscript to fix (fonts, colorspaces, etc.). This test\n        verifies the validation pipeline works, not that all PDFs pass.\n        \"\"\"\n        input_pdf = resources / 'graph.pdf'\n        output_pdf = tmp_path / 'output.pdf'\n\n        speculative_pdfa_conversion(input_pdf, output_pdf, 'pdfa-2')\n\n        # The converted file can be validated (even if it fails)\n        result = verapdf.validate(output_pdf, '2b')\n        assert isinstance(result.valid, bool)\n        assert isinstance(result.failed_rules, int)\n"
  },
  {
    "path": "tests/test_watcher.py",
    "content": "from __future__ import annotations\n\nimport datetime as dt\nimport os\nimport shutil\nimport subprocess\nimport sys\nimport time\nfrom pathlib import Path\n\nimport pytest\n\nwatchdog = pytest.importorskip('watchdog')\n\n\n@pytest.mark.parametrize('year_month', [True, False])\ndef test_watcher(tmp_path, resources, year_month):\n    input_dir = tmp_path / 'input'\n    input_dir.mkdir()\n    output_dir = tmp_path / 'output'\n    output_dir.mkdir()\n    processed_dir = tmp_path / 'processed'\n    processed_dir.mkdir()\n\n    env_extra = {'OCR_OUTPUT_DIRECTORY_YEAR_MONTH': '1'} if year_month else {}\n    proc = subprocess.Popen(\n        [\n            sys.executable,\n            Path(__file__).parent.parent / 'misc' / 'watcher.py',\n            str(input_dir),\n            str(output_dir),\n            str(processed_dir),\n        ],\n        cwd=str(tmp_path),\n        env=os.environ.copy() | env_extra,\n    )\n    time.sleep(5)\n\n    shutil.copy(resources / 'trivial.pdf', input_dir / 'trivial.pdf')\n    time.sleep(5)\n\n    if year_month:\n        assert (\n            output_dir\n            / f'{dt.date.today().year}'\n            / f'{dt.date.today().month:02d}'\n            / 'trivial.pdf'\n        ).exists()\n    else:\n        assert (output_dir / 'trivial.pdf').exists()\n\n    proc.terminate()\n    proc.wait()\n"
  }
]