Repository: funstory-ai/BabelDOC
Branch: main
Commit: 34739ea88118
Files: 156
Total size: 1.9 MB

Directory structure:
gitextract_4xv94fs_/

├── .cursorignore
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.yaml
│   │   └── feature_request.yaml
│   ├── PULL_REQUEST_TEMPLATE/
│   │   └── pr_form.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   ├── dependabot.yml
│   ├── labels.yml
│   ├── release-drafter.yml
│   └── workflows/
│       ├── codeql.yml
│       ├── docs.yml
│       ├── labeler.yml
│       ├── lint.yml
│       ├── pr-lint.yml
│       ├── publish-to-pypi.yml
│       └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── babeldoc/
│   ├── __init__.py
│   ├── assets/
│   │   ├── assets.py
│   │   └── embedding_assets_metadata.py
│   ├── asynchronize/
│   │   └── __init__.py
│   ├── babeldoc_exception/
│   │   ├── BabelDOCException.py
│   │   └── __init__.py
│   ├── const.py
│   ├── docvision/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── base_doclayout.py
│   │   ├── doclayout.py
│   │   ├── rpc_doclayout.py
│   │   ├── rpc_doclayout2.py
│   │   ├── rpc_doclayout3.py
│   │   ├── rpc_doclayout4.py
│   │   ├── rpc_doclayout5.py
│   │   ├── rpc_doclayout6.py
│   │   ├── rpc_doclayout7.py
│   │   └── table_detection/
│   │       └── rapidocr.py
│   ├── format/
│   │   ├── __init__.py
│   │   └── pdf/
│   │       ├── __init__.py
│   │       ├── babelpdf/
│   │       │   ├── base14.py
│   │       │   ├── cidfont.py
│   │       │   ├── cmap.py
│   │       │   ├── encoding.py
│   │       │   ├── type3.py
│   │       │   ├── utils.py
│   │       │   └── win_core.py
│   │       ├── converter.py
│   │       ├── document_il/
│   │       │   ├── __init__.py
│   │       │   ├── backend/
│   │       │   │   ├── __init__.py
│   │       │   │   └── pdf_creater.py
│   │       │   ├── frontend/
│   │       │   │   ├── __init__.py
│   │       │   │   └── il_creater.py
│   │       │   ├── il_version_1.py
│   │       │   ├── il_version_1.rnc
│   │       │   ├── il_version_1.rng
│   │       │   ├── il_version_1.xsd
│   │       │   ├── midend/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── add_debug_information.py
│   │       │   │   ├── automatic_term_extractor.py
│   │       │   │   ├── detect_scanned_file.py
│   │       │   │   ├── il_translator.py
│   │       │   │   ├── il_translator_llm_only.py
│   │       │   │   ├── layout_parser.py
│   │       │   │   ├── paragraph_finder.py
│   │       │   │   ├── remove_descent.py
│   │       │   │   ├── styles_and_formulas.py
│   │       │   │   ├── table_parser.py
│   │       │   │   └── typesetting.py
│   │       │   ├── utils/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── extract_char.py
│   │       │   │   ├── fontmap.py
│   │       │   │   ├── formular_helper.py
│   │       │   │   ├── layout_helper.py
│   │       │   │   ├── matrix_helper.py
│   │       │   │   ├── mupdf_helper.py
│   │       │   │   ├── paragraph_helper.py
│   │       │   │   ├── spatial_analyzer.py
│   │       │   │   ├── style_helper.py
│   │       │   │   └── zstd_helper.py
│   │       │   └── xml_converter.py
│   │       ├── high_level.py
│   │       ├── pdfinterp.py
│   │       ├── result_merger.py
│   │       ├── split_manager.py
│   │       └── translation_config.py
│   ├── glossary.py
│   ├── main.py
│   ├── pdfminer/
│   │   ├── LICENSE
│   │   ├── __init__.py
│   │   ├── _saslprep.py
│   │   ├── arcfour.py
│   │   ├── ascii85.py
│   │   ├── casting.py
│   │   ├── ccitt.py
│   │   ├── cmap/
│   │   │   └── README.txt
│   │   ├── cmapdb.py
│   │   ├── converter.py
│   │   ├── data_structures.py
│   │   ├── encodingdb.py
│   │   ├── fontmetrics.py
│   │   ├── glyphlist.py
│   │   ├── high_level.py
│   │   ├── image.py
│   │   ├── jbig2.py
│   │   ├── latin_enc.py
│   │   ├── layout.py
│   │   ├── lzw.py
│   │   ├── pdfcolor.py
│   │   ├── pdfdevice.py
│   │   ├── pdfdocument.py
│   │   ├── pdfexceptions.py
│   │   ├── pdffont.py
│   │   ├── pdfinterp.py
│   │   ├── pdfpage.py
│   │   ├── pdfparser.py
│   │   ├── pdftypes.py
│   │   ├── psexceptions.py
│   │   ├── psparser.py
│   │   ├── py.typed
│   │   ├── runlength.py
│   │   ├── settings.py
│   │   └── utils.py
│   ├── progress_monitor.py
│   ├── tools/
│   │   ├── generate_cmap_metadata.py
│   │   ├── generate_font_metadata.py
│   │   ├── italic_assistance.py
│   │   └── italic_recognize_tool.py
│   ├── translator/
│   │   ├── __init__.py
│   │   ├── cache.py
│   │   └── translator.py
│   └── utils/
│       ├── __init__.py
│       ├── atomic_integer.py
│       ├── memory.py
│       └── priority_thread_pool_executor.py
├── docs/
│   ├── CODE_OF_CONDUCT.md
│   ├── CONTRIBUTING.md
│   ├── CONTRIBUTOR_REWARD.md
│   ├── ImplementationDetails/
│   │   ├── AsyncTranslate/
│   │   │   └── AsyncTranslate.md
│   │   ├── ILTranslator/
│   │   │   └── ILTranslator.md
│   │   ├── PDFCreation/
│   │   │   └── PDFCreation.md
│   │   ├── PDFParsing/
│   │   │   └── PDFParsing.md
│   │   ├── ParagraphFinding/
│   │   │   └── ParagraphFinding.md
│   │   ├── README.md
│   │   ├── StylesAndFormulas/
│   │   │   └── StylesAndFormulas.md
│   │   └── Typesetting/
│   │       └── Typesetting.md
│   ├── README.md
│   ├── deploy.sh
│   ├── example/
│   │   └── demo_glossary.csv
│   ├── index.md
│   ├── intro-to-pdf-object.md
│   ├── requirements.txt
│   └── supported_languages.md
├── mkdocs.yml
├── pyproject.toml
└── tests/
    └── test_translation_cache_cleanup.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .cursorignore
================================================
# Project notes and templates
xnotes/


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.yaml
================================================
name: "🐞 Bug Report"
description: Create a report to help us improve
labels: ['bug']
body:
  - type: checkboxes
    id: checks
    attributes:
      label: Before you submit
      options:
        - label: I have searched existing issues
          required: true
        - label: I spent at least 5 minutes investigating and preparing this report
          required: true
        - label: I confirmed this is not caused by a network issue
          required: true
        - label: I have fully read and understood the [README](https://github.com/funstory-ai/BabelDOC/blob/main/README.md)
          required: true
        - label: I am certain that this issue is with BabelDOC itself and can be reproduced through the BabelDOC cli
          required: true
        - label: I have uploaded the original file, or confirmed that this issue is unrelated to the original file
          required: true
        - label: I have uploaded the log.
          required: true
        - label: I confirm that the latest version of BabelDOC is being used.
          required: true
        - label: I am aware that the issue section of this project is only for submitting bugs that are clearly related to the BabelDOC core, with complete reproduction steps and relevant logs attached.** Otherwise, issues will be closed directly.
          required: true

  - type: markdown
    attributes:
      value: |
        Thank you for using **BabelDOC** and helping us improve it! 🙏
        Please confirm again that the above checklist items have been carefully executed! (If you have not carefully executed them, the issue will be closed directly without any response)

        Please also note:
        - If you are using a downstream project like pdf2zh-next, please submit an issue directly to the downstream application. Only when you confirm that this issue is a problem with the core library of BabelDOC, submit this issue.
        - The CLI is only used for debugging purposes, we do not provide any technical support for CLI usage.

  - type: markdown
    attributes:
      value: |
        Please note! Users of immersive translate online services should contact customer service and provide their translation ID. **Feedback related to online services is not handled here.**

  - type: textarea
    id: environment
    attributes:
      label: Environment
      description: Provide your system details (required)
      value: |
        - OS:
        - Python:
        - BabelDOC:
      render: markdown
    validations:
      required: true

  - type: textarea
    id: describe
    attributes:
      label: Describe the bug
      description: A clear and concise description of what the bug is.
    validations:
      required: true

  - type: textarea
    id: reproduce
    attributes:
      label: Steps to Reproduce
      description: Help us reproduce the issue. Issues that do not provide reproduction steps will be closed directly.
      value: |
        1. Go to '...'
        2. Click on '...'
        3. See error
    validations:
      required: false

  - type: textarea
    id: expected
    attributes:
      label: Expected Behavior
      description: What did you expect to happen?
    validations:
      required: false

  - type: textarea
    id: logs
    attributes:
      label: Relevant Log Output or Screenshots
      description: Copy and paste any logs or attach screenshots. This will be formatted automatically.
      render: text
    validations:
      required: false

  - type: textarea
    id: pdf
    attributes:
      label: Original PDF File
      description: Upload the input PDF if applicable. (Issues related to specific PDFs but without uploaded files will be closed directly.)
    validations:
      required: false

  - type: textarea
    id: others
    attributes:
      label: Additional Context
      description: Anything else we should know?
    validations:
      required: false


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.yaml
================================================
name: "✨ Feature Request"
description: Suggest a new idea or improvement for BabelDOC
labels: ['enhancement']
body:
  - type: markdown
    attributes:
      value: |
        Thank you for helping improve **BabelDOC**! Please fill out the form below to suggest a feature.

  - type: checkboxes
    id: checks
    attributes:
      label: Before you submit
      options:
        - label: I have searched existing issues
          required: true
        - label: I have fully read and understood the [README](https://github.com/funstory-ai/BabelDOC/blob/main/README.md)
          required: true
        - label: This feature is not related to BabelDOC CLI. The CLI is only used for debugging purposes, we do not accept any feature requests related to the CLI.
          required: true
  
  - type: markdown
    attributes:
      value: |
        如果您想自部署 BabelDOC，请使用 [PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next) 代替。若其功能无法满足，请向 [PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next) 提交功能请求。
        If you wish to self-host BabelDOC, please use [PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next) instead. If its features do not meet your needs, please submit a feature request to [PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next).


  - type: textarea
    id: describe
    attributes:
      label: Is your feature request related to a problem?
      description: If applicable, describe what problem this feature would solve.
      placeholder: Ex. I'm always frustrated when ...
    validations:
      required: false

  - type: textarea
    id: solution
    attributes:
      label: Describe the solution you'd like
      description: What would you like to see happen?
    validations:
      required: true

  - type: textarea
    id: alternatives
    attributes:
      label: Describe alternatives you've considered
      description: Have you thought of other ways to solve this?
    validations:
      required: false

  - type: textarea
    id: additional
    attributes:
      label: Additional context
      description: Any other context, examples, or screenshots?
    validations:
      required: false


================================================
FILE: .github/PULL_REQUEST_TEMPLATE/pr_form.yml
================================================
name: Pull Request
description: Submit a pull request to contribute to BabelDOC
title: "[PR] <Your concise title here>"
labels:
  - needs triage
body:
  - type: markdown
    attributes:
      value: |
        ## 👋 Thanks for contributing to **BabelDOC**!

        Please fill out this form to help us review your pull request effectively.

  - type: input
    id: issue
    attributes:
      label: Related Issue(s)
      description: If this pull request closes or is related to one or more issues, list them here (e.g., #37)
      placeholder: "#37"
    validations:
      required: false

  - type: textarea
    id: summary
    attributes:
      label: Description
      description: Describe the purpose of this pull request and what was changed.
      placeholder: |
        - What does this PR introduce or fix?
        - What is the motivation behind it?
    validations:
      required: true

  - type: dropdown
    id: pr_type
    attributes:
      label: PR Type
      description: What kind of change is this?
      multiple: true
      options:
        - enhancement
        - bug
        - documentation
        - refactor
        - test
        - chore
    validations:
      required: true

  - type: checkboxes
    id: checklist
    attributes:
      label: Contributor Checklist
      options:
        - label: I’ve fully read and understood the **[CONTRIBUTING.md](https://funstory-ai.github.io/BabelDOC/CONTRIBUTING/)** guide
          required: true
        - label: My changes follow the project’s code style and guidelines
          required: true
        - label: I’ve linked the related issue(s) in the description above
        - label: I’ve updated relevant documentation (if applicable)
        - label: I’ve added necessary tests (if applicable)
        - label: All new and existing tests passed locally
        - label: I understand that due to limited maintainer resources, only small pull requests are accepted. Suggestions with proof-of-concept patches are appreciated, and my patch may be rewritten if necessary.

  - type: textarea
    id: testing
    attributes:
      label: Testing Instructions
      description: Provide step-by-step instructions on how to test your changes
      placeholder: |
        1. Run `...`
        2. Visit `...`
        3. Click `...`
        4. Verify `...`
    validations:
      required: false

  - type: textarea
    id: screenshots
    attributes:
      label: Screenshots (if applicable)
      description: If UI changes were made, please attach before/after screenshots.
    validations:
      required: false

  - type: textarea
    id: notes
    attributes:
      label: Additional Notes
      description: Anything else the reviewer should know?
    validations:
      required: false


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
### PR Title

<!-- Please fill in a concise and clear PR title below -->
[PR] <Your concise title here>

### Related Issue(s)

<!-- If this PR closes or is related to one or more issues, please list them here (e.g., #37) -->
<!-- e.g.: Closes #37, Relates to #42 -->

### Motivation and Context

<!-- Why is this change required? What problem does it solve? -->
<!-- If it fixes an open issue, please link to the issue here. -->

### Summary of Changes

<!-- What does this PR introduce or fix? Please describe concisely. -->

### PR Type

<!-- What kind of change is this? Please select one or more -->
- [ ] ✨ Enhancement
- [ ] 🐛 Bug Fix
- [ ] 📚 Documentation
- [ ] 🏗️ Refactor
- [ ] 🧪 Test
- [ ] 🧹 Chore

### Breaking Changes

<!-- Does this PR introduce any breaking changes? If so, please describe them. -->
<!-- - [ ] Yes, this PR introduces breaking changes.
<!-- - [ ] No, this PR does not introduce breaking changes. -->
<!-- Detailed description of breaking changes (if any): -->

### Contributor Checklist

- [ ] I have fully read and understood the **[CONTRIBUTING.md](https://funstory-ai.github.io/BabelDOC/CONTRIBUTING/)** guide.
- [ ] I have performed a self-review of my own code.
- [ ] My changes follow the project's code style and guidelines
- [ ] I have linked the related issue(s) in the description above (if applicable)
- [ ] I have updated relevant documentation (if applicable)
- [ ] I have added necessary tests that prove my fix is effective or that my feature works (if applicable)
- [ ] All new and existing tests passed locally with my changes
- [ ] My changes generate no new warnings or errors
- [ ] I understand that due to limited maintainer resources, only small PRs are accepted. Suggestions with proof-of-concept patches are appreciated, and my patch may be rewritten if necessary.

### Testing Instructions

<!-- Please provide clear and concise step-by-step instructions on how to test your changes. -->
<!-- e.g.: -->
<!-- 1. Check out this branch. -->
<!-- 2. Run `...` to install dependencies. -->
<!-- 3. Run `...` to start the application/run the script. -->
<!-- 4. Navigate to `...` or observe `...` -->
<!-- 5. Verify that `...` (expected outcome). -->

### Screenshots (if applicable)

<!-- If your changes include UI modifications, please add screenshots or GIFs to show the before and after. -->

### Additional Notes

<!-- Is there anything else the reviewer should know? For example, any dependencies, or potential impacts. --> 

================================================
FILE: .github/dependabot.yml
================================================
version: 2
updates:
  - package-ecosystem: github-actions
    directory: "/"
    schedule:
      interval: weekly
  # - package-ecosystem: pip
  #   directory: "/.github/workflows"
  #   schedule:
  #     interval: weekly
  # - package-ecosystem: pip
  #   directory: "/docs"
  #   schedule:
  #     interval: weekly
  - package-ecosystem: pip
    directory: "/"
    schedule:
      interval: weekly
    versioning-strategy: lockfile-only
    allow:
      - dependency-type: "all"

================================================
FILE: .github/labels.yml
================================================
---
# Labels names are important as they are used by Release Drafter to decide
# regarding where to record them in changelog or if to skip them.
#
# The repository labels will be automatically configured using this file and
# the GitHub Action https://github.com/marketplace/actions/github-labeler.
- name: breaking
  description: Breaking Changes
  color: "bfd4f2"
- name: bug
  description: Something isn't working
  color: "d73a4a"
- name: build
  description: Build System and Dependencies
  color: "bfdadc"
- name: ci
  description: Continuous Integration
  color: "4a97d6"
- name: dependencies
  description: Pull requests that update a dependency file
  color: "0366d6"
- name: documentation
  description: Improvements or additions to documentation
  color: "0075ca"
- name: duplicate
  description: This issue or pull request already exists
  color: "cfd3d7"
- name: enhancement
  description: New feature or request
  color: "a2eeef"
- name: github_actions
  description: Pull requests that update Github_actions code
  color: "000000"
- name: good first issue
  description: Good for newcomers
  color: "7057ff"
- name: help wanted
  description: Extra attention is needed
  color: "008672"
- name: invalid
  description: This doesn't seem right
  color: "e4e669"
- name: performance
  description: Performance
  color: "016175"
- name: python
  description: Pull requests that update Python code
  color: "2b67c6"
- name: question
  description: Further information is requested
  color: "d876e3"
- name: refactoring
  description: Refactoring
  color: "ef67c4"
- name: removal
  description: Removals and Deprecations
  color: "9ae7ea"
- name: style
  description: Style
  color: "c120e5"
- name: testing
  description: Testing
  color: "b1fc6f"
- name: wontfix
  description: This will not be worked on
  color: "ffffff"

================================================
FILE: .github/release-drafter.yml
================================================
name-template: 'v$RESOLVED_VERSION'
tag-template: 'v$RESOLVED_VERSION'
categories:
  - title: '🚀 Features'
    labels:
      - 'feature'
      - 'enhancement'
  - title: '🐛 Bug Fixes'
    labels:
      - 'fix'
      - 'bugfix'
      - 'bug'
  - title: '🧰 Maintenance'
    labels:
      - 'chore'
      - 'maintenance'
      - 'refactor'
  - title: '📝 Documentation'
    labels:
      - 'docs'
      - 'documentation'
change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
change-title-escapes: '\<*_&' # You can add # and @ to disable mentions
version-resolver:
  major:
    labels:
      - 'major'
  minor:
    labels:
      - 'minor'
  patch:
    labels:
      - 'patch'
  default: patch
template: |
  ## Changes

  $CHANGES

  ## Contributors
  
  $CONTRIBUTORS


================================================
FILE: .github/workflows/codeql.yml
================================================
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL Advanced"

on:
  push:
  pull_request:
    branches: [ "main" ]
  schedule:
    - cron: '36 14 * * 1'

jobs:
  analyze:
    name: Analyze (${{ matrix.language }})
    # Runner size impacts CodeQL analysis time. To learn more, please see:
    #   - https://gh.io/recommended-hardware-resources-for-running-codeql
    #   - https://gh.io/supported-runners-and-hardware-resources
    #   - https://gh.io/using-larger-runners (GitHub.com only)
    # Consider using larger runners or machines with greater resources for possible analysis time improvements.
    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
    permissions:
      # required for all workflows
      security-events: write

      # required to fetch internal or private CodeQL packs
      packages: read

      # only required for workflows in private repositories
      actions: read
      contents: read

    strategy:
      fail-fast: false
      matrix:
        include:
        - language: python
          build-mode: none
        - language: actions
        # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
        # Use `c-cpp` to analyze code written in C, C++ or both
        # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
        # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
        # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
        # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
        # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
        # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
    steps:
    - name: Checkout repository
      uses: actions/checkout@v5

    # Initializes the CodeQL tools for scanning.
    - name: Initialize CodeQL
      uses: github/codeql-action/init@v4
      with:
        languages: ${{ matrix.language }}
        build-mode: ${{ matrix.build-mode }}
        # If you wish to specify custom queries, you can do so here or in a config file.
        # By default, queries listed here will override any specified in a config file.
        # Prefix the list here with "+" to use these queries and those in the config file.

        # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
        # queries: security-extended,security-and-quality

    # If the analyze step fails for one of the languages you are analyzing with
    # "We were unable to automatically build your code", modify the matrix above
    # to set the build mode to "manual" for that language. Then modify this step
    # to build your code.
    # ℹ️ Command-line programs to run using the OS shell.
    # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
    - if: matrix.build-mode == 'manual'
      shell: bash
      run: |
        echo 'If you are using a "manual" build mode for one or more of the' \
          'languages you are analyzing, replace this with the commands to build' \
          'your code, for example:'
        echo '  make bootstrap'
        echo '  make release'
        exit 1

    - name: Perform CodeQL Analysis
      uses: github/codeql-action/analyze@v4
      with:
        category: "/language:${{matrix.language}}"


================================================
FILE: .github/workflows/docs.yml
================================================
name: docs
on:
  push:
    branches:
      - main
permissions:
  contents: write
jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5
        with:
          fetch-depth: 0
      - name: Configure Git Credentials
        run: |
          git config user.name github-actions[bot]
          git config user.email 41898282+github-actions[bot]@users.noreply.github.com
      - name: Setup uv with Python 3.12
        uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
        with:
          python-version: "3.12"
          enable-cache: true
          cache-dependency-glob: "uv.lock"
          activate-environment: true
      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 
      - uses: actions/cache@v4
        with:
          key: mkdocs-material-${{ env.cache_id }}
          path: .cache
          restore-keys: |
            mkdocs-material-
      - run: uv sync
      - run: uv run mkdocs gh-deploy --force

================================================
FILE: .github/workflows/labeler.yml
================================================
name: Labeler

on:
  push:
    branches:
      - 'main'
    paths:
      - '.github/labels.yml'
      - '.github/workflows/labels.yml'
  pull_request:
    paths:
      - '.github/labels.yml'
      - '.github/workflows/labels.yml'

permissions:
  contents: read
  issues: write
  pull-requests: write

jobs:
  labeler:
    runs-on: ubuntu-latest
    steps:
      - name: Check out the repository
        uses: actions/checkout@v5

      - name: Run Labeler
        uses: crazy-max/ghaction-github-labeler@24d110aa46a59976b8a7f35518cb7f14f434c916 # v5.3.0
        with:
          skip-delete: true
          dry-run: ${{ github.event_name == 'pull_request' }}
          github-token: ${{ secrets.GITHUB_TOKEN }}
          yaml-file: .github/labels.yml
          exclude: |
            help*
            *issue

================================================
FILE: .github/workflows/lint.yml
================================================
name: Lint Code
permissions:
  contents: read
  pull-requests: write
on: [push]

jobs:
  lint:
    strategy:
      fail-fast: false
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5
      - name: Ruff
        uses: astral-sh/ruff-action@v3
      - name: AutoCorrect
        uses: huacnlee/autocorrect-action@main


================================================
FILE: .github/workflows/pr-lint.yml
================================================
name: Lint Code and Review Dog Report

on: [pull_request]
permissions:
  contents: read
  pull-requests: write
jobs:
  ruff:
    name: runner / ruff
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5
      
      - name: Install Python
        uses: actions/setup-python@v6
        with:
          python-version: '3.11'
          
      - name: Install ruff
        run: pip install ruff
        
      - name: Install reviewdog
        uses: reviewdog/action-setup@d8edfce3dd5e1ec6978745e801f9c50b5ef80252 # v1.4.0
        with:
          reviewdog_version: latest
          
      - name: Run ruff with reviewdog
        env:
          REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          ruff check . --output-format=rdjson | reviewdog -f=rdjson -reporter=github-pr-review -fail-on-error
          
  autocorrect:
    name: runner / autocorrect
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5
      - name: AutoCorrect
        uses: huacnlee/autocorrect-action@bf91ab3904c2908dd8e71312a8a83ed1eb632997 # v2.13.3
      - name: Report ReviewDog
        if: failure()
        uses: huacnlee/autocorrect-action@bf91ab3904c2908dd8e71312a8a83ed1eb632997 # v2.13.3
        env:
          REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          reviewdog: true

================================================
FILE: .github/workflows/publish-to-pypi.yml
================================================
name: Release

on:
  push:
    branches:
      - main
      - master

permissions:
  id-token: write
  contents: write
  pull-requests: write

jobs:
  check-repository:
    name: Check if running in main repository
    runs-on: ubuntu-latest
    outputs:
      is_main_repo: ${{ github.repository == 'funstory-ai/BabelDOC' }}
    steps:
      - run: echo "Running repository check"

  build:
    name: Build distribution 📦
    needs: check-repository
    if: needs.check-repository.outputs.is_main_repo == 'true'
    runs-on: ubuntu-latest
    outputs:
      is_release: ${{ steps.check-version.outputs.tag }}
    steps:
      - uses: actions/checkout@v5
        with:
          persist-credentials: true
          fetch-depth: 2
          token: ${{ secrets.GITHUB_TOKEN }}
          
      - name: Setup uv with Python 3.12
        uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
        with:
          python-version: "3.12"
          enable-cache: true
          cache-dependency-glob: "uv.lock"
          activate-environment: true

      - name: Check if there is a parent commit
        id: check-parent-commit
        run: |
          echo "sha=$(git rev-parse --verify --quiet HEAD^)" >> $GITHUB_OUTPUT

      - name: Detect and tag new version
        id: check-version
        if: steps.check-parent-commit.outputs.sha
        uses: salsify/action-detect-and-tag-new-version@b1778166f13188a9d478e2d1198f993011ba9864 # v2.0.3
        with:
          version-command: |
            cat pyproject.toml | grep "version = " | head -n 1 | awk -F'"' '{print $2}'

      - name: Install Dependencies
        run: |
          uv sync

      - name: Bump version for developmental release
        if: "! steps.check-version.outputs.tag"
        run: |
          version=$(uv run bumpver update --patch --tag=final --dry 2>&1 | grep "New Version" | awk '{print $NF}') &&
          uv run bumpver update --set-version $version.dev$(date +%s)

      - name: Build package
        run: "uv build"

      - name: Store the distribution packages
        uses: actions/upload-artifact@v4.6.2
        with:
          name: python-package-distributions
          path: dist/

  publish-to-pypi:
    name: Publish Python 🐍 distribution 📦 to PyPI
    if: needs.build.outputs.is_release != ''
    needs:
      - check-repository
      - build
    runs-on: ubuntu-latest
    environment:
      name: pypi
      url: https://pypi.org/p/BabelDOC

    permissions:
      id-token: write

    steps:
      - name: Download all the dists
        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
        with:
          name: python-package-distributions
          path: dist/

      - name: Publish distribution 📦 to PyPI
        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0

  publish-to-testpypi:
    name: Publish Python 🐍 distribution 📦 to TestPyPI
    if: needs.build.outputs.is_release == ''
    needs:
      - check-repository
      - build
    runs-on: ubuntu-latest
    environment:
      name: testpypi
      url: https://test.pypi.org/p/BabelDOC

    permissions:
      id-token: write

    steps:
      - name: Download all the dists
        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
        with:
          name: python-package-distributions
          path: dist/

      - name: Publish distribution 📦 to TestPyPI
        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
        with:
          repository-url: https://test.pypi.org/legacy/

  post-release:
    name: Post Release Tasks
    needs:
      - check-repository
      - build
      - publish-to-pypi
      - publish-to-testpypi
    if: |
      always() && needs.check-repository.outputs.is_main_repo == 'true' && 
      (needs.publish-to-pypi.result == 'success' || needs.publish-to-testpypi.result == 'success')
    runs-on: ubuntu-latest
    permissions:
      contents: write
      pull-requests: write
    steps:
      - uses: actions/checkout@v5
        with:
          persist-credentials: true
          fetch-depth: 2
          token: ${{ secrets.GITHUB_TOKEN }}

      - name: Publish the release notes
        uses: release-drafter/release-drafter@b1476f6e6eb133afa41ed8589daba6dc69b4d3f5 # v6.1.0
        with:
          publish: ${{ needs.build.outputs.is_release != '' }}
          tag: ${{ needs.build.outputs.is_release }}
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

================================================
FILE: .github/workflows/test.yml
================================================
name: Run Tests 🧪

on:
  push:
  pull_request:
    branches: ["main"]

permissions:
  contents: read
  pull-requests: read

jobs:
  test:
    name: Run Python Tests
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.10", "3.11", "3.12", "3.13"]

    steps:
      - uses: actions/checkout@v5
        with:
          persist-credentials: false
      - name: Cached Assets
        id: cache-assets
        uses: actions/cache@v4.2.0
        with:
          path: ~/.cache/babeldoc
          key: babeldoc-assets-${{ hashFiles('babeldoc/assets/embedding_assets_metadata.py') }}
      - name: Setup uv with Python ${{ matrix.python-version }}
        uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true
          cache-dependency-glob: "uv.lock"
          activate-environment: true
      - name: Warm up cache
        run: |
          uv run babeldoc --warmup
      - name: Run tests
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAIAPIKEY }}
          OPENAI_BASE_URL: ${{ secrets.OPENAIAPIURL }}
          OPENAI_MODEL: ${{ secrets.OPENAIMODEL }}
        run: |
          uv run babeldoc --help
          uv run babeldoc --openai --files examples/ci/test.pdf --openai-api-key ${{ env.OPENAI_API_KEY }} --openai-base-url ${{ env.OPENAI_BASE_URL }} --openai-model ${{ env.OPENAI_MODEL }}
      - name: Generate offline assets package
        run: |
          uv run babeldoc --generate-offline-assets /tmp/offline_assets
      - name: Restore offline assets package
        run: |
          rm -rf ~/.cache/babeldoc
          uv run babeldoc --restore-offline-assets /tmp/offline_assets
      - name: Clean up
        run: |
          rm -rf /tmp/offline_assets
          rm -rf ~/.cache/babeldoc/cache.v1.db
          rm -rf ~/.cache/babeldoc/working


================================================
FILE: .gitignore
================================================
# Logs
web/logs
web/*.log
web/npm-debug.log*
web/yarn-debug.log*
web/yarn-error.log*
web/pnpm-debug.log*
web/lerna-debug.log*

web/node_modules
web/dist
web/dist-ssr
web/*.local

memray*
**/*.so
*.pdf
*.docx
*.json
**/*.pyc
.venv
.idea
*.egg-info
.DS_Store
.vscode
__pycache__
.ruff_cache
yadt.toml
examples/
/make_gif.py
/dist
.cache
.cursor/rules/_*.mdc
/.cursor
/xnotes
/docs/workflow-rules.md
babeldoc/format/txt
/profile.svg


# uv
uv.lock

# Claude Code memory file
CLAUDE.md
/.claude
babeldoc/format/playground
temp.jpg
AGENTS.md


================================================
FILE: .pre-commit-config.yaml
================================================
files: '^.*\.py$'
repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
    # Ruff version.
    rev: v0.9.5
    hooks:
      # Run the linter.
      - id: ruff
        args: [ "--fix",
                "--ignore=E203,E261,E501,E741,F841" ]
      # Run the formatter.
      - id: ruff-format


================================================
FILE: LICENSE
================================================
                    GNU AFFERO GENERAL PUBLIC LICENSE
                       Version 3, 19 November 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU Affero General Public License is a free, copyleft license for
software and other kinds of works, specifically designed to ensure
cooperation with the community in the case of network server software.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
our General Public Licenses are intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  Developers that use our General Public Licenses protect your rights
with two steps: (1) assert copyright on the software, and (2) offer
you this License which gives you legal permission to copy, distribute
and/or modify the software.

  A secondary benefit of defending all users' freedom is that
improvements made in alternate versions of the program, if they
receive widespread use, become available for other developers to
incorporate.  Many developers of free software are heartened and
encouraged by the resulting cooperation.  However, in the case of
software used on network servers, this result may fail to come about.
The GNU General Public License permits making a modified version and
letting the public access it on a server without ever releasing its
source code to the public.

  The GNU Affero General Public License is designed specifically to
ensure that, in such cases, the modified source code becomes available
to the community.  It requires the operator of a network server to
provide the source code of the modified version running there to the
users of that server.  Therefore, public use of a modified version, on
a publicly accessible server, gives the public access to the source
code of the modified version.

  An older license, called the Affero General Public License and
published by Affero, was designed to accomplish similar goals.  This is
a different license, not a version of the Affero GPL, but Affero has
released a new version of the Affero GPL which permits relicensing under
this license.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU Affero General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Remote Network Interaction; Use with the GNU General Public License.

  Notwithstanding any other provision of this License, if you modify the
Program, your modified version must prominently offer all users
interacting with it remotely through a computer network (if your version
supports such interaction) an opportunity to receive the Corresponding
Source of your version by providing access to the Corresponding Source
from a network server at no charge, through some standard or customary
means of facilitating copying of software.  This Corresponding Source
shall include the Corresponding Source for any work covered by version 3
of the GNU General Public License that is incorporated pursuant to the
following paragraph.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the work with which it is combined will remain governed by version
3 of the GNU General Public License.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU Affero General Public License from time to time.  Such new versions
will be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU Affero General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU Affero General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU Affero General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    BabelDOC is library for ultimated document translation solution.
    Copyright (C) 2024  <funstory.ai limited>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published
    by the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If your software can interact with users remotely through a computer
network, you should also make sure that it provides a way for users to
get its source.  For example, if your program is a web application, its
interface could display a "Source" link that leads users to an archive
of the code.  There are many ways you could offer source, and different
solutions will be better for different programs; see section 13 for the
specific requirements.

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU AGPL, see
<https://www.gnu.org/licenses/>.


================================================
FILE: README.md
================================================
<!-- # Yet Another Document Translator -->

<div align="center">
<!-- <img src="https://s.immersivetranslate.com/assets/r2-uploads/images/babeldoc-banner.png" width="320px"  alt="YADT"/> -->

<br/>

<picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://s.immersivetranslate.com/assets/uploads/babeldoc-big-logo-darkmode-with-transparent-background-IKuNO1.svg" width="320px" alt="BabelDOC"/>
  <img src="https://s.immersivetranslate.com/assets/uploads/babeldoc-big-logo-with-transparent-background-2xweBr.svg" width="320px" alt="BabelDOC"/>
</picture>

<!-- <h2 id="title">BabelDOC</h2> -->

<p>
  <!-- PyPI -->
  <a href="https://pypi.org/project/BabelDOC/">
    <img src="https://img.shields.io/pypi/v/BabelDOC"></a>
  <a href="https://pepy.tech/projects/BabelDOC">
    <img src="https://static.pepy.tech/badge/BabelDOC"></a>
  <!-- <a href="https://github.com/funstory-ai/BabelDOC/pulls">
    <img src="https://img.shields.io/badge/contributions-welcome-green"></a> -->
  <!-- License -->
  <a href="./LICENSE">
    <img src="https://img.shields.io/github/license/funstory-ai/BabelDOC"></a>
  <a href="https://t.me/+Z9_SgnxmsmA5NzBl">
    <img src="https://img.shields.io/badge/Telegram-2CA5E0?style=flat-squeare&logo=telegram&logoColor=white"></a>
  <a href="https://deepwiki.com/funstory-ai/BabelDOC"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki"></a>
</p>

<a href="https://trendshift.io/repositories/13358" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13358" alt="funstory-ai%2FBabelDOC | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>

</div>

PDF scientific paper translation and bilingual comparison library.

- **Online Service**: Beta version launched [Immersive Translate - BabelDOC](https://app.immersivetranslate.com/babel-doc/) Free usage quota is available; please refer to the FAQ section on the page for details.
- **Self-deployment**: [PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next) support for BabelDOC, available for self-deployment + WebUI with more translation services.
- Provides a simple [command line interface](#getting-started).
- Provides a [Python API](#python-api).
- Mainly designed to be embedded into other programs, but can also be used directly for simple translation tasks.

> [!TIP]
>
> How to use BabelDOC in Zotero
>
> 1. Immersive Translate Pro members can use the [immersive-translate/zotero-immersivetranslate](https://github.com/immersive-translate/zotero-immersivetranslate) plugin
>
> 2. PDFMathTranslate self-deployed users can use the [guaguastandup/zotero-pdf2zh](https://github.com/guaguastandup/zotero-pdf2zh) plugin

[Supported Language](https://funstory-ai.github.io/BabelDOC/supported_languages/)

## Preview

<div align="center">
<img src="https://s.immersivetranslate.com/assets/r2-uploads/images/babeldoc-preview.png" width="80%"/>
</div>

## We are hiring

See details: [EN](https://github.com/funstory-ai/jobs) | [ZH](https://github.com/funstory-ai/jobs/blob/main/README_ZH.md)

## Getting Started

### Install from PyPI

We recommend using the Tool feature of [uv](https://github.com/astral-sh/uv) to install yadt.

1. First, you need to refer to [uv installation](https://github.com/astral-sh/uv#installation) to install uv and set up the `PATH` environment variable as prompted.

2. Use the following command to install yadt:

```bash
uv tool install --python 3.12 BabelDOC

babeldoc --help
```

3. Use the `babeldoc` command. For example:

```bash
babeldoc --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here"  --files example.pdf

# multiple files
babeldoc --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here"  --files example1.pdf --files example2.pdf
```

### Install from Source

We still recommend using [uv](https://github.com/astral-sh/uv) to manage virtual environments.

1. First, you need to refer to [uv installation](https://github.com/astral-sh/uv#installation) to install uv and set up the `PATH` environment variable as prompted.

2. Use the following command to install yadt:

```bash
# clone the project
git clone https://github.com/funstory-ai/BabelDOC

# enter the project directory
cd BabelDOC

# install dependencies and run babeldoc
uv run babeldoc --help
```

3. Use the `uv run babeldoc` command. For example:

```bash
uv run babeldoc --files example.pdf --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here"

# multiple files
uv run babeldoc --files example.pdf --files example2.pdf --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here"
```

> [!TIP]
> The absolute path is recommended.

## Advanced Options

> [!NOTE]
> This CLI is mainly for debugging purposes. Although end users can use this CLI to translate files, we do not provide any technical support for this purpose.
>
> End users should directly use **Online Service**: Beta version launched [Immersive Translate - BabelDOC](https://app.immersivetranslate.com/babel-doc/) 1000 free pages per month.
>
> End users who need self-deployment should use [PDFMathTranslate 2.0](https://github.com/PDFMathTranslate/PDFMathTranslate-next)
> 
> If you find that an option is not listed below, it means that this option is a debugging option for maintainers. Please do not use these options.


### Language Options

- `--lang-in`, `-li`: Source language code (default: en)
- `--lang-out`, `-lo`: Target language code (default: zh)

> [!TIP]
> Currently, this project mainly focuses on English-to-Chinese translation, and other scenarios have not been tested yet.
> 
> (2025.3.1 update): Basic English target language support has been added, primarily to minimize line breaks within words([0-9A-Za-z]+).
> 
> [HELP WANTED: Collecting word regular expressions for more languages](https://github.com/funstory-ai/BabelDOC/issues/129)

### PDF Processing Options

- `--files`: One or more file paths to input PDF documents.
- `--pages`, `-p`: Specify pages to translate (e.g., "1,2,1-,-3,3-5"). If not set, translate all pages
- `--split-short-lines`: Force split short lines into different paragraphs (may cause poor typesetting & bugs)
- `--short-line-split-factor`: Split threshold factor (default: 0.8). The actual threshold is the median length of all lines on the current page \* this factor
- `--skip-clean`: Skip PDF cleaning step
- `--dual-translate-first`: Put translated pages first in dual PDF mode (default: original pages first)
- `--disable-rich-text-translate`: Disable rich text translation (may help improve compatibility with some PDFs)
- `--enhance-compatibility`: Enable all compatibility enhancement options (equivalent to --skip-clean --dual-translate-first --disable-rich-text-translate)
- `--use-alternating-pages-dual`: Use alternating pages mode for dual PDF. When enabled, original and translated pages are arranged in alternate order. When disabled (default), original and translated pages are shown side by side on the same page.
- `--watermark-output-mode`: Control watermark output mode: 'watermarked' (default) adds watermark to translated PDF, 'no_watermark' doesn't add watermark, 'both' outputs both versions.
- `--max-pages-per-part`: Maximum number of pages per part for split translation. If not set, no splitting will be performed.
- `--no-watermark`: [DEPRECATED] Use --watermark-output-mode=no_watermark instead.
- `--translate-table-text`: Translate table text (experimental, default: False)
- `--formular-font-pattern`: Font pattern to identify formula text (default: None)
- `--formular-char-pattern`: Character pattern to identify formula text (default: None)
- `--show-char-box`: Show character bounding boxes (debug only, default: False)
- `--skip-scanned-detection`: Skip scanned document detection (default: False). When using split translation, only the first part performs detection if not skipped.
- `--ocr-workaround`: Use OCR workaround (default: False). Only suitable for documents with black text on white background. When enabled, white rectangular blocks will be added below the translation to cover the original text content, and all text will be forced to black color.
- `--auto-enable-ocr-workaround`: Enable automatic OCR workaround (default: False). If a document is detected as heavily scanned, this will attempt to enable OCR processing and skip further scan detection. See "Important Interaction Note" below for crucial details on how this interacts with `--ocr-workaround` and `--skip-scanned-detection`.
- `--primary-font-family`: Override primary font family for translated text. Choices: 'serif' for serif fonts, 'sans-serif' for sans-serif fonts, 'script' for script/italic fonts. If not specified, uses automatic font selection based on original text properties.
- `--only-include-translated-page`: Only include translated pages in the output PDF. This option is only effective when `--pages` is used. (default: False)
- `--merge-alternating-line-numbers`: Enable post-processing to merge alternating line-number layouts (keep the number paragraph as an independent paragraph b; merge adjacent text paragraphs a and c across it when `layout_id` and `xobj_id` match, digits are ASCII and spaces only). Default: off.
- `--skip-form-render`: Skip form rendering (default: False). When enabled, PDF forms will not be rendered in the output.
- `--skip-curve-render`: Skip curve rendering (default: False). When enabled, PDF curves will not be rendered in the output.
- `--only-parse-generate-pdf`: Only parse PDF and generate output PDF without translation (default: False). This skips all translation-related processing including layout analysis, paragraph finding, style processing, and translation itself. Useful for testing PDF parsing and reconstruction functionality.
- `--remove-non-formula-lines`: Remove non-formula lines from paragraph areas (default: False). This removes decorative lines that are not part of formulas, while protecting lines in figure/table areas. Useful for cleaning up documents with decorative elements that interfere with text flow.
- `--non-formula-line-iou-threshold`: IoU threshold for detecting paragraph overlap when removing non-formula lines (default: 0.9). Higher values are more conservative and will remove fewer lines.
- `--figure-table-protection-threshold`: IoU threshold for protecting lines in figure/table areas when removing non-formula lines (default: 0.9). Higher values provide more protection for structural elements in figures and tables.

- `--rpc-doclayout`: RPC service host address for document layout analysis (default: None)
- `--working-dir`: Working directory for translation. If not set, use temp directory.
- `--no-auto-extract-glossary`: Disable automatic term extraction. If this flag is present, the step is skipped. Defaults to enabled.
- `--save-auto-extracted-glossary`: Save automatically extracted glossary to the specified file. If not set, the glossary will not be saved.

> [!TIP]
> - Both `--skip-clean` and `--dual-translate-first` may help improve compatibility with some PDF readers
> - `--disable-rich-text-translate` can also help with compatibility by simplifying translation input
> - However, using `--skip-clean` will result in larger file sizes
> - If you encounter any compatibility issues, try using `--enhance-compatibility` first
> - Use `--max-pages-per-part` for large documents to split them into smaller parts for translation and automatically merge them back.
> - Use `--skip-scanned-detection` to speed up processing when you know your document is not a scanned PDF.
> - Use `--ocr-workaround` to fill background for scanned PDF. (Current assumption: background is pure white, text is pure black, this option will also auto enable `--skip-scanned-detection`)

### Translation Service Options

- `--qps`: QPS (Queries Per Second) limit for translation service (default: 4)
- `--ignore-cache`: Ignore translation cache and force retranslation
- `--no-dual`: Do not output bilingual PDF files
- `--no-mono`: Do not output monolingual PDF files
- `--min-text-length`: Minimum text length to translate (default: 5)
- `--openai`: Use OpenAI for translation (default: False)
- `--custom-system-prompt`: Custom system prompt for translation.
- `--add-formula-placehold-hint`: Add formula placeholder hint for translation. (Currently not recommended, it may affect translation quality, default: False)
- `--disable-same-text-fallback`: Disable fallback translation when LLM output matches input text. (default: False)
- `--pool-max-workers`: Maximum number of worker threads for internal task processing pools. If not specified, defaults to QPS value. This parameter directly sets the worker count, replacing previous QPS-based dynamic calculations.
- `--no-auto-extract-glossary`: Disable automatic term extraction. If this flag is present, the step is skipped. Defaults to enabled.

> [!TIP]
>
> 1. Currently, only OpenAI-compatible LLM is supported. For more translator support, please use [PDFMathTranslate 2.0](https://github.com/PDFMathTranslate/PDFMathTranslate-next).
> 2. It is recommended to use models with strong compatibility with OpenAI, such as: `glm-4-flash`, `deepseek-chat`, etc.
> 3. Currently, it has not been optimized for traditional translation engines like Bing/Google, it is recommended to use LLMs.
> 4. You can use [litellm](https://github.com/BerriAI/litellm) to access multiple models.
> 5. `--custom-system-prompt`: It is mainly used to add the `/no_think` instruction of Qwen 3 in the prompt. For example: `--custom-system-prompt "/no_think You are a professional, authentic machine translation engine."`

### OpenAI Specific Options

- `--openai-model`: OpenAI model to use (default: gpt-4o-mini)
- `--openai-base-url`: Base URL for OpenAI API
- `--openai-api-key`: API key for OpenAI service
- `--enable-json-mode-if-requested`: Enable JSON mode for OpenAI requests (default: False)
- `--term-pool-max-workers`: Maximum number of worker threads dedicated to automatic term extraction. If not specified, this defaults to the value of `--pool-max-workers`, which itself defaults to the QPS value when unset.

> [!TIP]
>
> 1. This tool supports any OpenAI-compatible API endpoints. Just set the correct base URL and API key. (e.g. `https://xxx.custom.xxx/v1`)
> 2. For local models like Ollama, you can use any value as the API key (e.g. `--openai-api-key a`).

### Glossary Options

- `--glossary-files`: Comma-separated paths to glossary CSV files.
  - Each CSV file should have the columns: `source`, `target`, and an optional `tgt_lng`.
  - The `source` column contains the term in the original language.
  - The `target` column contains the term in the target language.
  - The `tgt_lng` column (optional) specifies the target language for that specific entry (e.g., "zh-CN", "en-US").
    - If `tgt_lng` is provided for an entry, that entry will only be loaded and used if its (normalized) `tgt_lng` matches the (normalized) overall target language specified by `--lang-out`. Normalization involves lowercasing and replacing hyphens (`-`) with underscores (`_`).
    - If `tgt_lng` is omitted for an entry, that entry is considered applicable for any `--lang-out`.
  - The name of each glossary (used in LLM prompts) is derived from its filename (without the .csv extension).
  - During translation, the system will check the input text against the loaded glossaries. If terms from a glossary are found in the current text segment, that glossary (with the relevant terms) will be included in the prompt to the language model, along with an instruction to adhere to it.

### Output Control

- `--output`, `-o`: Output directory for translated files. If not set, use current working directory.
- `--debug`: Enable debug logging level and export detailed intermediate results in `~/.cache/yadt/working`.
- `--report-interval`: Progress report interval in seconds (default: 0.1).

### General Options

- `--warmup`: Only download and verify required assets then exit (default: False)

### Offline Assets Management

- `--generate-offline-assets`: Generate an offline assets package in the specified directory. This creates a zip file containing all required models and fonts.
- `--restore-offline-assets`: Restore an offline assets package from the specified file. This extracts models and fonts from a previously generated package.

> [!TIP]
> 
> 1. Offline assets packages are useful for environments without internet access or to speed up installation on multiple machines.
> 2. Generate a package once with `babeldoc --generate-offline-assets /path/to/output/dir` and then distribute it.
> 3. Restore the package on target machines with `babeldoc --restore-offline-assets /path/to/offline_assets_*.zip`.
> 4. The offline assets package name cannot be modified because the file list hash is encoded in the name.
> 5. If you provide a directory path to `--restore-offline-assets`, the tool will automatically look for the correct offline assets package file in that directory.
> 6. The package contains all necessary fonts and models required for document processing, ensuring consistent results across different environments.
> 7. The integrity of all assets is verified using SHA3-256 hashes during both packaging and restoration.
> 8. If you're deploying in an air-gapped environment, make sure to generate the package on a machine with internet access first.

### Configuration File

- `--config`, `-c`: Configuration file path. Use the TOML format.

Example Configuration:

```toml
[babeldoc]
# Basic settings
debug = true
lang-in = "en-US"
lang-out = "zh-CN"
qps = 10
output = "/path/to/output/dir"

# PDF processing options
split-short-lines = false
short-line-split-factor = 0.8
skip-clean = false
dual-translate-first = false
disable-rich-text-translate = false
use-alternating-pages-dual = false
watermark-output-mode = "watermarked"  # Choices: "watermarked", "no_watermark", "both"
max-pages-per-part = 50  # Automatically split the document for translation and merge it back.
only_include_translated_page = false # Only include translated pages in the output PDF. Effective only when `pages` is used.
# no-watermark = false  # DEPRECATED: Use watermark-output-mode instead
skip-scanned-detection = false  # Skip scanned document detection for faster processing
auto_extract_glossary = true # Set to false to disable automatic term extraction
formular_font_pattern = "" # Font pattern for formula text
formular_char_pattern = "" # Character pattern for formula text
show_char_box = false # Show character bounding boxes (debug)
ocr_workaround = false # Use OCR workaround for scanned PDFs
rpc_doclayout = "" # RPC service host for document layout analysis
working_dir = "" # Working directory for translation
auto_enable_ocr_workaround = false # Enable automatic OCR workaround for scanned PDFs. See docs for interaction with ocr_workaround and skip_scanned_detection.
skip_form_render = false # Skip form rendering (default: False)
skip_curve_render = false # Skip curve rendering (default: False)
only_parse_generate_pdf = false # Only parse PDF and generate output PDF without translation (default: False)
remove_non_formula_lines = false # Remove non-formula lines from paragraph areas (default: False)
non_formula_line_iou_threshold = 0.2 # IoU threshold for paragraph overlap detection (default: 0.2)
figure_table_protection_threshold = 0.3 # IoU threshold for figure/table protection (default: 0.3)

# Translation service
openai = true
openai-model = "gpt-4o-mini"
openai-base-url = "https://api.openai.com/v1"
openai-api-key = "your-api-key-here"
enable-json-mode-if-requested = false  # Enable JSON mode when requested (default: false)
disable_same_text_fallback = false # Disable fallback translation when LLM output matches input text (default: false)
pool-max-workers = 8  # Maximum worker threads for task processing (defaults to QPS value if not set)

# Glossary Options (Optional)
# glossary-files = "/path/to/glossary1.csv,/path/to/glossary2.csv"

# Output control
no-dual = false
no-mono = false
min-text-length = 5
report-interval = 0.5

# Offline assets management
# Uncomment one of these options as needed:
# generate-offline-assets = "/path/to/output/dir"
# restore-offline-assets = "/path/to/offline_assets_package.zip"
```

## Python API

The current recommended way to call BabelDOC in Python is to call the `high_level.do_translate_async_stream` function of [pdf2zh next](https://github.com/PDFMathTranslate/PDFMathTranslate-next).

> [!WARNING]
> **All APIs of BabelDOC should be considered as internal APIs, and any direct use of BabelDOC is not supported.**

## Background

There are a lot projects and teams working on to make document editing and translating easier like:

- [mathpix](https://mathpix.com/)
- [Doc2X](https://doc2x.noedgeai.com/)
- [minerU](https://github.com/opendatalab/MinerU)
- [PDFMathTranslate](https://github.com/funstory-ai/yadt)

There are also some solutions to solve specific parts of the problem like:

- [layoutreader](https://github.com/microsoft/unilm/tree/master/layoutreader): the read order of the text block in a pdf
- [Surya](https://github.com/surya-is/surya): the structure of the pdf

This project hopes to promote a standard pipeline and interface to solve the problem.

In fact, there are two main stages of a PDF parser or translator:

- **Parsing**: A stage of parsing means to get the structure of the pdf such as text blocks, images, tables, etc.
- **Rendering**: A stage of rendering means to render the structure into a new pdf or other format.

For a service like mathpix, it will parse the pdf into a structure may be in a XML format, and then render them using a single column reader order as [layoutreader](https://github.com/microsoft/unilm/tree/master/layoutreader) does. The bad news is that the original structure lost.

Some people will use Adobe PDF Parser because it will generate a Word document and it keeps the original structure. But it is somewhat expensive.
And you know, a pdf or word document is not a good format for reading in mobile devices.

We offer an intermediate representation of the results from parser and can be rendered into a new pdf or other format. The pipeline is also a plugin-based system which everybody can add their new model, ocr, renderer, etc.

## Roadmap

- [ ] Add line support
- [ ] Add table support
- [ ] Add cross-page/cross-column paragraph support
- [ ] More advanced typesetting features
- [ ] Outline support
- [ ] ...

Our first 1.0 version goal is to finish a translation from [PDF Reference, Version 1.7](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf) to the following language version:

- Simplified Chinese
- Traditional Chinese
- Japanese
- Spanish

And meet the following requirements:

- layout error less than 1%
- content loss less than 1%

## Version Number Explanation

This project uses a combination of [Semantic Versioning](https://semver.org/) and [Pride Versioning](https://pridever.org/). The version number format is: "0.MAJOR.MINOR".

> [!NOTE]
>
> The API compatibility here mainly refers to the compatibility with [pdf2zh_next](https://github.com/PDFMathTranslate/PDFMathTranslate-next).


- MAJOR: Incremented by 1 when API incompatible changes are made or when proud improvements are implemented.

- MINOR: Incremented by 1 when any API compatible changes are made.

## Known Issues

1. Parsing errors in the author and reference sections; they get merged into one paragraph after translation.
2. Lines are not supported.
3. Does not support drop caps.
4. Large pages will be skipped.

## How to Contribute

We encourage you to contribute to YADT! Please check out the [CONTRIBUTING](https://github.com/funstory-ai/yadt/blob/main/docs/CONTRIBUTING.md) guide.

Everyone interacting in YADT and its sub-projects' codebases, issue trackers, chat rooms, and mailing lists is expected to follow the YADT [Code of Conduct](https://github.com/funstory-ai/yadt/blob/main/docs/CODE_OF_CONDUCT.md).

[Immersive Translation](https://immersivetranslate.com) sponsors monthly Pro membership redemption codes for active contributors to this project, see details at: [CONTRIBUTOR_REWARD.md](https://github.com/funstory-ai/BabelDOC/blob/main/docs/CONTRIBUTOR_REWARD.md)

## Acknowledgements

- [PDFMathTranslate](https://github.com/Byaidu/PDFMathTranslate)
- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
- [pdfminer](https://github.com/pdfminer/pdfminer.six)
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
- [Asynchronize](https://github.com/multimeric/Asynchronize/tree/master?tab=readme-ov-file)
- [PriorityThreadPoolExecutor](https://github.com/oleglpts/PriorityThreadPoolExecutor)

<h2 id="star_hist">Star History</h2>

<a href="https://star-history.com/#funstory-ai/babeldoc&Date">
 <picture>
   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=funstory-ai/babeldoc&type=Date&theme=dark" />
   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=funstory-ai/babeldoc&type=Date" />
   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=funstory-ai/babeldoc&type=Date"/>
 </picture>
</a>

> [!WARNING]
> **Important Interaction Note for `--auto-enable-ocr-workaround`:**
>
> When `--auto-enable-ocr-workaround` is set to `true` (either via command line or config file):
>
> 1.  During the initial setup, the values for `ocr_workaround` and `skip_scanned_detection` will be forced to `false` by `TranslationConfig`, regardless of whether you also set `--ocr-workaround` or `--skip-scanned-detection` flags.
> 2.  Then, during the scanned document detection phase (`DetectScannedFile` stage):
>     *   If the document is identified as heavily scanned (e.g., >80% scanned pages) AND `auto_enable_ocr_workaround` is `true` (i.e., `translation_config.auto_enable_ocr_workaround` is true), the system will then attempt to set both `ocr_workaround` to `true` and `skip_scanned_detection` to `true`.
>
> This means that `--auto-enable-ocr-workaround` effectively gives the system control to enable OCR processing for scanned documents, potentially overriding manual settings for `--ocr-workaround` and `--skip_scanned_detection` based on its detection results. If the document is *not* detected as heavily scanned, then the initial `false` values for `ocr_workaround` and `skip_scanned_detection` (forced by `--auto-enable-ocr-workaround` at the `TranslationConfig` initialization stage) will remain in effect unless changed by other logic.


================================================
FILE: babeldoc/__init__.py
================================================
__version__ = "0.5.23"


================================================
FILE: babeldoc/assets/assets.py
================================================
import asyncio
import hashlib
import json
import logging
import threading
import zipfile
from pathlib import Path

import httpx
from babeldoc.assets import embedding_assets_metadata
from babeldoc.assets.embedding_assets_metadata import CMAP_METADATA
from babeldoc.assets.embedding_assets_metadata import CMAP_URL_BY_UPSTREAM
from babeldoc.assets.embedding_assets_metadata import DOC_LAYOUT_ONNX_MODEL_URL
from babeldoc.assets.embedding_assets_metadata import (
    DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256,
)
from babeldoc.assets.embedding_assets_metadata import EMBEDDING_FONT_METADATA
from babeldoc.assets.embedding_assets_metadata import FONT_METADATA_URL
from babeldoc.assets.embedding_assets_metadata import FONT_URL_BY_UPSTREAM
from babeldoc.assets.embedding_assets_metadata import (
    TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256,
)
from babeldoc.assets.embedding_assets_metadata import TABLE_DETECTION_RAPIDOCR_MODEL_URL
from babeldoc.assets.embedding_assets_metadata import TIKTOKEN_CACHES
from babeldoc.const import get_cache_file_path
from tenacity import retry
from tenacity import stop_after_attempt
from tenacity import wait_exponential

logger = logging.getLogger(__name__)


_FASTEST_FONT_UPSTREAM_LOCK = asyncio.Lock()
_FASTEST_FONT_UPSTREAM: str | None = None
_FASTEST_FONT_METADATA: dict | None = None


class ResultContainer:
    def __init__(self):
        self.result = None

    def set_result(self, result):
        self.result = result


def run_in_another_thread(coro):
    result_container = ResultContainer()

    def _wrapper():
        result_container.set_result(asyncio.run(coro))

    thread = threading.Thread(target=_wrapper)
    thread.start()
    thread.join()
    return result_container.result


def run_coro(coro):
    return run_in_another_thread(coro)


def _retry_if_not_cancelled_and_failed(retry_state):
    """Only retry if the exception is not CancelledError and the attempt failed."""
    if retry_state.outcome.failed:
        exception = retry_state.outcome.exception()
        # Don't retry on CancelledError
        if isinstance(exception, asyncio.CancelledError):
            logger.debug("Operation was cancelled, not retrying")
            return False
        # Retry on network related errors
        if isinstance(
            exception, httpx.HTTPError | ConnectionError | ValueError | TimeoutError
        ):
            logger.warning(f"Network error occurred: {exception}, will retry")
            return True
    # Don't retry on success
    return False


def verify_file(path: Path, sha3_256: str):
    if not path.exists():
        return False
    hash_ = hashlib.sha3_256()
    with path.open("rb") as f:
        while True:
            chunk = f.read(1024 * 1024)
            if not chunk:
                break
            hash_.update(chunk)
    return hash_.hexdigest() == sha3_256


@retry(
    retry=_retry_if_not_cancelled_and_failed,
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=15),
    before_sleep=lambda retry_state: logger.warning(
        f"Download file failed, retrying in {retry_state.next_action.sleep} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
async def download_file(
    client: httpx.AsyncClient | None = None,
    url: str = None,
    path: Path = None,
    sha3_256: str = None,
):
    if client is None:
        async with httpx.AsyncClient() as client:
            response = await client.get(url, follow_redirects=True)
    else:
        response = await client.get(url, follow_redirects=True)

    response.raise_for_status()
    with path.open("wb") as f:
        f.write(response.content)
    if not verify_file(path, sha3_256):
        path.unlink(missing_ok=True)
        raise ValueError(f"File {path} is corrupted")


@retry(
    retry=_retry_if_not_cancelled_and_failed,
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=15),
    before_sleep=lambda retry_state: logger.warning(
        f"Get font metadata failed, retrying in {retry_state.next_action.sleep} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
async def get_font_metadata(
    client: httpx.AsyncClient | None = None, upstream: str = None
):
    if upstream not in FONT_METADATA_URL:
        logger.critical(f"Invalid upstream: {upstream}")
        exit(1)

    if client is None:
        async with httpx.AsyncClient() as client:
            response = await client.get(
                FONT_METADATA_URL[upstream], follow_redirects=True
            )
    else:
        response = await client.get(FONT_METADATA_URL[upstream], follow_redirects=True)

    response.raise_for_status()
    logger.debug(f"Get font metadata from {upstream} success")
    return upstream, response.json()


async def _get_fastest_upstream_for_font_internal(
    client: httpx.AsyncClient | None = None, exclude_upstream: list[str] | None = None
) -> tuple[str | None, dict | None]:
    """Find the fastest upstream for font metadata without using cached result."""
    tasks: list[asyncio.Task[tuple[str, dict]]] = []
    for upstream in FONT_METADATA_URL:
        if exclude_upstream and upstream in exclude_upstream:
            continue
        tasks.append(asyncio.create_task(get_font_metadata(client, upstream)))
    for future in asyncio.as_completed(tasks):
        try:
            result = await future
            for task in tasks:
                if not task.done():
                    task.cancel()
            return result
        except Exception as e:
            logger.exception(f"Error getting font metadata: {e}")
    logger.error("All upstreams failed")
    return None, None


async def get_fastest_upstream_for_font(
    client: httpx.AsyncClient | None = None, exclude_upstream: list[str] | None = None
) -> tuple[str | None, dict | None]:
    """Get the fastest upstream for font metadata with cached result.

    The cached upstream is only used when exclude_upstream is None.
    """
    global _FASTEST_FONT_UPSTREAM, _FASTEST_FONT_METADATA

    if exclude_upstream is None and _FASTEST_FONT_UPSTREAM is not None:
        return _FASTEST_FONT_UPSTREAM, _FASTEST_FONT_METADATA

    if exclude_upstream is not None:
        # Do not use or update cache when exclude_upstream is provided.
        return await _get_fastest_upstream_for_font_internal(client, exclude_upstream)

    async with _FASTEST_FONT_UPSTREAM_LOCK:
        if _FASTEST_FONT_UPSTREAM is not None:
            return _FASTEST_FONT_UPSTREAM, _FASTEST_FONT_METADATA

        upstream, metadata = await _get_fastest_upstream_for_font_internal(client)
        if upstream is not None:
            _FASTEST_FONT_UPSTREAM = upstream
            _FASTEST_FONT_METADATA = metadata
            logger.info(f"Fastest font upstream determined: {upstream}")
        return upstream, metadata


async def get_fastest_upstream_for_model(client: httpx.AsyncClient | None = None):
    return await get_fastest_upstream_for_font(client, exclude_upstream=["github"])


async def get_fastest_upstream(client: httpx.AsyncClient | None = None):
    (
        fastest_upstream_for_font,
        online_font_metadata,
    ) = await get_fastest_upstream_for_font(client)
    if fastest_upstream_for_font is None:
        logger.error("Failed to get fastest upstream")
        exit(1)

    if fastest_upstream_for_font == "github":
        # since github is only store font, we need to get the fastest upstream for model
        fastest_upstream_for_model, _ = await get_fastest_upstream_for_model(client)
        if fastest_upstream_for_model is None:
            logger.error("Failed to get fastest upstream")
            exit(1)
    else:
        fastest_upstream_for_model = fastest_upstream_for_font

    return online_font_metadata, fastest_upstream_for_font, fastest_upstream_for_model


async def get_doclayout_onnx_model_path_async(client: httpx.AsyncClient | None = None):
    onnx_path = get_cache_file_path(
        "doclayout_yolo_docstructbench_imgsz1024.onnx", "models"
    )
    if verify_file(onnx_path, DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256):
        return onnx_path

    logger.info("doclayout onnx model not found or corrupted, downloading...")
    fastest_upstream, _ = await get_fastest_upstream_for_model(client)
    if fastest_upstream is None:
        logger.error("Failed to get fastest upstream")
        exit(1)

    url = DOC_LAYOUT_ONNX_MODEL_URL[fastest_upstream]

    await download_file(
        client, url, onnx_path, DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256
    )
    logger.info(f"Download doclayout onnx model from {fastest_upstream} success")
    return onnx_path


async def get_table_detection_rapidocr_model_path_async(
    client: httpx.AsyncClient | None = None,
):
    onnx_path = get_cache_file_path("ch_PP-OCRv4_det_infer.onnx", "models")
    if verify_file(onnx_path, TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256):
        return onnx_path

    logger.info("table detection rapidocr model not found or corrupted, downloading...")
    fastest_upstream, _ = await get_fastest_upstream_for_model(client)
    if fastest_upstream is None:
        logger.error("Failed to get fastest upstream")
        exit(1)

    url = TABLE_DETECTION_RAPIDOCR_MODEL_URL[fastest_upstream]

    await download_file(client, url, onnx_path, TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256)
    logger.info(
        f"Download table detection rapidocr model from {fastest_upstream} success"
    )
    return onnx_path


def get_doclayout_onnx_model_path():
    return run_coro(get_doclayout_onnx_model_path_async())


def get_table_detection_rapidocr_model_path():
    return run_coro(get_table_detection_rapidocr_model_path_async())


def get_font_url_by_name_and_upstream(font_file_name: str, upstream: str):
    if upstream not in FONT_URL_BY_UPSTREAM:
        logger.critical(f"Invalid upstream: {upstream}")
        exit(1)

    return FONT_URL_BY_UPSTREAM[upstream](font_file_name)


async def get_font_and_metadata_async(
    font_file_name: str,
    client: httpx.AsyncClient | None = None,
    fastest_upstream: str | None = None,
    font_metadata: dict | None = None,
):
    cache_file_path = get_cache_file_path(font_file_name, "fonts")
    if font_file_name in EMBEDDING_FONT_METADATA and verify_file(
        cache_file_path, EMBEDDING_FONT_METADATA[font_file_name]["sha3_256"]
    ):
        return cache_file_path, EMBEDDING_FONT_METADATA[font_file_name]

    logger.info(f"Font {cache_file_path} not found or corrupted, downloading...")
    if fastest_upstream is None:
        fastest_upstream, font_metadata = await get_fastest_upstream_for_font(client)
        if fastest_upstream is None:
            logger.critical("Failed to get fastest upstream")
            exit(1)

        if font_file_name not in font_metadata:
            logger.critical(f"Font {font_file_name} not found in {font_metadata}")
            exit(1)

        if verify_file(cache_file_path, font_metadata[font_file_name]["sha3_256"]):
            return cache_file_path, font_metadata[font_file_name]

    assert font_metadata is not None
    logger.info(f"download {font_file_name} from {fastest_upstream}")

    url = get_font_url_by_name_and_upstream(font_file_name, fastest_upstream)
    if "sha3_256" not in font_metadata[font_file_name]:
        logger.critical(f"Font {font_file_name} not found in {font_metadata}")
        exit(1)
    await download_file(
        client, url, cache_file_path, font_metadata[font_file_name]["sha3_256"]
    )
    return cache_file_path, font_metadata[font_file_name]


def get_font_and_metadata(font_file_name: str):
    return run_coro(get_font_and_metadata_async(font_file_name))


async def get_cmap_file_path_async(
    name: str, client: httpx.AsyncClient | None = None
) -> Path:
    """Get cached cmap file path, downloading it if necessary."""
    if name.endswith(".json"):
        file_name = name
    else:
        file_name = f"{name}.json"

    if file_name not in CMAP_METADATA:
        logger.critical(f"CMap {file_name} not found in CMAP_METADATA")
        exit(1)

    meta = CMAP_METADATA[file_name]
    cache_file_path = get_cache_file_path(file_name, "cmap")
    if verify_file(cache_file_path, meta["sha3_256"]):
        return cache_file_path

    logger.info(f"CMap {cache_file_path} not found or corrupted, downloading...")
    await download_cmap_file_async(file_name, client)
    if not verify_file(cache_file_path, meta["sha3_256"]):
        logger.critical(f"Failed to verify downloaded cmap file: {cache_file_path}")
        exit(1)
    return cache_file_path


async def download_cmap_file_async(
    file_name: str, client: httpx.AsyncClient | None = None
) -> Path:
    """Download a single cmap file to cache directory."""
    if file_name not in CMAP_METADATA:
        logger.critical(f"CMap {file_name} not found in CMAP_METADATA")
        exit(1)

    fastest_upstream, _ = await get_fastest_upstream_for_font(client)
    if fastest_upstream is None:
        logger.critical("Failed to get fastest upstream for cmap")
        exit(1)

    if fastest_upstream not in CMAP_URL_BY_UPSTREAM:
        logger.critical(f"Invalid fastest upstream for cmap: {fastest_upstream}")
        exit(1)

    url = CMAP_URL_BY_UPSTREAM[fastest_upstream](file_name)
    cache_file_path = get_cache_file_path(file_name, "cmap")
    sha3_256 = CMAP_METADATA[file_name]["sha3_256"]
    await download_file(client, url, cache_file_path, sha3_256)
    return cache_file_path


async def get_cmap_data_async(
    name: str, client: httpx.AsyncClient | None = None
) -> dict:
    """Load cmap json data from cached file, downloading it if necessary."""
    path = await get_cmap_file_path_async(name, client)
    return json.loads(path.read_text())


def get_cmap_file_path(name: str):
    return run_coro(get_cmap_file_path_async(name))


def get_cmap_data(name: str):
    return run_coro(get_cmap_data_async(name))


def get_font_family(lang_code: str):
    font_family = embedding_assets_metadata.get_font_family(lang_code)
    return font_family


async def download_all_fonts_async(client: httpx.AsyncClient | None = None):
    for font_file_name in EMBEDDING_FONT_METADATA:
        if not verify_file(
            get_cache_file_path(font_file_name, "fonts"),
            EMBEDDING_FONT_METADATA[font_file_name]["sha3_256"],
        ):
            break
    else:
        logger.debug("All fonts are already downloaded")
        return

    fastest_upstream, font_metadata = await get_fastest_upstream_for_font(client)
    if fastest_upstream is None:
        logger.error("Failed to get fastest upstream")
        exit(1)
    logger.info(f"Downloading fonts from {fastest_upstream}")

    font_tasks = [
        asyncio.create_task(
            get_font_and_metadata_async(
                font_file_name, client, fastest_upstream, font_metadata
            )
        )
        for font_file_name in EMBEDDING_FONT_METADATA
    ]
    await asyncio.gather(*font_tasks)


async def download_all_cmaps_async(client: httpx.AsyncClient | None = None):
    """Download all cmap files defined in CMAP_METADATA."""
    for cmap_file_name, meta in CMAP_METADATA.items():
        if not verify_file(
            get_cache_file_path(cmap_file_name, "cmap"),
            meta["sha3_256"],
        ):
            break
    else:
        logger.debug("All cmaps are already downloaded")
        return

    fastest_upstream, _ = await get_fastest_upstream_for_font(client)
    if fastest_upstream is None:
        logger.error("Failed to get fastest upstream for cmap")
        exit(1)
    logger.info(f"Downloading cmaps from {fastest_upstream}")

    cmap_tasks = [
        asyncio.create_task(get_cmap_file_path_async(cmap_file_name, client))
        for cmap_file_name in CMAP_METADATA
    ]
    await asyncio.gather(*cmap_tasks)


async def async_warmup():
    logger.info("Downloading all assets...")
    from tiktoken import encoding_for_model

    _ = encoding_for_model("gpt-4o")
    async with httpx.AsyncClient() as client:
        onnx_task = asyncio.create_task(get_doclayout_onnx_model_path_async(client))
        onnx_task2 = asyncio.create_task(
            get_table_detection_rapidocr_model_path_async(client)
        )
        font_tasks = asyncio.create_task(download_all_fonts_async(client))
        cmap_tasks = asyncio.create_task(download_all_cmaps_async(client))
        await asyncio.gather(onnx_task, onnx_task2, font_tasks, cmap_tasks)


def warmup():
    run_coro(async_warmup())


def generate_all_assets_file_list():
    result: dict[str, list[dict[str, str]]] = {}
    result["fonts"] = []
    result["models"] = []
    result["tiktoken"] = []
    result["cmap"] = []
    for font_file_name in EMBEDDING_FONT_METADATA:
        result["fonts"].append(
            {
                "name": font_file_name,
                "sha3_256": EMBEDDING_FONT_METADATA[font_file_name]["sha3_256"],
            }
        )
    for cmap_file_name in CMAP_METADATA:
        result["cmap"].append(
            {
                "name": cmap_file_name,
                "sha3_256": CMAP_METADATA[cmap_file_name]["sha3_256"],
            }
        )
    for tiktoken_file, sha3_256 in TIKTOKEN_CACHES.items():
        result["tiktoken"].append(
            {
                "name": tiktoken_file,
                "sha3_256": sha3_256,
            }
        )
    result["models"].append(
        {
            "name": "doclayout_yolo_docstructbench_imgsz1024.onnx",
            "sha3_256": DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256,
        },
    )
    result["models"].append(
        {
            "name": "ch_PP-OCRv4_det_infer.onnx",
            "sha3_256": TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256,
        },
    )
    return result


async def generate_offline_assets_package_async(output_directory: Path | None = None):
    await async_warmup()
    logger.info("Generating offline assets package...")
    file_list = generate_all_assets_file_list()
    offline_assets_tag = get_offline_assets_tag(file_list)
    if output_directory is None:
        output_path = get_cache_file_path(
            f"offline_assets_{offline_assets_tag}.zip", "assets"
        )
    else:
        output_directory.mkdir(parents=True, exist_ok=True)
        output_path = output_directory / f"offline_assets_{offline_assets_tag}.zip"
    with zipfile.ZipFile(
        output_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=9
    ) as zipf:
        for file_type, file_descs in file_list.items():
            # zipf.mkdir(file_type)
            for file_desc in file_descs:
                file_name = file_desc["name"]
                sha3_256 = file_desc["sha3_256"]
                file_path = get_cache_file_path(file_name, file_type)
                if not verify_file(file_path, sha3_256):
                    logger.error(f"File {file_path} is corrupted")
                    exit(1)

                with file_path.open("rb") as f:
                    zipf.writestr(f"{file_type}/{file_name}", f.read())
    logger.info(f"Offline assets package generated at {output_path}")


async def restore_offline_assets_package_async(input_path: Path | None = None):
    file_list = generate_all_assets_file_list()
    offline_assets_tag = get_offline_assets_tag(file_list)
    if input_path is None:
        input_path = get_cache_file_path(
            f"offline_assets_{offline_assets_tag}.zip", "assets"
        )
    else:
        if input_path.exists() and input_path.is_dir():
            input_path = input_path / f"offline_assets_{offline_assets_tag}.zip"
        if not input_path.exists():
            logger.critical(f"Offline assets package not found: {input_path}")
            exit(1)

        import re

        offline_assets_tag_from_input_path = re.match(
            r"offline_assets_(.*)\.zip", input_path.name
        ).group(1)
        if offline_assets_tag != offline_assets_tag_from_input_path:
            logger.critical(
                f"Offline assets tag mismatch: {offline_assets_tag} != {offline_assets_tag_from_input_path}"
            )
            exit(1)
    nothing_changed = True
    with zipfile.ZipFile(input_path, "r") as zipf:
        for file_type, file_descs in file_list.items():
            for file_desc in file_descs:
                file_name = file_desc["name"]
                file_path = get_cache_file_path(file_name, file_type)

                if verify_file(file_path, file_desc["sha3_256"]):
                    continue
                nothing_changed = False
                with zipf.open(f"{file_type}/{file_name}", "r") as f:
                    with file_path.open("wb") as f2:
                        f2.write(f.read())
                if not verify_file(file_path, file_desc["sha3_256"]):
                    logger.critical(
                        "Offline assets package is corrupted, please delete it and try again"
                    )
                    exit(1)
    if not nothing_changed:
        logger.info(f"Offline assets package restored from {input_path}")


def get_offline_assets_tag(file_list: dict | None = None):
    if file_list is None:
        file_list = generate_all_assets_file_list()
    import orjson

    # noinspection PyTypeChecker
    offline_assets_tag = hashlib.sha3_256(
        orjson.dumps(
            file_list,
            option=orjson.OPT_APPEND_NEWLINE
            | orjson.OPT_INDENT_2
            | orjson.OPT_SORT_KEYS,
        )
    ).hexdigest()
    return offline_assets_tag


def generate_offline_assets_package(output_directory: Path | None = None):
    return run_coro(generate_offline_assets_package_async(output_directory))


def restore_offline_assets_package(input_path: Path | None = None):
    return run_coro(restore_offline_assets_package_async(input_path))


if __name__ == "__main__":
    from rich.logging import RichHandler

    logging.basicConfig(level=logging.DEBUG, handlers=[RichHandler()])
    logging.getLogger("httpx").setLevel(logging.WARNING)
    logging.getLogger("httpcore").setLevel(logging.WARNING)
    # warmup()
    # generate_offline_assets_package()
    # restore_offline_assets_package(Path(
    #     '/Users/aw/.cache/babeldoc/assets/offline_assets_33971e4940e90ba0c35baacda44bbe83b214f4703a7bdb8b837de97d0383508c.zip'))
    # warmup()


================================================
FILE: babeldoc/assets/embedding_assets_metadata.py
================================================
import itertools

DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256 = (
    "60be061226930524958b5465c8c04af3d7c03bcb0beb66454f5da9f792e3cf2a"
)

TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256 = (
    "062f4619afe91b33147c033acadecbb53f2a7b99ac703d157b96d5b10948da5e"
)

TIKTOKEN_CACHES = {
    "fb374d419588a4632f3f557e76b4b70aebbca790": "cb04bcda5782cfbbe77f2f991d92c0ea785d9496ef1137c91dfc3c8c324528d6"
}

FONT_METADATA_URL = {
    "github": "https://raw.githubusercontent.com/funstory-ai/BabelDOC-Assets/refs/heads/main/font_metadata.json",
    "huggingface": "https://huggingface.co/datasets/awwaawwa/BabelDOC-Assets/resolve/main/font_metadata.json?download=true",
    # "hf-mirror": "https://hf-mirror.com/datasets/awwaawwa/BabelDOC-Assets/resolve/main/font_metadata.json?download=true",
    "modelscope": "https://www.modelscope.cn/datasets/awwaawwa/BabelDOCAssets/resolve/master/font_metadata.json",
}

FONT_URL_BY_UPSTREAM = {
    "github": lambda name: f"https://raw.githubusercontent.com/funstory-ai/BabelDOC-Assets/refs/heads/main/fonts/{name}",
    "huggingface": lambda name: f"https://huggingface.co/datasets/awwaawwa/BabelDOC-Assets/resolve/main/fonts/{name}?download=true",
    "hf-mirror": lambda name: f"https://hf-mirror.com/datasets/awwaawwa/BabelDOC-Assets/resolve/main/fonts/{name}?download=true",
    "modelscope": lambda name: f"https://www.modelscope.cn/datasets/awwaawwa/BabelDOCAssets/resolve/master/fonts/{name}",
}

CMAP_URL_BY_UPSTREAM = {
    "github": lambda name: f"https://raw.githubusercontent.com/funstory-ai/BabelDOC-Assets/refs/heads/main/cmap/{name}",
    "huggingface": lambda name: f"https://huggingface.co/datasets/awwaawwa/BabelDOC-Assets/resolve/main/cmap/{name}?download=true",
    "hf-mirror": lambda name: f"https://hf-mirror.com/datasets/awwaawwa/BabelDOC-Assets/resolve/main/cmap/{name}?download=true",
    "modelscope": lambda name: f"https://www.modelscope.cn/datasets/awwaawwa/BabelDOCAssets/resolve/master/cmap/{name}",
}

DOC_LAYOUT_ONNX_MODEL_URL = {
    "huggingface": "https://huggingface.co/wybxc/DocLayout-YOLO-DocStructBench-onnx/resolve/main/doclayout_yolo_docstructbench_imgsz1024.onnx?download=true",
    "hf-mirror": "https://hf-mirror.com/wybxc/DocLayout-YOLO-DocStructBench-onnx/resolve/main/doclayout_yolo_docstructbench_imgsz1024.onnx?download=true",
    "modelscope": "https://www.modelscope.cn/models/AI-ModelScope/DocLayout-YOLO-DocStructBench-onnx/resolve/master/doclayout_yolo_docstructbench_imgsz1024.onnx",
}

TABLE_DETECTION_RAPIDOCR_MODEL_URL = {
    "huggingface": "https://huggingface.co/spaces/RapidAI/RapidOCR/resolve/main/models/text_det/ch_PP-OCRv4_det_infer.onnx",
    "hf-mirror": "https://hf-mirror.com/spaces/RapidAI/RapidOCR/resolve/main/models/text_det/ch_PP-OCRv4_det_infer.onnx",
    "modelscope": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/master/onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx",
}

# from https://github.com/funstory-ai/BabelDOC-Assets/blob/main/font_metadata.json
EMBEDDING_FONT_METADATA = {
    "GoNotoKurrent-Bold.ttf": {
        "ascent": 1069,
        "bold": 1,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "GoNotoKurrent-Bold.ttf",
        "font_name": "Go Noto Kurrent-Bold Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "000b37f592477945b27b7702dcad39f73e23e140e66ddff9847eb34f32389566",
        "size": 15303772,
    },
    "GoNotoKurrent-Regular.ttf": {
        "ascent": 1069,
        "bold": 0,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "GoNotoKurrent-Regular.ttf",
        "font_name": "Go Noto Kurrent-Regular Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "4324a60d507c691e6efc97420647f4d2c2d86d9de35009d1c769861b76074ae6",
        "size": 15515760,
    },
    "KleeOne-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "KleeOne-Regular.ttf",
        "font_name": "Klee One Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "8585c29f89b322d937f83739f61ede5d84297873e1465cad9a120a208ac55ce0",
        "size": 8724704,
    },
    "LXGWWenKai-Regular.1.520.ttf": {
        "ascent": 928,
        "bold": 0,
        "descent": -256,
        "encoding_length": 2,
        "file_name": "LXGWWenKai-Regular.1.520.ttf",
        "font_name": "LXGW WenKai Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "708b4fd6cfae62a26f71016724d38e862210732f101b9225225a1d5e8205f94d",
        "size": 24744500,
    },
    "LXGWWenKaiGB-Regular.1.520.ttf": {
        "ascent": 928,
        "bold": 0,
        "descent": -256,
        "encoding_length": 2,
        "file_name": "LXGWWenKaiGB-Regular.1.520.ttf",
        "font_name": "LXGW WenKai GB Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "0671656b00992e317f9e20610e7145b024e664ada9f272d4f8e497196af98005",
        "size": 24903712,
    },
    "LXGWWenKaiGB-Regular.ttf": {
        "ascent": 928,
        "bold": 0,
        "descent": -256,
        "encoding_length": 2,
        "file_name": "LXGWWenKaiGB-Regular.ttf",
        "font_name": "LXGW WenKai GB Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "b563a5e8d9db4cd15602a3a3700b01925e80a21f99fb88e1b763b1fb8685f8ee",
        "size": 19558756,
    },
    "LXGWWenKaiMonoTC-Regular.ttf": {
        "ascent": 928,
        "bold": 0,
        "descent": -241,
        "encoding_length": 2,
        "file_name": "LXGWWenKaiMonoTC-Regular.ttf",
        "font_name": "LXGW WenKai Mono TC Regular",
        "italic": 0,
        "monospace": 1,
        "serif": 0,
        "sha3_256": "596b278d11418d374a1cfa3a50cbfb82b31db82d3650cfacae8f94311b27fdc5",
        "size": 13115416,
    },
    "LXGWWenKaiTC-Regular.1.520.ttf": {
        "ascent": 928,
        "bold": 0,
        "descent": -256,
        "encoding_length": 2,
        "file_name": "LXGWWenKaiTC-Regular.1.520.ttf",
        "font_name": "LXGW WenKai TC Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "347d3d4bd88c2afcb194eba186d2c6c0b95d18b2145220feb1c88abf761f1398",
        "size": 15348376,
    },
    "LXGWWenKaiTC-Regular.ttf": {
        "ascent": 928,
        "bold": 0,
        "descent": -256,
        "encoding_length": 2,
        "file_name": "LXGWWenKaiTC-Regular.ttf",
        "font_name": "LXGW WenKai TC Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "66ccd0ffe8e56cd585dabde8d1292c3f551b390d8ed85f81d7a844825f9c2379",
        "size": 13100328,
    },
    "MaruBuri-Regular.ttf": {
        "ascent": 800,
        "bold": 0,
        "descent": -200,
        "encoding_length": 2,
        "file_name": "MaruBuri-Regular.ttf",
        "font_name": "MaruBuri Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "abb672dde7b89e06914ce27c59159b7a2933f26207bfcc47981c67c11c41e6d1",
        "size": 3268988,
    },
    "NotoSans-Bold.ttf": {
        "ascent": 1069,
        "bold": 1,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSans-Bold.ttf",
        "font_name": "Noto Sans Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "ecd38d472c1cad07d8a5dffd2b5a0f72edcd40fff2b4e68d770da8f2ef343a82",
        "size": 630964,
    },
    "NotoSans-BoldItalic.ttf": {
        "ascent": 1069,
        "bold": 1,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSans-BoldItalic.ttf",
        "font_name": "Noto Sans Bold Italic",
        "italic": 1,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "0b6c690a4a6b7d605b2ecbde00c7ac1a23e60feb17fa30d8b972d61ec3ff732b",
        "size": 644340,
    },
    "NotoSans-Italic.ttf": {
        "ascent": 1069,
        "bold": 0,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSans-Italic.ttf",
        "font_name": "Noto Sans Italic",
        "italic": 1,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "830652f61724c017e5a29a96225b484a2ccbd25f69a1b3f47e5f466a2dbed1ad",
        "size": 642344,
    },
    "NotoSans-Regular.ttf": {
        "ascent": 1069,
        "bold": 0,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSans-Regular.ttf",
        "font_name": "Noto Sans Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "7dfe2bbf97dc04c852d1223b220b63430e6ad03b0dbb28ebe6328a20a2d45eb8",
        "size": 629024,
    },
    "NotoSerif-Bold.ttf": {
        "ascent": 1069,
        "bold": 1,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSerif-Bold.ttf",
        "font_name": "Noto Serif Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "28d88d924285eadb9f9ce49f2d2b95473f89a307b226c5f6ebed87a654898312",
        "size": 506864,
    },
    "NotoSerif-BoldItalic.ttf": {
        "ascent": 1069,
        "bold": 1,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSerif-BoldItalic.ttf",
        "font_name": "Noto Serif Bold Italic",
        "italic": 1,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "b69ee56af6351b2fb4fbce623f8e1c1f9fb19170686a9e5db2cf260b8cf24ac7",
        "size": 535724,
    },
    "NotoSerif-Italic.ttf": {
        "ascent": 1069,
        "bold": 0,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSerif-Italic.ttf",
        "font_name": "Noto Serif Italic",
        "italic": 1,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "9b7773c24ab8a29e3c1c03efa4ab652d051e4c209134431953463aa946d62868",
        "size": 535340,
    },
    "NotoSerif-Regular.ttf": {
        "ascent": 1069,
        "bold": 0,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSerif-Regular.ttf",
        "font_name": "Noto Serif Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "c2bbe984e65bafd3bcd38b3cb1e1344f3b7b79d6beffc7a3d883b57f8358559d",
        "size": 504932,
    },
    "SourceHanSansCN-Bold.ttf": {
        "ascent": 1160,
        "bold": 1,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansCN-Bold.ttf",
        "font_name": "Source Han Sans CN Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "82314c11016a04ef03e7afd00abe0ccc8df54b922dee79abf6424f3002a31825",
        "size": 10174460,
    },
    "SourceHanSansCN-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansCN-Regular.ttf",
        "font_name": "Source Han Sans CN Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "b45a80cf3650bfc62aa014e58243c6325e182c4b0c5819e41a583c699cce9a8f",
        "size": 10397552,
    },
    "SourceHanSansHK-Bold.ttf": {
        "ascent": 1160,
        "bold": 1,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansHK-Bold.ttf",
        "font_name": "Source Han Sans HK Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "3eecd57457ba9a0fbad6c794f40e7ae704c4f825091aef2ac18902ffdde50608",
        "size": 6856692,
    },
    "SourceHanSansHK-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansHK-Regular.ttf",
        "font_name": "Source Han Sans HK Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "5fe4141f9164c03616323400b2936ee4c8265314492e2b822c3a6fbfb63ffe08",
        "size": 6999792,
    },
    "SourceHanSansJP-Bold.ttf": {
        "ascent": 1160,
        "bold": 1,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansJP-Bold.ttf",
        "font_name": "Source Han Sans JP Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "fb05bd84d62e8064117ee357ab6a4481e1cde931e8e984c0553c8c4b09dc3938",
        "size": 5603068,
    },
    "SourceHanSansJP-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansJP-Regular.ttf",
        "font_name": "Source Han Sans JP Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "722cfbdcc0fd83fe07a3d1b10e9e64343c924a351d02cfe8dbb6ec4c6bc38230",
        "size": 5723960,
    },
    "SourceHanSansKR-Bold.ttf": {
        "ascent": 1160,
        "bold": 1,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansKR-Bold.ttf",
        "font_name": "Source Han Sans KR Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "02959eb2c1eea0786a736aeb50b6e61f2ab873cd69c659389b7511f80f734838",
        "size": 5858892,
    },
    "SourceHanSansKR-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansKR-Regular.ttf",
        "font_name": "Source Han Sans KR Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "aba70109eff718e8f796f0185f8dca38026c1661b43c195883c84577e501adf2",
        "size": 5961704,
    },
    "SourceHanSansTW-Bold.ttf": {
        "ascent": 1160,
        "bold": 1,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansTW-Bold.ttf",
        "font_name": "Source Han Sans TW Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "4a92730e644a1348e87bba7c77e9b462f257f381bd6abbeac5860d8f8306aee6",
        "size": 6883224,
    },
    "SourceHanSansTW-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansTW-Regular.ttf",
        "font_name": "Source Han Sans TW Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "6129b68ff4b0814624cac7edca61fbacf8f4d79db6f4c3cfc46b1c48ea2f81ac",
        "size": 7024812,
    },
    "SourceHanSerifCN-Bold.ttf": {
        "ascent": 1150,
        "bold": 1,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifCN-Bold.ttf",
        "font_name": "Source Han Serif CN Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "77816a54957616e140e25a36a41fc061ddb505a1107de4e6a65f561e5dcf8310",
        "size": 14134156,
    },
    "SourceHanSerifCN-Regular.ttf": {
        "ascent": 1150,
        "bold": 0,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifCN-Regular.ttf",
        "font_name": "Source Han Serif CN Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "c8bf74da2c3b7457c9d887465b42fb6f80d3d84f361cfe5b0673a317fb1f85ad",
        "size": 14047768,
    },
    "SourceHanSerifHK-Bold.ttf": {
        "ascent": 1150,
        "bold": 1,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifHK-Bold.ttf",
        "font_name": "Source Han Serif HK Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "0f81296f22846b622a26f7342433d6c5038af708a32fc4b892420c150227f4bb",
        "size": 9532580,
    },
    "SourceHanSerifHK-Regular.ttf": {
        "ascent": 1150,
        "bold": 0,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifHK-Regular.ttf",
        "font_name": "Source Han Serif HK Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "d5232ec3adf4fb8604bb4779091169ec9bd9d574b513e4a75752e614193afebe",
        "size": 9467292,
    },
    "SourceHanSerifJP-Bold.ttf": {
        "ascent": 1150,
        "bold": 1,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifJP-Bold.ttf",
        "font_name": "Source Han Serif JP Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "a4a8c22e8ec7bb6e66b9caaff1e12c7a52b5a4201eec3d074b35957c0126faef",
        "size": 7811832,
    },
    "SourceHanSerifJP-Regular.ttf": {
        "ascent": 1150,
        "bold": 0,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifJP-Regular.ttf",
        "font_name": "Source Han Serif JP Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "3d1f9933c7f3abc8c285e317119a533e6dcfe6027d1f5f066ba71b3eb9161e9c",
        "size": 7748816,
    },
    "SourceHanSerifKR-Bold.ttf": {
        "ascent": 1150,
        "bold": 1,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifKR-Bold.ttf",
        "font_name": "Source Han Serif KR Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "b071b1aecb042aa779e1198767048438dc756d0da8f90660408abb421393f5cb",
        "size": 12387920,
    },
    "SourceHanSerifKR-Regular.ttf": {
        "ascent": 1150,
        "bold": 0,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifKR-Regular.ttf",
        "font_name": "Source Han Serif KR Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "a85913439f0a49024ca77c02dfede4318e503ee6b2b7d8fef01eb42435f27b61",
        "size": 12459924,
    },
    "SourceHanSerifTW-Bold.ttf": {
        "ascent": 1150,
        "bold": 1,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifTW-Bold.ttf",
        "font_name": "Source Han Serif TW Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "562eea88895ab79ffefab7eabb4d322352a7b1963764c524c6d5242ca456bb6e",
        "size": 9551724,
    },
    "SourceHanSerifTW-Regular.ttf": {
        "ascent": 1150,
        "bold": 0,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifTW-Regular.ttf",
        "font_name": "Source Han Serif TW Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "85c1d6460b2e169b3d53ac60f6fb7a219fb99923027d78fb64b679475e2ddae4",
        "size": 9486772,
    },
}

CMAP_METADATA = {
    "78-EUC-H.json": {
        "file_name": "78-EUC-H.json",
        "sha3_256": "657006ae4360ac584316dbda94f2223d7dd4cf7c721021b78b470ed712d22a3d",
        "size": 15035,
    },
    "78-EUC-V.json": {
        "file_name": "78-EUC-V.json",
        "sha3_256": "ffd0610937d3893cd6b9f10007033dab4c846d6a50914b3e0b5b1a1d5a446483",
        "size": 704,
    },
    "78-H.json": {
        "file_name": "78-H.json",
        "sha3_256": "07960a71bd7f2dc8501bfff6ebacb5d179961accbb8d043837d6d213d4e7c43f",
        "size": 14993,
    },
    "78-RKSJ-H.json": {
        "file_name": "78-RKSJ-H.json",
        "sha3_256": "2cea4cbf474c08d99420790509473f48960d14df27e37155c0833150eff0310c",
        "size": 15054,
    },
    "78-RKSJ-V.json": {
        "file_name": "78-RKSJ-V.json",
        "sha3_256": "0005485dc7cb41b9911d651a31a008ff4d8f707f3a271f5eb900640415255f58",
        "size": 705,
    },
    "78-V.json": {
        "file_name": "78-V.json",
        "sha3_256": "6ec527dfdd6f8176719db47aea208d96c8427ff2c44bb6d6adcf215e3599c7dd",
        "size": 700,
    },
    "78ms-RKSJ-H.json": {
        "file_name": "78ms-RKSJ-H.json",
        "sha3_256": "781802e72f8e79d599d58a81445333d005df5117b10c9b8392459729e51bbec7",
        "size": 17125,
    },
    "78ms-RKSJ-V.json": {
        "file_name": "78ms-RKSJ-V.json",
        "sha3_256": "1854ff118f30bdee044813bf764f44123697cb2c2dfcfacb10e1aa161d7db16b",
        "size": 1928,
    },
    "83pv-RKSJ-H.json": {
        "file_name": "83pv-RKSJ-H.json",
        "sha3_256": "2b6dd0a63fc97f3b33767a1b16a49b30ba0cb97a1ff01deb6ca5592d90e79815",
        "size": 5277,
    },
    "90ms-RKSJ-H.json": {
        "file_name": "90ms-RKSJ-H.json",
        "sha3_256": "ebacf23e35e924a65b45afb6276f645289f68b122f1b32ab4dbc64f9c7903ccf",
        "size": 4117,
    },
    "90ms-RKSJ-V.json": {
        "file_name": "90ms-RKSJ-V.json",
        "sha3_256": "0e08ffc0c46d93912870ad12a863081bcea12db09038e3929e1e015cfc1663da",
        "size": 1928,
    },
    "90msp-RKSJ-H.json": {
        "file_name": "90msp-RKSJ-H.json",
        "sha3_256": "3098d897f17b1723d5915518d281d3c5d4f46f0b83dbde8b8001073e0f882d32",
        "size": 4096,
    },
    "90msp-RKSJ-V.json": {
        "file_name": "90msp-RKSJ-V.json",
        "sha3_256": "a7ad430c32de4dbce2667fff874efc5d4114c685107f026788eee4ec83992fc8",
        "size": 1929,
    },
    "90pv-RKSJ-H.json": {
        "file_name": "90pv-RKSJ-H.json",
        "sha3_256": "2c1720cc7343f95ccb87e073df0c7788d33bc8811b703b709a0230e79ecb2341",
        "size": 6314,
    },
    "90pv-RKSJ-V.json": {
        "file_name": "90pv-RKSJ-V.json",
        "sha3_256": "487bf100397d4f0bcfa86dbfea149cac54faa59c0b449d65284cc43123d99023",
        "size": 1283,
    },
    "Add-H.json": {
        "file_name": "Add-H.json",
        "sha3_256": "3bd6fbbe961dffa3a6395d1e3823da665efc74363f44ff6083d98fc5ae22433a",
        "size": 15174,
    },
    "Add-RKSJ-H.json": {
        "file_name": "Add-RKSJ-H.json",
        "sha3_256": "bde048bae5dc9c43570bff29ff4691e03372e029dde66edc5e8de64a891dd53b",
        "size": 15259,
    },
    "Add-RKSJ-V.json": {
        "file_name": "Add-RKSJ-V.json",
        "sha3_256": "1a81852c30ebf3101e1e0b0b5eff2e4f19211373c513d7c42b0933ded6b6e59b",
        "size": 1426,
    },
    "Add-V.json": {
        "file_name": "Add-V.json",
        "sha3_256": "6a4f7a4ee2d7a04ce0500b93453859faf3fc3f11b3f55cb61753ef79846b419b",
        "size": 1421,
    },
    "B5-H.json": {
        "file_name": "B5-H.json",
        "sha3_256": "f1b984aa231df737628663a56d380c93fe3172a243792db6d36921b964a118db",
        "size": 5960,
    },
    "B5-V.json": {
        "file_name": "B5-V.json",
        "sha3_256": "0fafc3f78a34f2bf2377a89b2679469505a35ae42df95bf6765f743344f9a94c",
        "size": 334,
    },
    "B5pc-H.json": {
        "file_name": "B5pc-H.json",
        "sha3_256": "07f0c25086768b9731971ba164d88cb10202a9d36e79a076c43233351f61c52f",
        "size": 6015,
    },
    "B5pc-V.json": {
        "file_name": "B5pc-V.json",
        "sha3_256": "f5e44d8eeeda40e8c3a81858dfb823eeed3f5e834e985544d1e56fb79260b8f8",
        "size": 336,
    },
    "CNS-EUC-H.json": {
        "file_name": "CNS-EUC-H.json",
        "sha3_256": "2add6b8cd4750db8bf6b029595232fecb8f1e54a0bad56590d4aa46401085e44",
        "size": 11342,
    },
    "CNS-EUC-V.json": {
        "file_name": "CNS-EUC-V.json",
        "sha3_256": "1ff26a35f10467a99957886c482de267658b9132a704b547381c90fc37c90820",
        "size": 12592,
    },
    "CNS1-H.json": {
        "file_name": "CNS1-H.json",
        "sha3_256": "e64c524f07718603b6bd84fd6799f875cc13c00137fbaa2b41215d518e96c87a",
        "size": 3728,
    },
    "CNS1-V.json": {
        "file_name": "CNS1-V.json",
        "sha3_256": "57a1d2aabe6ab9db9a323ab43c37e3aa1ba9b3eb71841dfec4d8568d657d503a",
        "size": 332,
    },
    "CNS2-H.json": {
        "file_name": "CNS2-H.json",
        "sha3_256": "90831af5d65fae9565d705fc8f1fccd091e33a67a1e544552410e39d7558daed",
        "size": 2053,
    },
    "CNS2-V.json": {
        "file_name": "CNS2-V.json",
        "sha3_256": "c4d2aae661b26120030754901abced51766fa4bce638433a7aa7130a3d5eabb0",
        "size": 54,
    },
    "ETHK-B5-H.json": {
        "file_name": "ETHK-B5-H.json",
        "sha3_256": "3ef2e9ef0364675c2fb9ccbfd37ed9227d416457ee8cadb9e59b2db4354d88ea",
        "size": 25660,
    },
    "ETHK-B5-V.json": {
        "file_name": "ETHK-B5-V.json",
        "sha3_256": "a12c5917b6f3400793e7d6ea2e217e9af05a28621a937cfef4da9f5184a03578",
        "size": 364,
    },
    "ETen-B5-H.json": {
        "file_name": "ETen-B5-H.json",
        "sha3_256": "57f29290c730277b221ad074709d4f76c429d5410931131c9da7157ebae76951",
        "size": 6205,
    },
    "ETen-B5-V.json": {
        "file_name": "ETen-B5-V.json",
        "sha3_256": "d07d9af9e30a8fc3ca7e52158f854226b831ab9ef552cda46219819e47950680",
        "size": 364,
    },
    "ETenms-B5-H.json": {
        "file_name": "ETenms-B5-H.json",
        "sha3_256": "0659f282182ebdaa6abb38062bc3428a3b7b5907513fd499980d1b49930a9b9e",
        "size": 72,
    },
    "ETenms-B5-V.json": {
        "file_name": "ETenms-B5-V.json",
        "sha3_256": "74b107f8950456b2df294a089091837bf802892c1bc3136c403da2a427130c33",
        "size": 429,
    },
    "EUC-H.json": {
        "file_name": "EUC-H.json",
        "sha3_256": "b6df6e254254eb5a2254b0d581f4820d2b3553cd372136ec88f605521683c44a",
        "size": 2910,
    },
    "EUC-V.json": {
        "file_name": "EUC-V.json",
        "sha3_256": "e81c0f409365f2fd60232f6e5c84bf52c8a6b9c6336d4c96fb554f213dbdfaf6",
        "size": 701,
    },
    "Ext-H.json": {
        "file_name": "Ext-H.json",
        "sha3_256": "629359cf115575acb68b59c82373a1a3958001212a854d0a5b98e6fe1efe81db",
        "size": 15891,
    },
    "Ext-RKSJ-H.json": {
        "file_name": "Ext-RKSJ-H.json",
        "sha3_256": "3336a4a77a75924588f13c5a24157680c9c5b6a46298063dcdb461b90bb55da0",
        "size": 15975,
    },
    "Ext-RKSJ-V.json": {
        "file_name": "Ext-RKSJ-V.json",
        "sha3_256": "f2915039ff32992094ff6521fa24c3f41c27f55f3f071730eea732e261a2a553",
        "size": 994,
    },
    "Ext-V.json": {
        "file_name": "Ext-V.json",
        "sha3_256": "e2fb58ec483aee0910b0733dcb6220f10f9f4d2553c8c139a523e3992363f93e",
        "size": 989,
    },
    "GB-EUC-H.json": {
        "file_name": "GB-EUC-H.json",
        "sha3_256": "4a0b5fda367993409663ec1d4be57c207a3500d778373546b729d143d789c191",
        "size": 2178,
    },
    "GB-EUC-V.json": {
        "file_name": "GB-EUC-V.json",
        "sha3_256": "b45a8a562304c2c388fd1574c3a1a0af6f49e4849f7904ba07d57967d9625917",
        "size": 520,
    },
    "GB-H.json": {
        "file_name": "GB-H.json",
        "sha3_256": "a50b5d6461c95a667ccbc44c507ff5e6686e4f1bbd8bfae69486396b4ed03510",
        "size": 2139,
    },
    "GB-V.json": {
        "file_name": "GB-V.json",
        "sha3_256": "1f043042065f2df4590ebbd27fbc8f93802ea66caeb0b8ba92823575842743e5",
        "size": 516,
    },
    "GBK-EUC-H.json": {
        "file_name": "GBK-EUC-H.json",
        "sha3_256": "4502e7abe2edfb6256b5a4308dfca940aaa92a2d951c4b44942ce7bdb9eda877",
        "size": 99532,
    },
    "GBK-EUC-V.json": {
        "file_name": "GBK-EUC-V.json",
        "sha3_256": "c71f6281bb59897dcf48f587136d002d5caa8a0ed89f9b490a6a288765ec674d",
        "size": 521,
    },
    "GBK2K-H.json": {
        "file_name": "GBK2K-H.json",
        "sha3_256": "0a2a975da25641067ea2743f15407df20895b28804a1e64c12cd9fd0f306b1a9",
        "size": 109298,
    },
    "GBK2K-V.json": {
        "file_name": "GBK2K-V.json",
        "sha3_256": "0febb4a13f8f73dc949d159b4f37e886d1c3d1514aaf53d3492e0b5e21523f52",
        "size": 1044,
    },
    "GBKp-EUC-H.json": {
        "file_name": "GBKp-EUC-H.json",
        "sha3_256": "50d628304aff1f13ded3790cc3b8bd48502267768cac5e72cb3be8a46f9a5436",
        "size": 99510,
    },
    "GBKp-EUC-V.json": {
        "file_name": "GBKp-EUC-V.json",
        "sha3_256": "8c540fc12dfed309896544f8153fa52b793708a85e3882985567dcae86fb1732",
        "size": 522,
    },
    "GBT-EUC-H.json": {
        "file_name": "GBT-EUC-H.json",
        "sha3_256": "5fbe99ec7638de5216ea452788d3ef40cfd8c110c8b8ae936b57db6221d9b9d9",
        "size": 54802,
    },
    "GBT-EUC-V.json": {
        "file_name": "GBT-EUC-V.json",
        "sha3_256": "4cc3a48b1f7c8ab088391aa78131289da3d68e2fe0071b380a10c19757356ab5",
        "size": 521,
    },
    "GBT-H.json": {
        "file_name": "GBT-H.json",
        "sha3_256": "8bbbbbdee2722751708dd66a7ed12fa54a08bbf0dcfaefca2b87f305ca591f32",
        "size": 54763,
    },
    "GBT-V.json": {
        "file_name": "GBT-V.json",
        "sha3_256": "32e4457c8b0edbeeec9445465ec40106603ad50003e1af98994c02020df1c59f",
        "size": 517,
    },
    "GBTpc-EUC-H.json": {
        "file_name": "GBTpc-EUC-H.json",
        "sha3_256": "7f7faa903850fc471948e284853a81ee2f4a32693e14131f3ab1fbc490c5695b",
        "size": 54820,
    },
    "GBTpc-EUC-V.json": {
        "file_name": "GBTpc-EUC-V.json",
        "sha3_256": "3cf85a97171567e08d0112b71ca4a0aef68c52918b7c635669ef7e25e1bcb818",
        "size": 523,
    },
    "GBpc-EUC-H.json": {
        "file_name": "GBpc-EUC-H.json",
        "sha3_256": "38332ce5be0b82e4010fbd05ceac92e9f05a784ccacf6a4f004cd8da734c47de",
        "size": 2196,
    },
    "GBpc-EUC-V.json": {
        "file_name": "GBpc-EUC-V.json",
        "sha3_256": "5a0b4e7db0aedd6b27f84b191791b527da3ea27ea1ca42460086cb0d294418bf",
        "size": 522,
    },
    "H.json": {
        "file_name": "H.json",
        "sha3_256": "5ee11fcc99897b769fd62238967954e957bb8079353abba815792aab6f3e329c",
        "size": 2868,
    },
    "HKdla-B5-H.json": {
        "file_name": "HKdla-B5-H.json",
        "sha3_256": "8f24808486e1d5363a66981021f3f8b136f1ec6231d48bda76344e1f7f1695aa",
        "size": 25384,
    },
    "HKdla-B5-V.json": {
        "file_name": "HKdla-B5-V.json",
        "sha3_256": "1e686a7f69d6b7a3c05a4be9e7e396cf81498ef48299341616e76805c1092733",
        "size": 340,
    },
    "HKdlb-B5-H.json": {
        "file_name": "HKdlb-B5-H.json",
        "sha3_256": "0ccae437017107059630d56c7e0e2d6f086d5fb512c9e60b1bd48c4a04b6652d",
        "size": 22501,
    },
    "HKdlb-B5-V.json": {
        "file_name": "HKdlb-B5-V.json",
        "sha3_256": "dad584337fd6e5e6ab5e1e30dc9b8cc1013985a04a159b3c108c4dfb5c10fb55",
        "size": 340,
    },
    "HKgccs-B5-H.json": {
        "file_name": "HKgccs-B5-H.json",
        "sha3_256": "f7da0854c355c51957de6e71ffa33fbc69414d52dcfc5a5cb50c8f8c6c6bd9c6",
        "size": 13642,
    },
    "HKgccs-B5-V.json": {
        "file_name": "HKgccs-B5-V.json",
        "sha3_256": "d7f89dc24162b624bc4d682484da315a4d39eaf9a8f63c1392e06d2aa46f015a",
        "size": 341,
    },
    "HKm314-B5-H.json": {
        "file_name": "HKm314-B5-H.json",
        "sha3_256": "febd4cb78048e012478df9fc91aa23e946304d63c5f7c64ea8e16277b64a359b",
        "size": 13405,
    },
    "HKm314-B5-V.json": {
        "file_name": "HKm314-B5-V.json",
        "sha3_256": "d310bbf5a975fe8e1f8bb4523b0db8e792043578f0c2a12735bbc24fc4a3721f",
        "size": 341,
    },
    "HKm471-B5-H.json": {
        "file_name": "HKm471-B5-H.json",
        "sha3_256": "fdb1368b1a6f4df20ab87e2a1045a579088645828d1168e39d6aa5b52c09bd8e",
        "size": 17079,
    },
    "HKm471-B5-V.json": {
        "file_name": "HKm471-B5-V.json",
        "sha3_256": "34c40c1bb1409942f12f66f1bcbc2be73406b4c5e626ea7a4ab7f73160ba2a88",
        "size": 341,
    },
    "HKscs-B5-H.json": {
        "file_name": "HKscs-B5-H.json",
        "sha3_256": "63fe2b09c05c8ef70fb937aad49698d4154e1d7bb75f94344fea4db522b87a88",
        "size": 25722,
    },
    "HKscs-B5-V.json": {
        "file_name": "HKscs-B5-V.json",
        "sha3_256": "14c864025ffca616fc173458162efe190bdace4700e2a7ad4869c66476534223",
        "size": 365,
    },
    "Hankaku.json": {
        "file_name": "Hankaku.json",
        "sha3_256": "befe81a2bbe191bcb8e0ff23706a51cb6a41a60f6bc508d5c0c19040c14afc06",
        "size": 238,
    },
    "Hiragana.json": {
        "file_name": "Hiragana.json",
        "sha3_256": "0e8ce0a48ec8c05f4c65d23ada539c4a2a236fcb7dd46e20874acd9362394525",
        "size": 200,
    },
    "Identity-H.json": {
        "file_name": "Identity-H.json",
        "sha3_256": "77cc630138b29b5acd4ab216cb1d173bb3e7b994ab932a4f3d8a9121be91fbab",
        "size": 6404,
    },
    "Identity-V.json": {
        "file_name": "Identity-V.json",
        "sha3_256": "067a8d390f2d99dfa94ff19009925e5815c8b54b65b39314a244cbbace494679",
        "size": 62,
    },
    "KSC-EUC-H.json": {
        "file_name": "KSC-EUC-H.json",
        "sha3_256": "79fb3c0bd9d2ce6b80da98d6f1ef4fd2776dfc3fb78c5ee4d6ee3a06aebc9fd0",
        "size": 11234,
    },
    "KSC-EUC-V.json": {
        "file_name": "KSC-EUC-V.json",
        "sha3_256": "a541a285c966105a92dba6939401ac8aaeb057e5200bdbf8c874ceecb9f37b01",
        "size": 441,
    },
    "KSC-H.json": {
        "file_name": "KSC-H.json",
        "sha3_256": "a0a20bce98ffe98036aa748d46c2921e17247827a22298edb59c778b8b776f24",
        "size": 11214,
    },
    "KSC-Johab-H.json": {
        "file_name": "KSC-Johab-H.json",
        "sha3_256": "3d7cd1473ddcf7c3bfb80c7eadf45a365389759b1df1f53e0bd5f31e31125e96",
        "size": 100922,
    },
    "KSC-Johab-V.json": {
        "file_name": "KSC-Johab-V.json",
        "sha3_256": "2f7cf1d05bd82d65e488fc3297aefc1c1f48f2c6972b01304c4be5f260fae86e",
        "size": 443,
    },
    "KSC-V.json": {
        "file_name": "KSC-V.json",
        "sha3_256": "f6f09bab60f802d61c22368ca8650cefa08851c2039c5825e37404c7047eb496",
        "size": 437,
    },
    "KSCms-UHC-H.json": {
        "file_name": "KSCms-UHC-H.json",
        "sha3_256": "6df55fd679239f3a6642c7690e89a85525fa6a8a3cf748aef247b2d06fdc1aca",
        "size": 16419,
    },
    "KSCms-UHC-HW-H.json": {
        "file_name": "KSCms-UHC-HW-H.json",
        "sha3_256": "a05183c5d7b6b6f62d11f8175e5749d5ad2913d469403905c8f01a403d715583",
        "size": 16422,
    },
    "KSCms-UHC-HW-V.json": {
        "file_name": "KSCms-UHC-HW-V.json",
        "sha3_256": "e2586795b094fade7e385ff1ce5570232edc791c456acf4c6e1c11bc501f82a4",
        "size": 446,
    },
    "KSCms-UHC-V.json": {
        "file_name": "KSCms-UHC-V.json",
        "sha3_256": "c09dc49c1afea5a5dc01bd6ac672d2af83b4821d74de7df71d4da3233513cefb",
        "size": 443,
    },
    "KSCpc-EUC-H.json": {
        "file_name": "KSCpc-EUC-H.json",
        "sha3_256": "b43448cb510c7f952a6affd0950db58063719f7499309c64f78fea6b2778fa11",
        "size": 12226,
    },
    "KSCpc-EUC-V.json": {
        "file_name": "KSCpc-EUC-V.json",
        "sha3_256": "1f4889c2e7278085738257e8097382ef5ac40b543b71751b75b155b056a46db2",
        "size": 443,
    },
    "Katakana.json": {
        "file_name": "Katakana.json",
        "sha3_256": "524b659bd0acc0fb4baa7633c3250683d6b3ba1685caadc9739240ccdbfd2ce2",
        "size": 86,
    },
    "NWP-H.json": {
        "file_name": "NWP-H.json",
        "sha3_256": "6c067655436fe89fb21a26e258973313bfe7cd5fbab3a2857b00ea92cc82c25d",
        "size": 18143,
    },
    "NWP-V.json": {
        "file_name": "NWP-V.json",
        "sha3_256": "b494038c72c63c6917ab3ed3f83a8b6bf21c65ba9ea47a4887833fffcc434763",
        "size": 1205,
    },
    "RKSJ-H.json": {
        "file_name": "RKSJ-H.json",
        "sha3_256": "eff868636f960b80d6923b77eb59d76acf6d7297bc74e1b7f3a13ff92a71c1cb",
        "size": 2953,
    },
    "RKSJ-V.json": {
        "file_name": "RKSJ-V.json",
        "sha3_256": "f3827bc17eb1172a5713d2d5c83a9b60f965894e3f2cb8dcb731b6f151abaa10",
        "size": 702,
    },
    "Roman.json": {
        "file_name": "Roman.json",
        "sha3_256": "620ab6ac0f4b487f19d44397b49612db57d164ddbff8e7d52fb5fd7e969e0cb9",
        "size": 67,
    },
    "UniAKR-UTF16-H.json": {
        "file_name": "UniAKR-UTF16-H.json",
        "sha3_256": "1204af593c62e5d10ace0db3b5ca0caecc80240f1c866bf1585fad405c204a54",
        "size": 232741,
    },
    "UniAKR-UTF32-H.json": {
        "file_name": "UniAKR-UTF32-H.json",
        "sha3_256": "cbbebc4b9b018109612dcfc0798f5c164d739a8b202017580301e0f27f76c35d",
        "size": 296773,
    },
    "UniAKR-UTF8-H.json": {
        "file_name": "UniAKR-UTF8-H.json",
        "sha3_256": "e08da06fc02a877abb02205fe0db3b61566d9ac41511a735ef2f12b5741d069a",
        "size": 266575,
    },
    "UniCNS-UCS2-H.json": {
        "file_name": "UniCNS-UCS2-H.json",
        "sha3_256": "48a0840498b90cf597c05ad2f63e26aaea778a49171f821d4b87b94424d7e640",
        "size": 400654,
    },
    "UniCNS-UCS2-V.json": {
        "file_name": "UniCNS-UCS2-V.json",
        "sha3_256": "014f9d86baea5fd13e460dd3735eab98dbbacf126922826ef0be9d7c8c605418",
        "size": 360,
    },
    "UniCNS-UTF16-H.json": {
        "file_name": "UniCNS-UTF16-H.json",
        "sha3_256": "c67980ebfb0d525365d0b5421548cc64ce9fb89afca1a0f6d04972f1e39b7f9c",
        "size": 320254,
    },
    "UniCNS-UTF16-V.json": {
        "file_name": "UniCNS-UTF16-V.json",
        "sha3_256": "98bd35d76997c0f3c443f130d44e814997cb0277183b7bf6571f92206d9a85a0",
        "size": 311,
    },
    "UniCNS-UTF32-H.json": {
        "file_name": "UniCNS-UTF32-H.json",
        "sha3_256": "6ab73cc531843f9bef915a949a0b79de1df288bb7ed6026db782ac446ed36c94",
        "size": 391690,
    },
    "UniCNS-UTF32-V.json": {
        "file_name": "UniCNS-UTF32-V.json",
        "sha3_256": "d94f8c3d7fe834d34f746b9404a4bb5dd8479353e3b9f95b308642a8be793a44",
        "size": 391,
    },
    "UniCNS-UTF8-H.json": {
        "file_name": "UniCNS-UTF8-H.json",
        "sha3_256": "3666cbe4d00de4038120c98472137857c93d44735c3a5def8c4ac7f84a59aa72",
        "size": 357287,
    },
    "UniCNS-UTF8-V.json": {
        "file_name": "UniCNS-UTF8-V.json",
        "sha3_256": "e410ed491c0e2f31ba30cfd60eb4e21c40d3ee82e2be1c06c7adb8772b175f10",
        "size": 350,
    },
    "UniGB-UCS2-H.json": {
        "file_name": "UniGB-UCS2-H.json",
        "sha3_256": "42a8e01b690cf2cd6b137c1eb94e7668899f0041b6e43b921252fe453486a96e",
        "size": 336533,
    },
    "UniGB-UCS2-V.json": {
        "file_name": "UniGB-UCS2-V.json",
        "sha3_256": "0a0aaf21f823546faf0971b7926724cc95b53b3da3f42a22ec0526ca8de1b237",
        "size": 617,
    },
    "UniGB-UTF16-H.json": {
        "file_name": "UniGB-UTF16-H.json",
        "sha3_256": "c306f093839fffe81e0c8597a24be508a64aa2a9c3e9b9eee858d55059530c0d",
        "size": 251806,
    },
    "UniGB-UTF16-V.json": {
        "file_name": "UniGB-UTF16-V.json",
        "sha3_256": "bd283b8c7e145e340db39868ec1a3b0a08d89acc2bfac672d41008a8195c7bb3",
        "size": 456,
    },
    "UniGB-UTF32-H.json": {
        "file_name": "UniGB-UTF32-H.json",
        "sha3_256": "a01a6a8b4b715f27c7e1866894240b0e1fd61a4eaca1c91df80c1f256ad06f72",
        "size": 319766,
    },
    "UniGB-UTF32-V.json": {
        "file_name": "UniGB-UTF32-V.json",
        "sha3_256": "8b31bba8b852a2c6c1f6d92aea633285e2f75237fbe87ecadff9f9312a0bfaa9",
        "size": 572,
    },
    "UniGB-UTF8-H.json": {
        "file_name": "UniGB-UTF8-H.json",
        "sha3_256": "87f7a6b0360d0f9bd0658cb7a67587e86c604be44292214622d972d85a474dbf",
        "size": 290481,
    },
    "UniGB-UTF8-V.json": {
        "file_name": "UniGB-UTF8-V.json",
        "sha3_256": "1378adf3ecd0bfbdb11dabbf2118cbb968a03aa2215780b77b07459e3b1df6e7",
        "size": 513,
    },
    "UniJIS-UCS2-H.json": {
        "file_name": "UniJIS-UCS2-H.json",
        "sha3_256": "a73e449136b46240ef86c9fb2b614e7d290b814130e9beb4b987c52fd7eda575",
        "size": 205924,
    },
    "UniJIS-UCS2-HW-H.json": {
        "file_name": "UniJIS-UCS2-HW-H.json",
        "sha3_256": "e58ec4fd06677ecfcef12d25f6456b7f80da706b2ac6ef915239e0b780b775a0",
        "size": 154,
    },
    "UniJIS-UCS2-HW-V.json": {
        "file_name": "UniJIS-UCS2-HW-V.json",
        "sha3_256": "bc3c81dbd6329d83cd71743a6985ed0cf516b0aa97a1c58c3cc3940e280b1e8e",
        "size": 4868,
    },
    "UniJIS-UCS2-V.json": {
        "file_name": "UniJIS-UCS2-V.json",
        "sha3_256": "276712ac66416538e859ad28e9f5b685fbc71e5d7d91e905a3489f03667ae4bc",
        "size": 4775,
    },
    "UniJIS-UTF16-H.json": {
        "file_name": "UniJIS-UTF16-H.json",
        "sha3_256": "afc923e268f22dcf09e0871ce0060c7588aa1304d4b26e781a261c14566f7642",
        "size": 238042,
    },
    "UniJIS-UTF16-V.json": {
        "file_name": "UniJIS-UTF16-V.json",
        "sha3_256": "0a044ab7015485c3b0f7f9e4d883a1d9e9f1d04235b13e2a17687e878ce3e9f0",
        "size": 3951,
    },
    "UniJIS-UTF32-H.json": {
        "file_name": "UniJIS-UTF32-H.json",
        "sha3_256": "1c27e2e595d659073e37e5ee22a9b39abe30af1483de33e1078ed174abdc723c",
        "size": 295294,
    },
    "UniJIS-UTF32-V.json": {
        "file_name": "UniJIS-UTF32-V.json",
        "sha3_256": "aa7a475ce5f85f79d73e17355c08e6aee21a949b596f2efe359913489a22117f",
        "size": 4983,
    },
    "UniJIS-UTF8-H.json": {
        "file_name": "UniJIS-UTF8-H.json",
        "sha3_256": "d91079b3f1671a7f4ace8b8f89478558f43f7782e666064ce1b53af563a87306",
        "size": 266367,
    },
    "UniJIS-UTF8-V.json": {
        "file_name": "UniJIS-UTF8-V.json",
        "sha3_256": "d0c8c94f7d54dafa40876ce7eb28845d8ac00b688cf4bac255694cb2f086d109",
        "size": 4483,
    },
    "UniJIS2004-UTF16-H.json": {
        "file_name": "UniJIS2004-UTF16-H.json",
        "sha3_256": "336660e87fc57ad166258d22f09690fcebb546840faee1e1b3f6cad3556bcf80",
        "size": 238119,
    },
    "UniJIS2004-UTF16-V.json": {
        "file_name": "UniJIS2004-UTF16-V.json",
        "sha3_256": "f6619a74b62f9986e9a74620b28e726b927dde5cd6184742f368ef4d686fe55c",
        "size": 3955,
    },
    "UniJIS2004-UTF32-H.json": {
        "file_name": "UniJIS2004-UTF32-H.json",
        "sha3_256": "2512690db880e0663f8208d22acda8daa98f1240ff14a038bf02e57c4908afb5",
        "size": 295371,
    },
    "UniJIS2004-UTF32-V.json": {
        "file_name": "UniJIS2004-UTF32-V.json",
        "sha3_256": "da1728a91845f1654457eaf0f15b75d1ace5cbf75486bca8523bd5edf20a8010",
        "size": 4987,
    },
    "UniJIS2004-UTF8-H.json": {
        "file_name": "UniJIS2004-UTF8-H.json",
        "sha3_256": "af36b0255a1ed15966670703ba8a48987a1cf7e43f5c94a4e86a41e5ee26b940",
        "size": 266444,
    },
    "UniJIS2004-UTF8-V.json": {
        "file_name": "UniJIS2004-UTF8-V.json",
        "sha3_256": "28bebdf1581c45f2e9b38caa2ff643abd561321bab45febb0f90d802d2290faa",
        "size": 4487,
    },
    "UniJISPro-UCS2-HW-V.json": {
        "file_name": "UniJISPro-UCS2-HW-V.json",
        "sha3_256": "21fd353a062b6c415389d6fde11718488f765ca31fd4ca481050c89633568009",
        "size": 4994,
    },
    "UniJISPro-UCS2-V.json": {
        "file_name": "UniJISPro-UCS2-V.json",
        "sha3_256": "8daa155869a35f3f629abb042790c59eb5cff342b83573c2ae4c87b3e865dc27",
        "size": 4901,
    },
    "UniJISPro-UTF8-V.json": {
        "file_name": "UniJISPro-UTF8-V.json",
        "sha3_256": "19b9a6d908f9fb7413d778c9cc912072314864225c38a3f5c345936fabcea650",
        "size": 5726,
    },
    "UniJISX0213-UTF32-H.json": {
        "file_name": "UniJISX0213-UTF32-H.json",
        "sha3_256": "e6a07453703f5070bf567c9d67aa20bc4b404bd311413fed45d9ba8c297a91d9",
        "size": 295246,
    },
    "UniJISX0213-UTF32-V.json": {
        "file_name": "UniJISX0213-UTF32-V.json",
        "sha3_256": "5f2dd4ff8045b2308a707e3d4ffb73e1ba7f5a1c1fdb43b17c5a322109897b9c",
        "size": 4908,
    },
    "UniJISX02132004-UTF32-H.json": {
        "file_name": "UniJISX02132004-UTF32-H.json",
        "sha3_256": "81427dc73cf9392c0c3e8eeeb1dedbc797b123059714bfcdcd1ecffec9f341e3",
        "size": 295323,
    },
    "UniJISX02132004-UTF32-V.json": {
        "file_name": "UniJISX02132004-UTF32-V.json",
        "sha3_256": "c0721298f3449f0c6f48ada1200ebcadbfc4020b10333871f6c0eea0be9f13ac",
        "size": 4912,
    },
    "UniKS-UCS2-H.json": {
        "file_name": "UniKS-UCS2-H.json",
        "sha3_256": "3a1c10535982d06dde447764f8e3dd82c6c87bec6c4272eaf449f67db6d50ab8",
        "size": 202706,
    },
    "UniKS-UCS2-V.json": {
        "file_name": "UniKS-UCS2-V.json",
        "sha3_256": "b915820ff4639f837e4d3b7e5a7c0810c26af1dcf3df9e56ed9a0a69e3cdba9d",
        "size": 492,
    },
    "UniKS-UTF16-H.json": {
        "file_name": "UniKS-UTF16-H.json",
        "sha3_256": "820f534efffcef15f0d3f270c078774febee31b451a1387b27f7225da321c12f",
        "size": 153894,
    },
    "UniKS-UTF16-V.json": {
        "file_name": "UniKS-UTF16-V.json",
        "sha3_256": "2b5be7641990cf79754a12309c6069c01b636cfc3308bc4dc8075da59c2d8d6b",
        "size": 403,
    },
    "UniKS-UTF32-H.json": {
        "file_name": "UniKS-UTF32-H.json",
        "sha3_256": "541515ed8ff15170b38fbe6587ff6c54f6fc75aeede9da110133dc335e4ddf0e",
        "size": 195998,
    },
    "UniKS-UTF32-V.json": {
        "file_name": "UniKS-UTF32-V.json",
        "sha3_256": "940e977d3927c8480c65dc4ad6be4f365f65b8d76707758a7696d40e2b3583ea",
        "size": 503,
    },
    "UniKS-UTF8-H.json": {
        "file_name": "UniKS-UTF8-H.json",
        "sha3_256": "81b5c336c1a20dee2e9592c6615a46cdd906edd242717c1807609b5687576252",
        "size": 177154,
    },
    "UniKS-UTF8-V.json": {
        "file_name": "UniKS-UTF8-V.json",
        "sha3_256": "9a282e8eee884f801a5518cc52ff240ee8635553661dd0ee7df952adbad7462a",
        "size": 452,
    },
    "V.json": {
        "file_name": "V.json",
        "sha3_256": "616f263e53079846a66efc861524a15c0a411e823c37fe08e62bad835745cbba",
        "size": 697,
    },
    "WP-Symbol.json": {
        "file_name": "WP-Symbol.json",
        "sha3_256": "533dfe497eab1f095039b6344217fc0ff6b1f7cdf9b406bb19c30b945fe78c21",
        "size": 588,
    },
}


FONT_NAMES = {v["font_name"] for v in EMBEDDING_FONT_METADATA.values()}

CN_FONT_FAMILY = {
    # 手写体
    "script": [
        "LXGWWenKaiGB-Regular.1.520.ttf",
    ],
    # 正文字体
    "normal": [
        "SourceHanSerifCN-Bold.ttf",
        "SourceHanSerifCN-Regular.ttf",
        "SourceHanSansCN-Bold.ttf",
        "SourceHanSansCN-Regular.ttf",
    ],
    # 备用字体
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": ["SourceHanSansCN-Regular.ttf"],
}

HK_FONT_FAMILY = {
    "script": ["LXGWWenKaiTC-Regular.1.520.ttf"],
    "normal": [
        "SourceHanSerifHK-Bold.ttf",
        "SourceHanSerifHK-Regular.ttf",
        "SourceHanSansHK-Bold.ttf",
        "SourceHanSansHK-Regular.ttf",
    ],
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": ["SourceHanSansCN-Regular.ttf"],
}

TW_FONT_FAMILY = {
    "script": ["LXGWWenKaiTC-Regular.1.520.ttf"],
    "normal": [
        "SourceHanSerifTW-Bold.ttf",
        "SourceHanSerifTW-Regular.ttf",
        "SourceHanSansTW-Bold.ttf",
        "SourceHanSansTW-Regular.ttf",
    ],
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": ["SourceHanSansCN-Regular.ttf"],
}

KR_FONT_FAMILY = {
    "script": ["MaruBuri-Regular.ttf"],
    "normal": [
        "SourceHanSerifKR-Bold.ttf",
        "SourceHanSerifKR-Regular.ttf",
        "SourceHanSansKR-Bold.ttf",
        "SourceHanSansKR-Regular.ttf",
    ],
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": ["SourceHanSansCN-Regular.ttf"],
}

JP_FONT_FAMILY = {
    "script": ["KleeOne-Regular.ttf"],
    "normal": [
        "SourceHanSerifJP-Bold.ttf",
        "SourceHanSerifJP-Regular.ttf",
        "SourceHanSansJP-Bold.ttf",
        "SourceHanSansJP-Regular.ttf",
    ],
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": ["SourceHanSansCN-Regular.ttf"],
}

EN_FONT_FAMILY = {
    "script": [
        "NotoSans-Italic.ttf",
        "NotoSans-BoldItalic.ttf",
        "NotoSerif-Italic.ttf",
        "NotoSerif-BoldItalic.ttf",
    ],
    "normal": [
        "NotoSerif-Regular.ttf",
        "NotoSerif-Bold.ttf",
        "NotoSans-Regular.ttf",
        "NotoSans-Bold.ttf",
    ],
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": [
        "NotoSans-Regular.ttf",
    ],
}

ALL_FONT_FAMILY = {
    "CN": CN_FONT_FAMILY,
    "TW": TW_FONT_FAMILY,
    "HK": HK_FONT_FAMILY,
    "KR": KR_FONT_FAMILY,
    "JP": JP_FONT_FAMILY,
    "EN": EN_FONT_FAMILY,
    "JA": JP_FONT_FAMILY,
}


def __add_fallback_to_font_family():
    for lang1, family1 in ALL_FONT_FAMILY.items():
        added_font = set()
        for font in itertools.chain.from_iterable(family1.values()):
            added_font.add(font)

        for lang2, family2 in ALL_FONT_FAMILY.items():
            if lang1 != lang2:
                for type_ in family1:
                    for font in family2[type_]:
                        if font not in added_font:
                            family1[type_].append(font)
                            added_font.add(font)


def __cleanup_unused_font_metadata():
    """Remove unused font metadata that are not referenced in any font family."""
    referenced_fonts = set()
    for family in ALL_FONT_FAMILY.values():
        for font_list in family.values():
            referenced_fonts.update(font_list)

    # Remove unreferenced fonts from EMBEDDING_FONT_METADATA
    unused_fonts = set(EMBEDDING_FONT_METADATA.keys()) - referenced_fonts
    for font_name in unused_fonts:
        del EMBEDDING_FONT_METADATA[font_name]


__add_fallback_to_font_family()
__cleanup_unused_font_metadata()


def get_font_family(lang_code: str):
    lang_code = lang_code.upper()
    if "KR" in lang_code:
        font_family = KR_FONT_FAMILY
    elif "JP" in lang_code or "JA" in lang_code:
        font_family = JP_FONT_FAMILY
    elif "HK" in lang_code:
        font_family = HK_FONT_FAMILY
    elif "TW" in lang_code:
        font_family = TW_FONT_FAMILY
    elif "EN" in lang_code:
        font_family = EN_FONT_FAMILY
    elif "CN" in lang_code:
        font_family = CN_FONT_FAMILY
    else:
        font_family = EN_FONT_FAMILY
    verify_font_family(font_family)
    return font_family


def verify_font_family(font_family: str | dict):
    if isinstance(font_family, str):
        font_family = ALL_FONT_FAMILY[font_family]
    for k in font_family:
        if k not in ["script", "normal", "fallback", "base"]:
            raise ValueError(f"Invalid font family: {font_family}")
        for font_file_name in font_family[k]:
            if font_file_name not in EMBEDDING_FONT_METADATA:
                raise ValueError(f"Invalid font file: {font_file_name}")


if __name__ == "__main__":
    for k in ALL_FONT_FAMILY:
        verify_font_family(k)


================================================
FILE: babeldoc/asynchronize/__init__.py
================================================
import asyncio
import time


class Args:
    def __init__(self, args, kwargs):
        self.args = args
        self.kwargs = kwargs


class AsyncCallback:
    def __init__(self):
        self.queue = asyncio.Queue()
        self.finished = False
        self.loop = asyncio.get_event_loop()

    def step_callback(self, *args, **kwargs):
        # Whenever a step is called, add to the queue but don't set finished to True, so __anext__ will continue
        args = Args(args, kwargs)

        # We have to use the threadsafe call so that it wakes up the event loop, in case it's sleeping:
        # https://stackoverflow.com/a/49912853/2148718
        self.loop.call_soon_threadsafe(self.queue.put_nowait, args)

        # Add a small delay to release the GIL, ensuring the event loop has time to process messages
        time.sleep(0.01)

    def finished_callback(self, *args, **kwargs):
        # Whenever a finished is called, add to the queue as with step, but also set finished to True, so __anext__
        # will terminate after processing the remaining items
        if self.finished:
            return
        self.step_callback(*args, **kwargs)
        self.finished = True

    def __await__(self):
        # Since this implements __anext__, this can return itself
        return self.queue.get().__await__()

    def __aiter__(self):
        # Since this implements __anext__, this can return itself
        return self

    async def __anext__(self):
        # Keep waiting for the queue if a) we haven't finished, or b) if the queue is still full. This lets us finish
        # processing the remaining items even after we've finished
        if self.finished and self.queue.empty():
            raise StopAsyncIteration

        result = await self.queue.get()
        return result


================================================
FILE: babeldoc/babeldoc_exception/BabelDOCException.py
================================================
class ScannedPDFError(Exception):
    def __init__(self, message):
        super().__init__(message)


class ExtractTextError(Exception):
    def __init__(self, message):
        super().__init__(message)


class InputFileGeneratedByBabelDOCError(Exception):
    def __init__(self, message):
        super().__init__(message)


class ContentFilterError(Exception):
    def __init__(self, message):
        super().__init__(message)
        self.message = message


================================================
FILE: babeldoc/babeldoc_exception/__init__.py
================================================


================================================
FILE: babeldoc/const.py
================================================
import itertools
import multiprocessing as mp
import os
import shutil
import subprocess
import threading
from pathlib import Path

__version__ = "0.5.23"

CACHE_FOLDER = Path.home() / ".cache" / "babeldoc"


def get_cache_file_path(filename: str, sub_folder: str | None = None) -> Path:
    if sub_folder is not None:
        sub_folder = sub_folder.strip("/")
        sub_folder_path = CACHE_FOLDER / sub_folder
        sub_folder_path.mkdir(parents=True, exist_ok=True)
        return sub_folder_path / filename
    return CACHE_FOLDER / filename


try:
    git_path = shutil.which("git")
    if git_path is None:
        raise FileNotFoundError("git executable not found")
    two_parent = Path(__file__).resolve().parent.parent
    md_ = two_parent / "docs" / "README.md"
    if two_parent.name == "site-packages" or not md_.exists():
        raise FileNotFoundError("not in git repo")
    WATERMARK_VERSION = (
        subprocess.check_output(  # noqa: S603
            [git_path, "describe", "--always"],
            cwd=Path(__file__).resolve().parent,
        )
        .strip()
        .decode()
    )
except (OSError, FileNotFoundError, subprocess.CalledProcessError):
    WATERMARK_VERSION = f"v{__version__}"

TIKTOKEN_CACHE_FOLDER = CACHE_FOLDER / "tiktoken"
TIKTOKEN_CACHE_FOLDER.mkdir(parents=True, exist_ok=True)
os.environ["TIKTOKEN_CACHE_DIR"] = str(TIKTOKEN_CACHE_FOLDER)


_process_pool = None
_process_pool_lock = threading.Lock()
_ENABLE_PROCESS_POOL = False


def enable_process_pool():
    # Development and Testing ONLY API
    global _ENABLE_PROCESS_POOL
    _ENABLE_PROCESS_POOL = True


# macos & windows use spawn mode
# linux use forkserver mode


def get_process_pool():
    if not _ENABLE_PROCESS_POOL:
        return None
    global _process_pool
    with _process_pool_lock:
        if _process_pool is None:
            # Create pool only in main process
            if mp.current_process().name != "MainProcess":
                return None

            _process_pool = mp.Pool()
        return _process_pool


def close_process_pool():
    if not _ENABLE_PROCESS_POOL:
        return None
    global _process_pool
    with _process_pool_lock:
        if _process_pool:
            _process_pool.close()
            _process_pool.join()
            _process_pool = None


def batched(iterable, n, *, strict=False):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    iterator = iter(iterable)
    while batch := tuple(itertools.islice(iterator, n)):
        if strict and len(batch) != n:
            raise ValueError("batched(): incomplete batch")
        yield batch


================================================
FILE: babeldoc/docvision/README.md
================================================


================================================
FILE: babeldoc/docvision/__init__.py
================================================


================================================
FILE: babeldoc/docvision/base_doclayout.py
================================================
import abc
import logging
from collections.abc import Generator

import pymupdf

from babeldoc.format.pdf.document_il.il_version_1 import Page

logger = logging.getLogger(__name__)


class YoloResult:
    """Helper class to store detection results from ONNX model."""

    def __init__(self, names, boxes=None, boxes_data=None):
        if boxes is not None:
            self.boxes = boxes
        else:
            assert boxes_data is not None
            self.boxes = [YoloBox(data=d) for d in boxes_data]
        self.boxes.sort(key=lambda x: x.conf, reverse=True)
        self.names = names


class YoloBox:
    """Helper class to store detection results from ONNX model."""

    def __init__(self, data=None, xyxy=None, conf=None, cls=None):
        if data is not None:
            self.xyxy = data[:4]
            self.conf = data[-2]
            self.cls = data[-1]
            return
        assert xyxy is not None and conf is not None and cls is not None
        self.xyxy = xyxy
        self.conf = conf
        self.cls = cls


class DocLayoutModel(abc.ABC):
    @staticmethod
    def load_onnx():
        logger.info("Loading ONNX model...")
        from babeldoc.docvision.doclayout import OnnxModel

        model = OnnxModel.from_pretrained()
        return model

    @staticmethod
    def load_available():
        return DocLayoutModel.load_onnx()

    @property
    @abc.abstractmethod
    def stride(self) -> int:
        """Stride of the model input."""

    @abc.abstractmethod
    def handle_document(
        self,
        pages: list[Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ) -> Generator[tuple[Page, YoloResult], None, None]:
        """
        Handle a document.
        """


================================================
FILE: babeldoc/docvision/doclayout.py
================================================
import ast
import logging
import platform
import re
import threading
from collections.abc import Generator

import cv2
import numpy as np

from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

try:
    import onnx
    import onnxruntime
except ImportError as e:
    if "DLL load failed" in str(e):
        raise OSError(
            "Microsoft Visual C++ Redistributable is not installed. "
            "Download it at https://aka.ms/vs/17/release/vc_redist.x64.exe"
        ) from e
    raise
import pymupdf

import babeldoc.format.pdf.document_il.il_version_1
from babeldoc.assets.assets import get_doclayout_onnx_model_path

# from huggingface_hub import hf_hub_download

logger = logging.getLogger(__name__)


# 检测操作系统类型
os_name = platform.system()


class OnnxModel(DocLayoutModel):
    def __init__(self, model_path: str):
        self.model_path = model_path

        model = onnx.load(model_path)
        metadata = {d.key: d.value for d in model.metadata_props}
        self._stride = ast.literal_eval(metadata["stride"])
        self._names = ast.literal_eval(metadata["names"])
        providers = []

        available_providers = onnxruntime.get_available_providers()
        for provider in available_providers:
            # disable dml|cuda|
            # directml/cuda may encounter problems under special circumstances
            if re.match(r"cpu", provider, re.IGNORECASE):
                logger.info(f"Available Provider: {provider}")
                providers.append(provider)
        self.model = onnxruntime.InferenceSession(
            model.SerializeToString(),
            providers=providers,
        )
        self.lock = threading.Lock()

    @staticmethod
    def from_pretrained():
        pth = get_doclayout_onnx_model_path()
        return OnnxModel(pth)

    @property
    def stride(self):
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image,
            (resized_w, resized_h),
            interpolation=cv2.INTER_LINEAR,
        )

        # Calculate padding size and align to stride multiple
        pad_w = (new_w - resized_w) % self.stride
        pad_h = (new_h - resized_h) % self.stride
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image,
            top,
            bottom,
            left,
            right,
            cv2.BORDER_CONSTANT,
            value=(114, 114, 114),
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict(self, image, imgsz=800, batch_size=16, **kwargs):
        """
        Predict the layout of document pages.

        Args:
            image: A single image or a list of images of document pages.
            imgsz: Resize the image to this size. Must be a multiple of the stride.
            batch_size: Number of images to process in one batch.
            **kwargs: Additional arguments.

        Returns:
            A list of YoloResult objects, one for each input image.
        """
        # Handle single image input
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        total_images = len(image)
        results = []
        batch_size = 1

        # Process images in batches
        for i in range(0, total_images, batch_size):
            batch_images = image[i : i + batch_size]
            batch_size_actual = len(batch_images)

            # Calculate target size based on the maximum height in the batch
            max_height = max(img.shape[0] for img in batch_images)
            target_imgsz = 1024

            # Preprocess batch
            processed_batch = []
            orig_shapes = []
            for img in batch_images:
                orig_h, orig_w = img.shape[:2]
                orig_shapes.append((orig_h, orig_w))

                pix = self.resize_and_pad_image(img, new_shape=target_imgsz)
                pix = np.transpose(pix, (2, 0, 1))  # CHW
                pix = pix.astype(np.float32) / 255.0  # Normalize to [0, 1]
                processed_batch.append(pix)

            # Stack batch
            batch_input = np.stack(processed_batch, axis=0)  # BCHW
            new_h, new_w = batch_input.shape[2:]

            # Run inference
            batch_preds = self.model.run(None, {"images": batch_input})[0]

            # Process each prediction in the batch
            for j in range(batch_size_actual):
                preds = batch_preds[j]
                preds = preds[preds[..., 4] > 0.25]
                if len(preds) > 0:
                    preds[..., :4] = self.scale_boxes(
                        (new_h, new_w),
                        preds[..., :4],
                        orig_shapes[j],
                    )
                results.append(YoloResult(boxes_data=preds, names=self._names))

        return results

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ) -> Generator[
        tuple[babeldoc.format.pdf.document_il.il_version_1.Page, YoloResult], None, None
    ]:
        for page in pages:
            translate_config.raise_if_cancelled()
            with self.lock:
                # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
                pix = get_no_rotation_img(mupdf_doc[page.page_number])
            image = np.frombuffer(pix.samples, np.uint8).reshape(
                pix.height,
                pix.width,
                3,
            )[:, :, ::-1]
            predict_result = self.predict(image)[0]
            save_debug_image(
                image,
                predict_result,
                page.page_number + 1,
            )
            yield page, predict_result


================================================
FILE: babeldoc/docvision/rpc_doclayout.py
================================================
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import httpx
import msgpack
import numpy as np
import pymupdf
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

import babeldoc
from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

logger = logging.getLogger(__name__)


def encode_image(image) -> bytes:
    """Read and encode image to bytes

    Args:
        image: Can be either a file path (str) or numpy array
    """
    if isinstance(image, str):
        if not Path(image).exists():
            raise FileNotFoundError(f"Image file not found: {image}")
        img = cv2.imread(image)
        if img is None:
            raise ValueError(f"Failed to read image: {image}")
    else:
        img = image

    # logger.debug(f"Image shape: {img.shape}")
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    encoded = cv2.imencode(".jpg", img)[1].tobytes()
    # logger.debug(f"Encoded image size: {len(encoded)} bytes")
    return encoded


@retry(
    stop=stop_after_attempt(3),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed, retrying in {retry_state.next_action.sleep} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
def predict_layout(
    image,
    host: str = "http://localhost:8000",
    imgsz: int = 1024,
):
    """
    Predict document layout using the MOSEC service

    Args:
        image: Can be either a file path (str) or numpy array
        host: Service host URL
        imgsz: Image size for model input

    Returns:
        List of predictions containing bounding boxes and classes
    """
    # Prepare request data
    if not isinstance(image, list):
        image = [image]
    image_data = [encode_image(image) for image in image]
    data = {
        "image": image_data,
        "imgsz": imgsz,
    }

    # Pack data using msgpack
    packed_data = msgpack.packb(data, use_bin_type=True)
    # logger.debug(f"Packed data size: {len(packed_data)} bytes")

    # Send request
    # logger.debug(f"Sending request to {host}/inference")
    response = httpx.post(
        f"{host}/inference",
        data=packed_data,
        headers={
            "Content-Type": "application/msgpack",
            "Accept": "application/msgpack",
        },
        timeout=300,
        follow_redirects=True,
    )

    # logger.debug(f"Response status: {response.status_code}")
    # logger.debug(f"Response headers: {response.headers}")

    if response.status_code == 200:
        try:
            result = msgpack.unpackb(response.content, raw=False)
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.content}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


class ResultContainer:
    def __init__(self):
        self.result = YoloResult(boxes_data=np.array([]), names=[])


class RpcDocLayoutModel(DocLayoutModel):
    """DocLayoutModel implementation that uses RPC service."""

    def __init__(self, host: str = "http://localhost:8000"):
        """Initialize RPC model with host address."""
        self.host = host
        self._stride = 32  # Default stride value
        self._names = ["text", "title", "list", "table", "figure"]
        self.lock = threading.Lock()

    @property
    def stride(self) -> int:
        """Stride of the model input."""
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size,
        ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
        )

        # Calculate padding size
        pad_h = new_h - resized_h
        pad_w = new_w - resized_w
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict_image(
        self,
        image,
        host: str = None,
        result_container: ResultContainer | None = None,
        imgsz: int = 1024,
    ) -> ResultContainer:
        """Predict the layout of document pages using RPC service."""
        if result_container is None:
            result_container = ResultContainer()
        target_imgsz = (800, 800)
        orig_h, orig_w = image.shape[:2]
        if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]:
            image = self.resize_and_pad_image(image, new_shape=target_imgsz)
        preds = predict_layout([image], host=self.host, imgsz=800)

        if len(preds) > 0:
            for pred in preds:
                boxes = [
                    YoloBox(
                        None,
                        self.scale_boxes(
                            (800, 800), np.array(x["xyxy"]), (orig_h, orig_w)
                        ),
                        np.array(x["conf"]),
                        x["cls"],
                    )
                    for x in pred["boxes"]
                ]
                result_container.result = YoloResult(
                    boxes=boxes,
                    names={int(k): v for k, v in pred["names"].items()},
                )
        return result_container.result

    def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]:
        """Predict the layout of document pages using RPC service."""
        # Handle single image input
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        result_containers = [ResultContainer() for _ in image]
        predict_thread = ThreadPoolExecutor(max_workers=len(image))
        for img, result_container in zip(image, result_containers, strict=True):
            predict_thread.submit(
                self.predict_image, img, self.host, result_container, 800
            )
        predict_thread.shutdown(wait=True)
        result = [result_container.result for result_container in result_containers]
        return result

    def predict_page(
        self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image
    ):
        translate_config.raise_if_cancelled()
        with self.lock:
            # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
            pix = get_no_rotation_img(mupdf_doc[page.page_number])
        image = np.frombuffer(pix.samples, np.uint8).reshape(
            pix.height,
            pix.width,
            3,
        )[:, :, ::-1]
        predict_result = self.predict_image(image, self.host, None, 800)
        save_debug_image(image, predict_result, page.page_number + 1)
        return page, predict_result

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ):
        with ThreadPoolExecutor(max_workers=16) as executor:
            yield from executor.map(
                self.predict_page,
                pages,
                (mupdf_doc for _ in range(len(pages))),
                (translate_config for _ in range(len(pages))),
                (save_debug_image for _ in range(len(pages))),
            )

    @staticmethod
    def from_host(host: str) -> "RpcDocLayoutModel":
        """Create RpcDocLayoutModel from host address."""
        return RpcDocLayoutModel(host=host)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    # Test the service
    try:
        # Use a default test image if example/1.png doesn't exist
        image_path = "example/1.png"
        if not Path(image_path).exists():
            print(f"Warning: {image_path} not found.")
            print("Please provide the path to a test image:")
            image_path = input("> ")

        logger.info(f"Processing image: {image_path}")
        result = predict_layout(image_path)
        print("Prediction results:")
        print(result)
    except Exception as e:
        print(f"Error: {e!s}")


================================================
FILE: babeldoc/docvision/rpc_doclayout2.py
================================================
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import httpx
import msgpack
import numpy as np
import pymupdf
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

import babeldoc
from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

logger = logging.getLogger(__name__)
DPI = 150


def encode_image(image) -> bytes:
    """Read and encode image to bytes

    Args:
        image: Can be either a file path (str) or numpy array
    """
    if isinstance(image, str):
        if not Path(image).exists():
            raise FileNotFoundError(f"Image file not found: {image}")
        img = cv2.imread(image)
        if img is None:
            raise ValueError(f"Failed to read image: {image}")
    else:
        img = image

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    # logger.debug(f"Image shape: {img.shape}")
    encoded = cv2.imencode(".jpg", img)[1].tobytes()
    # logger.debug(f"Encoded image size: {len(encoded)} bytes")
    return encoded


@retry(
    stop=stop_after_attempt(3),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
def predict_layout(
    image,
    host: str = "http://localhost:8000",
    _imgsz: int = 1024,
):
    """
    Predict document layout using the MOSEC service

    Args:
        image: Can be either a file path (str) or numpy array
        host: Service host URL
        imgsz: Image size for model input

    Returns:
        List of predictions containing bounding boxes and classes
    """
    # Prepare request data

    if not isinstance(image, list):
        image = [image]
    image_data = [encode_image(image) for image in image]
    data = {
        "image": image_data,
    }

    # Pack data using msgpack
    packed_data = msgpack.packb(data, use_bin_type=True)
    # logger.debug(f"Packed data size: {len(packed_data)} bytes")

    # Send request
    # logger.debug(f"Sending request to {host}/inference")
    response = httpx.post(
        # f"{host}/analyze?min_sim=0.7&early_stop=0.99&timeout=480",
        f"{host}/inference",
        data=packed_data,
        headers={
            "Content-Type": "application/msgpack",
            "Accept": "application/msgpack",
        },
        timeout=480,
        follow_redirects=True,
    )

    # logger.debug(f"Response status: {response.status_code}")
    # logger.debug(f"Response headers: {response.headers}")
    idx = 0
    id_lookup = {}
    if response.status_code == 200:
        try:
            result = msgpack.unpackb(response.content, raw=False)
            useful_result = []
            if isinstance(result, dict):
                names = {}
                for box in result["boxes"]:
                    if box["score"] < 0.7:
                        continue

                    box["xyxy"] = box["coordinate"]
                    box["conf"] = box["score"]
                    if box["label"] not in names:
                        idx += 1
                        names[idx] = box["label"]
                        box["cls_id"] = idx
                        id_lookup[box["label"]] = idx
                    else:
                        box["cls_id"] = id_lookup[box["label"]]
                    names[box["cls_id"]] = box["label"]
                    box["cls"] = box["cls_id"]
                    useful_result.append(box)
                if "names" not in result:
                    result["names"] = names
                result["boxes"] = useful_result
                result = [result]
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.content}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


class ResultContainer:
    def __init__(self):
        self.result = YoloResult(boxes_data=np.array([]), names=[])


class RpcDocLayoutModel(DocLayoutModel):
    """DocLayoutModel implementation that uses RPC service."""

    def __init__(self, host: str = "http://localhost:8000"):
        """Initialize RPC model with host address."""
        self.host = host
        self._stride = 32  # Default stride value
        self._names = ["text", "title", "list", "table", "figure"]
        self.lock = threading.Lock()

    @property
    def stride(self) -> int:
        """Stride of the model input."""
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size,
        ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
        )

        # Calculate padding size
        pad_h = new_h - resized_h
        pad_w = new_w - resized_w
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict_image(
        self,
        image,
        host: str | None = None,
        result_container: ResultContainer | None = None,
        imgsz: int = 1024,
    ) -> ResultContainer:
        """Predict the layout of document pages using RPC service."""
        if result_container is None:
            result_container = ResultContainer()
        target_imgsz = (800, 800)
        orig_h, orig_w = image.shape[:2]
        target_imgsz = (orig_h, orig_w)
        if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]:
            image = self.resize_and_pad_image(image, new_shape=target_imgsz)
        preds = predict_layout(image, host=self.host)
        orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72
        if len(preds) > 0:
            for pred in preds:
                boxes = [
                    YoloBox(
                        None,
                        self.scale_boxes(
                            target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w)
                        ),
                        np.array(x["conf"]),
                        x["cls"],
                    )
                    for x in pred["boxes"]
                ]
                result_container.result = YoloResult(
                    boxes=boxes,
                    names={int(k): v for k, v in pred["names"].items()},
                )
        return result_container.result

    def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]:
        """Predict the layout of document pages using RPC service."""
        # Handle single image input
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        result_containers = [ResultContainer() for _ in image]
        predict_thread = ThreadPoolExecutor(max_workers=len(image))
        for img, result_container in zip(image, result_containers, strict=True):
            predict_thread.submit(
                self.predict_image, img, self.host, result_container, 800
            )
        predict_thread.shutdown(wait=True)
        result = [result_container.result for result_container in result_containers]
        return result

    def predict_page(
        self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image
    ):
        translate_config.raise_if_cancelled()
        with self.lock:
            # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
            pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI)
        image = np.frombuffer(pix.samples, np.uint8).reshape(
            pix.height,
            pix.width,
            3,
        )[:, :, ::-1]
        predict_result = self.predict_image(image, self.host, None, 800)
        save_debug_image(image, predict_result, page.page_number + 1)
        return page, predict_result

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ):
        with ThreadPoolExecutor(max_workers=16) as executor:
            yield from executor.map(
                self.predict_page,
                pages,
                (mupdf_doc for _ in range(len(pages))),
                (translate_config for _ in range(len(pages))),
                (save_debug_image for _ in range(len(pages))),
            )

    @staticmethod
    def from_host(host: str) -> "RpcDocLayoutModel":
        """Create RpcDocLayoutModel from host address."""
        return RpcDocLayoutModel(host=host)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    # Test the service
    try:
        # Use a default test image if example/1.png doesn't exist
        image_path = "example/1.png"
        if not Path(image_path).exists():
            print(f"Warning: {image_path} not found.")
            print("Please provide the path to a test image:")
            image_path = input("> ")

        logger.info(f"Processing image: {image_path}")
        result = predict_layout(image_path)
        print("Prediction results:")
        print(result)
    except Exception as e:
        print(f"Error: {e!s}")


================================================
FILE: babeldoc/docvision/rpc_doclayout3.py
================================================
import json
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import httpx
import numpy as np
import pymupdf
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

import babeldoc
from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

logger = logging.getLogger(__name__)
DPI = 150


def encode_image(image) -> bytes:
    """Read and encode image to bytes

    Args:
        image: Can be either a file path (str) or numpy array
    """
    if isinstance(image, str):
        if not Path(image).exists():
            raise FileNotFoundError(f"Image file not found: {image}")
        img = cv2.imread(image)
        if img is None:
            raise ValueError(f"Failed to read image: {image}")
    else:
        img = image

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    # logger.debug(f"Image shape: {img.shape}")
    encoded = cv2.imencode(".jpg", img)[1].tobytes()
    # logger.debug(f"Encoded image size: {len(encoded)} bytes")
    return encoded


@retry(
    stop=stop_after_attempt(3),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
def predict_layout(
    image,
    host: str = "http://localhost:8000",
    _imgsz: int = 1024,
):
    """
    Predict document layout using the MOSEC service

    Args:
        image: Can be either a file path (str) or numpy array
        host: Service host URL
        imgsz: Image size for model input

    Returns:
        List of predictions containing bounding boxes and classes
    """
    # Prepare request data

    image_data = encode_image(image)

    # Pack data using msgpack
    # packed_data = msgpack.packb(data, use_bin_type=True)
    # logger.debug(f"Packed data size: {len(packed_data)} bytes")

    # Send request
    # logger.debug(f"Sending request to {host}/inference")
    response = httpx.post(
        f"{host}/analyze?min_sim=0.7&early_stop=0.99&timeout=1800",
        files={"file": ("image.jpg", image_data, "image/jpeg")},
        headers={
            "Accept": "application/json",
        },
        timeout=1800,
        follow_redirects=True,
    )

    # logger.debug(f"Response status: {response.status_code}")
    # logger.debug(f"Response headers: {response.headers}")
    idx = 0
    id_lookup = {}
    if response.status_code == 200:
        try:
            result = json.loads(response.text)
            useful_result = []
            if isinstance(result, dict):
                names = {}
                for box in result["boxes"]:
                    if box["ocr_match_score"] < 0.7:
                        continue

                    box["xyxy"] = box["coords"]
                    box["conf"] = box["ocr_match_score"]
                    if box["label"] not in names:
                        idx += 1
                        names[idx] = box["label"]
                        box["cls_id"] = idx
                        id_lookup[box["label"]] = idx
                    else:
                        box["cls_id"] = id_lookup[box["label"]]
                    names[box["cls_id"]] = box["label"]
                    box["cls"] = box["cls_id"]
                    useful_result.append(box)
                if "names" not in result:
                    result["names"] = names
                result["boxes"] = useful_result
                result = [result]
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.content}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


class ResultContainer:
    def __init__(self):
        self.result = YoloResult(boxes_data=np.array([]), names=[])


class RpcDocLayoutModel(DocLayoutModel):
    """DocLayoutModel implementation that uses RPC service."""

    def __init__(self, host: str = "http://localhost:8000"):
        """Initialize RPC model with host address."""
        self.host = host
        self._stride = 32  # Default stride value
        self._names = ["text", "title", "list", "table", "figure"]
        self.lock = threading.Lock()

    @property
    def stride(self) -> int:
        """Stride of the model input."""
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size,
        ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
        )

        # Calculate padding size
        pad_h = new_h - resized_h
        pad_w = new_w - resized_w
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict_image(
        self,
        image,
        host: str | None = None,
        result_container: ResultContainer | None = None,
        imgsz: int = 1024,
    ) -> ResultContainer:
        """Predict the layout of document pages using RPC service."""
        if result_container is None:
            result_container = ResultContainer()
        target_imgsz = (800, 800)
        orig_h, orig_w = image.shape[:2]
        target_imgsz = (orig_h, orig_w)
        if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]:
            image = self.resize_and_pad_image(image, new_shape=target_imgsz)
        preds = predict_layout(image, host=self.host)
        orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72
        if len(preds) > 0:
            for pred in preds:
                boxes = [
                    YoloBox(
                        None,
                        self.scale_boxes(
                            target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w)
                        ),
                        np.array(x["conf"]),
                        x["cls"],
                    )
                    for x in pred["boxes"]
                ]
                result_container.result = YoloResult(
                    boxes=boxes,
                    names={int(k): v for k, v in pred["names"].items()},
                )
        return result_container.result

    def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]:
        """Predict the layout of document pages using RPC service."""
        # Handle single image input
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        result_containers = [ResultContainer() for _ in image]
        predict_thread = ThreadPoolExecutor(max_workers=len(image))
        for img, result_container in zip(image, result_containers, strict=True):
            predict_thread.submit(
                self.predict_image, img, self.host, result_container, 800
            )
        predict_thread.shutdown(wait=True)
        result = [result_container.result for result_container in result_containers]
        return result

    def predict_page(
        self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image
    ):
        translate_config.raise_if_cancelled()
        with self.lock:
            # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
            pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI)
        image = np.frombuffer(pix.samples, np.uint8).reshape(
            pix.height,
            pix.width,
            3,
        )[:, :, ::-1]
        predict_result = self.predict_image(image, self.host, None, 800)
        save_debug_image(image, predict_result, page.page_number + 1)
        return page, predict_result

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ):
        with ThreadPoolExecutor(max_workers=4) as executor:
            yield from executor.map(
                self.predict_page,
                pages,
                (mupdf_doc for _ in range(len(pages))),
                (translate_config for _ in range(len(pages))),
                (save_debug_image for _ in range(len(pages))),
            )

    @staticmethod
    def from_host(host: str) -> "RpcDocLayoutModel":
        """Create RpcDocLayoutModel from host address."""
        return RpcDocLayoutModel(host=host)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    # Test the service
    try:
        # Use a default test image if example/1.png doesn't exist
        image_path = "example/1.png"
        if not Path(image_path).exists():
            print(f"Warning: {image_path} not found.")
            print("Please provide the path to a test image:")
            image_path = input("> ")

        logger.info(f"Processing image: {image_path}")
        result = predict_layout(image_path)
        print("Prediction results:")
        print(result)
    except Exception as e:
        print(f"Error: {e!s}")


================================================
FILE: babeldoc/docvision/rpc_doclayout4.py
================================================
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import httpx
import msgpack
import numpy as np
import pymupdf
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

import babeldoc
from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

logger = logging.getLogger(__name__)
DPI = 150


def encode_image(image) -> bytes:
    """Read and encode image to bytes

    Args:
        image: Can be either a file path (str) or numpy array
    """
    if isinstance(image, str):
        if not Path(image).exists():
            raise FileNotFoundError(f"Image file not found: {image}")
        img = cv2.imread(image)
        if img is None:
            raise ValueError(f"Failed to read image: {image}")
    else:
        img = image

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    # logger.debug(f"Image shape: {img.shape}")
    encoded = cv2.imencode(".jpg", img)[1].tobytes()
    # logger.debug(f"Encoded image size: {len(encoded)} bytes")
    return encoded


@retry(
    stop=stop_after_attempt(3),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
def predict_layout(
    image,
    host: str = "http://localhost:8000",
    _imgsz: int = 1024,
):
    """
    Predict document layout using the MOSEC service

    Args:
        image: Can be either a file path (str) or numpy array
        host: Service host URL
        imgsz: Image size for model input

    Returns:
        List of predictions containing bounding boxes and classes
    """
    # Prepare request data

    if not isinstance(image, list):
        image = [image]
    image_data = [encode_image(image) for image in image]
    data = {
        "image": image_data,
    }

    # Pack data using msgpack
    packed_data = msgpack.packb(data, use_bin_type=True)
    # logger.debug(f"Packed data size: {len(packed_data)} bytes")

    # Send request
    # logger.debug(f"Sending request to {host}/inference")
    response = httpx.post(
        # f"{host}/analyze?min_sim=0.7&early_stop=0.99&timeout=480",
        f"{host}/inference",
        data=packed_data,
        headers={
            "Content-Type": "application/msgpack",
            "Accept": "application/msgpack",
        },
        timeout=480,
        follow_redirects=True,
    )

    # logger.debug(f"Response status: {response.status_code}")
    # logger.debug(f"Response headers: {response.headers}")
    idx = 0
    id_lookup = {}
    if response.status_code == 200:
        try:
            result = msgpack.unpackb(response.content, raw=False)
            useful_result = []
            if isinstance(result, dict):
                names = {}
                for box in result["boxes"]:
                    if box["score"] < 0.7:
                        continue

                    box["xyxy"] = box["coordinate"]
                    box["conf"] = box["score"]
                    if box["label"] not in names:
                        idx += 1
                        names[idx] = box["label"]
                        box["cls_id"] = idx
                        id_lookup[box["label"]] = idx
                    else:
                        box["cls_id"] = id_lookup[box["label"]]
                    names[box["cls_id"]] = box["label"]
                    box["cls"] = box["cls_id"]
                    useful_result.append(box)
                if "names" not in result:
                    result["names"] = names
                result["boxes"] = useful_result
                result = [result]
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.content}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


class ResultContainer:
    def __init__(self):
        self.result = YoloResult(boxes_data=np.array([]), names=[])


class RpcDocLayoutModel(DocLayoutModel):
    """DocLayoutModel implementation that uses RPC service."""

    def __init__(self, host: str = "http://localhost:8000"):
        """Initialize RPC model with host address."""
        self.host = host
        self._stride = 32  # Default stride value
        self._names = ["text", "title", "list", "table", "figure"]
        self.lock = threading.Lock()

    @property
    def stride(self) -> int:
        """Stride of the model input."""
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size,
        ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
        )

        # Calculate padding size
        pad_h = new_h - resized_h
        pad_w = new_w - resized_w
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict_image(
        self,
        image,
        host: str | None = None,
        result_container: ResultContainer | None = None,
        imgsz: int = 1024,
    ) -> ResultContainer:
        """Predict the layout of document pages using RPC service."""
        if result_container is None:
            result_container = ResultContainer()
        target_imgsz = (800, 800)
        orig_h, orig_w = image.shape[:2]
        target_imgsz = (orig_h, orig_w)
        if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]:
            image = self.resize_and_pad_image(image, new_shape=target_imgsz)
        preds = predict_layout(image, host=self.host)
        orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72
        if len(preds) > 0:
            for pred in preds:
                boxes = [
                    YoloBox(
                        None,
                        self.scale_boxes(
                            target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w)
                        ),
                        np.array(x["conf"]),
                        x["cls"],
                    )
                    for x in pred["boxes"]
                ]
                result_container.result = YoloResult(
                    boxes=boxes,
                    names={int(k): v for k, v in pred["names"].items()},
                )
        return result_container.result

    def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]:
        """Predict the layout of document pages using RPC service."""
        # Handle single image input
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        result_containers = [ResultContainer() for _ in image]
        predict_thread = ThreadPoolExecutor(max_workers=len(image))
        for img, result_container in zip(image, result_containers, strict=True):
            predict_thread.submit(
                self.predict_image, img, self.host, result_container, 800
            )
        predict_thread.shutdown(wait=True)
        result = [result_container.result for result_container in result_containers]
        return result

    def predict_page(
        self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image
    ):
        translate_config.raise_if_cancelled()
        with self.lock:
            # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
            pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI)
        image = np.frombuffer(pix.samples, np.uint8).reshape(
            pix.height,
            pix.width,
            3,
        )[:, :, ::-1]
        predict_result = self.predict_image(image, self.host, None, 800)
        save_debug_image(image, predict_result, page.page_number + 1)
        return page, predict_result

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ):
        with ThreadPoolExecutor(max_workers=1) as executor:
            yield from executor.map(
                self.predict_page,
                pages,
                (mupdf_doc for _ in range(len(pages))),
                (translate_config for _ in range(len(pages))),
                (save_debug_image for _ in range(len(pages))),
            )

    @staticmethod
    def from_host(host: str) -> "RpcDocLayoutModel":
        """Create RpcDocLayoutModel from host address."""
        return RpcDocLayoutModel(host=host)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    # Test the service
    try:
        # Use a default test image if example/1.png doesn't exist
        image_path = "example/1.png"
        if not Path(image_path).exists():
            print(f"Warning: {image_path} not found.")
            print("Please provide the path to a test image:")
            image_path = input("> ")

        logger.info(f"Processing image: {image_path}")
        result = predict_layout(image_path)
        print("Prediction results:")
        print(result)
    except Exception as e:
        print(f"Error: {e!s}")


================================================
FILE: babeldoc/docvision/rpc_doclayout5.py
================================================
import json
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import httpx
import numpy as np
import pymupdf
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

import babeldoc
from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

logger = logging.getLogger(__name__)
DPI = 150


def encode_image(image) -> bytes:
    """Read and encode image to bytes

    Args:
        image: Can be either a file path (str) or numpy array
    """
    if isinstance(image, str):
        if not Path(image).exists():
            raise FileNotFoundError(f"Image file not found: {image}")
        img = cv2.imread(image)
        if img is None:
            raise ValueError(f"Failed to read image: {image}")
    else:
        img = image

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    # logger.debug(f"Image shape: {img.shape}")
    encoded = cv2.imencode(".jpg", img)[1].tobytes()
    # logger.debug(f"Encoded image size: {len(encoded)} bytes")
    return encoded


@retry(
    stop=stop_after_attempt(3),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
def predict_layout(
    image,
    host: str = "http://localhost:8000",
    _imgsz: int = 1024,
):
    """
    Predict document layout using the MOSEC service

    Args:
        image: Can be either a file path (str) or numpy array
        host: Service host URL
        imgsz: Image size for model input

    Returns:
        List of predictions containing bounding boxes and classes
    """
    # Prepare request data

    image_data = encode_image(image)

    # Pack data using msgpack
    # packed_data = msgpack.packb(data, use_bin_type=True)
    # logger.debug(f"Packed data size: {len(packed_data)} bytes")

    # Send request
    # logger.debug(f"Sending request to {host}/inference")
    response = httpx.post(
        f"{host}/analyze_hybrid?min_sim=0.7&early_stop=0.99&timeout=1800",
        files={"file": ("image.jpg", image_data, "image/jpeg")},
        headers={
            "Accept": "application/json",
        },
        timeout=1800,
        follow_redirects=True,
    )

    # logger.debug(f"Response status: {response.status_code}")
    # logger.debug(f"Response headers: {response.headers}")
    idx = 0
    id_lookup = {}
    if response.status_code == 200:
        try:
            result = json.loads(response.text)
            useful_result = []
            if isinstance(result, dict):
                names = {}
                clusters = result["clusters"]
                for box in clusters:
                    box["xyxy"] = box["box"]
                    box["conf"] = 1
                    if box["label"] not in names:
                        idx += 1
                        names[idx] = box["label"]
                        box["cls_id"] = idx
                        id_lookup[box["label"]] = idx
                    else:
                        box["cls_id"] = id_lookup[box["label"]]
                    names[box["cls_id"]] = box["label"]
                    box["cls"] = box["cls_id"]
                    useful_result.append(box)
                if "names" not in result:
                    result["names"] = names
                result["boxes"] = useful_result
                result = [result]
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.text}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


class ResultContainer:
    def __init__(self):
        self.result = YoloResult(boxes_data=np.array([]), names=[])


class RpcDocLayoutModel(DocLayoutModel):
    """DocLayoutModel implementation that uses RPC service."""

    def __init__(self, host: str = "http://localhost:8000"):
        """Initialize RPC model with host address."""
        self.host = host
        self._stride = 32  # Default stride value
        self._names = ["text", "title", "list", "table", "figure"]
        self.lock = threading.Lock()

    @property
    def stride(self) -> int:
        """Stride of the model input."""
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size,
        ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
        )

        # Calculate padding size
        pad_h = new_h - resized_h
        pad_w = new_w - resized_w
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict_image(
        self,
        image,
        host: str | None = None,
        result_container: ResultContainer | None = None,
        imgsz: int = 1024,
    ) -> ResultContainer:
        """Predict the layout of document pages using RPC service."""
        if result_container is None:
            result_container = ResultContainer()
        target_imgsz = (800, 800)
        orig_h, orig_w = image.shape[:2]
        target_imgsz = (orig_h, orig_w)
        if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]:
            image = self.resize_and_pad_image(image, new_shape=target_imgsz)
        preds = predict_layout(image, host=self.host)
        orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72
        if len(preds) > 0:
            for pred in preds:
                boxes = [
                    YoloBox(
                        None,
                        self.scale_boxes(
                            target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w)
                        ),
                        np.array(x["conf"]),
                        x["cls"],
                    )
                    for x in pred["boxes"]
                ]
                result_container.result = YoloResult(
                    boxes=boxes,
                    names={int(k): v for k, v in pred["names"].items()},
                )
        return result_container.result

    def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]:
        """Predict the layout of document pages using RPC service."""
        # Handle single image input
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        result_containers = [ResultContainer() for _ in image]
        predict_thread = ThreadPoolExecutor(max_workers=len(image))
        for img, result_container in zip(image, result_containers, strict=True):
            predict_thread.submit(
                self.predict_image, img, self.host, result_container, 800
            )
        predict_thread.shutdown(wait=True)
        result = [result_container.result for result_container in result_containers]
        return result

    def predict_page(
        self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image
    ):
        translate_config.raise_if_cancelled()
        with self.lock:
            # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
            pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI)
        image = np.frombuffer(pix.samples, np.uint8).reshape(
            pix.height,
            pix.width,
            3,
        )[:, :, ::-1]
        predict_result = self.predict_image(image, self.host, None, 800)
        save_debug_image(image, predict_result, page.page_number + 1)
        return page, predict_result

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ):
        with ThreadPoolExecutor(max_workers=1) as executor:
            yield from executor.map(
                self.predict_page,
                pages,
                (mupdf_doc for _ in range(len(pages))),
                (translate_config for _ in range(len(pages))),
                (save_debug_image for _ in range(len(pages))),
            )

    @staticmethod
    def from_host(host: str) -> "RpcDocLayoutModel":
        """Create RpcDocLayoutModel from host address."""
        return RpcDocLayoutModel(host=host)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    # Test the service
    try:
        # Use a default test image if example/1.png doesn't exist
        image_path = "example/1.png"
        if not Path(image_path).exists():
            print(f"Warning: {image_path} not found.")
            print("Please provide the path to a test image:")
            image_path = input("> ")

        logger.info(f"Processing image: {image_path}")
        result = predict_layout(image_path)
        print("Prediction results:")
        print(result)
    except Exception as e:
        print(f"Error: {e!s}")


================================================
FILE: babeldoc/docvision/rpc_doclayout6.py
================================================
import base64
import json
import logging
import threading
import unicodedata
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import httpx
import msgpack
import numpy as np
import pymupdf
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

import babeldoc
from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.extract_char import (
    convert_page_to_char_boxes,
)
from babeldoc.format.pdf.document_il.utils.extract_char import (
    process_page_chars_to_lines,
)
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.utils.layout_helper import SPACE_REGEX
from babeldoc.format.pdf.document_il.utils.mupdf_helper import (
    get_no_rotation_img_multiprocess,
)

logger = logging.getLogger(__name__)
DPI = 150


def encode_image(image) -> bytes:
    """Read and encode image to bytes

    Args:
        image: Can be either a file path (str) or numpy array
    """
    if isinstance(image, str):
        if not Path(image).exists():
            raise FileNotFoundError(f"Image file not found: {image}")
        img = cv2.imread(image)
        if img is None:
            raise ValueError(f"Failed to read image: {image}")
    else:
        img = image

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    # logger.debug(f"Image shape: {img.shape}")
    encoded = cv2.imencode(".jpg", img)[1].tobytes()
    # logger.debug(f"Encoded image size: {len(encoded)} bytes")
    return encoded


def clip_num(num: float, min_value: float, max_value: float) -> float:
    """Clip a number to a specified range."""
    if num < min_value:
        return min_value
    elif num > max_value:
        return max_value
    return num


@retry(
    stop=stop_after_attempt(5),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed VLM, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... "
        f"(Attempt {retry_state.attempt_number}/5)"
    ),
)
def predict_layout(
    image,
    host: str = "http://localhost:8000",
    _imgsz: int = 1024,
    lines=None,
    font_mapper: FontMapper | None = None,
):
    """Predict document layout using OCR line information (RPC service)."""

    if lines is None:
        lines = []

    image_data = encode_image(image)

    def convert_line(line):
        if not line.text:
            return None
        boxes = [c[0] for c in line.chars]
        min_x = min(b.x for b in boxes)
        max_x = max(b.x2 for b in boxes)
        min_y = min(b.y for b in boxes)
        max_y = max(b.y2 for b in boxes)

        image_height, image_width = image.shape[:2]

        # Transform to image pixel coordinates
        min_x = min_x / 72 * DPI
        max_x = max_x / 72 * DPI
        min_y = min_y / 72 * DPI
        max_y = max_y / 72 * DPI

        min_y, max_y = image_height - max_y, image_height - min_y

        box_volume = (max_x - min_x) * (max_y - min_y)
        if box_volume < 1:
            return None

        min_x = clip_num(min_x, 0, image_width - 1)
        max_x = clip_num(max_x, 0, image_width - 1)
        min_y = clip_num(min_y, 0, image_height - 1)
        max_y = clip_num(max_y, 0, image_height - 1)

        filtered_text = filter_text(line.text, font_mapper)
        if not filtered_text:
            return None

        return {"box": [min_x, min_y, max_x, max_y], "text": filtered_text}

    formatted_results = [convert_line(l) for l in lines]
    formatted_results = [r for r in formatted_results if r is not None]
    if not formatted_results:
        return None

    image_b64 = base64.b64encode(image_data).decode("utf-8")

    request_data = {
        "image": image_b64,
        "ocr_results": formatted_results,
        "image_size": list(image.shape[:2])[::-1],  # (height, width)
    }

    response = httpx.post(
        f"{host}/inference",
        json=request_data,
        headers={"Accept": "application/json", "Content-Type": "application/json"},
        timeout=30,
        follow_redirects=True,
    )

    idx = 0
    id_lookup = {}
    if response.status_code == 200:
        try:
            result = json.loads(response.text)
            useful_result = []
            if isinstance(result, dict):
                names = {}
                clusters = result["clusters"]
                for box in clusters:
                    box["xyxy"] = box["box"]
                    box["conf"] = 1
                    if box["label"] not in names:
                        idx += 1
                        names[idx] = box["label"]
                        box["cls_id"] = idx
                        id_lookup[box["label"]] = idx
                    else:
                        box["cls_id"] = id_lookup[box["label"]]
                    names[box["cls_id"]] = box["label"]
                    box["cls"] = box["cls_id"]
                    useful_result.append(box)
                if "names" not in result:
                    result["names"] = names
                result["boxes"] = useful_result
                result = [result]
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.text}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


@retry(
    stop=stop_after_attempt(5),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed PADDLE, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... "
        f"(Attempt {retry_state.attempt_number}/5)"
    ),
)
def predict_layout2(
    image,
    host: str = "http://localhost:8000",
    _imgsz: int = 1024,
):
    """
    Predict document layout using the MOSEC service

    Args:
        image: Can be either a file path (str) or numpy array
        host: Service host URL
        imgsz: Image size for model input

    Returns:
        List of predictions containing bounding boxes and classes
    """
    # Prepare request data

    if not isinstance(image, list):
        image = [image]
    image_data = [encode_image(image) for image in image]
    data = {
        "image": image_data,
    }

    # Pack data using msgpack
    packed_data = msgpack.packb(data, use_bin_type=True)
    # logger.debug(f"Packed data size: {len(packed_data)} bytes")

    # Send request
    # logger.debug(f"Sending request to {host}/inference")
    response = httpx.post(
        # f"{host}/analyze?min_sim=0.7&early_stop=0.99&timeout=480",
        f"{host}/inference",
        data=packed_data,
        headers={
            "Content-Type": "application/msgpack",
            "Accept": "application/msgpack",
        },
        timeout=30,
        follow_redirects=True,
    )

    # logger.debug(f"Response status: {response.status_code}")
    # logger.debug(f"Response headers: {response.headers}")
    idx = 0
    id_lookup = {}
    if response.status_code == 200:
        try:
            result = msgpack.unpackb(response.content, raw=False)
            useful_result = []
            if isinstance(result, dict):
                names = {}
                for box in result["boxes"]:
                    if box["score"] < 0.7:
                        continue

                    box["xyxy"] = box["coordinate"]
                    box["conf"] = box["score"]
                    if box["label"] not in names:
                        idx += 1
                        names[idx] = box["label"]
                        box["cls_id"] = idx
                        id_lookup[box["label"]] = idx
                    else:
                        box["cls_id"] = id_lookup[box["label"]]
                    names[box["cls_id"]] = box["label"]
                    box["cls"] = box["cls_id"]
                    useful_result.append(box)
                if "names" not in result:
                    result["names"] = names
                result["boxes"] = useful_result
                result = [result]
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.content}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


class ResultContainer:
    def __init__(self):
        self.result = YoloResult(boxes_data=np.array([]), names=[])


def filter_text(txt: str, font_mapper: FontMapper):
    normalize = unicodedata.normalize("NFKC", txt)
    unicodes = []
    for c in normalize:
        if font_mapper.has_char(c):
            unicodes.append(c)
    normalize = "".join(unicodes)
    result = SPACE_REGEX.sub(" ", normalize).strip()
    return result


class RpcDocLayoutModel(DocLayoutModel):
    """DocLayoutModel implementation that uses RPC service."""

    def __init__(self, host: str = "http://localhost:8000;http://localhost:8001"):
        """Initialize RPC model with host address.

        Args:
            host: Two RPC service hosts separated by ';', e.g. "host1;host2".
        """
        if ";" not in host:
            raise ValueError(
                "RpcDocLayoutModel host must be two hosts separated by ';' (e.g. 'http://h1;http://h2')"
            )

        self.host1, self.host2 = [h.strip() for h in host.split(";", 1)]

        # keep the raw host string for logging/debugging purposes
        self.host = host

        self._stride = 32  # Default stride value
        self._names = ["text", "title", "list", "table", "figure"]
        self.lock = threading.Lock()
        self.font_mapper = None

    def init_font_mapper(self, translation_config):
        self.font_mapper = FontMapper(translation_config)

    @property
    def stride(self) -> int:
        """Stride of the model input."""
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size,
        ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
        )

        # Calculate padding size
        pad_h = new_h - resized_h
        pad_w = new_w - resized_w
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def calculate_iou(self, box1, box2):
        """Calculate IoU between two boxes in xyxy format."""
        x1_1, y1_1, x2_1, y2_1 = box1
        x1_2, y1_2, x2_2, y2_2 = box2

        # Calculate intersection area
        x1_inter = max(x1_1, x1_2)
        y1_inter = max(y1_1, y1_2)
        x2_inter = min(x2_1, x2_2)
        y2_inter = min(y2_1, y2_2)

        if x2_inter <= x1_inter or y2_inter <= y1_inter:
            return 0.0

        intersection = (x2_inter - x1_inter) * (y2_inter - y1_inter)

        # Calculate union area
        area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
        area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
        union = area1 + area2 - intersection

        return intersection / union if union > 0 else 0.0

    def is_subset(self, inner_box, outer_box):
        """Check if inner_box is a subset of outer_box."""
        x1_inner, y1_inner, x2_inner, y2_inner = inner_box
        x1_outer, y1_outer, x2_outer, y2_outer = outer_box

        return (
            x1_inner >= x1_outer
            and y1_inner >= y1_outer
            and x2_inner <= x2_outer
            and y2_inner <= y2_outer
        )

    def expand_box_to_contain(self, box_to_expand, box_to_contain):
        """Expand box_to_expand to fully contain box_to_contain."""
        x1_expand, y1_expand, x2_expand, y2_expand = box_to_expand
        x1_contain, y1_contain, x2_contain, y2_contain = box_to_contain

        return [
            min(x1_expand, x1_contain),
            min(y1_expand, y1_contain),
            max(x2_expand, x2_contain),
            max(y2_expand, y2_contain),
        ]

    def post_process_boxes(self, merged_boxes: list[YoloBox], names: dict[int, str]):
        """Post-process merged boxes to handle text and paragraph_hybrid overlaps."""
        for i, text_box in enumerate(merged_boxes):
            text_label = names.get(text_box.cls, "")
            if "text" not in text_label:
                continue

            for j, para_box in enumerate(merged_boxes):
                if i == j:
                    continue

                para_label = names.get(para_box.cls, "")
                if "paragraph_hybrid" not in para_label:
                    continue

                # Calculate IoU
                iou = self.calculate_iou(text_box.xyxy, para_box.xyxy)

                # Check if IoU > 0.95 and paragraph is not subset of text
                if iou > 0.95 and not self.is_subset(para_box.xyxy, text_box.xyxy):
                    # Expand text box to contain paragraph_hybrid
                    expanded_box = self.expand_box_to_contain(
                        text_box.xyxy, para_box.xyxy
                    )
                    merged_boxes[i] = YoloBox(
                        None,
                        np.array(expanded_box),
                        text_box.conf,
                        text_box.cls,
                    )

    def predict_image(
        self,
        image,
        imgsz: int = 1024,
        lines=None,
    ) -> YoloResult:
        """Predict the layout of a single page and fuse results from two RPC services."""

        # Resize/pad image if needed – use original size to avoid extra scaling artefacts
        orig_h, orig_w = image.shape[:2]
        target_imgsz = (orig_h, orig_w)
        if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]:
            image_proc = self.resize_and_pad_image(image, new_shape=target_imgsz)
        else:
            image_proc = image

        # Parallel calls to both services; exceptions propagate if either fails
        with ThreadPoolExecutor(max_workers=2) as ex:
            if lines:
                future1 = ex.submit(
                    predict_layout,
                    image_proc,
                    self.host1,
                    imgsz,
                    lines,
                    self.font_mapper,
                )
            future2 = ex.submit(predict_layout2, image_proc, self.host2, imgsz)

            # .result() will re-raise any exception occurred in worker thread.
            if lines:
                preds1 = future1.result()
            else:
                preds1 = None
            preds2 = future2.result()

        # Convert DPI to PDF points (72 dpi)
        pdf_h, pdf_w = orig_h / DPI * 72, orig_w / DPI * 72

        merged_boxes: list[YoloBox] = []
        names: dict[int, str] = {}

        def _process_preds(preds, id_offset: int, label_suffix: str | None):
            for pred in preds or []:
                for box in pred["boxes"]:
                    # scale coords back to PDF space
                    scaled_xyxy = self.scale_boxes(
                        target_imgsz, np.array(box["xyxy"]), (pdf_h, pdf_w)
                    )

                    new_cls_id = box["cls"] + id_offset

                    # derive label – fall back gracefully if missing
                    label = pred["names"].get(box["cls"], str(box["cls"]))
                    if label_suffix:
                        label = f"{label}{label_suffix}"

                    names[new_cls_id] = label

                    merged_boxes.append(
                        YoloBox(
                            None,
                            scaled_xyxy,
                            np.array(box.get("conf", box.get("score", 1.0))),
                            new_cls_id,
                        )
                    )

        # service-1: +1000 id, add "_hybrid" suffix
        if preds1:
            _process_preds(preds1, 1000, "_hybrid")

        # service-2: +2000 id, label unchanged
        _process_preds(preds2, 2000, None)

        # Sort boxes by confidence desc (YoloResult expects sorted list)
        merged_boxes.sort(key=lambda b: b.conf, reverse=True)

        # Post-process boxes to handle text and paragraph_hybrid overlaps
        self.post_process_boxes(merged_boxes, names)

        return YoloResult(boxes=merged_boxes, names=names)

    def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]:  # type: ignore[override]
        """Predict the layout for one or multiple images."""

        # Normalize to list
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        # Sequential processing is sufficient; keep simple
        results: list[YoloResult] = []
        for img in image:
            results.append(self.predict_image(img, imgsz))

        return results

    def predict_page(self, page, pdf_bytes: Path, translate_config, save_debug_image):
        translate_config.raise_if_cancelled()
        # doc = pymupdf.open(io.BytesIO(pdf_bytes))
        # with self.lock:
        # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
        image = get_no_rotation_img_multiprocess(
            pdf_bytes.as_posix(), page.page_number, dpi=DPI
        )
        # image = np.frombuffer(pix.samples, np.uint8).reshape(
        #     pix.height,
        #     pix.width,
        #     3,
        # )[:, :, ::-1]
        char_boxes = convert_page_to_char_boxes(page)
        lines = process_page_chars_to_lines(char_boxes)
        predict_result = self.predict_image(image, 800, lines)
        save_debug_image(image, predict_result, page.page_number + 1)
        return page, predict_result

    def handle_document(  # type: ignore[override]
        self,
        pages: list["babeldoc.format.pdf.document_il.il_version_1.Page"],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ):
        layout_temp_path = translate_config.get_working_file_path("layout.temp.pdf")
        mupdf_doc.save(layout_temp_path.as_posix())
        with ThreadPoolExecutor(max_workers=32) as executor:
            yield from executor.map(
                self.predict_page,
                pages,
                (layout_temp_path for _ in range(len(pages))),
                (translate_config for _ in range(len(pages))),
                (save_debug_image for _ in range(len(pages))),
            )

    @staticmethod
    def from_host(host: str) -> "RpcDocLayoutModel":
        """Create RpcDocLayoutModel from host address."""
        return RpcDocLayoutModel(host=host)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    # Test the service
    try:
        # Use a default test image if example/1.png doesn't exist
        image_path = "example/1.png"
        if not Path(image_path).exists():
            print(f"Warning: {image_path} not found.")
            print("Please provide the path to a test image:")
            image_path = input("> ")

        logger.info(f"Processing image: {image_path}")
        result = predict_layout(image_path)
        print("Prediction results:")
        print(result)
    except Exception as e:
        print(f"Error: {e!s}")


================================================
FILE: babeldoc/docvision/rpc_doclayout7.py
================================================
import base64
import json
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import httpx
import numpy as np
import pymupdf
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

import babeldoc
from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.document_il.utils.extract_char import (
    convert_page_to_char_boxes,
)
from babeldoc.format.pdf.document_il.utils.extract_char import (
    process_page_chars_to_lines,
)
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

logger = logging.getLogger(__name__)
DPI = 150


def encode_image(image) -> bytes:
    """Read and encode image to bytes

    Args:
        image: Can be either a file path (str) or numpy array
    """
    if isinstance(image, str):
        if not Path(image).exists():
            raise FileNotFoundError(f"Image file not found: {image}")
        img = cv2.imread(image)

        if img is None:
            raise ValueError(f"Failed to read image: {image}")
    else:
        img = image

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    # logger.debug(f"Image shape: {img.shape}")
    encoded = cv2.imencode(".jpg", img)[1].tobytes()
    return encoded


@retry(
    stop=stop_after_attempt(3),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
def predict_layout(
    image,
    host: str = "http://localhost:8000",
    _imgsz: int = 1024,
    lines: list[babeldoc.format.pdf.document_il.utils.extract_char.Line] | None = None,
):
    """
    Predict document layout using the MOSEC service

    Args:
        image: Can be either a file path (str) or numpy array
        host: Service host URL
        imgsz: Image size for model input

    Returns:
        List of predictions containing bounding boxes and classes
    """
    # Prepare request data

    image_data = encode_image(image)

    def convert_line(line: babeldoc.format.pdf.document_il.utils.extract_char.Line):
        """Extract bounding box from a line object."""
        boxes = [c[0] for c in line.chars]
        min_x = min([b.x for b in boxes])
        max_x = max([b.x2 for b in boxes])
        min_y = min([b.y for b in boxes])
        max_y = max([b.y2 for b in boxes])
        # min_y, max_y = max_y, min_y

        min_x = min_x / 72 * DPI
        max_x = max_x / 72 * DPI
        min_y = min_y / 72 * DPI
        max_y = max_y / 72 * DPI

        image_height = image.shape[0]
        min_y, max_y = image_height - max_y, image_height - min_y

        return {"box": [min_x, min_y, max_x, max_y], "text": line.text}

    formatted_results = [convert_line(l) for l in lines]

    image_b64 = base64.b64encode(image_data).decode("utf-8")

    request_data = {
        "image": image_b64,
        "ocr_results": formatted_results,
        "image_size": list(image.shape[:2])[::-1],  # (height, width)
    }

    # Pack data using msgpack
    # packed_data = msgpack.packb(data, use_bin_type=True)
    # logger.debug(f"Packed data size: {len(packed_data)} bytes")

    # Send request
    # logger.debug(f"Sending request to {host}/inference")
    response = httpx.post(
        f"{host}/inference",
        json=request_data,
        headers={"Accept": "application/json", "Content-Type": "application/json"},
        timeout=1800,
        follow_redirects=True,
    )

    # logger.debug(f"Response status: {response.status_code}")
    # logger.debug(f"Response headers: {response.headers}")
    idx = 0
    id_lookup = {}
    if response.status_code == 200:
        try:
            result = json.loads(response.text)
            useful_result = []
            if isinstance(result, dict):
                names = {}
                clusters = result["clusters"]
                for box in clusters:
                    box["xyxy"] = box["box"]
                    box["conf"] = 1
                    if box["label"] not in names:
                        idx += 1
                        names[idx] = box["label"]
                        box["cls_id"] = idx
                        id_lookup[box["label"]] = idx
                    else:
                        box["cls_id"] = id_lookup[box["label"]]
                    names[box["cls_id"]] = box["label"]
                    box["cls"] = box["cls_id"]
                    useful_result.append(box)
                if "names" not in result:
                    result["names"] = names
                result["boxes"] = useful_result
                result = [result]
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.text}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


class ResultContainer:
    def __init__(self):
        self.result = YoloResult(boxes_data=np.array([]), names=[])


class RpcDocLayoutModel(DocLayoutModel):
    """DocLayoutModel implementation that uses RPC service."""

    def __init__(self, host: str = "http://localhost:8000"):
        """Initialize RPC model with host address."""
        self.host = host
        self._stride = 32  # Default stride value
        self._names = ["text", "title", "list", "table", "figure"]
        self.lock = threading.Lock()

    @property
    def stride(self) -> int:
        """Stride of the model input."""
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size,
        ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
        )

        # Calculate padding size
        pad_h = new_h - resized_h
        pad_w = new_w - resized_w
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict_image(
        self,
        image,
        host: str | None = None,
        result_container: ResultContainer | None = None,
        imgsz: int = 1024,
        page: il_version_1.Page | None = None,
    ) -> YoloResult:
        """Predict the layout of document pages using RPC service."""
        if result_container is None:
            result_container = ResultContainer()
        target_imgsz = (800, 800)
        orig_h, orig_w = image.shape[:2]
        target_imgsz = (orig_h, orig_w)
        if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]:
            image = self.resize_and_pad_image(image, new_shape=target_imgsz)

        char_boxes = convert_page_to_char_boxes(page)
        lines = process_page_chars_to_lines(char_boxes)

        preds = predict_layout(image, host=self.host, lines=lines)
        orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72
        if len(preds) > 0:
            for pred in preds:
                boxes = [
                    YoloBox(
                        None,
                        self.scale_boxes(
                            target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w)
                        ),
                        np.array(x["conf"]),
                        x["cls"],
                    )
                    for x in pred["boxes"]
                ]
                result_container.result = YoloResult(
                    boxes=boxes,
                    names={int(k): v for k, v in pred["names"].items()},
                )
        return result_container.result

    def predict_page(
        self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image
    ):
        translate_config.raise_if_cancelled()
        with self.lock:
            # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
            pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI)
        image = np.frombuffer(pix.samples, np.uint8).reshape(
            pix.height,
            pix.width,
            3,
        )[:, :, ::-1]
        predict_result = self.predict_image(image, self.host, None, 800, page)
        save_debug_image(image, predict_result, page.page_number + 1)
        return page, predict_result

    def handle_document(
        self,
        pages: list[il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ):
        with ThreadPoolExecutor(max_workers=1) as executor:
            yield from executor.map(
                self.predict_page,
                pages,
                (mupdf_doc for _ in range(len(pages))),
                (translate_config for _ in range(len(pages))),
                (save_debug_image for _ in range(len(pages))),
            )

    @staticmethod
    def from_host(host: str) -> "RpcDocLayoutModel":
        """Create RpcDocLayoutModel from host address."""
        return RpcDocLayoutModel(host=host)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    # Test the service
    try:
        # Use a default test image if example/1.png doesn't exist
        image_path = "example/1.png"
        if not Path(image_path).exists():
            print(f"Warning: {image_path} not found.")
            print("Please provide the path to a test image:")
            image_path = input("> ")

        logger.info(f"Processing image: {image_path}")
        result = predict_layout(image_path)
        print("Prediction results:")
        print(result)
    except Exception as e:
        print(f"Error: {e!s}")


================================================
FILE: babeldoc/docvision/table_detection/rapidocr.py
================================================
import logging
import re
import threading
from collections.abc import Generator

import cv2
import numpy as np
from babeldoc.assets.assets import get_table_detection_rapidocr_model_path
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img
from rapidocr_onnxruntime import RapidOCR

try:
    import onnxruntime
except ImportError as e:
    if "DLL load failed" in str(e):
        raise OSError(
            "Microsoft Visual C++ Redistributable is not installed. "
            "Download it at https://aka.ms/vs/17/release/vc_redist.x64.exe"
        ) from e
    raise
import babeldoc.format.pdf.document_il.il_version_1
import pymupdf

logger = logging.getLogger(__name__)


def convert_to_yolo_result(predictions):
    """
    Convert RapidOCR predictions to YoloResult format.

    Args:
        predictions (list): List of predictions, where each prediction is a list of coordinates
                           in format [[x1, y1], [x2, y2], [x3, y3], [x4, y4], (text, confidence)]
                           or a numpy array of format [x1, y1, x2, y2, ...]

    Returns:
        YoloResult: Converted predictions in YoloResult format
    """
    boxes = []

    for pred in predictions:
        # Check if the prediction is in the format of 4 corner points
        if isinstance(pred, list) and len(pred) >= 5 and isinstance(pred[0], list):
            # Convert 4 corner points to xyxy format (min x, min y, max x, max y)
            points = np.array(pred[:4])
            x1, y1 = points[:, 0].min(), points[:, 1].min()
            x2, y2 = points[:, 0].max(), points[:, 1].max()
            xyxy = [x1, y1, x2, y2]
            box = YoloBox(xyxy=xyxy, conf=1.0, cls="text")
        # Check if the prediction is already in xyxy format
        elif isinstance(pred, list | np.ndarray) and len(pred) >= 4:
            if isinstance(pred, np.ndarray):
                pred = pred.tolist()
            xyxy = pred[:4]
            box = YoloBox(xyxy=xyxy, conf=1.0, cls="text")
        else:
            continue

        boxes.append(box)

    return YoloResult(names=["text"], boxes=boxes)


def create_yolo_result_from_nested_coords(nested_coords: np.ndarray, names: dict):
    boxes = []

    for quad in nested_coords.tolist():
        if len(quad) != 4:
            continue

        # Convert quad coordinates to xyxy format (min x, min y, max x, max y)
        x1, y1, x2, y2 = quad

        # Create YoloBox with confidence 1.0 and class 'text'
        box = YoloBox(
            xyxy=[float(x1), float(y1), float(x2), float(y2)], conf=np.array(1.0), cls=0
        )
        boxes.append(box)

    return YoloResult(names=names, boxes=boxes)


class RapidOCRModel:
    def __init__(self):
        self.use_cuda = False
        self.use_dml = False
        available_providers = onnxruntime.get_available_providers()
        for provider in available_providers:
            if re.match(r"dml", provider, re.IGNORECASE):
                self.use_dml = True
            elif re.match(r"cuda", provider, re.IGNORECASE):
                self.use_cuda = True
        self.use_dml = False  # force disable directml
        self.model = RapidOCR(
            det_model_path=get_table_detection_rapidocr_model_path(),
            det_use_cuda=self.use_cuda,
            det_use_dml=False,
        )
        self.names = {0: "table_text"}
        self.lock = threading.Lock()

    @property
    def stride(self):
        return 32

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image,
            (resized_w, resized_h),
            interpolation=cv2.INTER_LINEAR,
        )

        # Calculate padding size and align to stride multiple
        pad_w = (new_w - resized_w) % self.stride
        pad_h = (new_h - resized_h) % self.stride
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image,
            top,
            bottom,
            left,
            right,
            cv2.BORDER_CONSTANT,
            value=(114, 114, 114),
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict(self, image, imgsz=800, batch_size=16, **kwargs):
        """
        Predict the layout of document pages.

        Args:
            image: A single image or a list of images of document pages.
            imgsz: Resize the image to this size. Must be a multiple of the stride.
            batch_size: Number of images to process in one batch.
            **kwargs: Additional arguments.

        Returns:
            A YoloResult object containing the detected boxes.
        """
        # Handle single image input
        assert isinstance(image, np.ndarray) and len(image.shape) == 3

        # Calculate target size based on the maximum height in the batch
        target_imgsz = 1024

        orig_shape = (image.shape[0], image.shape[1])

        pix = self.resize_and_pad_image(image, new_shape=target_imgsz)
        # pix = np.transpose(pix, (2, 0, 1))  # CHW
        # pix = pix.astype(np.float32) / 255.0  # Normalize to [0, 1]
        input_ = pix

        new_h, new_w = input_.shape[:2]

        # Run inference
        preds = self.model(input_, use_det=True, use_cls=False, use_rec=False)

        # Process each prediction in the batch
        if len(preds) > 0:
            preds_np = np.array(preds[0])[:, [0, 2], :].reshape([-1, 4])
            preds_np[..., :4] = self.scale_boxes(
                (new_h, new_w),
                preds_np[..., :4],
                orig_shape,
            )

            # Convert predictions to YoloResult format
            return create_yolo_result_from_nested_coords(preds_np, self.names)
        else:
            # Return empty YoloResult if no predictions
            return YoloResult(names=self.names, boxes=[])

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ) -> Generator[
        tuple[babeldoc.format.pdf.document_il.il_version_1.Page, YoloResult], None, None
    ]:
        for page in pages:
            translate_config.raise_if_cancelled()
            with self.lock:
                # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
                pix = get_no_rotation_img(mupdf_doc[page.page_number])
            image = np.frombuffer(pix.samples, np.uint8).reshape(
                pix.height,
                pix.width,
                3,
            )[:, :, ::-1]

            table_boxes = []
            for layout in page.page_layout:
                if layout.class_name == "table":
                    table_boxes.append(layout.box)

            predict_result = self.predict(image)

            ok_boxes = []
            for box in predict_result.boxes:
                # Convert the box coordinates to float for proper comparison
                box_xyxy = [float(coord) for coord in box.xyxy]

                # Check if this box is inside any of the table boxes
                for table_box in table_boxes:
                    # Determine if box is inside or overlapping with table_box with image dimensions
                    if self._is_box_in_table(
                        box_xyxy, table_box, page, image.shape[1], image.shape[0]
                    ):
                        ok_boxes.append(box)
                        break

            yolo_result = YoloResult(names=self.names, boxes=ok_boxes)
            save_debug_image(
                image,
                yolo_result,
                page.page_number + 1,
            )
            yield page, yolo_result

    def _is_box_in_table(self, box_xyxy, table_box, page, img_width, img_height):
        """
        Check if a box from image coordinates is inside a table box from PDF coordinates.

        Args:
            box_xyxy (list): Box coordinates in image coordinate system [x1, y1, x2, y2]
            table_box (Box): Table box in PDF coordinate system
            page: The page object containing information for coordinate conversion
            img_width: Width of the image
            img_height: Height of the image

        Returns:
            bool: True if the box is inside or significantly overlapping with the table box
        """

        # Get table box coordinates in PDF coordinate system
        table_pdf_x1 = table_box.x
        table_pdf_y1 = table_box.y
        table_pdf_x2 = table_box.x2
        table_pdf_y2 = table_box.y2

        # Convert table box to image coordinates
        table_img_x1 = table_pdf_x1
        table_img_y1 = img_height - table_pdf_y2
        table_img_x2 = table_pdf_x2
        table_img_y2 = img_height - table_pdf_y1

        # Now check for overlap between the boxes
        # Calculate the area of overlap
        x_overlap = max(
            0, min(box_xyxy[2], table_img_x2) - max(box_xyxy[0], table_img_x1)
        )
        y_overlap = max(
            0, min(box_xyxy[3], table_img_y2) - max(box_xyxy[1], table_img_y1)
        )
        overlap_area = x_overlap * y_overlap

        # Calculate area of the detected box
        box_area = (box_xyxy[2] - box_xyxy[0]) * (box_xyxy[3] - box_xyxy[1])

        # If overlap area is significant relative to the box area, consider it inside
        if box_area > 0 and overlap_area / box_area > 0.5:
            return True

        return False


================================================
FILE: babeldoc/format/__init__.py
================================================


================================================
FILE: babeldoc/format/pdf/__init__.py
================================================


================================================
FILE: babeldoc/format/pdf/babelpdf/base14.py
================================================
from .encoding import get_type1_encoding
from .win_core import win_core

base14_bbox = {
    "Courier-BoldOblique": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (216, -15, 495, 572),
        "quotedbl": (212, 277, 584, 562),
        "numbersign": (88, -45, 640, 651),
        "dollar": (87, -126, 629, 666),
        "percent": (102, -15, 624, 616),
        "ampersand": (62, -15, 594, 543),
        "quoteright": (230, 277, 542, 562),
        "parenleft": (266, -102, 592, 616),
        "parenright": (117, -102, 443, 616),
        "asterisk": (179, 219, 597, 601),
        "plus": (114, 39, 596, 478),
        "comma": (99, -111, 430, 174),
        "hyphen": (143, 203, 567, 313),
        "period": (207, -15, 426, 171),
        "slash": (91, -77, 626, 626),
        "zero": (137, -15, 591, 616),
        "one": (93, 0, 561, 616),
        "two": (61, 0, 593, 616),
        "three": (72, -15, 570, 616),
        "four": (82, 0, 558, 616),
        "five": (77, -15, 621, 601),
        "six": (136, -15, 652, 616),
        "seven": (147, 0, 622, 601),
        "eight": (116, -15, 603, 616),
        "nine": (76, -15, 591, 616),
        "colon": (206, -15, 479, 425),
        "semicolon": (99, -111, 480, 425),
        "less": (121, 15, 612, 501),
        "equal": (96, 118, 614, 398),
        "greater": (97, 15, 589, 501),
        "question": (183, -14, 591, 580),
        "at": (67, -15, 641, 616),
        "A": (-9, 0, 631, 562),
        "B": (30, 0, 628, 562),
        "C": (75, -18, 674, 580),
        "D": (30, 0, 663, 562),
        "E": (25, 0, 669, 562),
        "F": (39, 0, 683, 562),
        "G": (75, -18, 674, 580),
        "H": (20, 0, 699, 562),
        "I": (77, 0, 642, 562),
        "J": (59, -18, 720, 562),
        "K": (21, 0, 691, 562),
        "L": (39, 0, 635, 562),
        "M": (-2, 0, 721, 562),
        "N": (8, -12, 729, 562),
        "O": (75, -18, 645, 580),
        "P": (48, 0, 642, 562),
        "Q": (84, -138, 635, 580),
        "R": (24, 0, 617, 562),
        "S": (54, -22, 672, 582),
        "T": (86, 0, 678, 562),
        "U": (101, -18, 715, 562),
        "V": (84, 0, 732, 562),
        "W": (84, 0, 737, 562),
        "X": (12, 0, 689, 562),
        "Y": (109, 0, 708, 562),
        "Z": (62, 0, 636, 562),
        "bracketleft": (223, -102, 606, 616),
        "backslash": (223, -77, 496, 626),
        "bracketright": (103, -102, 486, 616),
        "asciicircum": (171, 250, 555, 616),
        "underscore": (-27, -125, 584, -75),
        "quoteleft": (297, 277, 487, 562),
        "a": (62, -15, 592, 454),
        "b": (13, -15, 635, 626),
        "c": (82, -15, 631, 459),
        "d": (61, -15, 644, 626),
        "e": (82, -15, 604, 454),
        "f": (83, 0, 677, 626),
        "g": (41, -146, 673, 454),
        "h": (18, 0, 614, 626),
        "i": (77, 0, 545, 658),
        "j": (37, -146, 580, 658),
        "k": (33, 0, 642, 626),
        "l": (77, 0, 545, 626),
        "m": (-22, 0, 648, 454),
        "n": (18, 0, 614, 454),
        "o": (72, -15, 622, 454),
        "p": (-31, -142, 621, 454),
        "q": (61, -142, 684, 454),
        "r": (47, 0, 654, 454),
        "s": (67, -17, 607, 459),
        "t": (118, -15, 566, 562),
        "u": (70, -15, 591, 439),
        "v": (70, 0, 694, 439),
        "w": (53, 0, 711, 439),
        "x": (6, 0, 670, 439),
        "y": (-20, -142, 694, 439),
        "z": (81, 0, 613, 439),
        "braceleft": (204, -102, 595, 616),
        "bar": (202, -250, 504, 750),
        "braceright": (114, -102, 506, 616),
        "asciitilde": (120, 153, 589, 356),
        "exclamdown": (197, -146, 476, 449),
        "cent": (122, -49, 604, 614),
        "sterling": (107, -28, 650, 611),
        "fraction": (22, -60, 707, 661),
        "yen": (98, 0, 709, 562),
        "florin": (-56, -131, 701, 616),
        "section": (74, -70, 619, 580),
        "currency": (77, 49, 643, 517),
        "quotesingle": (304, 277, 492, 562),
        "quotedblleft": (190, 277, 594, 562),
        "guillemotleft": (63, 70, 638, 446),
        "guilsinglleft": (196, 70, 544, 446),
        "guilsinglright": (166, 70, 514, 446),
        "fi": (12, 0, 643, 626),
        "fl": (12, 0, 643, 626),
        "endash": (108, 203, 602, 313),
        "dagger": (176, -70, 586, 580),
        "daggerdbl": (122, -70, 586, 580),
        "periodcentered": (250, 165, 460, 351),
        "paragraph": (61, -70, 699, 580),
        "bullet": (197, 132, 523, 430),
        "quotesinglbase": (145, -142, 457, 143),
        "quotedblbase": (35, -142, 559, 143),
        "quotedblright": (120, 277, 644, 562),
        "guillemotright": (72, 70, 647, 446),
        "ellipsis": (36, -15, 586, 116),
        "perthousand": (-44, -15, 742, 616),
        "questiondown": (102, -146, 509, 449),
        "grave": (272, 508, 503, 661),
        "acute": (313, 508, 608, 661),
        "circumflex": (212, 483, 606, 657),
        "tilde": (200, 493, 642, 636),
        "macron": (195, 505, 636, 585),
        "breve": (217, 468, 651, 631),
        "dotaccent": (347, 485, 489, 625),
        "dieresis": (245, 485, 591, 625),
        "ring": (319, 481, 527, 678),
        "cedilla": (169, -206, 366, 0),
        "hungarumlaut": (172, 488, 728, 661),
        "ogonek": (144, -199, 350, 0),
        "caron": (238, 493, 632, 667),
        "emdash": (33, 203, 677, 313),
        "AE": (-29, 0, 707, 562),
        "ordfeminine": (189, 196, 526, 580),
        "Lslash": (39, 0, 635, 562),
        "Oslash": (48, -22, 672, 584),
        "OE": (27, 0, 700, 562),
        "ordmasculine": (189, 196, 542, 580),
        "ae": (22, -15, 651, 454),
        "dotlessi": (77, 0, 545, 439),
        "lslash": (77, 0, 578, 626),
        "oslash": (55, -24, 637, 463),
        "oe": (19, -15, 661, 454),
        "germandbls": (22, -15, 628, 626),
        "Scedilla": (54, -206, 672, 582),
        "multiply": (105, 39, 606, 478),
        "logicalnot": (135, 103, 617, 413),
        "format": (-26, -146, 243, 601),
        "tab": (19, 0, 641, 562),
        "overscore": (123, 579, 734, 629),
        "IJ": (-8, -18, 741, 562),
        "trademark": (86, 230, 868, 562),
        "onequarter": (14, -60, 706, 661),
        "mu": (50, -142, 591, 439),
        "minus": (114, 203, 596, 313),
        "brokenbar": (218, -175, 488, 675),
        "arrowleft": (40, 143, 708, 455),
        "LL": (-45, 0, 694, 562),
        "arrowright": (20, 143, 688, 455),
        "thorn": (-31, -142, 621, 626),
        "lira": (107, -28, 650, 611),
        "arrowboth": (40, 143, 688, 455),
        "indent": (99, 45, 579, 372),
        "threesuperior": (193, 222, 525, 616),
        "onehalf": (23, -60, 715, 661),
        "graybox": (76, 0, 652, 599),
        "Idot": (77, 0, 642, 748),
        "ll": (1, 0, 653, 626),
        "Thorn": (48, 0, 619, 562),
        "Ccedilla": (75, -206, 674, 580),
        "notegraphic": (91, -15, 619, 572),
        "arrowup": (244, 3, 556, 626),
        "down": (168, -15, 496, 439),
        "plusminus": (76, 24, 614, 515),
        "threequarters": (8, -60, 698, 661),
        "scedilla": (67, -206, 607, 459),
        "ij": (6, -146, 714, 658),
        "eth": (94, -27, 661, 626),
        "merge": (168, -15, 533, 487),
        "twosuperior": (192, 230, 540, 616),
        "arrowdown": (174, -15, 486, 608),
        "left": (109, 44, 589, 371),
        "return": (79, 0, 700, 562),
        "Eth": (30, 0, 663, 562),
        "up": (196, 0, 523, 447),
        "divide": (114, 16, 596, 500),
        "prescription": (24, -15, 632, 562),
        "square": (19, 0, 700, 562),
        "stop": (19, 0, 700, 562),
        "degree": (174, 243, 569, 616),
        "ccedilla": (82, -206, 631, 459),
        "onesuperior": (213, 230, 514, 616),
        "largebullet": (307, 229, 413, 333),
        "center": (103, 14, 623, 580),
        "registered": (54, -18, 666, 580),
        "copyright": (54, -18, 666, 580),
        "dectab": (8, 0, 615, 320),
        "space": (0, 0, 0, 0),
        "Aacute": (-9, 0, 665, 784),
        "Acircumflex": (-9, 0, 631, 780),
        "Adieresis": (-9, 0, 631, 748),
        "Agrave": (-9, 0, 631, 784),
        "Aring": (-9, 0, 631, 801),
        "Atilde": (-9, 0, 638, 759),
        "Eacute": (25, 0, 669, 784),
        "Ecircumflex": (25, 0, 669, 780),
        "Edieresis": (25, 0, 669, 748),
        "Egrave": (25, 0, 669, 784),
        "Gcaron": (75, -18, 674, 790),
        "Iacute": (77, 0, 642, 784),
        "Icircumflex": (77, 0, 642, 780),
        "Idieresis": (77, 0, 642, 748),
        "Igrave": (77, 0, 642, 784),
        "Ntilde": (8, -12, 729, 759),
        "Oacute": (75, -18, 645, 784),
        "Ocircumflex": (75, -18, 645, 780),
        "Odieresis": (75, -18, 645, 748),
        "Ograve": (75, -18, 645, 784),
        "Otilde": (75, -18, 668, 759),
        "Scaron": (54, -22, 672, 790),
        "Uacute": (101, -18, 715, 784),
        "Ucircumflex": (101, -18, 715, 780),
        "Udieresis": (101, -18, 715, 748),
        "Ugrave": (101, -18, 715, 784),
        "Yacute": (109, 0, 708, 784),
        "Ydieresis": (109, 0, 708, 748),
        "Zcaron": (62, 0, 659, 790),
        "aacute": (62, -15, 608, 661),
        "acircumflex": (62, -15, 592, 657),
        "adieresis": (62, -15, 592, 625),
        "agrave": (62, -15, 592, 661),
        "aring": (62, -15, 592, 678),
        "atilde": (62, -15, 642, 636),
        "eacute": (82, -15, 608, 661),
        "ecircumflex": (82, -15, 606, 657),
        "edieresis": (82, -15, 604, 625),
        "egrave": (82, -15, 604, 661),
        "gcaron": (41, -146, 673, 667),
        "iacute": (77, 0, 608, 661),
        "icircumflex": (77, 0, 566, 657),
        "idieresis": (77, 0, 551, 625),
        "igrave": (77, 0, 545, 661),
        "ntilde": (18, 0, 642, 636),
        "oacute": (72, -15, 622, 661),
        "ocircumflex": (72, -15, 622, 657),
        "odieresis": (72, -15, 622, 625),
        "ograve": (72, -15, 622, 661),
        "otilde": (72, -15, 642, 636),
        "scaron": (67, -17, 632, 667),
        "uacute": (70, -15, 608, 661),
        "ucircumflex": (70, -15, 591, 657),
        "udieresis": (70, -15, 591, 625),
        "ugrave": (70, -15, 591, 661),
        "yacute": (-20, -142, 694, 661),
        "ydieresis": (-20, -142, 694, 625),
        "zcaron": (81, 0, 632, 667),
    },
    "Courier-Bold": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (202, -15, 398, 572),
        "quotedbl": (135, 277, 465, 562),
        "numbersign": (56, -45, 544, 651),
        "dollar": (82, -126, 519, 666),
        "percent": (5, -15, 595, 616),
        "ampersand": (36, -15, 546, 543),
        "quoteright": (171, 277, 423, 562),
        "parenleft": (219, -102, 461, 616),
        "parenright": (139, -102, 381, 616),
        "asterisk": (91, 219, 509, 601),
        "plus": (71, 39, 529, 478),
        "comma": (123, -111, 393, 174),
        "hyphen": (100, 203, 500, 313),
        "period": (192, -15, 408, 171),
        "slash": (98, -77, 502, 626),
        "zero": (87, -15, 513, 616),
        "one": (81, 0, 539, 616),
        "two": (61, 0, 499, 616),
        "three": (63, -15, 501, 616),
        "four": (53, 0, 507, 616),
        "five": (70, -15, 521, 601),
        "six": (90, -15, 521, 616),
        "seven": (55, 0, 494, 601),
        "eight": (83, -15, 517, 616),
        "nine": (79, -15, 510, 616),
        "colon": (191, -15, 407, 425),
        "semicolon": (123, -111, 408, 425),
        "less": (66, 15, 523, 501),
        "equal": (71, 118, 529, 398),
        "greater": (77, 15, 534, 501),
        "question": (98, -14, 501, 580),
        "at": (16, -15, 584, 616),
        "A": (-9, 0, 609, 562),
        "B": (30, 0, 573, 562),
        "C": (22, -18, 560, 580),
        "D": (30, 0, 594, 562),
        "E": (25, 0, 560, 562),
        "F": (39, 0, 570, 562),
        "G": (22, -18, 594, 580),
        "H": (20, 0, 580, 562),
        "I": (77, 0, 523, 562),
        "J": (37, -18, 601, 562),
        "K": (21, 0, 599, 562),
        "L": (39, 0, 578, 562),
        "M": (-2, 0, 602, 562),
        "N": (8, -12, 610, 562),
        "O": (22, -18, 578, 580),
        "P": (48, 0, 559, 562),
        "Q": (32, -138, 578, 580),
        "R": (24, 0, 599, 562),
        "S": (47, -22, 553, 582),
        "T": (21, 0, 579, 562),
        "U": (4, -18, 596, 562),
        "V": (-13, 0, 613, 562),
        "W": (-18, 0, 618, 562),
        "X": (12, 0, 588, 562),
        "Y": (12, 0, 589, 562),
        "Z": (62, 0, 539, 562),
        "bracketleft": (245, -102, 475, 616),
        "backslash": (99, -77, 503, 626),
        "bracketright": (125, -102, 355, 616),
        "asciicircum": (108, 250, 492, 616),
        "underscore": (0, -125, 600, -75),
        "quoteleft": (178, 277, 428, 562),
        "a": (35, -15, 570, 454),
        "b": (0, -15, 584, 626),
        "c": (40, -15, 545, 459),
        "d": (20, -15, 591, 626),
        "e": (40, -15, 563, 454),
        "f": (83, 0, 547, 626),
        "g": (30, -146, 580, 454),
        "h": (5, 0, 592, 626),
        "i": (77, 0, 523, 658),
        "j": (63, -146, 440, 658),
        "k": (20, 0, 585, 626),
        "l": (77, 0, 523, 626),
        "m": (-22, 0, 626, 454),
        "n": (18, 0, 592, 454),
        "o": (30, -15, 570, 454),
        "p": (-1, -142, 570, 454),
        "q": (20, -142, 591, 454),
        "r": (47, 0, 580, 454),
        "s": (68, -17, 535, 459),
        "t": (47, -15, 532, 562),
        "u": (-1, -15, 569, 439),
        "v": (-1, 0, 601, 439),
        "w": (-18, 0, 618, 439),
        "x": (6, 0, 594, 439),
        "y": (-4, -142, 601, 439),
        "z": (81, 0, 520, 439),
        "braceleft": (160, -102, 464, 616),
        "bar": (255, -250, 345, 750),
        "braceright": (136, -102, 440, 616),
        "asciitilde": (71, 153, 530, 356),
        "exclamdown": (202, -146, 398, 449),
        "cent": (66, -49, 518, 614),
        "sterling": (72, -28, 558, 611),
        "fraction": (25, -60, 576, 661),
        "yen": (10, 0, 590, 562),
        "florin": (-30, -131, 572, 616),
        "section": (83, -70, 517, 580),
        "currency": (54, 49, 546, 517),
        "quotesingle": (227, 277, 373, 562),
        "quotedblleft": (71, 277, 535, 562),
        "guillemotleft": (8, 70, 553, 446),
        "guilsinglleft": (141, 70, 459, 446),
        "guilsinglright": (141, 70, 459, 446),
        "fi": (12, 0, 593, 626),
        "fl": (12, 0, 593, 626),
        "endash": (65, 203, 535, 313),
        "dagger": (106, -70, 494, 580),
        "daggerdbl": (106, -70, 494, 580),
        "periodcentered": (196, 165, 404, 351),
        "paragraph": (6, -70, 576, 580),
        "bullet": (140, 132, 460, 430),
        "quotesinglbase": (175, -142, 427, 143),
        "quotedblbase": (65, -142, 529, 143),
        "quotedblright": (61, 277, 525, 562),
        "guillemotright": (47, 70, 592, 446),
        "ellipsis": (26, -15, 574, 116),
        "perthousand": (-113, -15, 713, 616),
        "questiondown": (99, -146, 502, 449),
        "grave": (132, 508, 395, 661),
        "acute": (205, 508, 468, 661),
        "circumflex": (103, 483, 497, 657),
        "tilde": (89, 493, 512, 636),
        "macron": (88, 505, 512, 585),
        "breve": (83, 468, 517, 631),
        "dotaccent": (230, 485, 370, 625),
        "dieresis": (128, 485, 472, 625),
        "ring": (198, 481, 402, 678),
        "cedilla": (205, -206, 387, 0),
        "hungarumlaut": (68, 488, 588, 661),
        "ogonek": (169, -199, 367, 0),
        "caron": (103, 493, 497, 667),
        "emdash": (-10, 203, 610, 313),
        "AE": (-29, 0, 602, 562),
        "ordfeminine": (147, 196, 453, 580),
        "Lslash": (39, 0, 578, 562),
        "Oslash": (22, -22, 578, 584),
        "OE": (-25, 0, 595, 562),
        "ordmasculine": (147, 196, 453, 580),
        "ae": (-4, -15, 601, 454),
        "dotlessi": (77, 0, 523, 439),
        "lslash": (77, 0, 523, 626),
        "oslash": (30, -24, 570, 463),
        "oe": (-18, -15, 611, 454),
        "germandbls": (22, -15, 596, 626),
        "Scedilla": (47, -206, 553, 582),
        "multiply": (81, 39, 520, 478),
        "logicalnot": (71, 103, 529, 413),
        "format": (5, -146, 115, 601),
        "tab": (19, 0, 581, 562),
        "overscore": (0, 579, 600, 629),
        "IJ": (-8, -18, 622, 562),
        "trademark": (-9, 230, 749, 562),
        "onequarter": (-56, -60, 656, 661),
        "mu": (-1, -142, 569, 439),
        "minus": (71, 203, 529, 313),
        "brokenbar": (255, -175, 345, 675),
        "arrowleft": (-24, 143, 634, 455),
        "LL": (-45, 0, 645, 562),
        "arrowright": (-34, 143, 624, 455),
        "thorn": (-14, -142, 570, 626),
        "lira": (72, -28, 558, 611),
        "arrowboth": (-24, 143, 624, 455),
        "indent": (65, 45, 535, 372),
        "threesuperior": (138, 222, 433, 616),
        "onehalf": (-47, -60, 648, 661),
        "graybox": (76, 0, 525, 599),
        "Idot": (77, 0, 523, 748),
        "ll": (-12, 0, 600, 626),
        "Thorn": (48, 0, 557, 562),
        "Ccedilla": (22, -206, 560, 580),
        "notegraphic": (77, -15, 523, 572),
        "arrowup": (144, 3, 456, 626),
        "down": (137, -15, 464, 439),
        "plusminus": (71, 24, 529, 515),
        "threequarters": (-47, -60, 648, 661),
        "scedilla": (68, -206, 535, 459),
        "ij": (6, -146, 574, 658),
        "eth": (58, -27, 543, 626),
        "merge": (137, -15, 464, 487),
        "twosuperior": (143, 230, 436, 616),
        "arrowdown": (144, -15, 456, 608),
        "left": (65, 44, 535, 371),
        "return": (19, 0, 581, 562),
        "Eth": (30, 0, 594, 562),
        "up": (136, 0, 463, 447),
        "divide": (71, 16, 529, 500),
        "prescription": (24, -15, 599, 562),
        "square": (19, 0, 581, 562),
        "stop": (19, 0, 581, 562),
        "degree": (86, 243, 474, 616),
        "ccedilla": (40, -206, 545, 459),
        "onesuperior": (153, 230, 447, 616),
        "largebullet": (248, 229, 352, 333),
        "center": (40, 14, 560, 580),
        "registered": (0, -18, 600, 580),
        "copyright": (0, -18, 600, 580),
        "dectab": (8, 0, 592, 320),
        "space": (0, 0, 0, 0),
        "Aacute": (-9, 0, 609, 784),
        "Acircumflex": (-9, 0, 609, 780),
        "Adieresis": (-9, 0, 609, 748),
        "Agrave": (-9, 0, 609, 784),
        "Aring": (-9, 0, 609, 801),
        "Atilde": (-9, 0, 609, 759),
        "Eacute": (25, 0, 560, 784),
        "Ecircumflex": (25, 0, 560, 780),
        "Edieresis": (25, 0, 560, 748),
        "Egrave": (25, 0, 560, 784),
        "Gcaron": (22, -18, 594, 790),
        "Iacute": (77, 0, 523, 784),
        "Icircumflex": (77, 0, 523, 780),
        "Idieresis": (77, 0, 523, 748),
        "Igrave": (77, 0, 523, 784),
        "Ntilde": (8, -12, 610, 759),
        "Oacute": (22, -18, 578, 784),
        "Ocircumflex": (22, -18, 578, 780),
        "Odieresis": (22, -18, 578, 748),
        "Ograve": (22, -18, 578, 784),
        "Otilde": (22, -18, 578, 759),
        "Scaron": (47, -22, 553, 790),
        "Uacute": (4, -18, 596, 784),
        "Ucircumflex": (4, -18, 596, 780),
        "Udieresis": (4, -18, 596, 748),
        "Ugrave": (4, -18, 596, 784),
        "Yacute": (12, 0, 589, 784),
        "Ydieresis": (12, 0, 589, 748),
        "Zcaron": (62, 0, 539, 790),
        "aacute": (35, -15, 570, 661),
        "acircumflex": (35, -15, 570, 657),
        "adieresis": (35, -15, 570, 625),
        "agrave": (35, -15, 570, 661),
        "aring": (35, -15, 570, 678),
        "atilde": (35, -15, 570, 636),
        "eacute": (40, -15, 563, 661),
        "ecircumflex": (40, -15, 563, 657),
        "edieresis": (40, -15, 563, 625),
        "egrave": (40, -15, 563, 661),
        "gcaron": (30, -146, 580, 667),
        "iacute": (77, 0, 523, 661),
        "icircumflex": (63, 0, 523, 657),
        "idieresis": (77, 0, 523, 625),
        "igrave": (77, 0, 523, 661),
        "ntilde": (18, 0, 592, 636),
        "oacute": (30, -15, 570, 661),
        "ocircumflex": (30, -15, 570, 657),
        "odieresis": (30, -15, 570, 625),
        "ograve": (30, -15, 570, 661),
        "otilde": (30, -15, 570, 636),
        "scaron": (68, -17, 535, 667),
        "uacute": (-1, -15, 569, 661),
        "ucircumflex": (-1, -15, 569, 657),
        "udieresis": (-1, -15, 569, 625),
        "ugrave": (-1, -15, 569, 661),
        "yacute": (-4, -142, 601, 661),
        "ydieresis": (-4, -142, 601, 625),
        "zcaron": (81, 0, 520, 667),
    },
    "Courier": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (236, -15, 364, 572),
        "quotedbl": (187, 328, 413, 562),
        "numbersign": (93, -32, 507, 639),
        "dollar": (105, -126, 496, 662),
        "percent": (81, -15, 518, 622),
        "ampersand": (63, -15, 538, 543),
        "quoteright": (213, 328, 376, 562),
        "parenleft": (269, -108, 440, 622),
        "parenright": (160, -108, 331, 622),
        "asterisk": (116, 257, 484, 607),
        "plus": (80, 44, 520, 470),
        "comma": (181, -112, 344, 122),
        "hyphen": (103, 231, 497, 285),
        "period": (229, -15, 371, 109),
        "slash": (125, -80, 475, 629),
        "zero": (106, -15, 494, 622),
        "one": (96, 0, 505, 622),
        "two": (70, 0, 471, 622),
        "three": (75, -15, 466, 622),
        "four": (78, 0, 500, 622),
        "five": (92, -15, 497, 607),
        "six": (111, -15, 497, 622),
        "seven": (82, 0, 483, 607),
        "eight": (102, -15, 498, 622),
        "nine": (96, -15, 489, 622),
        "colon": (229, -15, 371, 385),
        "semicolon": (181, -112, 371, 385),
        "less": (41, 42, 519, 472),
        "equal": (80, 138, 520, 376),
        "greater": (66, 42, 544, 472),
        "question": (129, -15, 492, 572),
        "at": (77, -15, 533, 622),
        "A": (3, 0, 597, 562),
        "B": (43, 0, 559, 562),
        "C": (41, -18, 540, 580),
        "D": (43, 0, 574, 562),
        "E": (53, 0, 550, 562),
        "F": (53, 0, 545, 562),
        "G": (31, -18, 575, 580),
        "H": (32, 0, 568, 562),
        "I": (96, 0, 504, 562),
        "J": (34, -18, 566, 562),
        "K": (38, 0, 582, 562),
        "L": (47, 0, 554, 562),
        "M": (4, 0, 596, 562),
        "N": (7, -13, 593, 562),
        "O": (43, -18, 557, 580),
        "P": (79, 0, 558, 562),
        "Q": (43, -138, 557, 580),
        "R": (38, 0, 588, 562),
        "S": (72, -20, 529, 580),
        "T": (38, 0, 563, 562),
        "U": (17, -18, 583, 562),
        "V": (-4, -13, 604, 562),
        "W": (-3, -13, 603, 562),
        "X": (23, 0, 577, 562),
        "Y": (24, 0, 576, 562),
        "Z": (86, 0, 514, 562),
        "bracketleft": (269, -108, 442, 622),
        "backslash": (118, -80, 482, 629),
        "bracketright": (158, -108, 331, 622),
        "asciicircum": (94, 354, 506, 622),
        "underscore": (0, -125, 600, -75),
        "quoteleft": (224, 328, 387, 562),
        "a": (53, -15, 559, 441),
        "b": (14, -15, 575, 629),
        "c": (66, -15, 529, 441),
        "d": (45, -15, 591, 629),
        "e": (66, -15, 548, 441),
        "f": (114, 0, 531, 629),
        "g": (45, -157, 566, 441),
        "h": (18, 0, 582, 629),
        "i": (95, 0, 505, 657),
        "j": (82, -157, 410, 657),
        "k": (43, 0, 580, 629),
        "l": (95, 0, 505, 629),
        "m": (-5, 0, 605, 441),
        "n": (26, 0, 575, 441),
        "o": (62, -15, 538, 441),
        "p": (9, -157, 555, 441),
        "q": (45, -157, 591, 441),
        "r": (60, 0, 559, 441),
        "s": (80, -15, 513, 441),
        "t": (87, -15, 530, 561),
        "u": (21, -15, 562, 426),
        "v": (10, -10, 590, 426),
        "w": (-4, -10, 604, 426),
        "x": (20, 0, 580, 426),
        "y": (7, -157, 592, 426),
        "z": (99, 0, 502, 426),
        "braceleft": (182, -108, 437, 622),
        "bar": (275, -250, 326, 750),
        "braceright": (163, -108, 418, 622),
        "asciitilde": (63, 197, 540, 320),
        "exclamdown": (236, -157, 364, 430),
        "cent": (96, -49, 500, 614),
        "sterling": (84, -21, 521, 611),
        "fraction": (92, -57, 509, 665),
        "yen": (26, 0, 574, 562),
        "florin": (4, -143, 539, 622),
        "section": (113, -78, 488, 580),
        "currency": (73, 58, 527, 506),
        "quotesingle": (259, 328, 341, 562),
        "quotedblleft": (143, 328, 471, 562),
        "guillemotleft": (37, 70, 563, 446),
        "guilsinglleft": (149, 70, 451, 446),
        "guilsinglright": (149, 70, 451, 446),
        "fi": (3, 0, 597, 629),
        "fl": (3, 0, 597, 629),
        "endash": (75, 231, 525, 285),
        "dagger": (141, -78, 459, 580),
        "daggerdbl": (141, -78, 459, 580),
        "periodcentered": (222, 189, 378, 327),
        "paragraph": (50, -78, 511, 562),
        "bullet": (172, 130, 428, 383),
        "quotesinglbase": (213, -134, 376, 100),
        "quotedblbase": (143, -134, 457, 100),
        "quotedblright": (143, 328, 457, 562),
        "guillemotright": (37, 70, 563, 446),
        "ellipsis": (37, -15, 563, 111),
        "perthousand": (3, -15, 600, 622),
        "questiondown": (108, -157, 471, 430),
        "grave": (151, 497, 378, 672),
        "acute": (242, 497, 469, 672),
        "circumflex": (124, 477, 476, 654),
        "tilde": (105, 489, 503, 606),
        "macron": (120, 525, 480, 565),
        "breve": (153, 501, 447, 609),
        "dotaccent": (249, 477, 352, 580),
        "dieresis": (148, 492, 453, 595),
        "ring": (218, 463, 382, 627),
        "cedilla": (224, -151, 362, 10),
        "hungarumlaut": (133, 497, 540, 672),
        "ogonek": (227, -151, 370, 0),
        "caron": (124, 492, 476, 669),
        "emdash": (0, 231, 600, 285),
        "AE": (3, 0, 550, 562),
        "ordfeminine": (156, 249, 442, 580),
        "Lslash": (47, 0, 554, 562),
        "Oslash": (43, -80, 557, 629),
        "OE": (7, 0, 567, 562),
        "ordmasculine": (157, 249, 443, 580),
        "ae": (19, -15, 570, 441),
        "dotlessi": (95, 0, 505, 426),
        "lslash": (95, 0, 505, 629),
        "oslash": (62, -80, 538, 506),
        "oe": (19, -15, 559, 441),
        "germandbls": (48, -15, 588, 629),
        "Scedilla": (72, -151, 529, 580),
        "multiply": (87, 43, 515, 470),
        "logicalnot": (87, 108, 513, 369),
        "format": (5, -157, 56, 607),
        "tab": (19, 0, 581, 562),
        "overscore": (0, 579, 600, 629),
        "IJ": (32, -18, 583, 562),
        "trademark": (-23, 263, 623, 562),
        "onequarter": (0, -57, 600, 665),
        "mu": (21, -157, 562, 426),
        "minus": (80, 232, 520, 283),
        "brokenbar": (275, -175, 326, 675),
        "arrowleft": (-24, 115, 624, 483),
        "LL": (8, 0, 592, 562),
        "arrowright": (-24, 115, 624, 483),
        "thorn": (-6, -157, 555, 629),
        "lira": (73, -21, 521, 611),
        "arrowboth": (-28, 115, 628, 483),
        "indent": (70, 68, 530, 348),
        "threesuperior": (155, 240, 406, 622),
        "onehalf": (0, -57, 611, 665),
        "graybox": (76, 0, 525, 599),
        "Idot": (96, 0, 504, 716),
        "ll": (18, 0, 567, 629),
        "Thorn": (79, 0, 538, 562),
        "Ccedilla": (41, -151, 540, 580),
        "notegraphic": (136, -15, 464, 572),
        "arrowup": (116, 0, 484, 623),
        "down": (160, -15, 440, 426),
        "plusminus": (87, 44, 513, 558),
        "threequarters": (8, -56, 593, 666),
        "scedilla": (80, -151, 513, 441),
        "ij": (37, -157, 490, 657),
        "eth": (62, -15, 538, 629),
        "merge": (160, -15, 440, 436),
        "twosuperior": (177, 249, 424, 622),
        "arrowdown": (116, -15, 484, 608),
        "left": (70, 68, 530, 348),
        "return": (19, 0, 581, 562),
        "Eth": (30, 0, 574, 562),
        "up": (160, 0, 440, 437),
        "divide": (87, 48, 513, 467),
        "prescription": (27, -15, 577, 562),
        "square": (19, 0, 581, 562),
        "stop": (19, 0, 581, 562),
        "degree": (123, 269, 477, 622),
        "ccedilla": (66, -151, 529, 441),
        "onesuperior": (172, 249, 428, 622),
        "largebullet": (261, 220, 339, 297),
        "center": (40, 14, 560, 580),
        "registered": (0, -18, 600, 580),
        "copyright": (0, -18, 600, 580),
        "dectab": (18, 0, 582, 227),
        "space": (0, 0, 0, 0),
        "Aacute": (3, 0, 597, 793),
        "Acircumflex": (3, 0, 597, 775),
        "Adieresis": (3, 0, 597, 731),
        "Agrave": (3, 0, 597, 793),
        "Aring": (3, 0, 597, 753),
        "Atilde": (3, 0, 597, 732),
        "Eacute": (53, 0, 550, 793),
        "Ecircumflex": (53, 0, 550, 775),
        "Edieresis": (53, 0, 550, 731),
        "Egrave": (53, 0, 550, 793),
        "Gcaron": (31, -18, 575, 805),
        "Iacute": (96, 0, 504, 793),
        "Icircumflex": (96, 0, 504, 775),
        "Idieresis": (96, 0, 504, 731),
        "Igrave": (96, 0, 504, 793),
        "Ntilde": (7, -13, 593, 732),
        "Oacute": (43, -18, 557, 793),
        "Ocircumflex": (43, -18, 557, 775),
        "Odieresis": (43, -18, 557, 731),
        "Ograve": (43, -18, 557, 793),
        "Otilde": (43, -18, 557, 732),
        "Scaron": (72, -20, 529, 805),
        "Uacute": (17, -18, 583, 793),
        "Ucircumflex": (17, -18, 583, 775),
        "Udieresis": (17, -18, 583, 731),
        "Ugrave": (17, -18, 583, 793),
        "Yacute": (24, 0, 576, 793),
        "Ydieresis": (24, 0, 576, 731),
        "Zcaron": (86, 0, 514, 805),
        "aacute": (53, -15, 559, 672),
        "acircumflex": (53, -15, 559, 654),
        "adieresis": (53, -15, 559, 595),
        "agrave": (53, -15, 559, 672),
        "aring": (53, -15, 559, 627),
        "atilde": (53, -15, 559, 606),
        "eacute": (66, -15, 548, 672),
        "ecircumflex": (66, -15, 548, 654),
        "edieresis": (66, -15, 548, 595),
        "egrave": (66, -15, 548, 672),
        "gcaron": (45, -157, 566, 669),
        "iacute": (95, 0, 505, 672),
        "icircumflex": (94, 0, 505, 654),
        "idieresis": (95, 0, 505, 595),
        "igrave": (95, 0, 505, 672),
        "ntilde": (26, 0, 575, 606),
        "oacute": (62, -15, 538, 672),
        "ocircumflex": (62, -15, 538, 654),
        "odieresis": (62, -15, 538, 595),
        "ograve": (62, -15, 538, 672),
        "otilde": (62, -15, 538, 606),
        "scaron": (80, -15, 513, 669),
        "uacute": (21, -15, 562, 672),
        "ucircumflex": (21, -15, 562, 654),
        "udieresis": (21, -15, 562, 595),
        "ugrave": (21, -15, 562, 672),
        "yacute": (7, -157, 592, 672),
        "ydieresis": (7, -157, 592, 595),
        "zcaron": (99, 0, 502, 669),
    },
    "Courier-Oblique": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (244, -15, 464, 572),
        "quotedbl": (273, 328, 532, 562),
        "numbersign": (133, -32, 596, 639),
        "dollar": (108, -126, 596, 662),
        "percent": (134, -15, 599, 622),
        "ampersand": (87, -15, 580, 543),
        "quoteright": (283, 328, 495, 562),
        "parenleft": (314, -108, 572, 622),
        "parenright": (137, -108, 396, 622),
        "asterisk": (212, 257, 580, 607),
        "plus": (129, 44, 580, 470),
        "comma": (157, -112, 370, 122),
        "hyphen": (152, 231, 558, 285),
        "period": (238, -15, 382, 109),
        "slash": (112, -80, 604, 629),
        "zero": (155, -15, 574, 622),
        "one": (98, 0, 515, 622),
        "two": (70, 0, 568, 622),
        "three": (82, -15, 537, 622),
        "four": (108, 0, 541, 622),
        "five": (99, -15, 589, 607),
        "six": (155, -15, 629, 622),
        "seven": (182, 0, 612, 607),
        "eight": (133, -15, 588, 622),
        "nine": (93, -15, 574, 622),
        "colon": (238, -15, 441, 385),
        "semicolon": (157, -112, 441, 385),
        "less": (96, 42, 610, 472),
        "equal": (109, 138, 600, 376),
        "greater": (85, 42, 599, 472),
        "question": (222, -15, 583, 572),
        "at": (127, -15, 582, 622),
        "A": (3, 0, 607, 562),
        "B": (43, 0, 615, 562),
        "C": (94, -18, 655, 580),
        "D": (43, 0, 645, 562),
        "E": (53, 0, 660, 562),
        "F": (53, 0, 660, 562),
        "G": (84, -18, 645, 580),
        "H": (32, 0, 687, 562),
        "I": (96, 0, 623, 562),
        "J": (52, -18, 685, 562),
        "K": (38, 0, 671, 562),
        "L": (47, 0, 607, 562),
        "M": (4, 0, 715, 562),
        "N": (7, -13, 712, 562),
        "O": (95, -18, 625, 580),
        "P": (79, 0, 643, 562),
        "Q": (95, -138, 625, 580),
        "R": (38, 0, 598, 562),
        "S": (76, -20, 650, 580),
        "T": (108, 0, 665, 562),
        "U": (125, -18, 702, 562),
        "V": (105, -13, 723, 562),
        "W": (106, -13, 722, 562),
        "X": (23, 0, 675, 562),
        "Y": (133, 0, 695, 562),
        "Z": (86, 0, 610, 562),
        "bracketleft": (246, -108, 574, 622),
        "backslash": (249, -80, 468, 629),
        "bracketright": (135, -108, 463, 622),
        "asciicircum": (175, 354, 587, 622),
        "underscore": (-27, -125, 584, -75),
        "quoteleft": (343, 328, 457, 562),
        "a": (77, -15, 569, 441),
        "b": (29, -15, 625, 629),
        "c": (106, -15, 608, 441),
        "d": (86, -15, 640, 629),
        "e": (107, -15, 597, 441),
        "f": (114, 0, 662, 629),
        "g": (61, -157, 657, 441),
        "h": (33, 0, 592, 629),
        "i": (95, 0, 515, 657),
        "j": (52, -157, 550, 657),
        "k": (58, 0, 633, 629),
        "l": (95, 0, 515, 629),
        "m": (-5, 0, 615, 441),
        "n": (26, 0, 585, 441),
        "o": (102, -15, 588, 441),
        "p": (-24, -157, 605, 441),
        "q": (86, -157, 682, 441),
        "r": (60, 0, 636, 441),
        "s": (78, -15, 584, 441),
        "t": (167, -15, 561, 561),
        "u": (101, -15, 572, 426),
        "v": (90, -10, 681, 426),
        "w": (76, -10, 695, 426),
        "x": (20, 0, 655, 426),
        "y": (-4, -157, 683, 426),
        "z": (99, 0, 593, 426),
        "braceleft": (233, -108, 569, 622),
        "bar": (222, -250, 485, 750),
        "braceright": (140, -108, 477, 622),
        "asciitilde": (116, 197, 600, 320),
        "exclamdown": (225, -157, 445, 430),
        "cent": (152, -49, 588, 614),
        "sterling": (124, -21, 621, 611),
        "fraction": (84, -57, 646, 665),
        "yen": (120, 0, 693, 562),
        "florin": (-26, -143, 671, 622),
        "section": (104, -78, 590, 580),
        "currency": (94, 58, 628, 506),
        "quotesingle": (345, 328, 460, 562),
        "quotedblleft": (262, 328, 541, 562),
        "guillemotleft": (92, 70, 652, 446),
        "guilsinglleft": (204, 70, 540, 446),
        "guilsinglright": (170, 70, 506, 446),
        "fi": (3, 0, 619, 629),
        "fl": (3, 0, 619, 629),
        "endash": (124, 231, 586, 285),
        "dagger": (217, -78, 546, 580),
        "daggerdbl": (163, -78, 546, 580),
        "periodcentered": (276, 189, 434, 327),
        "paragraph": (100, -78, 630, 562),
        "bullet": (225, 130, 485, 383),
        "quotesinglbase": (185, -134, 397, 100),
        "quotedblbase": (115, -134, 478, 100),
        "quotedblright": (213, 328, 576, 562),
        "guillemotright": (58, 70, 618, 446),
        "ellipsis": (46, -15, 574, 111),
        "perthousand": (59, -15, 626, 622),
        "questiondown": (106, -157, 466, 430),
        "grave": (294, 497, 484, 672),
        "acute": (348, 497, 612, 672),
        "circumflex": (229, 477, 581, 654),
        "tilde": (212, 489, 629, 606),
        "macron": (232, 525, 600, 565),
        "breve": (279, 501, 576, 609),
        "dotaccent": (360, 477, 465, 580),
        "dieresis": (263, 492, 570, 595),
        "ring": (333, 463, 499, 627),
        "cedilla": (197, -151, 344, 10),
        "hungarumlaut": (239, 497, 683, 672),
        "ogonek": (207, -151, 348, 0),
        "caron": (262, 492, 614, 669),
        "emdash": (49, 231, 661, 285),
        "AE": (3, 0, 655, 562),
        "ordfeminine": (209, 249, 512, 580),
        "Lslash": (47, 0, 607, 562),
        "Oslash": (95, -80, 625, 629),
        "OE": (60, 0, 672, 562),
        "ordmasculine": (210, 249, 534, 580),
        "ae": (42, -15, 626, 441),
        "dotlessi": (95, 0, 515, 426),
        "lslash": (95, 0, 583, 629),
        "oslash": (102, -80, 588, 506),
        "oe": (55, -15, 615, 441),
        "germandbls": (48, -15, 617, 629),
        "Scedilla": (76, -151, 650, 580),
        "multiply": (103, 43, 607, 470),
        "logicalnot": (155, 108, 591, 369),
        "format": (-28, -157, 185, 607),
        "tab": (19, 0, 641, 562),
        "overscore": (123, 579, 734, 629),
        "IJ": (32, -18, 702, 562),
        "trademark": (75, 263, 742, 562),
        "onequarter": (65, -57, 674, 665),
        "mu": (72, -157, 572, 426),
        "minus": (129, 232, 580, 283),
        "brokenbar": (238, -175, 469, 675),
        "arrowleft": (40, 115, 693, 483),
        "LL": (8, 0, 647, 562),
        "arrowright": (34, 115, 688, 483),
        "thorn": (-24, -157, 605, 629),
        "lira": (118, -21, 621, 611),
        "arrowboth": (36, 115, 692, 483),
        "indent": (108, 68, 574, 348),
        "threesuperior": (213, 240, 500, 622),
        "onehalf": (65, -57, 669, 665),
        "graybox": (76, 0, 652, 599),
        "Idot": (96, 0, 623, 716),
        "ll": (33, 0, 616, 629),
        "Thorn": (79, 0, 605, 562),
        "Ccedilla": (94, -151, 658, 580),
        "notegraphic": (144, -15, 564, 572),
        "arrowup": (209, 0, 577, 623),
        "down": (187, -15, 467, 426),
        "plusminus": (96, 44, 594, 558),
        "threequarters": (73, -56, 659, 666),
        "scedilla": (78, -151, 584, 441),
        "ij": (37, -157, 630, 657),
        "eth": (102, -15, 639, 629),
        "merge": (187, -15, 503, 436),
        "twosuperior": (230, 249, 534, 622),
        "arrowdown": (152, -15, 520, 608),
        "left": (114, 68, 580, 348),
        "return": (79, 0, 700, 562),
        "Eth": (43, 0, 645, 562),
        "up": (223, 0, 503, 437),
        "divide": (136, 48, 573, 467),
        "prescription": (27, -15, 617, 562),
        "square": (19, 0, 700, 562),
        "stop": (19, 0, 700, 562),
        "degree": (214, 269, 575, 622),
        "ccedilla": (106, -151, 614, 441),
        "onesuperior": (231, 249, 491, 622),
        "largebullet": (316, 220, 394, 297),
        "center": (103, 14, 623, 580),
        "registered": (54, -18, 666, 580),
        "copyright": (54, -18, 666, 580),
        "dectab": (18, 0, 593, 227),
        "space": (0, 0, 0, 0),
        "Aacute": (3, 0, 658, 793),
        "Acircumflex": (3, 0, 607, 775),
        "Adieresis": (3, 0, 607, 731),
        "Agrave": (3, 0, 607, 793),
        "Aring": (3, 0, 607, 753),
        "Atilde": (3, 0, 656, 732),
        "Eacute": (53, 0, 668, 793),
        "Ecircumflex": (53, 0, 660, 775),
        "Edieresis": (53, 0, 660, 731),
        "Egrave": (53, 0, 660, 793),
        "Gcaron": (84, -18, 645, 805),
        "Iacute": (96, 0, 638, 793),
        "Icircumflex": (96, 0, 623, 775),
        "Idieresis": (96, 0, 623, 731),
        "Igrave": (96, 0, 623, 793),
        "Ntilde": (7, -13, 712, 732),
        "Oacute": (95, -18, 638, 793),
        "Ocircumflex": (95, -18, 625, 775),
        "Odieresis": (95, -18, 625, 731),
        "Ograve": (95, -18, 625, 793),
        "Otilde": (95, -18, 656, 732),
        "Scaron": (76, -20, 673, 805),
        "Uacute": (125, -18, 702, 793),
        "Ucircumflex": (125, -18, 702, 775),
        "Udieresis": (125, -18, 702, 731),
        "Ugrave": (125, -18, 702, 793),
        "Yacute": (133, 0, 695, 793),
        "Ydieresis": (133, 0, 695, 731),
        "Zcaron": (86, 0, 643, 805),
        "aacute": (77, -15, 612, 672),
        "acircumflex": (77, -15, 581, 654),
        "adieresis": (77, -15, 570, 595),
        "agrave": (77, -15, 569, 672),
        "aring": (77, -15, 569, 627),
        "atilde": (77, -15, 629, 606),
        "eacute": (107, -15, 612, 672),
        "ecircumflex": (107, -15, 597, 654),
        "edieresis": (107, -15, 597, 595),
        "egrave": (107, -15, 597, 672),
        "gcaron": (61, -157, 657, 669),
        "iacute": (95, 0, 612, 672),
        "icircumflex": (95, 0, 551, 654),
        "idieresis": (95, 0, 540, 595),
        "igrave": (95, 0, 515, 672),
        "ntilde": (26, 0, 629, 606),
        "oacute": (102, -15, 612, 672),
        "ocircumflex": (102, -15, 588, 654),
        "odieresis": (102, -15, 588, 595),
        "ograve": (102, -15, 588, 672),
        "otilde": (102, -15, 629, 606),
        "scaron": (78, -15, 614, 669),
        "uacute": (101, -15, 602, 672),
        "ucircumflex": (101, -15, 572, 654),
        "udieresis": (101, -15, 572, 595),
        "ugrave": (101, -15, 572, 672),
        "yacute": (-4, -157, 683, 672),
        "ydieresis": (-4, -157, 683, 595),
        "zcaron": (99, 0, 624, 669),
    },
    "Helvetica-BoldOblique": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (94, 0, 397, 718),
        "quotedbl": (193, 447, 529, 718),
        "numbersign": (60, 0, 644, 698),
        "dollar": (67, -115, 621, 775),
        "percent": (137, -19, 900, 710),
        "ampersand": (89, -19, 732, 718),
        "quoteright": (167, 445, 362, 718),
        "parenleft": (76, -208, 470, 734),
        "parenright": (-25, -208, 368, 734),
        "asterisk": (146, 387, 481, 718),
        "plus": (82, 0, 610, 506),
        "comma": (28, -168, 245, 146),
        "hyphen": (73, 215, 379, 345),
        "period": (64, 0, 245, 146),
        "slash": (-37, -19, 468, 737),
        "zero": (87, -19, 617, 710),
        "one": (173, 0, 529, 710),
        "two": (26, 0, 619, 710),
        "three": (66, -19, 608, 710),
        "four": (60, 0, 598, 710),
        "five": (64, -19, 636, 698),
        "six": (86, -19, 619, 710),
        "seven": (125, 0, 676, 698),
        "eight": (70, -19, 615, 710),
        "nine": (78, -19, 615, 710),
        "colon": (92, 0, 351, 512),
        "semicolon": (56, -168, 351, 512),
        "less": (82, -8, 655, 514),
        "equal": (58, 87, 633, 419),
        "greater": (36, -8, 609, 514),
        "question": (165, 0, 670, 727),
        "at": (186, -19, 953, 737),
        "A": (20, 0, 702, 718),
        "B": (76, 0, 763, 718),
        "C": (107, -19, 788, 737),
        "D": (76, 0, 777, 718),
        "E": (76, 0, 757, 718),
        "F": (76, 0, 740, 718),
        "G": (108, -19, 816, 737),
        "H": (71, 0, 804, 718),
        "I": (64, 0, 367, 718),
        "J": (60, -18, 637, 718),
        "K": (87, 0, 858, 718),
        "L": (76, 0, 611, 718),
        "M": (69, 0, 918, 718),
        "N": (69, 0, 807, 718),
        "O": (108, -19, 823, 737),
        "P": (76, 0, 737, 718),
        "Q": (108, -52, 823, 737),
        "R": (76, 0, 778, 718),
        "S": (81, -19, 717, 737),
        "T": (140, 0, 751, 718),
        "U": (116, -19, 804, 718),
        "V": (172, 0, 801, 718),
        "W": (169, 0, 1082, 718),
        "X": (14, 0, 791, 718),
        "Y": (168, 0, 806, 718),
        "Z": (25, 0, 737, 718),
        "bracketleft": (21, -196, 462, 722),
        "backslash": (124, -19, 307, 737),
        "bracketright": (-18, -196, 423, 722),
        "asciicircum": (131, 323, 591, 698),
        "underscore": (-27, -125, 540, -75),
        "quoteleft": (165, 454, 361, 727),
        "a": (55, -14, 582, 546),
        "b": (61, -14, 645, 718),
        "c": (79, -14, 599, 546),
        "d": (83, -14, 704, 718),
        "e": (71, -14, 592, 546),
        "f": (87, 0, 469, 727),
        "g": (39, -217, 666, 546),
        "h": (65, 0, 629, 718),
        "i": (69, 0, 363, 725),
        "j": (-42, -214, 363, 725),
        "k": (69, 0, 670, 718),
        "l": (69, 0, 362, 718),
        "m": (64, 0, 909, 546),
        "n": (65, 0, 629, 546),
        "o": (83, -14, 643, 546),
        "p": (18, -207, 645, 546),
        "q": (81, -207, 665, 546),
        "r": (64, 0, 489, 546),
        "s": (63, -14, 584, 546),
        "t": (101, -6, 422, 676),
        "u": (99, -14, 658, 532),
        "v": (126, 0, 656, 532),
        "w": (123, 0, 882, 532),
        "x": (15, 0, 648, 532),
        "y": (42, -214, 652, 532),
        "z": (20, 0, 583, 532),
        "braceleft": (94, -196, 518, 722),
        "bar": (80, -19, 353, 737),
        "braceright": (-18, -196, 407, 722),
        "asciitilde": (115, 163, 577, 343),
        "exclamdown": (50, -186, 353, 532),
        "cent": (79, -118, 599, 628),
        "sterling": (50, -16, 635, 718),
        "fraction": (-174, -19, 487, 710),
        "yen": (60, 0, 713, 698),
        "florin": (-50, -210, 669, 737),
        "section": (61, -184, 598, 727),
        "currency": (27, 76, 680, 636),
        "quotesingle": (165, 447, 321, 718),
        "quotedblleft": (160, 454, 588, 727),
        "guillemotleft": (135, 76, 571, 484),
        "guilsinglleft": (130, 76, 353, 484),
        "guilsinglright": (99, 76, 322, 484),
        "fi": (87, 0, 696, 727),
        "fl": (87, 0, 695, 727),
        "endash": (48, 227, 627, 333),
        "dagger": (118, -171, 626, 718),
        "daggerdbl": (46, -171, 628, 718),
        "periodcentered": (111, 172, 275, 334),
        "paragraph": (99, -191, 688, 700),
        "bullet": (84, 194, 420, 524),
        "quotesinglbase": (41, -146, 236, 127),
        "quotedblbase": (36, -146, 463, 127),
        "quotedblright": (162, 445, 589, 718),
        "guillemotright": (104, 76, 540, 484),
        "ellipsis": (92, 0, 939, 146),
        "perthousand": (76, -19, 1038, 710),
        "questiondown": (54, -195, 559, 532),
        "grave": (136, 604, 353, 750),
        "acute": (236, 604, 515, 750),
        "circumflex": (118, 604, 471, 750),
        "tilde": (113, 610, 507, 737),
        "macron": (122, 604, 483, 678),
        "breve": (156, 604, 494, 750),
        "dotaccent": (235, 614, 385, 729),
        "dieresis": (137, 614, 482, 729),
        "ring": (200, 568, 420, 776),
        "cedilla": (-37, -228, 219, 0),
        "hungarumlaut": (137, 604, 645, 750),
        "ogonek": (41, -228, 264, 0),
        "caron": (149, 604, 502, 750),
        "emdash": (48, 227, 1071, 333),
        "AE": (5, 0, 1100, 718),
        "ordfeminine": (92, 276, 464, 737),
        "Lslash": (34, 0, 611, 718),
        "Oslash": (35, -27, 894, 745),
        "OE": (99, -19, 1114, 737),
        "ordmasculine": (92, 276, 484, 737),
        "ae": (56, -14, 922, 546),
        "dotlessi": (69, 0, 322, 532),
        "lslash": (40, 0, 407, 718),
        "oslash": (22, -29, 701, 560),
        "oe": (83, -14, 976, 546),
        "germandbls": (69, -14, 657, 731),
        "onesuperior": (148, 283, 388, 710),
        "logicalnot": (105, 108, 633, 419),
        "mu": (22, -207, 658, 532),
        "trademark": (179, 306, 1109, 718),
        "Eth": (62, 0, 777, 718),
        "onehalf": (132, -19, 858, 710),
        "plusminus": (40, 0, 625, 506),
        "Thorn": (76, 0, 715, 718),
        "onequarter": (132, -19, 806, 710),
        "divide": (82, -42, 610, 548),
        "brokenbar": (80, -19, 353, 737),
        "degree": (175, 426, 467, 712),
        "thorn": (18, -208, 645, 718),
        "threequarters": (100, -19, 839, 710),
        "twosuperior": (69, 283, 448, 710),
        "registered": (56, -19, 834, 737),
        "minus": (82, 197, 610, 309),
        "eth": (82, -14, 670, 737),
        "multiply": (57, 1, 635, 505),
        "threesuperior": (92, 271, 440, 710),
        "copyright": (57, -19, 835, 737),
        "space": (0, 0, 0, 0),
        "Aacute": (20, 0, 750, 936),
        "Acircumflex": (20, 0, 706, 936),
        "Adieresis": (20, 0, 716, 915),
        "Agrave": (20, 0, 702, 936),
        "Aring": (20, 0, 702, 962),
        "Atilde": (20, 0, 741, 923),
        "Ccedilla": (107, -228, 788, 737),
        "Eacute": (76, 0, 757, 936),
        "Ecircumflex": (76, 0, 757, 936),
        "Edieresis": (76, 0, 757, 915),
        "Egrave": (76, 0, 757, 936),
        "Iacute": (64, 0, 528, 936),
        "Icircumflex": (64, 0, 484, 936),
        "Idieresis": (64, 0, 494, 915),
        "Igrave": (64, 0, 367, 936),
        "Ntilde": (69, 0, 807, 923),
        "Oacute": (108, -19, 823, 936),
        "Ocircumflex": (108, -19, 823, 936),
        "Odieresis": (108, -19, 823, 915),
        "Ograve": (108, -19, 823, 936),
        "Otilde": (108, -19, 823, 923),
        "Scaron": (81, -19, 717, 936),
        "Uacute": (116, -19, 804, 936),
        "Ucircumflex": (116, -19, 804, 936),
        "Udieresis": (116, -19, 804, 915),
        "Ugrave": (116, -19, 804, 936),
        "Yacute": (168, 0, 806, 936),
        "Ydieresis": (168, 0, 806, 915),
        "Zcaron": (25, 0, 737, 936),
        "aacute": (55, -14, 627, 750),
        "acircumflex": (55, -14, 583, 750),
        "adieresis": (55, -14, 594, 729),
        "agrave": (55, -14, 582, 750),
        "aring": (55, -14, 582, 776),
        "atilde": (55, -14, 619, 737),
        "ccedilla": (79, -228, 599, 546),
        "eacute": (71, -14, 627, 750),
        "ecircumflex": (71, -14, 592, 750),
        "edieresis": (71, -14, 594, 729),
        "egrave": (71, -14, 592, 750),
        "iacute": (69, 0, 488, 750),
        "icircumflex": (69, 0, 444, 750),
        "idieresis": (69, 0, 455, 729),
        "igrave": (69, 0, 326, 750),
        "ntilde": (65, 0, 646, 737),
        "oacute": (83, -14, 654, 750),
        "ocircumflex": (83, -14, 643, 750),
        "odieresis": (83, -14, 643, 729),
        "ograve": (83, -14, 643, 750),
        "otilde": (83, -14, 646, 737),
        "scaron": (63, -14, 614, 750),
        "uacute": (99, -14, 658, 750),
        "ucircumflex": (99, -14, 658, 750),
        "udieresis": (99, -14, 658, 729),
        "ugrave": (99, -14, 658, 750),
        "yacute": (42, -214, 652, 750),
        "ydieresis": (42, -214, 652, 729),
        "zcaron": (20, 0, 586, 750),
    },
    "Helvetica-Bold": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (90, 0, 244, 718),
        "quotedbl": (98, 447, 376, 718),
        "numbersign": (18, 0, 538, 698),
        "dollar": (30, -115, 523, 775),
        "percent": (28, -19, 861, 710),
        "ampersand": (54, -19, 701, 718),
        "quoteright": (69, 445, 209, 718),
        "parenleft": (35, -208, 314, 734),
        "parenright": (19, -208, 298, 734),
        "asterisk": (27, 387, 362, 718),
        "plus": (40, 0, 544, 506),
        "comma": (64, -168, 214, 146),
        "hyphen": (27, 215, 306, 345),
        "period": (64, 0, 214, 146),
        "slash": (-33, -19, 311, 737),
        "zero": (32, -19, 524, 710),
        "one": (69, 0, 378, 710),
        "two": (26, 0, 511, 710),
        "three": (27, -19, 516, 710),
        "four": (27, 0, 526, 710),
        "five": (27, -19, 516, 698),
        "six": (31, -19, 520, 710),
        "seven": (25, 0, 528, 698),
        "eight": (32, -19, 524, 710),
        "nine": (30, -19, 522, 710),
        "colon": (92, 0, 242, 512),
        "semicolon": (92, -168, 242, 512),
        "less": (38, -8, 546, 514),
        "equal": (40, 87, 544, 419),
        "greater": (38, -8, 546, 514),
        "question": (60, 0, 556, 727),
        "at": (118, -19, 856, 737),
        "A": (20, 0, 702, 718),
        "B": (76, 0, 669, 718),
        "C": (44, -19, 684, 737),
        "D": (76, 0, 685, 718),
        "E": (76, 0, 621, 718),
        "F": (76, 0, 587, 718),
        "G": (44, -19, 713, 737),
        "H": (71, 0, 651, 718),
        "I": (64, 0, 214, 718),
        "J": (22, -18, 484, 718),
        "K": (87, 0, 722, 718),
        "L": (76, 0, 583, 718),
        "M": (69, 0, 765, 718),
        "N": (69, 0, 654, 718),
        "O": (44, -19, 734, 737),
        "P": (76, 0, 627, 718),
        "Q": (44, -52, 737, 737),
        "R": (76, 0, 677, 718),
        "S": (39, -19, 629, 737),
        "T": (14, 0, 598, 718),
        "U": (72, -19, 651, 718),
        "V": (19, 0, 648, 718),
        "W": (16, 0, 929, 718),
        "X": (14, 0, 653, 718),
        "Y": (15, 0, 653, 718),
        "Z": (25, 0, 586, 718),
        "bracketleft": (63, -196, 309, 722),
        "backslash": (-33, -19, 311, 737),
        "bracketright": (24, -196, 270, 722),
        "asciicircum": (62, 323, 522, 698),
        "underscore": (0, -125, 556, -75),
        "quoteleft": (69, 454, 209, 727),
        "a": (29, -14, 527, 546),
        "b": (61, -14, 578, 718),
        "c": (34, -14, 524, 546),
        "d": (34, -14, 551, 718),
        "e": (23, -14, 528, 546),
        "f": (10, 0, 318, 727),
        "g": (40, -217, 553, 546),
        "h": (65, 0, 546, 718),
        "i": (69, 0, 209, 725),
        "j": (3, -214, 209, 725),
        "k": (69, 0, 562, 718),
        "l": (69, 0, 209, 718),
        "m": (64, 0, 826, 546),
        "n": (65, 0, 546, 546),
        "o": (34, -14, 578, 546),
        "p": (62, -207, 578, 546),
        "q": (34, -207, 552, 546),
        "r": (64, 0, 373, 546),
        "s": (30, -14, 519, 546),
        "t": (10, -6, 309, 676),
        "u": (66, -14, 545, 532),
        "v": (13, 0, 543, 532),
        "w": (10, 0, 769, 532),
        "x": (15, 0, 541, 532),
        "y": (10, -214, 539, 532),
        "z": (20, 0, 480, 532),
        "braceleft": (48, -196, 365, 722),
        "bar": (84, -19, 196, 737),
        "braceright": (24, -196, 341, 722),
        "asciitilde": (61, 163, 523, 343),
        "exclamdown": (90, -186, 244, 532),
        "cent": (34, -118, 524, 628),
        "sterling": (28, -16, 541, 718),
        "fraction": (-170, -19, 336, 710),
        "yen": (-9, 0, 565, 698),
        "florin": (-10, -210, 516, 737),
        "section": (34, -184, 522, 727),
        "currency": (-3, 76, 559, 636),
        "quotesingle": (70, 447, 168, 718),
        "quotedblleft": (64, 454, 436, 727),
        "guillemotleft": (88, 76, 468, 484),
        "guilsinglleft": (83, 76, 250, 484),
        "guilsinglright": (83, 76, 250, 484),
        "fi": (10, 0, 542, 727),
        "fl": (10, 0, 542, 727),
        "endash": (0, 227, 556, 333),
        "dagger": (36, -171, 520, 718),
        "daggerdbl": (36, -171, 520, 718),
        "periodcentered": (58, 172, 220, 334),
        "paragraph": (-8, -191, 539, 700),
        "bullet": (10, 194, 340, 524),
        "quotesinglbase": (69, -146, 209, 127),
        "quotedblbase": (64, -146, 436, 127),
        "quotedblright": (64, 445, 436, 718),
        "guillemotright": (88, 76, 468, 484),
        "ellipsis": (92, 0, 908, 146),
        "perthousand": (-3, -19, 1003, 710),
        "questiondown": (55, -195, 551, 532),
        "grave": (-23, 604, 225, 750),
        "acute": (108, 604, 356, 750),
        "circumflex": (-10, 604, 343, 750),
        "tilde": (-17, 610, 350, 737),
        "macron": (-6, 604, 339, 678),
        "breve": (-2, 604, 335, 750),
        "dotaccent": (104, 614, 230, 729),
        "dieresis": (6, 614, 327, 729),
        "ring": (59, 568, 275, 776),
        "cedilla": (6, -228, 245, 0),
        "hungarumlaut": (9, 604, 486, 750),
        "ogonek": (71, -228, 304, 0),
        "caron": (-10, 604, 343, 750),
        "emdash": (0, 227, 1000, 333),
        "AE": (5, 0, 954, 718),
        "ordfeminine": (22, 276, 347, 737),
        "Lslash": (-20, 0, 583, 718),
        "Oslash": (33, -27, 744, 745),
        "OE": (37, -19, 961, 737),
        "ordmasculine": (6, 276, 360, 737),
        "ae": (29, -14, 858, 546),
        "dotlessi": (69, 0, 209, 532),
        "lslash": (-18, 0, 296, 718),
        "oslash": (22, -29, 589, 560),
        "oe": (34, -14, 912, 546),
        "germandbls": (69, -14, 579, 731),
        "onesuperior": (26, 283, 237, 710),
        "logicalnot": (40, 108, 544, 419),
        "mu": (66, -207, 545, 532),
        "trademark": (44, 306, 956, 718),
        "Eth": (-5, 0, 685, 718),
        "onehalf": (26, -19, 794, 710),
        "plusminus": (40, 0, 544, 506),
        "Thorn": (76, 0, 627, 718),
        "onequarter": (26, -19, 766, 710),
        "divide": (40, -42, 544, 548),
        "brokenbar": (84, -19, 196, 737),
        "degree": (57, 426, 343, 712),
        "thorn": (62, -208, 578, 718),
        "threequarters": (16, -19, 799, 710),
        "twosuperior": (9, 283, 324, 710),
        "registered": (-11, -19, 748, 737),
        "minus": (40, 197, 544, 309),
        "eth": (34, -14, 578, 737),
        "multiply": (40, 1, 545, 505),
        "threesuperior": (8, 271, 326, 710),
        "copyright": (-11, -19, 749, 737),
        "space": (0, 0, 0, 0),
        "Aacute": (20, 0, 702, 936),
        "Acircumflex": (20, 0, 702, 936),
        "Adieresis": (20, 0, 702, 915),
        "Agrave": (20, 0, 702, 936),
        "Aring": (20, 0, 702, 962),
        "Atilde": (20, 0, 702, 923),
        "Ccedilla": (44, -228, 684, 737),
        "Eacute": (76, 0, 621, 936),
        "Ecircumflex": (76, 0, 621, 936),
        "Edieresis": (76, 0, 621, 915),
        "Egrave": (76, 0, 621, 936),
        "Iacute": (64, 0, 329, 936),
        "Icircumflex": (-37, 0, 316, 936),
        "Idieresis": (-21, 0, 300, 915),
        "Igrave": (-50, 0, 214, 936),
        "Ntilde": (69, 0, 654, 923),
        "Oacute": (44, -19, 734, 936),
        "Ocircumflex": (44, -19, 734, 936),
        "Odieresis": (44, -19, 734, 915),
        "Ograve": (44, -19, 734, 936),
        "Otilde": (44, -19, 734, 923),
        "Scaron": (39, -19, 629, 936),
        "Uacute": (72, -19, 651, 936),
        "Ucircumflex": (72, -19, 651, 936),
        "Udieresis": (72, -19, 651, 915),
        "Ugrave": (72, -19, 651, 936),
        "Yacute": (15, 0, 653, 936),
        "Ydieresis": (15, 0, 653, 915),
        "Zcaron": (25, 0, 586, 936),
        "aacute": (29, -14, 527, 750),
        "acircumflex": (29, -14, 527, 750),
        "adieresis": (29, -14, 527, 729),
        "agrave": (29, -14, 527, 750),
        "aring": (29, -14, 527, 776),
        "atilde": (29, -14, 527, 737),
        "ccedilla": (34, -228, 524, 546),
        "eacute": (23, -14, 528, 750),
        "ecircumflex": (23, -14, 528, 750),
        "edieresis": (23, -14, 528, 729),
        "egrave": (23, -14, 528, 750),
        "iacute": (69, 0, 329, 750),
        "icircumflex": (-37, 0, 316, 750),
        "idieresis": (-21, 0, 300, 729),
        "igrave": (-50, 0, 209, 750),
        "ntilde": (65, 0, 546, 737),
        "oacute": (34, -14, 578, 750),
        "ocircumflex": (34, -14, 578, 750),
        "odieresis": (34, -14, 578, 729),
        "ograve": (34, -14, 578, 750),
        "otilde": (34, -14, 578, 737),
        "scaron": (30, -14, 519, 750),
        "uacute": (66, -14, 545, 750),
        "ucircumflex": (66, -14, 545, 750),
        "udieresis": (66, -14, 545, 729),
        "ugrave": (66, -14, 545, 750),
        "yacute": (10, -214, 539, 750),
        "ydieresis": (10, -214, 539, 729),
        "zcaron": (20, 0, 480, 750),
    },
    "Helvetica-Oblique": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (90, 0, 340, 718),
        "quotedbl": (168, 463, 438, 718),
        "numbersign": (73, 0, 631, 688),
        "dollar": (69, -115, 617, 775),
        "percent": (147, -19, 888, 703),
        "ampersand": (78, -15, 647, 718),
        "quoteright": (151, 463, 310, 718),
        "parenleft": (108, -207, 454, 733),
        "parenright": (-9, -207, 336, 733),
        "asterisk": (165, 431, 475, 718),
        "plus": (85, 0, 606, 505),
        "comma": (56, -147, 214, 106),
        "hyphen": (93, 232, 357, 322),
        "period": (87, 0, 214, 106),
        "slash": (-21, -19, 452, 737),
        "zero": (94, -19, 607, 703),
        "one": (207, 0, 508, 703),
        "two": (26, 0, 617, 703),
        "three": (75, -19, 609, 703),
        "four": (61, 0, 576, 703),
        "five": (68, -19, 621, 688),
        "six": (91, -19, 615, 703),
        "seven": (137, 0, 669, 688),
        "eight": (74, -19, 606, 703),
        "nine": (83, -19, 608, 703),
        "colon": (87, 0, 301, 516),
        "semicolon": (56, -147, 301, 516),
        "less": (94, 11, 641, 495),
        "equal": (63, 115, 628, 390),
        "greater": (50, 11, 597, 495),
        "question": (161, 0, 610, 727),
        "at": (215, -19, 964, 737),
        "A": (14, 0, 654, 718),
        "B": (74, 0, 711, 718),
        "C": (108, -19, 781, 737),
        "D": (81, 0, 763, 718),
        "E": (86, 0, 762, 718),
        "F": (86, 0, 736, 718),
        "G": (111, -19, 798, 737),
        "H": (77, 0, 799, 718),
        "I": (91, 0, 341, 718),
        "J": (47, -19, 581, 718),
        "K": (76, 0, 808, 718),
        "L": (76, 0, 555, 718),
        "M": (73, 0, 914, 718),
        "N": (76, 0, 799, 718),
        "O": (105, -19, 825, 737),
        "P": (86, 0, 736, 718),
        "Q": (105, -56, 825, 737),
        "R": (88, 0, 773, 718),
        "S": (90, -19, 712, 737),
        "T": (148, 0, 750, 718),
        "U": (124, -19, 797, 718),
        "V": (173, 0, 800, 718),
        "W": (169, 0, 1081, 718),
        "X": (19, 0, 790, 718),
        "Y": (167, 0, 806, 718),
        "Z": (23, 0, 741, 718),
        "bracketleft": (21, -196, 403, 722),
        "backslash": (140, -19, 291, 737),
        "bracketright": (-14, -196, 368, 722),
        "asciicircum": (42, 264, 539, 688),
        "underscore": (-27, -125, 540, -75),
        "quoteleft": (165, 470, 323, 725),
        "a": (62, -15, 558, 538),
        "b": (58, -15, 584, 718),
        "c": (75, -15, 553, 538),
        "d": (84, -15, 652, 718),
        "e": (85, -15, 578, 538),
        "f": (86, 0, 416, 728),
        "g": (42, -220, 610, 538),
        "h": (65, 0, 572, 718),
        "i": (67, 0, 308, 718),
        "j": (-60, -210, 308, 718),
        "k": (67, 0, 600, 718),
        "l": (67, 0, 308, 718),
        "m": (65, 0, 851, 538),
        "n": (65, 0, 572, 538),
        "o": (84, -14, 584, 538),
        "p": (14, -207, 584, 538),
        "q": (84, -207, 605, 538),
        "r": (77, 0, 446, 538),
        "s": (64, -15, 529, 538),
        "t": (103, -7, 368, 669),
        "u": (95, -15, 600, 523),
        "v": (119, 0, 603, 523),
        "w": (125, 0, 820, 523),
        "x": (11, 0, 594, 523),
        "y": (15, -214, 600, 523),
        "z": (31, 0, 571, 523),
        "braceleft": (92, -196, 445, 722),
        "bar": (90, -19, 324, 737),
        "braceright": (0, -196, 354, 722),
        "asciitilde": (111, 180, 580, 326),
        "exclamdown": (77, -195, 326, 523),
        "cent": (96, -115, 583, 623),
        "sterling": (49, -16, 633, 718),
        "fraction": (-170, -19, 482, 703),
        "yen": (81, 0, 699, 688),
        "florin": (-52, -207, 654, 737),
        "section": (77, -191, 583, 737),
        "currency": (60, 99, 646, 603),
        "quotesingle": (157, 463, 285, 718),
        "quotedblleft": (138, 470, 461, 725),
        "guillemotleft": (146, 108, 554, 446),
        "guilsinglleft": (137, 108, 340, 446),
        "guilsinglright": (111, 108, 314, 446),
        "fi": (86, 0, 587, 728),
        "fl": (86, 0, 585, 728),
        "endash": (51, 240, 623, 313),
        "dagger": (135, -159, 622, 718),
        "daggerdbl": (52, -159, 623, 718),
        "periodcentered": (130, 190, 257, 315),
        "paragraph": (126, -173, 650, 718),
        "bullet": (91, 202, 412, 517),
        "quotesinglbase": (21, -149, 180, 106),
        "quotedblbase": (-6, -149, 318, 106),
        "quotedblright": (124, 463, 448, 718),
        "guillemotright": (120, 108, 528, 446),
        "ellipsis": (115, 0, 908, 106),
        "perthousand": (88, -19, 1029, 703),
        "questiondown": (85, -201, 534, 525),
        "grave": (170, 593, 337, 734),
        "acute": (248, 593, 475, 734),
        "circumflex": (147, 593, 438, 734),
        "tilde": (125, 606, 490, 722),
        "macron": (143, 627, 468, 684),
        "breve": (167, 595, 476, 731),
        "dotaccent": (249, 604, 362, 706),
        "dieresis": (168, 604, 443, 706),
        "ring": (214, 572, 402, 756),
        "cedilla": (2, -225, 232, 0),
        "hungarumlaut": (157, 593, 565, 734),
        "ogonek": (44, -225, 249, 0),
        "caron": (177, 593, 468, 734),
        "emdash": (51, 240, 1067, 313),
        "AE": (8, 0, 1097, 718),
        "ordfeminine": (100, 304, 448, 737),
        "Lslash": (41, 0, 555, 718),
        "Oslash": (43, -19, 890, 737),
        "OE": (99, -19, 1116, 737),
        "ordmasculine": (100, 304, 467, 737),
        "ae": (62, -15, 909, 538),
        "dotlessi": (95, 0, 294, 523),
        "lslash": (41, 0, 347, 718),
        "oslash": (29, -22, 647, 545),
        "oe": (84, -15, 964, 538),
        "germandbls": (67, -15, 657, 728),
        "onesuperior": (166, 281, 371, 703),
        "logicalnot": (106, 108, 628, 390),
        "mu": (24, -207, 600, 523),
        "trademark": (186, 306, 1056, 718),
        "Eth": (69, 0, 763, 718),
        "onehalf": (114, -19, 838, 703),
        "plusminus": (39, 0, 618, 506),
        "Thorn": (86, 0, 711, 718),
        "onequarter": (150, -19, 802, 703),
        "divide": (85, -19, 606, 524),
        "brokenbar": (90, -19, 324, 737),
        "degree": (169, 411, 467, 703),
        "thorn": (14, -207, 584, 718),
        "threequarters": (130, -19, 861, 703),
        "twosuperior": (64, 281, 448, 703),
        "registered": (55, -19, 837, 737),
        "minus": (85, 216, 606, 289),
        "eth": (82, -15, 617, 737),
        "multiply": (50, 0, 642, 506),
        "threesuperior": (90, 270, 436, 703),
        "copyright": (55, -19, 837, 737),
        "space": (0, 0, 0, 0),
        "Aacute": (14, 0, 683, 929),
        "Acircumflex": (14, 0, 654, 929),
        "Adieresis": (14, 0, 654, 901),
        "Agrave": (14, 0, 654, 929),
        "Aring": (14, 0, 654, 931),
        "Atilde": (14, 0, 699, 917),
        "Ccedilla": (108, -225, 781, 737),
        "Eacute": (86, 0, 762, 929),
        "Ecircumflex": (86, 0, 762, 929),
        "Edieresis": (86, 0, 762, 901),
        "Egrave": (86, 0, 762, 929),
        "Iacute": (91, 0, 489, 929),
        "Icircumflex": (91, 0, 452, 929),
        "Idieresis": (91, 0, 458, 901),
        "Igrave": (91, 0, 351, 929),
        "Ntilde": (76, 0, 799, 917),
        "Oacute": (105, -19, 825, 929),
        "Ocircumflex": (105, -19, 825, 929),
        "Odieresis": (105, -19, 825, 901),
        "Ograve": (105, -19, 825, 929),
        "Otilde": (105, -19, 825, 917),
        "Scaron": (90, -19, 712, 929),
        "Uacute": (124, -19, 797, 929),
        "Ucircumflex": (124, -19, 797, 929),
        "Udieresis": (124, -19, 797, 901),
        "Ugrave": (124, -19, 797, 929),
        "Yacute": (167, 0, 806, 929),
        "Ydieresis": (167, 0, 806, 901),
        "Zcaron": (23, 0, 741, 929),
        "aacute": (62, -15, 587, 734),
        "acircumflex": (62, -15, 558, 734),
        "adieresis": (62, -15, 558, 706),
        "agrave": (62, -15, 558, 734),
        "aring": (62, -15, 558, 756),
        "atilde": (62, -15, 592, 722),
        "ccedilla": (75, -225, 553, 538),
        "eacute": (85, -15, 587, 734),
        "ecircumflex": (85, -15, 578, 734),
        "edieresis": (85, -15, 578, 706),
        "egrave": (85, -15, 578, 734),
        "iacute": (95, 0, 448, 734),
        "icircumflex": (95, 0, 411, 734),
        "idieresis": (95, 0, 416, 706),
        "igrave": (95, 0, 310, 734),
        "ntilde": (65, 0, 592, 722),
        "oacute": (84, -14, 587, 734),
        "ocircumflex": (84, -14, 584, 734),
        "odieresis": (84, -14, 584, 706),
        "ograve": (84, -14, 584, 734),
        "otilde": (84, -14, 602, 722),
        "scaron": (64, -15, 552, 734),
        "uacute": (95, -15, 600, 734),
        "ucircumflex": (95, -15, 600, 734),
        "udieresis": (95, -15, 600, 706),
        "ugrave": (95, -15, 600, 734),
        "yacute": (15, -214, 600, 734),
        "ydieresis": (15, -214, 600, 706),
        "zcaron": (31, 0, 571, 734),
    },
    "Helvetica": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (90, 0, 187, 718),
        "quotedbl": (70, 463, 285, 718),
        "numbersign": (28, 0, 529, 688),
        "dollar": (32, -115, 520, 775),
        "percent": (39, -19, 850, 703),
        "ampersand": (44, -15, 645, 718),
        "quoteright": (53, 463, 157, 718),
        "parenleft": (68, -207, 299, 733),
        "parenright": (34, -207, 265, 733),
        "asterisk": (39, 431, 349, 718),
        "plus": (39, 0, 545, 505),
        "comma": (87, -147, 191, 106),
        "hyphen": (44, 232, 289, 322),
        "period": (87, 0, 191, 106),
        "slash": (-17, -19, 295, 737),
        "zero": (37, -19, 519, 703),
        "one": (101, 0, 359, 703),
        "two": (26, 0, 507, 703),
        "three": (34, -19, 522, 703),
        "four": (25, 0, 523, 703),
        "five": (32, -19, 514, 688),
        "six": (38, -19, 518, 703),
        "seven": (37, 0, 523, 688),
        "eight": (38, -19, 517, 703),
        "nine": (42, -19, 514, 703),
        "colon": (87, 0, 191, 516),
        "semicolon": (87, -147, 191, 516),
        "less": (48, 11, 536, 495),
        "equal": (39, 115, 545, 390),
        "greater": (48, 11, 536, 495),
        "question": (56, 0, 492, 727),
        "at": (147, -19, 868, 737),
        "A": (14, 0, 654, 718),
        "B": (74, 0, 627, 718),
        "C": (44, -19, 681, 737),
        "D": (81, 0, 674, 718),
        "E": (86, 0, 616, 718),
        "F": (86, 0, 583, 718),
        "G": (48, -19, 704, 737),
        "H": (77, 0, 646, 718),
        "I": (91, 0, 188, 718),
        "J": (17, -19, 428, 718),
        "K": (76, 0, 663, 718),
        "L": (76, 0, 537, 718),
        "M": (73, 0, 761, 718),
        "N": (76, 0, 646, 718),
        "O": (39, -19, 739, 737),
        "P": (86, 0, 622, 718),
        "Q": (39, -56, 739, 737),
        "R": (88, 0, 684, 718),
        "S": (49, -19, 620, 737),
        "T": (14, 0, 597, 718),
        "U": (79, -19, 644, 718),
        "V": (20, 0, 647, 718),
        "W": (16, 0, 928, 718),
        "X": (19, 0, 648, 718),
        "Y": (14, 0, 653, 718),
        "Z": (23, 0, 588, 718),
        "bracketleft": (63, -196, 250, 722),
        "backslash": (-17, -19, 295, 737),
        "bracketright": (28, -196, 215, 722),
        "asciicircum": (-14, 264, 483, 688),
        "underscore": (0, -125, 556, -75),
        "quoteleft": (65, 470, 169, 725),
        "a": (36, -15, 530, 538),
        "b": (58, -15, 517, 718),
        "c": (30, -15, 477, 538),
        "d": (35, -15, 499, 718),
        "e": (40, -15, 516, 538),
        "f": (14, 0, 262, 728),
        "g": (40, -220, 499, 538),
        "h": (65, 0, 491, 718),
        "i": (67, 0, 155, 718),
        "j": (-16, -210, 155, 718),
        "k": (67, 0, 501, 718),
        "l": (67, 0, 155, 718),
        "m": (65, 0, 769, 538),
        "n": (65, 0, 491, 538),
        "o": (35, -14, 521, 538),
        "p": (58, -207, 517, 538),
        "q": (35, -207, 494, 538),
        "r": (77, 0, 332, 538),
        "s": (32, -15, 464, 538),
        "t": (14, -7, 257, 669),
        "u": (68, -15, 489, 523),
        "v": (8, 0, 492, 523),
        "w": (14, 0, 709, 523),
        "x": (11, 0, 490, 523),
        "y": (11, -214, 489, 523),
        "z": (31, 0, 469, 523),
        "braceleft": (42, -196, 292, 722),
        "bar": (94, -19, 167, 737),
        "braceright": (42, -196, 292, 722),
        "asciitilde": (61, 180, 523, 326),
        "exclamdown": (118, -195, 215, 523),
        "cent": (51, -115, 513, 623),
        "sterling": (33, -16, 539, 718),
        "fraction": (-166, -19, 333, 703),
        "yen": (3, 0, 553, 688),
        "florin": (-11, -207, 501, 737),
        "section": (43, -191, 512, 737),
        "currency": (28, 99, 528, 603),
        "quotesingle": (59, 463, 132, 718),
        "quotedblleft": (38, 470, 307, 725),
        "guillemotleft": (97, 108, 459, 446),
        "guilsinglleft": (88, 108, 245, 446),
        "guilsinglright": (88, 108, 245, 446),
        "fi": (14, 0, 434, 728),
        "fl": (14, 0, 432, 728),
        "endash": (0, 240, 556, 313),
        "dagger": (43, -159, 514, 718),
        "daggerdbl": (43, -159, 514, 718),
        "periodcentered": (77, 190, 202, 315),
        "paragraph": (18, -173, 497, 718),
        "bullet": (18, 202, 333, 517),
        "quotesinglbase": (53, -149, 157, 106),
        "quotedblbase": (26, -149, 295, 106),
        "quotedblright": (26, 463, 295, 718),
        "guillemotright": (97, 108, 459, 446),
        "ellipsis": (115, 0, 885, 106),
        "perthousand": (7, -19, 994, 703),
        "questiondown": (91, -201, 527, 525),
        "grave": (14, 593, 211, 734),
        "acute": (122, 593, 319, 734),
        "circumflex": (21, 593, 312, 734),
        "tilde": (-4, 606, 337, 722),
        "macron": (10, 627, 323, 684),
        "breve": (13, 595, 321, 731),
        "dotaccent": (121, 604, 212, 706),
        "dieresis": (40, 604, 293, 706),
        "ring": (75, 572, 259, 756),
        "cedilla": (45, -225, 259, 0),
        "hungarumlaut": (31, 593, 409, 734),
        "ogonek": (73, -225, 287, 0),
        "caron": (21, 593, 312, 734),
        "emdash": (0, 240, 1000, 313),
        "AE": (8, 0, 951, 718),
        "ordfeminine": (24, 304, 346, 737),
        "Lslash": (-20, 0, 537, 718),
        "Oslash": (39, -19, 740, 737),
        "OE": (36, -19, 965, 737),
        "ordmasculine": (25, 304, 341, 737),
        "ae": (36, -15, 847, 538),
        "dotlessi": (95, 0, 183, 523),
        "lslash": (-20, 0, 242, 718),
        "oslash": (28, -22, 537, 545),
        "oe": (35, -15, 902, 538),
        "germandbls": (67, -15, 571, 728),
        "onesuperior": (43, 281, 222, 703),
        "logicalnot": (39, 108, 545, 390),
        "mu": (68, -207, 489, 523),
        "trademark": (46, 306, 903, 718),
        "Eth": (0, 0, 674, 718),
        "onehalf": (43, -19, 773, 703),
        "plusminus": (39, 0, 545, 506),
        "Thorn": (86, 0, 622, 718),
        "onequarter": (73, -19, 756, 703),
        "divide": (39, -19, 545, 524),
        "brokenbar": (94, -19, 167, 737),
        "degree": (54, 411, 346, 703),
        "thorn": (58, -207, 517, 718),
        "threequarters": (45, -19, 810, 703),
        "twosuperior": (4, 281, 323, 703),
        "registered": (-14, -19, 752, 737),
        "minus": (39, 216, 545, 289),
        "eth": (35, -15, 522, 737),
        "multiply": (39, 0, 545, 506),
        "threesuperior": (5, 270, 325, 703),
        "copyright": (-14, -19, 752, 737),
        "space": (0, 0, 0, 0),
        "Aacute": (14, 0, 654, 929),
        "Acircumflex": (14, 0, 654, 929),
        "Adieresis": (14, 0, 654, 901),
        "Agrave": (14, 0, 654, 929),
        "Aring": (14, 0, 654, 931),
        "Atilde": (14, 0, 654, 917),
        "Ccedilla": (44, -225, 681, 737),
        "Eacute": (86, 0, 616, 929),
        "Ecircumflex": (86, 0, 616, 929),
        "Edieresis": (86, 0, 616, 901),
        "Egrave": (86, 0, 616, 929),
        "Iacute": (91, 0, 292, 929),
        "Icircumflex": (-6, 0, 285, 929),
        "Idieresis": (13, 0, 266, 901),
        "Igrave": (-13, 0, 188, 929),
        "Ntilde": (76, 0, 646, 917),
        "Oacute": (39, -19, 739, 929),
        "Ocircumflex": (39, -19, 739, 929),
        "Odieresis": (39, -19, 739, 901),
        "Ograve": (39, -19, 739, 929),
        "Otilde": (39, -19, 739, 917),
        "Scaron": (49, -19, 620, 929),
        "Uacute": (79, -19, 644, 929),
        "Ucircumflex": (79, -19, 644, 929),
        "Udieresis": (79, -19, 644, 901),
        "Ugrave": (79, -19, 644, 929),
        "Yacute": (14, 0, 653, 929),
        "Ydieresis": (14, 0, 653, 901),
        "Zcaron": (23, 0, 588, 929),
        "aacute": (36, -15, 530, 734),
        "acircumflex": (36, -15, 530, 734),
        "adieresis": (36, -15, 530, 706),
        "agrave": (36, -15, 530, 734),
        "aring": (36, -15, 530, 756),
        "atilde": (36, -15, 530, 722),
        "ccedilla": (30, -225, 477, 538),
        "eacute": (40, -15, 516, 734),
        "ecircumflex": (40, -15, 516, 734),
        "edieresis": (40, -15, 516, 706),
        "egrave": (40, -15, 516, 734),
        "iacute": (95, 0, 292, 734),
        "icircumflex": (-6, 0, 285, 734),
        "idieresis": (13, 0, 266, 706),
        "igrave": (-13, 0, 184, 734),
        "ntilde": (65, 0, 491, 722),
        "oacute": (35, -14, 521, 734),
        "ocircumflex": (35, -14, 521, 734),
        "odieresis": (35, -14, 521, 706),
        "ograve": (35, -14, 521, 734),
        "otilde": (35, -14, 521, 722),
        "scaron": (32, -15, 464, 734),
        "uacute": (68, -15, 489, 734),
        "ucircumflex": (68, -15, 489, 734),
        "udieresis": (68, -15, 489, 706),
        "ugrave": (68, -15, 489, 734),
        "yacute": (11, -214, 489, 734),
        "ydieresis": (11, -214, 489, 706),
        "zcaron": (31, 0, 469, 734),
    },
    "Symbol": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (128, -17, 240, 672),
        "universal": (31, 0, 681, 705),
        "numbersign": (20, -16, 481, 673),
        "existential": (25, 0, 478, 707),
        "percent": (64, -35, 771, 655),
        "ampersand": (42, -17, 750, 661),
        "suchthat": (48, -17, 414, 499),
        "parenleft": (53, -191, 300, 673),
        "parenright": (30, -191, 277, 673),
        "asteriskmath": (65, 134, 427, 551),
        "plus": (10, 0, 539, 533),
        "comma": (56, -152, 194, 104),
        "minus": (11, 233, 535, 288),
        "period": (69, -17, 181, 95),
        "slash": (0, -18, 254, 646),
        "zero": (24, -17, 470, 685),
        "one": (117, 0, 390, 673),
        "two": (25, 0, 475, 685),
        "three": (39, -17, 435, 685),
        "four": (16, 0, 469, 685),
        "five": (29, -17, 443, 685),
        "six": (36, -17, 467, 685),
        "seven": (24, -16, 448, 673),
        "eight": (55, -17, 440, 684),
        "nine": (32, -18, 459, 684),
        "colon": (81, -17, 193, 460),
        "semicolon": (83, -152, 221, 460),
        "less": (26, 0, 523, 522),
        "equal": (11, 141, 537, 390),
        "greater": (26, 0, 523, 522),
        "question": (71, -17, 411, 686),
        "congruent": (11, 0, 537, 475),
        "Alpha": (4, 0, 684, 673),
        "Beta": (29, 0, 592, 673),
        "Chi": (-9, 0, 704, 673),
        "Delta": (6, 0, 608, 688),
        "Epsilon": (32, 0, 617, 673),
        "Phi": (26, 0, 741, 673),
        "Gamma": (24, 0, 609, 673),
        "Eta": (39, 0, 729, 673),
        "Iota": (32, 0, 316, 673),
        "theta1": (18, -17, 623, 689),
        "Kappa": (35, 0, 722, 673),
        "Lambda": (6, 0, 680, 688),
        "Mu": (28, 0, 887, 673),
        "Nu": (29, -8, 720, 673),
        "Omicron": (41, -17, 715, 685),
        "Pi": (25, 0, 745, 673),
        "Theta": (41, -17, 715, 685),
        "Rho": (28, 0, 562, 673),
        "Sigma": (5, 0, 589, 673),
        "Tau": (33, 0, 607, 673),
        "Upsilon": (-8, 0, 694, 673),
        "sigma1": (40, -233, 436, 500),
        "Omega": (34, 0, 736, 688),
        "Xi": (40, 0, 599, 673),
        "Psi": (15, 0, 781, 684),
        "Zeta": (44, 0, 636, 673),
        "bracketleft": (86, -155, 299, 674),
        "therefore": (163, 0, 701, 478),
        "bracketright": (33, -155, 246, 674),
        "perpendicular": (15, 0, 652, 674),
        "underscore": (-2, -252, 502, -206),
        "radicalex": (480, 881, 1090, 917),
        "alpha": (41, -18, 622, 500),
        "beta": (61, -223, 515, 740),
        "chi": (12, -231, 522, 499),
        "delta": (40, -18, 481, 739),
        "epsilon": (22, -19, 427, 501),
        "phi": (28, -224, 490, 671),
        "gamma": (6, -225, 484, 498),
        "eta": (0, -202, 527, 513),
        "iota": (0, -17, 301, 503),
        "phi1": (37, -224, 587, 499),
        "kappa": (33, 0, 558, 501),
        "lambda": (24, -17, 548, 739),
        "mu": (33, -223, 567, 500),
        "nu": (-9, -16, 474, 507),
        "omicron": (35, -18, 501, 498),
        "pi": (10, -19, 530, 487),
        "theta": (43, -17, 485, 690),
        "rho": (50, -230, 490, 498),
        "sigma": (31, -21, 588, 500),
        "tau": (10, -18, 418, 500),
        "upsilon": (7, -18, 535, 507),
        "omega1": (12, -17, 671, 583),
        "omega": (43, -17, 683, 500),
        "xi": (28, -224, 469, 765),
        "psi": (12, -228, 701, 500),
        "zeta": (60, -225, 467, 756),
        "braceleft": (58, -183, 397, 673),
        "bar": (65, -177, 135, 673),
        "braceright": (79, -183, 418, 673),
        "similar": (17, 203, 529, 307),
        "Upsilon1": (-1, 0, 610, 685),
        "minute": (27, 459, 228, 734),
        "lessequal": (29, 0, 526, 639),
        "fraction": (-180, -12, 340, 677),
        "infinity": (26, 125, 688, 404),
        "florin": (2, -193, 494, 686),
        "club": (86, -26, 660, 533),
        "diamond": (142, -36, 600, 550),
        "heart": (117, -33, 631, 532),
        "spade": (114, -36, 628, 548),
        "arrowboth": (24, -15, 1024, 511),
        "arrowleft": (32, -15, 942, 511),
        "arrowup": (45, 0, 571, 910),
        "arrowright": (49, -15, 959, 511),
        "arrowdown": (45, -22, 571, 888),
        "degree": (50, 385, 350, 685),
        "plusminus": (10, 0, 539, 645),
        "second": (20, 459, 413, 736),
        "greaterequal": (29, 0, 526, 639),
        "multiply": (17, 8, 533, 524),
        "proportional": (27, 124, 639, 404),
        "partialdiff": (27, -20, 462, 745),
        "bullet": (50, 113, 410, 473),
        "divide": (10, 71, 536, 456),
        "notequal": (15, -25, 540, 549),
        "equivalence": (14, 82, 538, 443),
        "approxequal": (14, 135, 527, 394),
        "ellipsis": (111, -17, 889, 95),
        "arrowvertex": (280, -120, 336, 1010),
        "arrowhorizex": (-60, 220, 1050, 276),
        "carriagereturn": (15, -16, 602, 629),
        "aleph": (175, -18, 661, 658),
        "Ifraktur": (10, -53, 578, 740),
        "Rfraktur": (26, -15, 759, 733),
        "weierstrass": (159, -211, 870, 573),
        "circlemultiply": (43, -17, 733, 673),
        "circleplus": (43, -15, 733, 675),
        "emptyset": (39, -24, 781, 719),
        "intersection": (40, 0, 732, 509),
        "union": (40, -17, 732, 492),
        "propersuperset": (20, 0, 673, 470),
        "reflexsuperset": (20, -125, 673, 470),
        "notsubset": (36, -70, 690, 540),
        "propersubset": (37, 0, 690, 470),
        "reflexsubset": (37, -125, 690, 470),
        "element": (45, 0, 505, 468),
        "notelement": (45, -58, 505, 555),
        "angle": (26, 0, 738, 673),
        "gradient": (36, -19, 681, 718),
        "registerserif": (50, -17, 740, 673),
        "copyrightserif": (51, -15, 741, 675),
        "trademarkserif": (18, 293, 855, 673),
        "product": (25, -101, 803, 751),
        "radical": (10, -38, 515, 917),
        "dotmath": (69, 210, 169, 310),
        "logicalnot": (15, 0, 680, 288),
        "logicaland": (23, 0, 583, 454),
        "logicalor": (30, 0, 578, 477),
        "arrowdblboth": (27, -20, 1023, 510),
        "arrowdblleft": (30, -15, 939, 513),
        "arrowdblup": (39, 2, 567, 911),
        "arrowdblright": (45, -20, 954, 508),
        "arrowdbldown": (44, -19, 572, 890),
        "lozenge": (18, 0, 466, 745),
        "angleleft": (25, -198, 306, 746),
        "registersans": (50, -20, 740, 670),
        "copyrightsans": (49, -15, 739, 675),
        "trademarksans": (5, 293, 725, 673),
        "summation": (14, -108, 695, 752),
        "parenlefttp": (40, -293, 436, 926),
        "parenleftex": (40, -85, 92, 925),
        "parenleftbt": (40, -293, 436, 926),
        "bracketlefttp": (0, -80, 341, 926),
        "bracketleftex": (0, -79, 55, 925),
        "bracketleftbt": (0, -80, 340, 926),
        "bracelefttp": (201, -75, 439, 926),
        "braceleftmid": (14, -85, 255, 935),
        "braceleftbt": (201, -70, 439, 926),
        "braceex": (201, -80, 255, 935),
        "angleright": (21, -198, 302, 746),
        "integral": (2, -107, 290, 915),
        "integraltp": (332, -83, 715, 921),
        "integralex": (332, -88, 415, 975),
        "integralbt": (39, -81, 415, 921),
        "parenrighttp": (54, -293, 450, 926),
        "parenrightex": (398, -85, 450, 925),
        "parenrightbt": (54, -293, 450, 926),
        "bracketrighttp": (22, -80, 360, 926),
        "bracketrightex": (305, -79, 360, 925),
        "bracketrightbt": (20, -80, 360, 926),
        "bracerighttp": (17, -75, 255, 926),
        "bracerightmid": (201, -85, 442, 935),
        "bracerightbt": (17, -70, 255, 926),
        "apple": (56, -2, 733, 808),
        "space": (0, 0, 0, 0),
    },
    "Times-BoldItalic": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (67, -13, 370, 684),
        "quotedbl": (136, 398, 536, 685),
        "numbersign": (-33, 0, 533, 700),
        "dollar": (-20, -100, 497, 733),
        "percent": (39, -10, 793, 692),
        "ampersand": (5, -19, 699, 682),
        "quoteright": (98, 369, 302, 685),
        "parenleft": (28, -179, 344, 685),
        "parenright": (-44, -179, 271, 685),
        "asterisk": (65, 249, 456, 685),
        "plus": (33, 0, 537, 506),
        "comma": (-60, -182, 144, 134),
        "hyphen": (2, 166, 271, 282),
        "period": (-9, -13, 139, 135),
        "slash": (-64, -18, 342, 685),
        "zero": (17, -14, 477, 683),
        "one": (5, 0, 419, 683),
        "two": (-27, 0, 446, 683),
        "three": (-15, -13, 450, 683),
        "four": (-15, 0, 503, 683),
        "five": (-11, -13, 487, 669),
        "six": (23, -15, 509, 679),
        "seven": (52, 0, 525, 669),
        "eight": (3, -13, 476, 683),
        "nine": (-12, -10, 475, 683),
        "colon": (23, -13, 264, 459),
        "semicolon": (-25, -183, 264, 459),
        "less": (31, -8, 539, 514),
        "equal": (33, 107, 537, 399),
        "greater": (31, -8, 539, 514),
        "question": (79, -13, 470, 684),
        "at": (63, -18, 770, 685),
        "A": (-67, 0, 593, 683),
        "B": (-24, 0, 624, 669),
        "C": (32, -18, 677, 685),
        "D": (-46, 0, 685, 669),
        "E": (-27, 0, 653, 669),
        "F": (-13, 0, 660, 669),
        "G": (21, -18, 706, 685),
        "H": (-24, 0, 799, 669),
        "I": (-32, 0, 406, 669),
        "J": (-46, -99, 524, 669),
        "K": (-21, 0, 702, 669),
        "L": (-22, 0, 590, 669),
        "M": (-29, -12, 917, 669),
        "N": (-27, -15, 748, 669),
        "O": (27, -18, 691, 685),
        "P": (-27, 0, 613, 669),
        "Q": (27, -208, 691, 685),
        "R": (-29, 0, 623, 669),
        "S": (2, -18, 526, 685),
        "T": (50, 0, 650, 669),
        "U": (67, -18, 744, 669),
        "V": (65, -18, 715, 669),
        "W": (65, -18, 940, 669),
        "X": (-24, 0, 694, 669),
        "Y": (73, 0, 659, 669),
        "Z": (-11, 0, 590, 669),
        "bracketleft": (-37, -159, 362, 674),
        "backslash": (-1, -18, 279, 685),
        "bracketright": (-56, -157, 343, 674),
        "asciicircum": (67, 304, 503, 669),
        "underscore": (0, -125, 500, -75),
        "quoteleft": (128, 369, 332, 685),
        "a": (-21, -14, 455, 462),
        "b": (-14, -13, 444, 699),
        "c": (-5, -13, 392, 462),
        "d": (-21, -13, 517, 699),
        "e": (5, -13, 398, 462),
        "f": (-169, -205, 446, 698),
        "g": (-52, -203, 478, 462),
        "h": (-13, -9, 498, 699),
        "i": (2, -9, 263, 684),
        "j": (-189, -207, 279, 684),
        "k": (-23, -8, 483, 699),
        "l": (2, -9, 290, 699),
        "m": (-14, -9, 722, 462),
        "n": (-6, -9, 493, 462),
        "o": (-3, -13, 441, 462),
        "p": (-120, -205, 446, 462),
        "q": (1, -205, 471, 462),
        "r": (-21, 0, 389, 462),
        "s": (-19, -13, 333, 462),
        "t": (-11, -9, 281, 594),
        "u": (15, -9, 492, 462),
        "v": (16, -13, 401, 462),
        "w": (16, -13, 614, 462),
        "x": (-46, -13, 469, 462),
        "y": (-94, -205, 392, 462),
        "z": (-43, -78, 368, 449),
        "braceleft": (5, -187, 436, 686),
        "bar": (66, -18, 154, 685),
        "braceright": (-129, -187, 302, 686),
        "asciitilde": (54, 173, 516, 333),
        "exclamdown": (19, -205, 322, 492),
        "cent": (42, -143, 439, 576),
        "sterling": (-32, -12, 510, 683),
        "fraction": (-169, -14, 324, 683),
        "yen": (33, 0, 628, 669),
        "florin": (-87, -156, 537, 707),
        "section": (36, -143, 459, 685),
        "currency": (-26, 34, 526, 586),
        "quotesingle": (128, 398, 268, 685),
        "quotedblleft": (53, 369, 513, 685),
        "guillemotleft": (12, 32, 468, 415),
        "guilsinglleft": (32, 32, 303, 415),
        "guilsinglright": (10, 32, 281, 415),
        "fi": (-188, -205, 514, 703),
        "fl": (-186, -205, 553, 704),
        "endash": (-40, 178, 477, 269),
        "dagger": (91, -145, 494, 685),
        "daggerdbl": (10, -139, 493, 685),
        "periodcentered": (51, 257, 199, 405),
        "paragraph": (-57, -193, 562, 669),
        "bullet": (0, 175, 350, 525),
        "quotesinglbase": (-5, -182, 199, 134),
        "quotedblbase": (-57, -182, 403, 134),
        "quotedblright": (53, 369, 513, 685),
        "guillemotright": (12, 32, 468, 415),
        "ellipsis": (40, -13, 852, 135),
        "perthousand": (7, -29, 996, 706),
        "questiondown": (30, -205, 421, 492),
        "grave": (85, 516, 297, 697),
        "acute": (139, 516, 379, 697),
        "circumflex": (40, 516, 367, 690),
        "tilde": (48, 536, 407, 655),
        "macron": (51, 553, 393, 623),
        "breve": (71, 516, 387, 678),
        "dotaccent": (163, 525, 293, 655),
        "dieresis": (55, 525, 397, 655),
        "ring": (127, 516, 340, 729),
        "cedilla": (-80, -218, 156, 5),
        "hungarumlaut": (69, 516, 498, 697),
        "ogonek": (-40, -173, 189, 44),
        "caron": (79, 516, 411, 690),
        "emdash": (-40, 178, 977, 269),
        "AE": (-64, 0, 918, 669),
        "ordfeminine": (16, 399, 330, 685),
        "Lslash": (-22, 0, 590, 669),
        "Oslash": (27, -125, 691, 764),
        "OE": (23, -8, 946, 677),
        "ordmasculine": (56, 400, 347, 685),
        "ae": (-5, -13, 673, 462),
        "dotlessi": (2, -9, 238, 462),
        "lslash": (-13, -9, 301, 699),
        "oslash": (-3, -119, 441, 560),
        "oe": (6, -13, 674, 462),
        "germandbls": (-200, -200, 473, 705),
        "onesuperior": (30, 274, 301, 683),
        "logicalnot": (51, 108, 555, 399),
        "mu": (-60, -207, 516, 449),
        "trademark": (32, 263, 968, 669),
        "Eth": (-31, 0, 700, 669),
        "onehalf": (-9, -14, 723, 683),
        "plusminus": (33, 0, 537, 506),
        "Thorn": (-27, 0, 573, 669),
        "onequarter": (7, -14, 721, 683),
        "divide": (33, -29, 537, 535),
        "brokenbar": (66, -18, 154, 685),
        "degree": (83, 397, 369, 683),
        "thorn": (-120, -205, 446, 699),
        "threequarters": (7, -14, 726, 683),
        "twosuperior": (2, 274, 313, 683),
        "registered": (30, -18, 718, 685),
        "minus": (51, 209, 555, 297),
        "eth": (-3, -13, 454, 699),
        "multiply": (48, 16, 522, 490),
        "threesuperior": (17, 265, 321, 683),
        "copyright": (30, -18, 718, 685),
        "space": (0, 0, 0, 0),
        "Aacute": (-67, 0, 593, 904),
        "Acircumflex": (-67, 0, 593, 897),
        "Adieresis": (-67, 0, 593, 862),
        "Agrave": (-67, 0, 593, 904),
        "Aring": (-67, 0, 593, 921),
        "Atilde": (-67, 0, 593, 862),
        "Ccedilla": (32, -218, 677, 685),
        "Eacute": (-27, 0, 653, 904),
        "Ecircumflex": (-27, 0, 653, 897),
        "Edieresis": (-27, 0, 653, 862),
        "Egrave": (-27, 0, 653, 904),
        "Iacute": (-32, 0, 412, 904),
        "Icircumflex": (-32, 0, 420, 897),
        "Idieresis": (-32, 0, 445, 862),
        "Igrave": (-32, 0, 406, 904),
        "Ntilde": (-27, -15, 748, 862),
        "Oacute": (27, -18, 691, 904),
        "Ocircumflex": (27, -18, 691, 897),
        "Odieresis": (27, -18, 691, 862),
        "Ograve": (27, -18, 691, 904),
        "Otilde": (27, -18, 691, 862),
        "Scaron": (2, -18, 526, 897),
        "Uacute": (67, -18, 744, 904),
        "Ucircumflex": (67, -18, 744, 897),
        "Udieresis": (67, -18, 744, 862),
        "Ugrave": (67, -18, 744, 904),
        "Yacute": (73, 0, 659, 904),
        "Ydieresis": (73, 0, 659, 862),
        "Zcaron": (-11, 0, 590, 897),
        "aacute": (-21, -14, 463, 697),
        "acircumflex": (-21, -14, 455, 690),
        "adieresis": (-21, -14, 471, 655),
        "agrave": (-21, -14, 455, 697),
        "aring": (-21, -14, 455, 729),
        "atilde": (-21, -14, 491, 655),
        "ccedilla": (-24, -218, 392, 462),
        "eacute": (5, -13, 435, 697),
        "ecircumflex": (5, -13, 423, 690),
        "edieresis": (5, -13, 443, 655),
        "egrave": (5, -13, 398, 697),
        "iacute": (2, -9, 352, 697),
        "icircumflex": (-2, -9, 325, 690),
        "idieresis": (2, -9, 360, 655),
        "igrave": (2, -9, 260, 697),
        "ntilde": (-6, -9, 504, 655),
        "oacute": (-3, -13, 463, 697),
        "ocircumflex": (-3, -13, 451, 690),
        "odieresis": (-3, -13, 466, 655),
        "ograve": (-3, -13, 441, 697),
        "otilde": (-3, -13, 491, 655),
        "scaron": (-19, -13, 439, 690),
        "uacute": (15, -9, 492, 697),
        "ucircumflex": (15, -9, 492, 690),
        "udieresis": (15, -9, 494, 655),
        "ugrave": (15, -9, 492, 697),
        "yacute": (-94, -205, 435, 697),
        "ydieresis": (-94, -205, 438, 655),
        "zcaron": (-43, -78, 424, 690),
    },
    "Times-Bold": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (81, -13, 251, 691),
        "quotedbl": (83, 404, 472, 691),
        "numbersign": (4, 0, 496, 700),
        "dollar": (29, -99, 472, 750),
        "percent": (124, -14, 877, 692),
        "ampersand": (62, -16, 787, 691),
        "quoteright": (79, 356, 263, 691),
        "parenleft": (46, -168, 306, 694),
        "parenright": (27, -168, 287, 694),
        "asterisk": (56, 255, 447, 691),
        "plus": (33, 0, 537, 506),
        "comma": (39, -180, 223, 155),
        "hyphen": (44, 171, 287, 287),
        "period": (41, -13, 210, 156),
        "slash": (-24, -19, 302, 691),
        "zero": (24, -13, 476, 688),
        "one": (65, 0, 442, 688),
        "two": (17, 0, 478, 688),
        "three": (16, -14, 468, 688),
        "four": (19, 0, 475, 688),
        "five": (22, -8, 470, 676),
        "six": (28, -13, 475, 688),
        "seven": (17, 0, 477, 676),
        "eight": (28, -13, 472, 688),
        "nine": (26, -13, 473, 688),
        "colon": (82, -13, 251, 472),
        "semicolon": (82, -180, 266, 472),
        "less": (31, -8, 539, 514),
        "equal": (33, 107, 537, 399),
        "greater": (31, -8, 539, 514),
        "question": (57, -13, 445, 689),
        "at": (108, -19, 822, 691),
        "A": (9, 0, 689, 690),
        "B": (16, 0, 619, 676),
        "C": (49, -19, 687, 691),
        "D": (14, 0, 690, 676),
        "E": (16, 0, 641, 676),
        "F": (16, 0, 583, 676),
        "G": (37, -19, 755, 691),
        "H": (21, 0, 759, 676),
        "I": (20, 0, 370, 676),
        "J": (3, -96, 479, 676),
        "K": (30, 0, 769, 676),
        "L": (19, 0, 638, 676),
        "M": (14, 0, 921, 676),
        "N": (16, -18, 701, 676),
        "O": (35, -19, 743, 691),
        "P": (16, 0, 600, 676),
        "Q": (35, -176, 743, 691),
        "R": (26, 0, 715, 676),
        "S": (35, -19, 513, 692),
        "T": (31, 0, 636, 676),
        "U": (16, -19, 701, 676),
        "V": (16, -18, 701, 676),
        "W": (19, -15, 981, 676),
        "X": (16, 0, 699, 676),
        "Y": (15, 0, 699, 676),
        "Z": (28, 0, 634, 676),
        "bracketleft": (67, -149, 301, 678),
        "backslash": (-25, -19, 303, 691),
        "bracketright": (32, -149, 266, 678),
        "asciicircum": (73, 311, 509, 676),
        "underscore": (0, -125, 500, -75),
        "quoteleft": (70, 356, 254, 691),
        "a": (25, -14, 488, 473),
        "b": (17, -14, 521, 676),
        "c": (25, -14, 430, 473),
        "d": (25, -14, 534, 676),
        "e": (25, -14, 426, 473),
        "f": (14, 0, 389, 691),
        "g": (28, -206, 483, 473),
        "h": (16, 0, 534, 676),
        "i": (16, 0, 255, 691),
        "j": (-57, -203, 263, 691),
        "k": (22, 0, 543, 676),
        "l": (16, 0, 255, 676),
        "m": (16, 0, 814, 473),
        "n": (21, 0, 539, 473),
        "o": (25, -14, 476, 473),
        "p": (19, -205, 524, 473),
        "q": (34, -205, 536, 473),
        "r": (29, 0, 434, 473),
        "s": (25, -14, 361, 473),
        "t": (20, -12, 332, 630),
        "u": (16, -14, 537, 461),
        "v": (21, -14, 485, 461),
        "w": (23, -14, 707, 461),
        "x": (12, 0, 484, 461),
        "y": (16, -205, 480, 461),
        "z": (21, 0, 420, 461),
        "braceleft": (22, -175, 340, 698),
        "bar": (66, -19, 154, 691),
        "braceright": (54, -175, 372, 698),
        "asciitilde": (29, 173, 491, 333),
        "exclamdown": (82, -203, 252, 501),
        "cent": (53, -140, 458, 588),
        "sterling": (21, -14, 477, 684),
        "fraction": (-168, -12, 329, 688),
        "yen": (-64, 0, 547, 676),
        "florin": (0, -155, 498, 706),
        "section": (57, -132, 443, 691),
        "currency": (-26, 61, 526, 613),
        "quotesingle": (75, 404, 204, 691),
        "quotedblleft": (32, 356, 486, 691),
        "guillemotleft": (23, 36, 473, 415),
        "guilsinglleft": (51, 36, 305, 415),
        "guilsinglright": (28, 36, 282, 415),
        "fi": (14, 0, 536, 691),
        "fl": (14, 0, 536, 691),
        "endash": (0, 181, 500, 271),
        "dagger": (47, -134, 453, 691),
        "daggerdbl": (45, -132, 456, 691),
        "periodcentered": (41, 248, 210, 417),
        "paragraph": (0, -186, 519, 676),
        "bullet": (35, 198, 315, 478),
        "quotesinglbase": (79, -180, 263, 155),
        "quotedblbase": (14, -180, 468, 155),
        "quotedblright": (14, 356, 468, 691),
        "guillemotright": (27, 36, 477, 415),
        "ellipsis": (82, -13, 917, 156),
        "perthousand": (7, -29, 995, 706),
        "questiondown": (55, -201, 443, 501),
        "grave": (8, 528, 246, 713),
        "acute": (86, 528, 324, 713),
        "circumflex": (-2, 528, 335, 704),
        "tilde": (-16, 547, 349, 674),
        "macron": (1, 565, 331, 637),
        "breve": (15, 528, 318, 691),
        "dotaccent": (103, 537, 230, 667),
        "dieresis": (-2, 537, 335, 667),
        "ring": (60, 527, 273, 740),
        "cedilla": (68, -218, 294, 0),
        "hungarumlaut": (-13, 528, 425, 713),
        "ogonek": (90, -173, 319, 44),
        "caron": (-2, 528, 335, 704),
        "emdash": (0, 181, 1000, 271),
        "AE": (4, 0, 951, 676),
        "ordfeminine": (-1, 397, 301, 688),
        "Lslash": (19, 0, 638, 676),
        "Oslash": (35, -74, 743, 737),
        "OE": (22, -5, 981, 684),
        "ordmasculine": (18, 397, 312, 688),
        "ae": (33, -14, 693, 473),
        "dotlessi": (16, 0, 255, 461),
        "lslash": (-22, 0, 303, 676),
        "oslash": (25, -92, 476, 549),
        "oe": (22, -14, 696, 473),
        "germandbls": (19, -12, 517, 691),
        "onesuperior": (28, 275, 273, 688),
        "logicalnot": (33, 108, 537, 399),
        "mu": (33, -206, 536, 461),
        "trademark": (24, 271, 977, 676),
        "Eth": (6, 0, 690, 676),
        "onehalf": (-7, -12, 775, 688),
        "plusminus": (33, 0, 537, 506),
        "Thorn": (16, 0, 600, 676),
        "onequarter": (28, -12, 743, 688),
        "divide": (33, -31, 537, 537),
        "brokenbar": (66, -19, 154, 691),
        "degree": (57, 402, 343, 688),
        "thorn": (19, -205, 524, 676),
        "threequarters": (23, -12, 733, 688),
        "twosuperior": (0, 275, 300, 688),
        "registered": (26, -19, 721, 691),
        "minus": (33, 209, 537, 297),
        "eth": (25, -14, 476, 691),
        "multiply": (48, 16, 522, 490),
        "threesuperior": (3, 268, 297, 688),
        "copyright": (26, -19, 721, 691),
        "space": (0, 0, 0, 0),
        "Aacute": (9, 0, 689, 923),
        "Acircumflex": (9, 0, 689, 914),
        "Adieresis": (9, 0, 689, 877),
        "Agrave": (9, 0, 689, 923),
        "Aring": (9, 0, 689, 935),
        "Atilde": (9, 0, 689, 884),
        "Ccedilla": (49, -218, 687, 691),
        "Eacute": (16, 0, 641, 923),
        "Ecircumflex": (16, 0, 641, 914),
        "Edieresis": (16, 0, 641, 877),
        "Egrave": (16, 0, 641, 923),
        "Iacute": (20, 0, 370, 923),
        "Icircumflex": (20, 0, 370, 914),
        "Idieresis": (20, 0, 370, 877),
        "Igrave": (20, 0, 370, 923),
        "Ntilde": (16, -18, 701, 884),
        "Oacute": (35, -19, 743, 923),
        "Ocircumflex": (35, -19, 743, 914),
        "Odieresis": (35, -19, 743, 877),
        "Ograve": (35, -19, 743, 923),
        "Otilde": (35, -19, 743, 884),
        "Scaron": (35, -19, 513, 914),
        "Uacute": (16, -19, 701, 923),
        "Ucircumflex": (16, -19, 701, 914),
        "Udieresis": (16, -19, 701, 877),
        "Ugrave": (16, -19, 701, 923),
        "Yacute": (15, 0, 699, 928),
        "Ydieresis": (15, 0, 699, 877),
        "Zcaron": (28, 0, 634, 914),
        "aacute": (25, -14, 488, 713),
        "acircumflex": (25, -14, 488, 704),
        "adieresis": (25, -14, 488, 667),
        "agrave": (25, -14, 488, 713),
        "aring": (25, -14, 488, 740),
        "atilde": (25, -14, 488, 674),
        "ccedilla": (25, -218, 430, 473),
        "eacute": (25, -14, 426, 713),
        "ecircumflex": (25, -14, 426, 704),
        "edieresis": (25, -14, 426, 667),
        "egrave": (25, -14, 426, 713),
        "iacute": (16, 0, 290, 713),
        "icircumflex": (-36, 0, 301, 704),
        "idieresis": (-36, 0, 301, 667),
        "igrave": (-26, 0, 255, 713),
        "ntilde": (21, 0, 539, 674),
        "oacute": (25, -14, 476, 713),
        "ocircumflex": (25, -14, 476, 704),
        "odieresis": (25, -14, 476, 667),
        "ograve": (25, -14, 476, 713),
        "otilde": (25, -14, 476, 674),
        "scaron": (25, -14, 363, 704),
        "uacute": (16, -14, 537, 713),
        "ucircumflex": (16, -14, 537, 704),
        "udieresis": (16, -14, 537, 667),
        "ugrave": (16, -14, 537, 713),
        "yacute": (16, -205, 480, 713),
        "ydieresis": (16, -205, 480, 667),
        "zcaron": (21, 0, 420, 704),
    },
    "Times-Italic": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (39, -11, 302, 667),
        "quotedbl": (144, 421, 432, 666),
        "numbersign": (2, 0, 540, 676),
        "dollar": (31, -89, 497, 731),
        "percent": (79, -13, 790, 676),
        "ampersand": (76, -18, 723, 666),
        "quoteright": (151, 436, 290, 666),
        "parenleft": (42, -181, 315, 669),
        "parenright": (16, -180, 289, 669),
        "asterisk": (128, 255, 492, 666),
        "plus": (86, 0, 590, 506),
        "comma": (-4, -129, 135, 101),
        "hyphen": (49, 192, 282, 255),
        "period": (27, -11, 138, 100),
        "slash": (-65, -18, 386, 666),
        "zero": (32, -7, 497, 676),
        "one": (49, 0, 409, 676),
        "two": (12, 0, 452, 676),
        "three": (15, -7, 465, 676),
        "four": (1, 0, 479, 676),
        "five": (15, -7, 491, 666),
        "six": (30, -7, 521, 686),
        "seven": (75, -8, 537, 666),
        "eight": (30, -7, 493, 676),
        "nine": (23, -17, 492, 676),
        "colon": (50, -11, 261, 441),
        "semicolon": (27, -129, 261, 441),
        "less": (84, -8, 592, 514),
        "equal": (86, 120, 590, 386),
        "greater": (84, -8, 592, 514),
        "question": (132, -12, 472, 664),
        "at": (118, -18, 806, 666),
        "A": (-51, 0, 564, 668),
        "B": (-8, 0, 588, 653),
        "C": (66, -18, 689, 666),
        "D": (-8, 0, 700, 653),
        "E": (-1, 0, 634, 653),
        "F": (8, 0, 645, 653),
        "G": (52, -18, 722, 666),
        "H": (-8, 0, 767, 653),
        "I": (-8, 0, 384, 653),
        "J": (-6, -18, 491, 653),
        "K": (7, 0, 722, 653),
        "L": (-8, 0, 559, 653),
        "M": (-18, 0, 873, 653),
        "N": (-20, -15, 727, 653),
        "O": (60, -18, 699, 666),
        "P": (0, 0, 605, 653),
        "Q": (59, -182, 699, 666),
        "R": (-13, 0, 588, 653),
        "S": (17, -18, 508, 667),
        "T": (59, 0, 633, 653),
        "U": (102, -18, 765, 653),
        "V": (76, -18, 688, 653),
        "W": (71, -18, 906, 653),
        "X": (-29, 0, 655, 653),
        "Y": (78, 0, 633, 653),
        "Z": (-6, 0, 606, 653),
        "bracketleft": (21, -153, 391, 663),
        "backslash": (-41, -18, 319, 666),
        "bracketright": (12, -153, 382, 663),
        "asciicircum": (0, 301, 422, 666),
        "underscore": (0, -125, 500, -75),
        "quoteleft": (171, 436, 310, 666),
        "a": (17, -11, 476, 441),
        "b": (23, -11, 473, 683),
        "c": (30, -11, 425, 441),
        "d": (15, -13, 527, 683),
        "e": (31, -11, 412, 441),
        "f": (-147, -207, 424, 678),
        "g": (8, -206, 472, 441),
        "h": (19, -9, 478, 683),
        "i": (49, -11, 264, 654),
        "j": (-124, -207, 276, 654),
        "k": (14, -11, 461, 683),
        "l": (41, -11, 279, 683),
        "m": (12, -9, 704, 441),
        "n": (14, -9, 474, 441),
        "o": (27, -11, 468, 441),
        "p": (-75, -205, 469, 441),
        "q": (25, -209, 483, 441),
        "r": (45, 0, 412, 441),
        "s": (16, -13, 366, 442),
        "t": (37, -11, 296, 546),
        "u": (42, -11, 475, 441),
        "v": (21, -18, 426, 441),
        "w": (16, -18, 648, 441),
        "x": (-27, -11, 447, 441),
        "y": (-24, -206, 426, 441),
        "z": (-2, -81, 380, 428),
        "braceleft": (51, -177, 407, 687),
        "bar": (105, -18, 171, 666),
        "braceright": (-7, -177, 349, 687),
        "asciitilde": (40, 183, 502, 323),
        "exclamdown": (59, -205, 322, 473),
        "cent": (77, -143, 472, 560),
        "sterling": (10, -6, 517, 670),
        "fraction": (-169, -10, 337, 676),
        "yen": (27, 0, 603, 653),
        "florin": (25, -182, 507, 682),
        "section": (53, -162, 461, 666),
        "currency": (-22, 53, 522, 597),
        "quotesingle": (132, 421, 241, 666),
        "quotedblleft": (166, 436, 514, 666),
        "guillemotleft": (53, 37, 445, 403),
        "guilsinglleft": (51, 37, 281, 403),
        "guilsinglright": (52, 37, 282, 403),
        "fi": (-141, -207, 481, 681),
        "fl": (-141, -204, 517, 682),
        "endash": (-6, 197, 505, 243),
        "dagger": (101, -159, 488, 666),
        "daggerdbl": (22, -143, 491, 666),
        "periodcentered": (70, 199, 181, 310),
        "paragraph": (55, -123, 616, 653),
        "bullet": (40, 191, 310, 461),
        "quotesinglbase": (44, -129, 183, 101),
        "quotedblbase": (57, -129, 405, 101),
        "quotedblright": (151, 436, 499, 666),
        "guillemotright": (55, 37, 447, 403),
        "ellipsis": (57, -11, 762, 100),
        "perthousand": (25, -19, 1010, 706),
        "questiondown": (28, -205, 368, 471),
        "grave": (121, 492, 311, 664),
        "acute": (180, 494, 403, 664),
        "circumflex": (91, 492, 385, 661),
        "tilde": (100, 517, 427, 624),
        "macron": (99, 532, 411, 583),
        "breve": (117, 492, 418, 650),
        "dotaccent": (207, 508, 305, 606),
        "dieresis": (107, 508, 405, 606),
        "ring": (155, 492, 355, 691),
        "cedilla": (-30, -217, 182, 0),
        "hungarumlaut": (93, 494, 486, 664),
        "ogonek": (-20, -169, 200, 40),
        "caron": (121, 492, 426, 661),
        "emdash": (-6, 197, 894, 243),
        "AE": (-27, 0, 911, 653),
        "ordfeminine": (42, 406, 352, 676),
        "Lslash": (-8, 0, 559, 653),
        "Oslash": (60, -105, 699, 722),
        "OE": (49, -8, 964, 666),
        "ordmasculine": (67, 406, 362, 676),
        "ae": (23, -11, 640, 441),
        "dotlessi": (49, -11, 235, 441),
        "lslash": (37, -11, 307, 683),
        "oslash": (28, -135, 469, 554),
        "oe": (20, -12, 646, 441),
        "germandbls": (-168, -207, 493, 679),
        "onesuperior": (43, 271, 283, 676),
        "logicalnot": (86, 108, 590, 386),
        "mu": (-30, -209, 497, 428),
        "trademark": (30, 247, 957, 653),
        "Eth": (-8, 0, 700, 653),
        "onehalf": (34, -10, 749, 676),
        "plusminus": (86, 0, 590, 506),
        "Thorn": (0, 0, 569, 653),
        "onequarter": (33, -10, 736, 676),
        "divide": (86, -11, 590, 517),
        "brokenbar": (105, -18, 171, 666),
        "degree": (101, 390, 387, 676),
        "thorn": (-75, -205, 469, 683),
        "threequarters": (23, -10, 736, 676),
        "twosuperior": (33, 271, 324, 676),
        "registered": (41, -18, 719, 666),
        "minus": (86, 220, 590, 286),
        "eth": (27, -11, 482, 683),
        "multiply": (93, 8, 582, 497),
        "threesuperior": (43, 268, 339, 676),
        "copyright": (41, -18, 719, 666),
        "space": (0, 0, 0, 0),
        "Aacute": (-51, 0, 564, 876),
        "Acircumflex": (-51, 0, 564, 873),
        "Adieresis": (-51, 0, 564, 818),
        "Agrave": (-51, 0, 564, 876),
        "Aring": (-51, 0, 564, 883),
        "Atilde": (-51, 0, 566, 836),
        "Ccedilla": (66, -217, 689, 666),
        "Eacute": (-1, 0, 634, 876),
        "Ecircumflex": (-1, 0, 634, 873),
        "Edieresis": (-1, 0, 634, 818),
        "Egrave": (-1, 0, 634, 876),
        "Iacute": (-8, 0, 413, 876),
        "Icircumflex": (-8, 0, 425, 873),
        "Idieresis": (-8, 0, 435, 818),
        "Igrave": (-8, 0, 384, 876),
        "Ntilde": (-20, -15, 727, 836),
        "Oacute": (60, -18, 699, 876),
        "Ocircumflex": (60, -18, 699, 873),
        "Odieresis": (60, -18, 699, 818),
        "Ograve": (60, -18, 699, 876),
        "Otilde": (60, -18, 699, 836),
        "Scaron": (17, -18, 520, 873),
        "Uacute": (102, -18, 765, 876),
        "Ucircumflex": (102, -18, 765, 873),
        "Udieresis": (102, -18, 765, 818),
        "Ugrave": (102, -18, 765, 876),
        "Yacute": (78, 0, 633, 876),
        "Ydieresis": (78, 0, 633, 818),
        "Zcaron": (-6, 0, 606, 873),
        "aacute": (17, -11, 487, 664),
        "acircumflex": (17, -11, 476, 661),
        "adieresis": (17, -11, 489, 606),
        "agrave": (17, -11, 476, 664),
        "aring": (17, -11, 476, 691),
        "atilde": (17, -11, 511, 624),
        "ccedilla": (26, -217, 425, 441),
        "eacute": (31, -11, 459, 664),
        "ecircumflex": (31, -11, 441, 661),
        "edieresis": (31, -11, 451, 606),
        "egrave": (31, -11, 412, 664),
        "iacute": (49, -11, 356, 664),
        "icircumflex": (34, -11, 328, 661),
        "idieresis": (49, -11, 353, 606),
        "igrave": (49, -11, 284, 664),
        "ntilde": (14, -9, 476, 624),
        "oacute": (27, -11, 487, 664),
        "ocircumflex": (27, -11, 468, 661),
        "odieresis": (27, -11, 489, 606),
        "ograve": (27, -11, 468, 664),
        "otilde": (27, -11, 496, 624),
        "scaron": (16, -13, 454, 661),
        "uacute": (42, -11, 477, 664),
        "ucircumflex": (42, -11, 475, 661),
        "udieresis": (42, -11, 479, 606),
        "ugrave": (42, -11, 475, 664),
        "yacute": (-24, -206, 459, 664),
        "ydieresis": (-24, -206, 441, 606),
        "zcaron": (-2, -81, 434, 661),
    },
    "Times-Roman": {
        ".notdef": (0, 0, 0, 0),
        "exclam": (130, -9, 238, 676),
        "quotedbl": (77, 431, 331, 676),
        "numbersign": (5, 0, 496, 662),
        "dollar": (44, -87, 457, 727),
        "percent": (61, -13, 772, 676),
        "ampersand": (42, -13, 750, 676),
        "quoteright": (79, 433, 218, 676),
        "parenleft": (48, -177, 304, 676),
        "parenright": (29, -177, 285, 676),
        "asterisk": (69, 265, 432, 676),
        "plus": (30, 0, 534, 506),
        "comma": (56, -141, 195, 102),
        "hyphen": (39, 194, 285, 257),
        "period": (70, -11, 181, 100),
        "slash": (-9, -14, 287, 676),
        "zero": (24, -14, 476, 676),
        "one": (111, 0, 394, 676),
        "two": (30, 0, 475, 676),
        "three": (43, -14, 431, 676),
        "four": (12, 0, 472, 676),
        "five": (32, -14, 438, 688),
        "six": (34, -14, 468, 684),
        "seven": (20, -8, 449, 662),
        "eight": (56, -14, 445, 676),
        "nine": (30, -22, 459, 676),
        "colon": (81, -11, 192, 459),
        "semicolon": (80, -141, 219, 459),
        "less": (28, -8, 536, 514),
        "equal": (30, 120, 534, 386),
        "greater": (28, -8, 536, 514),
        "question": (68, -8, 414, 676),
        "at": (116, -14, 809, 676),
        "A": (15, 0, 706, 674),
        "B": (17, 0, 593, 662),
        "C": (28, -14, 633, 676),
        "D": (16, 0, 685, 662),
        "E": (12, 0, 597, 662),
        "F": (12, 0, 546, 662),
        "G": (32, -14, 709, 676),
        "H": (19, 0, 702, 662),
        "I": (18, 0, 315, 662),
        "J": (10, -14, 370, 662),
        "K": (34, 0, 723, 662),
        "L": (12, 0, 598, 662),
        "M": (12, 0, 863, 662),
        "N": (12, -11, 707, 662),
        "O": (34, -14, 688, 676),
        "P": (16, 0, 542, 662),
        "Q": (34, -178, 701, 676),
        "R": (17, 0, 659, 662),
        "S": (42, -14, 491, 676),
        "T": (17, 0, 593, 662),
        "U": (14, -14, 705, 662),
        "V": (16, -11, 697, 662),
        "W": (5, -11, 932, 662),
        "X": (10, 0, 704, 662),
        "Y": (22, 0, 703, 662),
        "Z": (9, 0, 597, 662),
        "bracketleft": (88, -156, 299, 662),
        "backslash": (-9, -14, 287, 676),
        "bracketright": (34, -156, 245, 662),
        "asciicircum": (24, 297, 446, 662),
        "underscore": (0, -125, 500, -75),
        "quoteleft": (115, 433, 254, 676),
        "a": (37, -10, 442, 460),
        "b": (3, -10, 468, 683),
        "c": (25, -10, 412, 460),
        "d": (27, -10, 491, 683),
        "e": (25, -10, 424, 460),
        "f": (20, 0, 383, 683),
        "g": (28, -218, 470, 460),
        "h": (9, 0, 487, 683),
        "i": (16, 0, 253, 683),
        "j": (-70, -218, 194, 683),
        "k": (7, 0, 505, 683),
        "l": (19, 0, 257, 683),
        "m": (16, 0, 775, 460),
        "n": (16, 0, 485, 460),
        "o": (29, -10, 470, 460),
        "p": (5, -217, 470, 460),
        "q": (24, -217, 488, 460),
        "r": (5, 0, 335, 460),
        "s": (51, -10, 348, 460),
        "t": (13, -10, 279, 579),
        "u": (9, -10, 479, 450),
        "v": (19, -14, 477, 450),
        "w": (21, -14, 694, 450),
        "x": (17, 0, 479, 450),
        "y": (14, -218, 475, 450),
        "z": (27, 0, 418, 450),
        "braceleft": (100, -181, 350, 680),
        "bar": (67, -14, 133, 676),
        "braceright": (130, -181, 380, 680),
        "asciitilde": (40, 183, 502, 323),
        "exclamdown": (97, -218, 205, 467),
        "cent": (53, -138, 448, 579),
        "sterling": (12, -8, 490, 676),
        "fraction": (-168, -14, 331, 676),
        "yen": (-53, 0, 512, 662),
        "florin": (7, -189, 490, 676),
        "section": (70, -148, 426, 676),
        "currency": (-22, 58, 522, 602),
        "quotesingle": (48, 431, 133, 676),
        "quotedblleft": (43, 433, 414, 676),
        "guillemotleft": (42, 33, 456, 416),
        "guilsinglleft": (63, 33, 285, 416),
        "guilsinglright": (48, 33, 270, 416),
        "fi": (31, 0, 521, 683),
        "fl": (32, 0, 521, 683),
        "endash": (0, 201, 500, 250),
        "dagger": (59, -149, 442, 676),
        "daggerdbl": (58, -153, 442, 676),
        "periodcentered": (70, 199, 181, 310),
        "paragraph": (-22, -154, 450, 662),
        "bullet": (40, 196, 310, 466),
        "quotesinglbase": (79, -141, 218, 102),
        "quotedblbase": (45, -141, 416, 102),
        "quotedblright": (30, 433, 401, 676),
        "guillemotright": (44, 33, 458, 416),
        "ellipsis": (111, -11, 888, 100),
        "perthousand": (7, -19, 994, 706),
        "questiondown": (30, -218, 376, 466),
        "grave": (19, 507, 242, 678),
        "acute": (93, 507, 317, 678),
        "circumflex": (11, 507, 322, 674),
        "tilde": (1, 532, 331, 638),
        "macron": (11, 547, 322, 601),
        "breve": (26, 507, 307, 664),
        "dotaccent": (118, 523, 216, 623),
        "dieresis": (18, 523, 315, 623),
        "ring": (67, 512, 266, 711),
        "cedilla": (52, -215, 261, 0),
        "hungarumlaut": (-3, 507, 377, 678),
        "ogonek": (64, -165, 249, 0),
        "caron": (11, 507, 322, 674),
        "emdash": (0, 201, 1000, 250),
        "AE": (0, 0, 863, 662),
        "ordfeminine": (4, 394, 270, 676),
        "Lslash": (12, 0, 598, 662),
        "Oslash": (34, -80, 688, 734),
        "OE": (30, -6, 885, 668),
        "ordmasculine": (6, 394, 304, 676),
        "ae": (38, -10, 632, 460),
        "dotlessi": (16, 0, 253, 460),
        "lslash": (19, 0, 259, 683),
        "oslash": (29, -112, 470, 551),
        "oe": (30, -10, 690, 460),
        "germandbls": (12, -9, 468, 683),
        "onesuperior": (57, 270, 248, 676),
        "logicalnot": (30, 108, 534, 386),
        "mu": (36, -218, 512, 450),
        "trademark": (30, 256, 957, 662),
        "Eth": (16, 0, 685, 662),
        "onehalf": (31, -14, 746, 676),
        "plusminus": (30, 0, 534, 506),
        "Thorn": (16, 0, 542, 662),
        "onequarter": (37, -14, 718, 676),
        "divide": (30, -10, 534, 516),
        "brokenbar": (67, -14, 133, 676),
        "degree": (57, 390, 343, 676),
        "thorn": (5, -217, 470, 683),
        "threequarters": (15, -14, 718, 676),
        "twosuperior": (1, 270, 296, 676),
        "registered": (38, -14, 722, 676),
        "minus": (30, 220, 534, 286),
        "eth": (29, -10, 471, 686),
        "multiply": (38, 8, 527, 497),
        "threesuperior": (15, 262, 291, 676),
        "copyright": (38, -14, 722, 676),
        "space": (0, 0, 0, 0),
        "Aacute": (15, 0, 706, 890),
        "Acircumflex": (15, 0, 706, 886),
        "Adieresis": (15, 0, 706, 835),
        "Agrave": (15, 0, 706, 890),
        "Aring": (15, 0, 706, 898),
        "Atilde": (15, 0, 706, 850),
        "Ccedilla": (28, -215, 633, 676),
        "Eacute": (12, 0, 597, 890),
        "Ecircumflex": (12, 0, 597, 886),
        "Edieresis": (12, 0, 597, 835),
        "Egrave": (12, 0, 597, 890),
        "Iacute": (18, 0, 317, 890),
        "Icircumflex": (11, 0, 322, 886),
        "Idieresis": (18, 0, 315, 835),
        "Igrave": (18, 0, 315, 890),
        "Ntilde": (12, -11, 707, 850),
        "Oacute": (34, -14, 688, 890),
        "Ocircumflex": (34, -14, 688, 886),
        "Odieresis": (34, -14, 688, 835),
        "Ograve": (34, -14, 688, 890),
        "Otilde": (34, -14, 688, 850),
        "Scaron": (42, -14, 491, 886),
        "Uacute": (14, -14, 705, 890),
        "Ucircumflex": (14, -14, 705, 886),
        "Udieresis": (14, -14, 705, 835),
        "Ugrave": (14, -14, 705, 890),
        "Yacute": (22, 0, 703, 890),
        "Ydieresis": (22, 0, 703, 835),
        "Zcaron": (9, 0, 597, 886),
        "aacute": (37, -10, 442, 678),
        "acircumflex": (37, -10, 442, 674),
        "adieresis": (37, -10, 442, 623),
        "agrave": (37, -10, 442, 678),
        "aring": (37, -10, 442, 711),
        "atilde": (37, -10, 442, 638),
        "ccedilla": (25, -215, 412, 460),
        "eacute": (25, -10, 424, 678),
        "ecircumflex": (25, -10, 424, 674),
        "edieresis": (25, -10, 424, 623),
        "egrave": (25, -10, 424, 678),
        "iacute": (16, 0, 290, 678),
        "icircumflex": (-16, 0, 295, 674),
        "idieresis": (-9, 0, 288, 623),
        "igrave": (-8, 0, 253, 678),
        "ntilde": (16, 0, 485, 638),
        "oacute": (29, -10, 470, 678),
        "ocircumflex": (29, -10, 470, 674),
        "odieresis": (29, -10, 470, 623),
        "ograve": (29, -10, 470, 678),
        "otilde": (29, -10, 470, 638),
        "scaron": (39, -10, 350, 674),
        "uacute": (9, -10, 479, 678),
        "ucircumflex": (9, -10, 479, 674),
        "udieresis": (9, -10, 479, 623),
        "ugrave": (9, -10, 479, 678),
        "yacute": (14, -218, 475, 678),
        "ydieresis": (14, -218, 475, 623),
        "zcaron": (27, 0, 418, 674),
    },
    "ZapfDingbats": {
        ".notdef": (0, 0, 0, 0),
        "a1": (35, 72, 939, 621),
        "a2": (35, 81, 927, 611),
        "a202": (35, 72, 939, 621),
        "a3": (35, 0, 945, 692),
        "a4": (34, 139, 685, 566),
        "a5": (35, -14, 755, 705),
        "a119": (35, -14, 755, 705),
        "a118": (35, -13, 761, 705),
        "a117": (35, 138, 655, 553),
        "a11": (35, 123, 925, 568),
        "a12": (35, 134, 904, 559),
        "a13": (29, -11, 516, 705),
        "a14": (34, 59, 820, 632),
        "a15": (35, 50, 876, 642),
        "a16": (35, 139, 899, 550),
        "a105": (35, 50, 876, 642),
        "a17": (35, 139, 909, 553),
        "a18": (35, 104, 938, 587),
        "a19": (34, -13, 721, 705),
        "a20": (36, -14, 811, 705),
        "a21": (35, 0, 727, 692),
        "a22": (35, 0, 727, 692),
        "a23": (-1, -68, 571, 661),
        "a24": (36, -13, 642, 705),
        "a25": (35, 0, 728, 692),
        "a26": (35, 0, 726, 692),
        "a27": (35, 0, 725, 692),
        "a28": (35, 0, 720, 692),
        "a6": (35, 0, 460, 692),
        "a7": (35, 0, 517, 692),
        "a8": (35, 0, 503, 692),
        "a9": (35, 96, 542, 596),
        "a10": (35, -14, 657, 705),
        "a29": (35, -14, 751, 705),
        "a30": (35, -14, 752, 705),
        "a31": (35, -14, 753, 705),
        "a32": (35, -14, 756, 705),
        "a33": (35, -13, 759, 705),
        "a34": (35, -13, 759, 705),
        "a35": (35, -14, 782, 705),
        "a36": (35, -14, 787, 705),
        "a37": (35, -14, 754, 705),
        "a38": (35, -14, 807, 705),
        "a39": (35, -14, 789, 705),
        "a40": (35, -14, 798, 705),
        "a41": (35, -13, 782, 705),
        "a42": (35, -14, 796, 705),
        "a43": (35, -14, 888, 705),
        "a44": (35, 0, 710, 692),
        "a45": (35, 0, 688, 692),
        "a46": (35, 0, 714, 692),
        "a47": (34, -14, 756, 705),
        "a48": (35, -14, 758, 705),
        "a49": (35, -14, 661, 706),
        "a50": (35, -6, 741, 699),
        "a51": (35, -7, 734, 699),
        "a52": (35, -14, 757, 705),
        "a53": (35, 0, 725, 692),
        "a54": (35, -13, 672, 704),
        "a55": (35, -14, 672, 705),
        "a56": (35, -14, 647, 705),
        "a57": (35, -14, 666, 705),
        "a58": (35, -14, 791, 705),
        "a59": (35, -14, 780, 705),
        "a60": (35, -14, 754, 705),
        "a61": (35, -14, 754, 705),
        "a62": (34, -14, 673, 705),
        "a63": (36, 0, 651, 692),
        "a64": (35, 1, 661, 690),
        "a65": (35, 0, 655, 692),
        "a66": (34, -14, 751, 705),
        "a67": (35, -14, 752, 705),
        "a68": (35, -14, 678, 705),
        "a69": (35, -14, 756, 705),
        "a70": (36, -14, 751, 705),
        "a71": (35, -14, 757, 705),
        "a72": (35, -14, 838, 705),
        "a73": (35, 0, 726, 692),
        "a74": (35, 0, 727, 692),
        "a203": (35, 0, 727, 692),
        "a75": (35, 0, 725, 692),
        "a204": (35, 0, 725, 692),
        "a76": (35, 0, 858, 705),
        "a77": (35, -14, 858, 692),
        "a78": (35, -14, 754, 705),
        "a79": (35, -14, 749, 705),
        "a81": (35, -14, 403, 705),
        "a82": (35, 0, 104, 692),
        "a83": (35, 0, 242, 692),
        "a84": (35, 0, 380, 692),
        "a97": (35, 263, 357, 705),
        "a98": (34, 263, 357, 705),
        "a99": (35, 263, 633, 705),
        "a100": (36, 263, 634, 705),
        "a101": (35, -143, 697, 806),
        "a102": (56, -14, 488, 706),
        "a103": (34, -14, 508, 705),
        "a104": (35, 40, 875, 651),
        "a106": (35, -14, 633, 705),
        "a107": (35, -14, 726, 705),
        "a108": (0, 121, 758, 569),
        "a112": (35, 0, 741, 705),
        "a111": (34, -14, 560, 705),
        "a110": (35, -14, 659, 705),
        "a109": (34, 0, 591, 705),
        "a120": (35, -14, 754, 705),
        "a121": (35, -14, 754, 705),
        "a122": (35, -14, 754, 705),
        "a123": (35, -14, 754, 705),
        "a124": (35, -14, 754, 705),
        "a125": (35, -14, 754, 705),
        "a126": (35, -14, 754, 705),
        "a127": (35, -14, 754, 705),
        "a128": (35, -14, 754, 705),
        "a129": (35, -14, 754, 705),
        "a130": (35, -14, 754, 705),
        "a131": (35, -14, 754, 705),
        "a132": (35, -14, 754, 705),
        "a133": (35, -14, 754, 705),
        "a134": (35, -14, 754, 705),
        "a135": (35, -14, 754, 705),
        "a136": (35, -14, 754, 705),
        "a137": (35, -14, 754, 705),
        "a138": (35, -14, 754, 705),
        "a139": (35, -14, 754, 705),
        "a140": (35, -14, 754, 705),
        "a141": (35, -14, 754, 705),
        "a142": (35, -14, 754, 705),
        "a143": (35, -14, 754, 705),
        "a144": (35, -14, 754, 705),
        "a145": (35, -14, 754, 705),
        "a146": (35, -14, 754, 705),
        "a147": (35, -14, 754, 705),
        "a148": (35, -14, 754, 705),
        "a149": (35, -14, 754, 705),
        "a150": (35, -14, 754, 705),
        "a151": (35, -14, 754, 705),
        "a152": (35, -14, 754, 705),
        "a153": (35, -14, 754, 705),
        "a154": (35, -14, 754, 705),
        "a155": (35, -14, 754, 705),
        "a156": (35, -14, 754, 705),
        "a157": (35, -14, 754, 705),
        "a158": (35, -14, 754, 705),
        "a159": (35, -14, 754, 705),
        "a160": (35, 58, 860, 634),
        "a161": (35, 152, 803, 540),
        "a163": (34, 152, 981, 540),
        "a164": (35, -127, 422, 820),
        "a196": (35, 94, 698, 597),
        "a165": (35, 140, 890, 552),
        "a192": (35, 94, 698, 597),
        "a166": (35, 166, 884, 526),
        "a167": (35, 32, 892, 660),
        "a168": (35, 129, 891, 562),
        "a169": (35, 128, 893, 563),
        "a170": (35, 155, 799, 537),
        "a171": (35, 93, 838, 599),
        "a172": (35, 104, 791, 588),
        "a173": (35, 98, 889, 594),
        "a162": (35, 98, 889, 594),
        "a174": (35, 0, 882, 692),
        "a175": (35, 84, 896, 608),
        "a176": (35, 84, 896, 608),
        "a177": (35, -99, 429, 791),
        "a178": (35, 71, 848, 623),
        "a179": (35, 44, 802, 648),
        "a193": (35, 44, 802, 648),
        "a180": (35, 101, 832, 591),
        "a199": (35, 101, 832, 591),
        "a181": (35, 44, 661, 648),
        "a200": (35, 44, 661, 648),
        "a182": (35, 77, 840, 619),
        "a201": (35, 73, 840, 615),
        "a183": (35, 0, 725, 692),
        "a184": (35, 160, 911, 533),
        "a197": (34, 37, 736, 655),
        "a185": (35, 207, 830, 481),
        "a194": (34, 37, 736, 655),
        "a198": (34, -19, 853, 712),
        "a186": (35, 124, 932, 568),
        "a195": (34, -19, 853, 712),
        "a187": (35, 113, 796, 579),
        "a188": (36, 118, 838, 578),
        "a189": (35, 150, 891, 542),
        "a190": (35, 76, 931, 616),
        "a191": (34, 99, 884, 593),
        "a86": (35, 0, 375, 692),
        "a85": (35, 0, 475, 692),
        "a95": (35, 0, 299, 692),
        "a205": (35, 0, 475, 692),
        "a89": (35, -14, 356, 705),
        "a87": (35, -14, 199, 705),
        "a91": (35, 0, 242, 692),
        "a90": (35, -14, 355, 705),
        "a206": (35, 0, 375, 692),
        "a94": (35, 0, 283, 692),
        "a93": (35, 0, 283, 692),
        "a92": (35, 0, 242, 692),
        "a96": (35, 0, 299, 692),
        "a88": (35, -14, 199, 705),
        "space": (0, 0, 0, 0),
    },
}

base14_alias = {
    "Times New Roman": "Times-Roman",
    "Times New Roman,Bold": "Times-Bold",
    "Times New Roman,Italic": "Times-Italic",
}


def get_cached_bbox(database, family, encoding):
    bbox = [(0, 0, 0, 0)] * 256
    base_font = database[family]
    for index, name in enumerate(encoding):
        if name:
            if cur_bbox := base_font.get(name, None):
                bbox[index] = cur_bbox
    return bbox


def get_base14_bbox(family, encoding_name="WinAnsiEncoding"):
    bbox = [(0, 0, 0, 0)] * 256
    encoding = get_type1_encoding(encoding_name)
    if not encoding:
        return [(0, 0, 0, 0)] * 256

    if family in base14_alias:
        family = base14_alias[family]

    if family in base14_bbox:
        bbox = get_cached_bbox(base14_bbox, family, encoding)

    if family in win_core:
        bbox = get_cached_bbox(win_core, family, encoding)

    return bbox


================================================
FILE: babeldoc/format/pdf/babelpdf/cidfont.py
================================================
import re
from io import BytesIO

import freetype


def indirect(obj):
    if isinstance(obj, tuple) and obj[0] == "xref":
        return int(obj[1].split(" ")[0])


def get_xref(doc, xref, key):
    obj = doc.xref_get_key(xref, key)
    if obj[0] == "xref":
        return indirect(obj)


def get_font_file(doc, xref):
    if idx := get_xref(doc, xref, "FontFile"):
        return doc.xref_stream(idx)
    if idx := get_xref(doc, xref, "FontFile2"):
        return doc.xref_stream(idx)
    if idx := get_xref(doc, xref, "FontFile3"):
        return doc.xref_stream(idx)


def get_font_descriptor(doc, xref):
    if idx := get_xref(doc, xref, "FontDescriptor"):
        return get_font_file(doc, idx)


def get_descendant_fonts(doc, xref):
    obj = doc.xref_get_key(xref, "DescendantFonts")
    array_text = ""
    if obj[0] == "xref":
        array_text = doc.xref_object(indirect(obj))
    elif obj[0] == "array":
        array_text = obj[1]
    if m := re.search(r"\d+", array_text):
        return get_font_descriptor(doc, int(m.group(0)))


def get_glyph_bbox(face, g):
    try:
        face.load_glyph(g, freetype.FT_LOAD_NO_SCALE)
        outline = face.glyph.outline
        if outline.contours:
            cbox = outline.get_bbox()
            return cbox.xMin, cbox.yMin, cbox.xMax, cbox.yMax
        else:
            return 0, 0, 0, 0
    except Exception:
        return 0, 0, 0, 0


def get_face_bbox(blob):
    face = freetype.Face(BytesIO(blob))
    scale = 1000 / face.units_per_EM
    bbox_list = [get_glyph_bbox(face, code) for code in range(face.num_glyphs)]
    bbox_list = [[v * scale for v in bbox] for bbox in bbox_list]
    return bbox_list


def get_cidfont_bbox(doc, xref):
    if doc.xref_get_key(xref, "Subtype")[1] == "/Type0":
        if blob := get_descendant_fonts(doc, xref):
            return get_face_bbox(blob)


================================================
FILE: babeldoc/format/pdf/babelpdf/cmap.py
================================================
import re
import struct

pattern_map_r = (
    r"\s+begincidrange\s*"
    r"(?P<cidrange>(<[a-fA-F0-9]+>\s*<[a-fA-F0-9]+>\s*\d+\s*)+)"
    r"\s+endcidrange\s+"
)
pattern_map_c = (
    r"\s+begincidchar\s*"
    r"(?P<cidchar>(<[a-fA-F0-9]+>\s*\d+\s*)+)"
    r"\s+endcidchar\s+"
)
pattern_one_c = (
    r"<(?P<pat>[a-fA-F0-9]+)>"
    r"\s*"
    r"(?P<val>\d+)"
)
pattern_one_r = (
    r"<(?P<pat>[a-fA-F0-9]+)>"
    r"\s*"
    r"<(?P<end>[a-fA-F0-9]+)>"
    r"\s*"
    r"(?P<val>\d+)"
)


def parse_blob_value(text):
    return int(text, 16), len(text) // 2


def parse_cmap_char(text, store):
    for m in re.finditer(pattern_one_c, text):
        pat = m["pat"]
        val = m["val"]
        store.append((pat, int(val)))


def parse_cmap_range(text, store):
    for m in re.finditer(pattern_one_r, text):
        pat = m["pat"]
        end = m["end"]
        val = m["val"]
        store.append((pat, end, int(val)))


def parse_cmap(text):
    usecmap = ""
    if m := re.search(r"/(?P<usecmap>[a-zA-Z0-9-]+)\s+usecmap\s+", text):
        usecmap = m["usecmap"]
    cidrange = []
    for m in re.finditer(pattern_map_r, text):
        parse_cmap_range(m["cidrange"], cidrange)
    cidchar = []
    for m in re.finditer(pattern_map_c, text):
        parse_cmap_char(m["cidchar"], cidchar)
    return usecmap, cidrange, cidchar


_CMAP_CACHE: dict[str, tuple[list, list]] = {}


def _normalize_cmap_name(name: str) -> str:
    """Normalize cmap name for internal cache key."""
    if name.endswith(".json"):
        return name[: -len(".json")]
    return name


def use_cmap(name: str):
    key = _normalize_cmap_name(name)
    if key in _CMAP_CACHE:
        return _CMAP_CACHE[key]

    # Lazy import to avoid circular dependency at import time.
    from babeldoc.assets.assets import get_cmap_data

    data = get_cmap_data(key)
    if not isinstance(data, dict):
        raise TypeError(f"Invalid cmap data type for {key}: {type(data)!r}")

    cid_u = data.get("u") or ""
    cid_r = data.get("r") or []
    cid_c = data.get("c") or []

    store_r: list = []
    store_c: list = []
    if cid_u:
        use_r, use_c = use_cmap(cid_u)
        store_r += use_r
        store_c += use_c
    store_r += cid_r
    store_c += cid_c

    _CMAP_CACHE[key] = (store_r, store_c)
    return store_r, store_c


def propagation(r, c):
    encoding = {}
    len_set = set()
    for one_r in r:
        val_l, len_l = parse_blob_value(one_r[0])
        val_r, len_r = parse_blob_value(one_r[1])
        if len_l != len_r:
            continue
        len_set.add(len_l)
        for i, v in enumerate(range(val_l, val_r + 1)):
            val_b = struct.pack(">L", v)
            fin_b = val_b[4 - len_l :]
            encoding[fin_b] = one_r[2] + i
    for one_c in c:
        encoding[one_c[0]] = one_c[1]
    len_list = list(len_set)
    len_list.sort(reverse=True)
    return encoding, len_list


class CharacterMap:
    def __init__(self, text):
        cid_r = []
        cid_c = []
        usecmap, cidrange, cidchar = parse_cmap(text)
        if usecmap:
            use_r, use_c = use_cmap(usecmap)
            cid_r += use_r
            cid_c += use_c
        cid_r += cidrange
        cid_c += cidchar
        self.encoding, self.len_list = propagation(cid_r, cid_c)

    def decode_one(self, text):
        for l in self.len_list:
            pat = text[:l]
            if pat in self.encoding:
                return self.encoding[pat], l
        return 0, 1

    def decode(self, text):
        index = 0
        size = len(text)
        gstr = []
        while index < size:
            g, l = self.decode_one(text[index:])
            gstr.append(g)
            index += l
        return gstr


================================================
FILE: babeldoc/format/pdf/babelpdf/encoding.py
================================================
adobe_standard = [
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "space",
    "exclam",
    "quotedbl",
    "numbersign",
    "dollar",
    "percent",
    "ampersand",
    "quoteright",
    "parenleft",
    "parenright",
    "asterisk",
    "plus",
    "comma",
    "hyphen",
    "period",
    "slash",
    "zero",
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "colon",
    "semicolon",
    "less",
    "equal",
    "greater",
    "question",
    "at",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "bracketleft",
    "backslash",
    "bracketright",
    "asciicircum",
    "underscore",
    "quoteleft",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
    "braceleft",
    "bar",
    "braceright",
    "asciitilde",
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "exclamdown",
    "cent",
    "sterling",
    "fraction",
    "yen",
    "florin",
    "section",
    "currency",
    "quotesingle",
    "quotedblleft",
    "guillemotleft",
    "guilsinglleft",
    "guilsinglright",
    "fi",
    "fl",
    None,
    "endash",
    "dagger",
    "daggerdbl",
    "periodcentered",
    None,
    "paragraph",
    "bullet",
    "quotesinglbase",
    "quotedblbase",
    "quotedblright",
    "guillemotright",
    "ellipsis",
    "perthousand",
    None,
    "questiondown",
    None,
    "grave",
    "acute",
    "circumflex",
    "tilde",
    "macron",
    "breve",
    "dotaccent",
    "dieresis",
    None,
    "ring",
    "cedilla",
    None,
    "hungarumlaut",
    "ogonek",
    "caron",
    "emdash",
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "AE",
    None,
    "ordfeminine",
    None,
    None,
    None,
    None,
    "Lslash",
    "Oslash",
    "OE",
    "ordmasculine",
    None,
    None,
    None,
    None,
    None,
    "ae",
    None,
    None,
    None,
    "dotlessi",
    None,
    None,
    "lslash",
    "oslash",
    "oe",
    "germandbls",
    None,
    None,
    None,
    None,
]

mac_expert = [
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "space",
    "exclamsmall",
    "Hungarumlautsmall",
    "centoldstyle",
    "dollaroldstyle",
    "dollarsuperior",
    "ampersandsmall",
    "Acutesmall",
    "parenleftsuperior",
    "parenrightsuperior",
    "twodotenleader",
    "onedotenleader",
    "comma",
    "hyphen",
    "period",
    "fraction",
    "zerooldstyle",
    "oneoldstyle",
    "twooldstyle",
    "threeoldstyle",
    "fouroldstyle",
    "fiveoldstyle",
    "sixoldstyle",
    "sevenoldstyle",
    "eightoldstyle",
    "nineoldstyle",
    "colon",
    "semicolon",
    None,
    "threequartersemdash",
    None,
    "questionsmall",
    None,
    None,
    None,
    None,
    "Ethsmall",
    None,
    None,
    "onequarter",
    "onehalf",
    "threequarters",
    "oneeighth",
    "threeeighths",
    "fiveeighths",
    "seveneighths",
    "onethird",
    "twothirds",
    None,
    None,
    None,
    None,
    None,
    None,
    "ff",
    "fi",
    "fl",
    "ffi",
    "ffl",
    "parenleftinferior",
    None,
    "parenrightinferior",
    "Circumflexsmall",
    "hypheninferior",
    "Gravesmall",
    "Asmall",
    "Bsmall",
    "Csmall",
    "Dsmall",
    "Esmall",
    "Fsmall",
    "Gsmall",
    "Hsmall",
    "Ismall",
    "Jsmall",
    "Ksmall",
    "Lsmall",
    "Msmall",
    "Nsmall",
    "Osmall",
    "Psmall",
    "Qsmall",
    "Rsmall",
    "Ssmall",
    "Tsmall",
    "Usmall",
    "Vsmall",
    "Wsmall",
    "Xsmall",
    "Ysmall",
    "Zsmall",
    "colonmonetary",
    "onefitted",
    "rupiah",
    "Tildesmall",
    None,
    None,
    "asuperior",
    "centsuperior",
    None,
    None,
    None,
    None,
    "Aacutesmall",
    "Agravesmall",
    "Acircumflexsmall",
    "Adieresissmall",
    "Atildesmall",
    "Aringsmall",
    "Ccedillasmall",
    "Eacutesmall",
    "Egravesmall",
    "Ecircumflexsmall",
    "Edieresissmall",
    "Iacutesmall",
    "Igravesmall",
    "Icircumflexsmall",
    "Idieresissmall",
    "Ntildesmall",
    "Oacutesmall",
    "Ogravesmall",
    "Ocircumflexsmall",
    "Odieresissmall",
    "Otildesmall",
    "Uacutesmall",
    "Ugravesmall",
    "Ucircumflexsmall",
    "Udieresissmall",
    None,
    "eightsuperior",
    "fourinferior",
    "threeinferior",
    "sixinferior",
    "eightinferior",
    "seveninferior",
    "Scaronsmall",
    None,
    "centinferior",
    "twoinferior",
    None,
    "Dieresissmall",
    None,
    "Caronsmall",
    "osuperior",
    "fiveinferior",
    None,
    "commainferior",
    "periodinferior",
    "Yacutesmall",
    None,
    "dollarinferior",
    None,
    None,
    "Thornsmall",
    None,
    "nineinferior",
    "zeroinferior",
    "Zcaronsmall",
    "AEsmall",
    "Oslashsmall",
    "questiondownsmall",
    "oneinferior",
    "Lslashsmall",
    None,
    None,
    None,
    None,
    None,
    None,
    "Cedillasmall",
    None,
    None,
    None,
    None,
    None,
    "OEsmall",
    "figuredash",
    "hyphensuperior",
    None,
    None,
    None,
    None,
    "exclamdownsmall",
    None,
    "Ydieresissmall",
    None,
    "onesuperior",
    "twosuperior",
    "threesuperior",
    "foursuperior",
    "fivesuperior",
    "sixsuperior",
    "sevensuperior",
    "ninesuperior",
    "zerosuperior",
    None,
    "esuperior",
    "rsuperior",
    "tsuperior",
    None,
    None,
    "isuperior",
    "ssuperior",
    "dsuperior",
    None,
    None,
    None,
    None,
    None,
    "lsuperior",
    "Ogoneksmall",
    "Brevesmall",
    "Macronsmall",
    "bsuperior",
    "nsuperior",
    "msuperior",
    "commasuperior",
    "periodsuperior",
    "Dotaccentsmall",
    "Ringsmall",
    None,
    None,
    None,
    None,
]

mac_roman = [
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "space",
    "exclamsmall",
    "Hungarumlautsmall",
    "centoldstyle",
    "dollaroldstyle",
    "dollarsuperior",
    "ampersandsmall",
    "Acutesmall",
    "parenleftsuperior",
    "parenrightsuperior",
    "twodotenleader",
    "onedotenleader",
    "comma",
    "hyphen",
    "period",
    "fraction",
    "zerooldstyle",
    "oneoldstyle",
    "twooldstyle",
    "threeoldstyle",
    "fouroldstyle",
    "fiveoldstyle",
    "sixoldstyle",
    "sevenoldstyle",
    "eightoldstyle",
    "nineoldstyle",
    "colon",
    "semicolon",
    None,
    "threequartersemdash",
    None,
    "questionsmall",
    None,
    None,
    None,
    None,
    "Ethsmall",
    None,
    None,
    "onequarter",
    "onehalf",
    "threequarters",
    "oneeighth",
    "threeeighths",
    "fiveeighths",
    "seveneighths",
    "onethird",
    "twothirds",
    None,
    None,
    None,
    None,
    None,
    None,
    "ff",
    "fi",
    "fl",
    "ffi",
    "ffl",
    "parenleftinferior",
    None,
    "parenrightinferior",
    "Circumflexsmall",
    "hypheninferior",
    "Gravesmall",
    "Asmall",
    "Bsmall",
    "Csmall",
    "Dsmall",
    "Esmall",
    "Fsmall",
    "Gsmall",
    "Hsmall",
    "Ismall",
    "Jsmall",
    "Ksmall",
    "Lsmall",
    "Msmall",
    "Nsmall",
    "Osmall",
    "Psmall",
    "Qsmall",
    "Rsmall",
    "Ssmall",
    "Tsmall",
    "Usmall",
    "Vsmall",
    "Wsmall",
    "Xsmall",
    "Ysmall",
    "Zsmall",
    "colonmonetary",
    "onefitted",
    "rupiah",
    "Tildesmall",
    None,
    None,
    "asuperior",
    "centsuperior",
    None,
    None,
    None,
    None,
    "Aacutesmall",
    "Agravesmall",
    "Acircumflexsmall",
    "Adieresissmall",
    "Atildesmall",
    "Aringsmall",
    "Ccedillasmall",
    "Eacutesmall",
    "Egravesmall",
    "Ecircumflexsmall",
    "Edieresissmall",
    "Iacutesmall",
    "Igravesmall",
    "Icircumflexsmall",
    "Idieresissmall",
    "Ntildesmall",
    "Oacutesmall",
    "Ogravesmall",
    "Ocircumflexsmall",
    "Odieresissmall",
    "Otildesmall",
    "Uacutesmall",
    "Ugravesmall",
    "Ucircumflexsmall",
    "Udieresissmall",
    None,
    "eightsuperior",
    "fourinferior",
    "threeinferior",
    "sixinferior",
    "eightinferior",
    "seveninferior",
    "Scaronsmall",
    None,
    "centinferior",
    "twoinferior",
    None,
    "Dieresissmall",
    None,
    "Caronsmall",
    "osuperior",
    "fiveinferior",
    None,
    "commainferior",
    "periodinferior",
    "Yacutesmall",
    None,
    "dollarinferior",
    None,
    None,
    "Thornsmall",
    None,
    "nineinferior",
    "zeroinferior",
    "Zcaronsmall",
    "AEsmall",
    "Oslashsmall",
    "questiondownsmall",
    "oneinferior",
    "Lslashsmall",
    None,
    None,
    None,
    None,
    None,
    None,
    "Cedillasmall",
    None,
    None,
    None,
    None,
    None,
    "OEsmall",
    "figuredash",
    "hyphensuperior",
    None,
    None,
    None,
    None,
    "exclamdownsmall",
    None,
    "Ydieresissmall",
    None,
    "onesuperior",
    "twosuperior",
    "threesuperior",
    "foursuperior",
    "fivesuperior",
    "sixsuperior",
    "sevensuperior",
    "ninesuperior",
    "zerosuperior",
    None,
    "esuperior",
    "rsuperior",
    "tsuperior",
    None,
    None,
    "isuperior",
    "ssuperior",
    "dsuperior",
    None,
    None,
    None,
    None,
    None,
    "lsuperior",
    "Ogoneksmall",
    "Brevesmall",
    "Macronsmall",
    "bsuperior",
    "nsuperior",
    "msuperior",
    "commasuperior",
    "periodsuperior",
    "Dotaccentsmall",
    "Ringsmall",
    None,
    None,
    None,
    None,
]

win_ansi = [
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "space",
    "exclam",
    "quotedbl",
    "numbersign",
    "dollar",
    "percent",
    "ampersand",
    "quotesingle",
    "parenleft",
    "parenright",
    "asterisk",
    "plus",
    "comma",
    "hyphen",
    "period",
    "slash",
    "zero",
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "colon",
    "semicolon",
    "less",
    "equal",
    "greater",
    "question",
    "at",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "bracketleft",
    "backslash",
    "bracketright",
    "asciicircum",
    "underscore",
    "grave",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
    "braceleft",
    "bar",
    "braceright",
    "asciitilde",
    "bullet",
    "Euro",
    "bullet",
    "quotesinglbase",
    "florin",
    "quotedblbase",
    "ellipsis",
    "dagger",
    "daggerdbl",
    "circumflex",
    "perthousand",
    "Scaron",
    "guilsinglleft",
    "OE",
    "bullet",
    "Zcaron",
    "bullet",
    "bullet",
    "quoteleft",
    "quoteright",
    "quotedblleft",
    "quotedblright",
    "bullet",
    "endash",
    "emdash",
    "tilde",
    "trademark",
    "scaron",
    "guilsinglright",
    "oe",
    "bullet",
    "zcaron",
    "Ydieresis",
    "space",
    "exclamdown",
    "cent",
    "sterling",
    "currency",
    "yen",
    "brokenbar",
    "section",
    "dieresis",
    "copyright",
    "ordfeminine",
    "guillemotleft",
    "logicalnot",
    "hyphen",
    "registered",
    "macron",
    "degree",
    "plusminus",
    "twosuperior",
    "threesuperior",
    "acute",
    "mu",
    "paragraph",
    "periodcentered",
    "cedilla",
    "onesuperior",
    "ordmasculine",
    "guillemotright",
    "onequarter",
    "onehalf",
    "threequarters",
    "questiondown",
    "Agrave",
    "Aacute",
    "Acircumflex",
    "Atilde",
    "Adieresis",
    "Aring",
    "AE",
    "Ccedilla",
    "Egrave",
    "Eacute",
    "Ecircumflex",
    "Edieresis",
    "Igrave",
    "Iacute",
    "Icircumflex",
    "Idieresis",
    "Eth",
    "Ntilde",
    "Ograve",
    "Oacute",
    "Ocircumflex",
    "Otilde",
    "Odieresis",
    "multiply",
    "Oslash",
    "Ugrave",
    "Uacute",
    "Ucircumflex",
    "Udieresis",
    "Yacute",
    "Thorn",
    "germandbls",
    "agrave",
    "aacute",
    "acircumflex",
    "atilde",
    "adieresis",
    "aring",
    "ae",
    "ccedilla",
    "egrave",
    "eacute",
    "ecircumflex",
    "edieresis",
    "igrave",
    "iacute",
    "icircumflex",
    "idieresis",
    "eth",
    "ntilde",
    "ograve",
    "oacute",
    "ocircumflex",
    "otilde",
    "odieresis",
    "divide",
    "oslash",
    "ugrave",
    "uacute",
    "ucircumflex",
    "udieresis",
    "yacute",
    "thorn",
    "ydieresis",
]


def get_type1_encoding(name):
    match name:
        case "StandardEncoding":
            return adobe_standard
        case "MacRomanEncoding":
            return mac_roman
        case "WinAnsiEncoding":
            return win_ansi
        case "MacExpertEncoding":
            return mac_expert


WinAnsiEncoding = [
    0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    24,
    25,
    26,
    27,
    28,
    29,
    30,
    31,
    32,
    33,
    34,
    35,
    36,
    37,
    38,
    39,
    40,
    41,
    42,
    43,
    44,
    45,
    46,
    47,
    48,
    49,
    50,
    51,
    52,
    53,
    54,
    55,
    56,
    57,
    58,
    59,
    60,
    61,
    62,
    63,
    64,
    65,
    66,
    67,
    68,
    69,
    70,
    71,
    72,
    73,
    74,
    75,
    76,
    77,
    78,
    79,
    80,
    81,
    82,
    83,
    84,
    85,
    86,
    87,
    88,
    89,
    90,
    91,
    92,
    93,
    94,
    95,
    96,
    97,
    98,
    99,
    100,
    101,
    102,
    103,
    104,
    105,
    106,
    107,
    108,
    109,
    110,
    111,
    112,
    113,
    114,
    115,
    116,
    117,
    118,
    119,
    120,
    121,
    122,
    123,
    124,
    125,
    126,
    127,
    8364,
    0,
    8218,
    402,
    8222,
    8230,
    8224,
    8225,
    710,
    8240,
    352,
    8249,
    338,
    0,
    381,
    0,
    0,
    8216,
    8217,
    8220,
    8221,
    8226,
    8211,
    8212,
    732,
    8482,
    353,
    8250,
    339,
    0,
    382,
    376,
    160,
    161,
    162,
    163,
    164,
    165,
    166,
    167,
    168,
    169,
    170,
    171,
    172,
    173,
    174,
    175,
    176,
    177,
    178,
    179,
    180,
    181,
    182,
    183,
    184,
    185,
    186,
    187,
    188,
    189,
    190,
    191,
    192,
    193,
    194,
    195,
    196,
    197,
    198,
    199,
    200,
    201,
    202,
    203,
    204,
    205,
    206,
    207,
    208,
    209,
    210,
    211,
    212,
    213,
    214,
    215,
    216,
    217,
    218,
    219,
    220,
    221,
    222,
    223,
    224,
    225,
    226,
    227,
    228,
    229,
    230,
    231,
    232,
    233,
    234,
    235,
    236,
    237,
    238,
    239,
    240,
    241,
    242,
    243,
    244,
    245,
    246,
    247,
    248,
    249,
    250,
    251,
    252,
    253,
    254,
    255,
]


================================================
FILE: babeldoc/format/pdf/babelpdf/type3.py
================================================
import io
import re

import pymupdf


def merge_bbox(bbox_list, factor=1):
    if bbox_list:
        base = bbox_list[0]
        for bbox in bbox_list[1:]:
            base.include_rect(bbox)
        x0, y0, x1, y1 = [v / factor for v in tuple(base)]
        return x0, -y1, x1, -y0


def get_type3_bbox(doc, obj):
    bbox_list = [(0, 0, 0, 0)] * 256
    first = int(doc.xref_get_key(obj, "FirstChar")[1])
    last = int(doc.xref_get_key(obj, "LastChar")[1])
    factor_text = doc.xref_get_key(obj, "FontMatrix")[1]
    factor = 1
    if factor_m := re.search(r"(\d+)?\.\d+", factor_text):
        factor = float(factor_m.group(0))
    page = doc.new_page(width=10, height=10)
    doc.xref_set_key(page.xref, "Resources", "<<>>")
    doc.xref_set_key(page.xref, "Resources/Font", f"<</T0 {obj} 0 R>>")
    text = doc.get_new_xref()
    doc.update_object(text, "<<>>")
    for x in range(first, last + 1):
        doc.update_stream(text, b"1 0 0 1 0 10 cm BT /T0 1 Tf <%02X> Tj ET" % x)
        doc.xref_set_key(page.xref, "Contents", f"{text} 0 R")
        char_data = page.get_svg_image(text_as_path=True)
        char_doc = pymupdf.Document(stream=io.BytesIO(char_data.encode("U8")))
        char_bbox = []
        for element in char_doc:
            for item in element.get_drawings():
                char_bbox.append(item["rect"])
        if char_bbox_merged := merge_bbox(char_bbox, factor):
            bbox_list[x] = char_bbox_merged
    doc.delete_page(-1)
    return bbox_list


================================================
FILE: babeldoc/format/pdf/babelpdf/utils.py
================================================
from babeldoc.pdfminer.pdftypes import PDFObjRef


def guarded_bbox(bbox):
    bbox_guarded = []
    for v in bbox:
        u = v
        if isinstance(v, PDFObjRef):
            u = v.resolve()
        if isinstance(u, int) or isinstance(u, float):
            bbox_guarded.append(u)
        else:
            bbox_guarded.append(u)
    return bbox_guarded


================================================
FILE: babeldoc/format/pdf/babelpdf/win_core.py
================================================
win_core = {
    "Arial": {
        "space": (0, 0, 0, 0),
        "exclam": (85, 0, 194, 715),
        "quotedbl": (45, 462, 308, 715),
        "numbersign": (10, -12, 543, 728),
        "dollar": (35, -103, 509, 781),
        "percent": (58, -26, 827, 728),
        "ampersand": (42, -16, 644, 728),
        "quotesingle": (43, 462, 144, 715),
        "parenleft": (60, -210, 296, 728),
        "parenright": (60, -210, 296, 728),
        "asterisk": (31, 423, 354, 728),
        "plus": (55, 115, 528, 588),
        "comma": (83, -141, 188, 100),
        "hyphen": (31, 214, 301, 303),
        "period": (90, 0, 190, 100),
        "slash": (0, -12, 277, 728),
        "zero": (41, -12, 508, 718),
        "one": (108, 0, 372, 718),
        "two": (30, 0, 503, 718),
        "three": (41, -12, 510, 718),
        "four": (12, 0, 507, 715),
        "five": (41, -12, 516, 706),
        "six": (37, -12, 510, 718),
        "seven": (47, 0, 510, 706),
        "eight": (40, -12, 512, 718),
        "nine": (41, -12, 512, 718),
        "colon": (90, 0, 190, 518),
        "semicolon": (83, -141, 188, 518),
        "less": (54, 110, 528, 595),
        "equal": (55, 203, 528, 502),
        "greater": (54, 110, 528, 595),
        "question": (43, 0, 505, 728),
        "at": (54, -210, 979, 729),
        "A": (-1, 0, 668, 715),
        "B": (73, 0, 613, 715),
        "C": (49, -12, 682, 728),
        "D": (77, 0, 668, 715),
        "E": (79, 0, 613, 715),
        "F": (82, 0, 564, 715),
        "G": (53, -12, 715, 728),
        "H": (80, 0, 641, 715),
        "I": (93, 0, 187, 715),
        "J": (28, -12, 422, 715),
        "K": (73, 0, 665, 715),
        "L": (73, 0, 520, 715),
        "M": (74, 0, 757, 715),
        "N": (76, 0, 640, 715),
        "O": (48, -12, 732, 728),
        "P": (77, 0, 623, 715),
        "Q": (42, -55, 741, 728),
        "R": (78, 0, 709, 715),
        "S": (44, -12, 614, 728),
        "T": (23, 0, 590, 715),
        "U": (78, -12, 641, 715),
        "V": (4, 0, 659, 715),
        "W": (12, 0, 932, 715),
        "X": (4, 0, 660, 715),
        "Y": (2, 0, 659, 715),
        "Z": (20, 0, 585, 715),
        "bracketleft": (67, -198, 261, 715),
        "backslash": (0, -12, 277, 728),
        "bracketright": (19, -198, 212, 715),
        "asciicircum": (26, 336, 442, 728),
        "underscore": (-15, -198, 567, -135),
        "grave": (43, 583, 227, 719),
        "a": (36, -11, 513, 530),
        "b": (65, -11, 515, 715),
        "c": (39, -11, 490, 530),
        "d": (34, -11, 483, 715),
        "e": (36, -11, 514, 530),
        "f": (9, 0, 312, 728),
        "g": (32, -210, 489, 530),
        "h": (65, 0, 488, 715),
        "i": (66, 0, 154, 715),
        "j": (-45, -210, 153, 715),
        "k": (66, 0, 496, 715),
        "l": (63, 0, 151, 715),
        "m": (65, 0, 768, 530),
        "n": (65, 0, 487, 530),
        "o": (33, -11, 519, 530),
        "p": (65, -198, 516, 530),
        "q": (35, -198, 484, 530),
        "r": (64, 0, 346, 530),
        "s": (30, -11, 461, 530),
        "t": (17, -6, 270, 699),
        "u": (63, -11, 484, 518),
        "v": (12, 0, 488, 518),
        "w": (2, 0, 714, 518),
        "x": (7, 0, 492, 518),
        "y": (16, -210, 491, 518),
        "z": (19, 0, 478, 518),
        "braceleft": (27, -210, 310, 728),
        "bar": (91, -210, 168, 728),
        "braceright": (22, -210, 305, 728),
        "asciitilde": (42, 271, 541, 432),
        "bullet": (53, 226, 300, 474),
        "Euro": (-13, -12, 540, 728),
        "quotesinglbase": (52, -132, 154, 102),
        "florin": (22, -210, 529, 728),
        "quotedblbase": (34, -132, 288, 102),
        "ellipsis": (116, 0, 883, 100),
        "dagger": (35, -168, 514, 699),
        "daggerdbl": (35, -168, 516, 706),
        "circumflex": (12, 583, 321, 719),
        "perthousand": (18, -26, 981, 728),
        "Scaron": (44, -12, 614, 893),
        "guilsinglleft": (44, 35, 271, 480),
        "OE": (62, -12, 968, 728),
        "Zcaron": (20, 0, 585, 893),
        "quoteleft": (62, 493, 164, 728),
        "quoteright": (52, 488, 154, 723),
        "quotedblleft": (40, 493, 293, 728),
        "quotedblright": (34, 488, 288, 723),
        "endash": (-1, 223, 554, 294),
        "emdash": (0, 223, 1000, 294),
        "tilde": (3, 595, 330, 708),
        "trademark": (109, 317, 870, 715),
        "scaron": (30, -11, 461, 719),
        "guilsinglright": (44, 35, 266, 480),
        "oe": (40, -11, 906, 530),
        "zcaron": (19, 0, 478, 719),
        "Ydieresis": (2, 0, 659, 859),
        "exclamdown": (113, -197, 222, 518),
        "cent": (52, -199, 504, 715),
        "sterling": (13, -13, 528, 728),
        "currency": (36, 114, 516, 593),
        "yen": (-1, 0, 553, 715),
        "brokenbar": (91, -210, 168, 728),
        "section": (39, -210, 510, 728),
        "dieresis": (29, 620, 303, 720),
        "copyright": (1, -8, 738, 728),
        "ordfeminine": (22, 364, 350, 728),
        "guillemotleft": (65, 35, 483, 480),
        "logicalnot": (55, 207, 528, 502),
        "registered": (1, -8, 738, 728),
        "macron": (-15, 764, 567, 827),
        "degree": (62, 457, 333, 728),
        "plusminus": (38, 0, 510, 600),
        "twosuperior": (12, 357, 316, 724),
        "threesuperior": (16, 349, 315, 724),
        "acute": (108, 583, 288, 719),
        "mu": (78, -198, 497, 518),
        "paragraph": (0, -198, 540, 715),
        "periodcentered": (116, 311, 216, 411),
        "cedilla": (52, -205, 263, 11),
        "onesuperior": (52, 357, 232, 724),
        "ordmasculine": (21, 361, 342, 728),
        "guillemotright": (68, 35, 486, 480),
        "onequarter": (52, -27, 819, 728),
        "onehalf": (52, -27, 816, 728),
        "threequarters": (16, -27, 819, 728),
        "questiondown": (77, -209, 538, 518),
        "Agrave": (-1, 0, 668, 896),
        "Aacute": (-1, 0, 668, 896),
        "Acircumflex": (-1, 0, 668, 896),
        "Atilde": (-1, 0, 668, 872),
        "Adieresis": (-1, 0, 668, 859),
        "Aring": (-1, 0, 668, 869),
        "AE": (0, 0, 945, 715),
        "Ccedilla": (49, -205, 682, 728),
        "Egrave": (79, 0, 613, 896),
        "Eacute": (79, 0, 613, 896),
        "Ecircumflex": (79, 0, 613, 896),
        "Edieresis": (79, 0, 613, 859),
        "Igrave": (26, 0, 209, 896),
        "Iacute": (68, 0, 249, 896),
        "Icircumflex": (-15, 0, 293, 896),
        "Idieresis": (1, 0, 275, 859),
        "Eth": (-1, 0, 668, 715),
        "Ntilde": (76, 0, 640, 872),
        "Ograve": (48, -12, 732, 896),
        "Oacute": (48, -12, 732, 896),
        "Ocircumflex": (48, -12, 732, 896),
        "Otilde": (48, -12, 732, 872),
        "Odieresis": (48, -12, 732, 859),
        "multiply": (78, 140, 504, 566),
        "Oslash": (40, -28, 740, 742),
        "Ugrave": (78, -12, 641, 896),
        "Uacute": (78, -12, 641, 896),
        "Ucircumflex": (78, -12, 641, 896),
        "Udieresis": (78, -12, 641, 859),
        "Yacute": (2, 0, 659, 896),
        "Thorn": (77, 0, 623, 715),
        "germandbls": (74, -12, 579, 728),
        "agrave": (36, -11, 513, 719),
        "aacute": (36, -11, 513, 719),
        "acircumflex": (36, -11, 513, 719),
        "atilde": (36, -11, 513, 708),
        "adieresis": (36, -11, 513, 720),
        "aring": (36, -11, 513, 740),
        "ae": (33, -11, 848, 530),
        "ccedilla": (39, -195, 490, 530),
        "egrave": (36, -11, 514, 719),
        "eacute": (36, -11, 514, 719),
        "ecircumflex": (36, -11, 514, 719),
        "edieresis": (36, -11, 514, 720),
        "igrave": (17, 0, 200, 719),
        "iacute": (92, 0, 272, 719),
        "icircumflex": (-8, 0, 300, 719),
        "idieresis": (4, 0, 278, 720),
        "eth": (35, -12, 516, 715),
        "ntilde": (65, 0, 487, 708),
        "ograve": (33, -11, 519, 719),
        "oacute": (33, -11, 519, 719),
        "ocircumflex": (33, -11, 519, 719),
        "otilde": (33, -11, 519, 708),
        "odieresis": (33, -11, 519, 720),
        "divide": (38, 155, 510, 550),
        "oslash": (62, -38, 548, 550),
        "ugrave": (63, -11, 484, 719),
        "uacute": (63, -11, 484, 719),
        "ucircumflex": (63, -11, 484, 719),
        "udieresis": (63, -11, 484, 720),
        "yacute": (16, -210, 491, 719),
        "thorn": (65, -198, 516, 715),
        "ydieresis": (16, -210, 491, 720),
    },
    "Arial,Bold": {
        "space": (0, 0, 0, 0),
        "exclam": (89, 0, 238, 715),
        "quotedbl": (54, 461, 424, 715),
        "numbersign": (8, -12, 544, 728),
        "dollar": (34, -100, 511, 773),
        "percent": (43, -28, 842, 728),
        "ampersand": (43, -18, 706, 728),
        "quotesingle": (44, 461, 194, 715),
        "parenleft": (52, -210, 300, 728),
        "parenright": (32, -210, 281, 728),
        "asterisk": (13, 386, 367, 728),
        "plus": (41, 103, 541, 603),
        "comma": (57, -159, 205, 137),
        "hyphen": (56, 190, 325, 328),
        "period": (71, 0, 208, 137),
        "slash": (-1, -12, 278, 728),
        "zero": (41, -12, 506, 718),
        "one": (79, 0, 393, 718),
        "two": (24, 0, 505, 718),
        "three": (37, -12, 513, 718),
        "four": (18, 0, 533, 718),
        "five": (44, -12, 525, 706),
        "six": (42, -12, 520, 718),
        "seven": (42, 0, 511, 706),
        "eight": (40, -12, 511, 718),
        "nine": (31, -12, 509, 718),
        "colon": (98, 0, 235, 518),
        "semicolon": (83, -159, 231, 518),
        "less": (46, 81, 537, 625),
        "equal": (41, 181, 541, 524),
        "greater": (46, 81, 537, 624),
        "question": (51, 0, 565, 723),
        "at": (29, -210, 971, 728),
        "A": (0, 0, 718, 715),
        "B": (73, 0, 672, 715),
        "C": (47, -12, 670, 728),
        "D": (72, 0, 672, 715),
        "E": (72, 0, 617, 715),
        "F": (73, 0, 564, 715),
        "G": (47, -12, 717, 728),
        "H": (73, 0, 645, 715),
        "I": (68, 0, 212, 715),
        "J": (17, -12, 475, 715),
        "K": (74, 0, 720, 715),
        "L": (76, 0, 580, 709),
        "M": (70, 0, 762, 715),
        "N": (74, 0, 642, 715),
        "O": (43, -12, 737, 728),
        "P": (72, 0, 621, 715),
        "Q": (43, -71, 764, 728),
        "R": (73, 0, 716, 715),
        "S": (36, -12, 618, 728),
        "T": (21, 0, 590, 715),
        "U": (71, -12, 642, 715),
        "V": (0, 0, 666, 715),
        "W": (3, 0, 942, 715),
        "X": (0, 0, 665, 715),
        "Y": (-1, 0, 667, 715),
        "Z": (10, 0, 592, 715),
        "bracketleft": (71, -201, 314, 715),
        "backslash": (-1, -12, 278, 728),
        "bracketright": (18, -201, 261, 715),
        "asciicircum": (56, 337, 527, 728),
        "underscore": (-9, -197, 561, -108),
        "grave": (20, 582, 241, 728),
        "a": (35, -11, 522, 530),
        "b": (65, -11, 572, 715),
        "c": (41, -11, 530, 530),
        "d": (41, -11, 547, 715),
        "e": (31, -11, 516, 530),
        "f": (11, 0, 362, 728),
        "g": (41, -210, 546, 530),
        "h": (71, 0, 543, 715),
        "i": (71, 0, 208, 715),
        "j": (-45, -210, 206, 715),
        "k": (66, 0, 546, 715),
        "l": (71, 0, 208, 715),
        "m": (61, 0, 824, 530),
        "n": (70, 0, 543, 530),
        "o": (40, -11, 575, 530),
        "p": (67, -197, 573, 530),
        "q": (44, -197, 547, 530),
        "r": (65, 0, 401, 530),
        "s": (23, -11, 507, 530),
        "t": (15, -11, 320, 701),
        "u": (68, -11, 540, 518),
        "v": (5, 0, 543, 518),
        "w": (4, 0, 777, 518),
        "x": (5, 0, 546, 518),
        "y": (6, -210, 540, 518),
        "z": (16, 0, 479, 518),
        "braceleft": (29, -210, 363, 728),
        "bar": (85, -210, 194, 728),
        "braceright": (21, -210, 355, 728),
        "asciitilde": (32, 253, 551, 451),
        "bullet": (32, 208, 320, 497),
        "Euro": (-15, -12, 524, 728),
        "quotesinglbase": (57, -159, 205, 137),
        "florin": (-9, -210, 557, 728),
        "quotedblbase": (51, -160, 430, 137),
        "ellipsis": (98, 0, 902, 137),
        "dagger": (33, -170, 517, 707),
        "daggerdbl": (33, -170, 517, 707),
        "circumflex": (1, 583, 332, 728),
        "perthousand": (0, -28, 999, 728),
        "Scaron": (36, -12, 618, 903),
        "guilsinglleft": (36, 34, 298, 479),
        "OE": (35, -12, 969, 728),
        "Zcaron": (10, 0, 592, 903),
        "quoteleft": (74, 425, 222, 722),
        "quoteright": (57, 416, 205, 713),
        "quotedblleft": (64, 425, 441, 722),
        "quotedblright": (51, 418, 430, 715),
        "endash": (-1, 208, 554, 310),
        "emdash": (0, 208, 1000, 310),
        "tilde": (-6, 588, 331, 712),
        "trademark": (105, 315, 877, 715),
        "scaron": (23, -11, 507, 728),
        "guilsinglright": (36, 34, 298, 479),
        "oe": (42, -11, 902, 530),
        "zcaron": (16, 0, 479, 728),
        "Ydieresis": (-1, 0, 667, 874),
        "exclamdown": (95, -198, 243, 518),
        "cent": (41, -196, 530, 710),
        "sterling": (6, -12, 540, 728),
        "currency": (21, 100, 530, 610),
        "yen": (0, 0, 551, 715),
        "brokenbar": (85, -210, 194, 728),
        "section": (28, -210, 521, 728),
        "dieresis": (2, 610, 330, 728),
        "copyright": (-4, -17, 743, 730),
        "ordfeminine": (18, 362, 345, 728),
        "guillemotleft": (46, 34, 500, 479),
        "logicalnot": (41, 183, 541, 524),
        "registered": (-4, -17, 743, 730),
        "macron": (-9, 757, 561, 847),
        "degree": (41, 416, 353, 728),
        "plusminus": (24, 0, 524, 674),
        "twosuperior": (12, 354, 308, 724),
        "threesuperior": (19, 349, 312, 724),
        "acute": (91, 582, 312, 728),
        "mu": (54, -198, 525, 518),
        "paragraph": (0, -196, 551, 715),
        "periodcentered": (97, 279, 234, 416),
        "cedilla": (18, -204, 284, -5),
        "onesuperior": (44, 354, 241, 724),
        "ordmasculine": (12, 361, 351, 728),
        "guillemotright": (51, 34, 505, 479),
        "onequarter": (44, -26, 824, 724),
        "onehalf": (44, -26, 808, 724),
        "threequarters": (19, -26, 824, 724),
        "questiondown": (49, -205, 563, 518),
        "Agrave": (0, 0, 718, 902),
        "Aacute": (0, 0, 718, 902),
        "Acircumflex": (0, 0, 718, 900),
        "Atilde": (0, 0, 718, 879),
        "Adieresis": (0, 0, 718, 874),
        "Aring": (0, 0, 718, 858),
        "AE": (-41, 0, 951, 715),
        "Ccedilla": (47, -204, 670, 728),
        "Egrave": (72, 0, 617, 902),
        "Eacute": (72, 0, 617, 902),
        "Ecircumflex": (72, 0, 617, 900),
        "Edieresis": (72, 0, 617, 874),
        "Igrave": (-4, 0, 216, 902),
        "Iacute": (51, 0, 272, 902),
        "Icircumflex": (-20, 0, 310, 900),
        "Idieresis": (-21, 0, 306, 874),
        "Eth": (-1, 0, 672, 715),
        "Ntilde": (74, 0, 642, 879),
        "Ograve": (43, -12, 737, 902),
        "Oacute": (43, -12, 737, 902),
        "Ocircumflex": (43, -12, 737, 900),
        "Otilde": (43, -12, 737, 879),
        "Odieresis": (43, -12, 737, 874),
        "multiply": (53, 114, 529, 591),
        "Oslash": (30, -40, 750, 750),
        "Ugrave": (71, -12, 642, 902),
        "Uacute": (71, -12, 642, 902),
        "Ucircumflex": (71, -12, 642, 900),
        "Udieresis": (71, -12, 642, 874),
        "Yacute": (-1, 0, 667, 902),
        "Thorn": (72, 0, 621, 715),
        "germandbls": (67, -11, 575, 728),
        "agrave": (35, -11, 522, 728),
        "aacute": (35, -11, 522, 728),
        "acircumflex": (35, -11, 522, 728),
        "atilde": (35, -11, 522, 712),
        "adieresis": (35, -11, 522, 728),
        "aring": (35, -11, 522, 750),
        "ae": (42, -11, 841, 530),
        "ccedilla": (41, -204, 530, 530),
        "egrave": (31, -11, 516, 728),
        "eacute": (31, -11, 516, 728),
        "ecircumflex": (31, -11, 516, 728),
        "edieresis": (31, -11, 516, 728),
        "igrave": (-11, 0, 209, 728),
        "iacute": (61, 0, 282, 728),
        "icircumflex": (-24, 0, 305, 728),
        "idieresis": (-23, 0, 304, 728),
        "eth": (40, -12, 573, 715),
        "ntilde": (70, 0, 543, 712),
        "ograve": (40, -11, 575, 728),
        "oacute": (40, -11, 575, 728),
        "ocircumflex": (40, -11, 575, 728),
        "otilde": (40, -11, 575, 712),
        "odieresis": (40, -11, 575, 728),
        "divide": (23, 90, 524, 616),
        "oslash": (42, -35, 577, 546),
        "ugrave": (68, -11, 540, 728),
        "uacute": (68, -11, 540, 728),
        "ucircumflex": (68, -11, 540, 728),
        "udieresis": (68, -11, 540, 728),
        "yacute": (6, -210, 540, 728),
        "thorn": (67, -197, 573, 715),
        "ydieresis": (6, -210, 540, 728),
    },
    "Arial,BoldItalic": {
        "space": (0, 0, 0, 0),
        "exclam": (61, 0, 353, 715),
        "quotedbl": (151, 461, 506, 715),
        "numbersign": (47, -12, 583, 728),
        "dollar": (43, -99, 576, 770),
        "percent": (90, -30, 864, 728),
        "ampersand": (83, -16, 706, 728),
        "quotesingle": (151, 461, 329, 715),
        "parenleft": (65, -210, 435, 728),
        "parenright": (-78, -210, 291, 728),
        "asterisk": (98, 386, 452, 728),
        "plus": (80, 103, 581, 603),
        "comma": (10, -155, 212, 135),
        "hyphen": (38, 190, 338, 325),
        "period": (43, 0, 210, 135),
        "slash": (-43, -12, 408, 728),
        "zero": (64, -12, 571, 718),
        "one": (118, 0, 510, 720),
        "two": (60, 0, 570, 718),
        "three": (50, -12, 560, 718),
        "four": (27, 0, 560, 715),
        "five": (63, -12, 577, 706),
        "six": (81, -12, 575, 718),
        "seven": (103, 0, 602, 706),
        "eight": (65, -12, 566, 718),
        "nine": (63, -12, 558, 718),
        "colon": (70, 0, 316, 518),
        "semicolon": (40, -155, 319, 518),
        "less": (85, 81, 576, 625),
        "equal": (80, 181, 581, 524),
        "greater": (85, 81, 576, 624),
        "question": (123, 0, 618, 728),
        "at": (64, -210, 1006, 728),
        "A": (-11, 0, 673, 715),
        "B": (40, 0, 709, 715),
        "C": (94, -12, 745, 728),
        "D": (43, 0, 724, 715),
        "E": (41, 0, 721, 715),
        "F": (39, 0, 689, 715),
        "G": (88, -12, 785, 728),
        "H": (43, 0, 764, 715),
        "I": (34, 0, 331, 715),
        "J": (28, -12, 599, 715),
        "K": (39, 0, 801, 715),
        "L": (44, 0, 581, 715),
        "M": (40, 0, 878, 715),
        "N": (44, 0, 762, 715),
        "O": (87, -12, 784, 728),
        "P": (40, 0, 702, 715),
        "Q": (87, -95, 783, 728),
        "R": (43, 0, 741, 715),
        "S": (63, -12, 676, 728),
        "T": (120, 0, 708, 715),
        "U": (91, -12, 765, 715),
        "V": (113, 0, 793, 715),
        "W": (117, 0, 1067, 715),
        "X": (-30, 0, 783, 715),
        "Y": (114, 0, 784, 715),
        "Z": (24, 0, 667, 715),
        "bracketleft": (9, -197, 438, 715),
        "backslash": (78, -12, 287, 728),
        "bracketright": (-55, -197, 375, 715),
        "asciicircum": (104, 337, 576, 728),
        "underscore": (-9, -197, 561, -108),
        "grave": (133, 585, 331, 731),
        "a": (44, -12, 533, 530),
        "b": (36, -12, 601, 715),
        "c": (60, -12, 564, 530),
        "d": (59, -12, 668, 715),
        "e": (58, -12, 554, 530),
        "f": (53, 0, 470, 728),
        "g": (31, -210, 622, 530),
        "h": (41, 0, 590, 715),
        "i": (40, 0, 329, 715),
        "j": (-109, -210, 331, 715),
        "k": (37, 0, 614, 715),
        "l": (39, 0, 328, 715),
        "m": (35, 0, 868, 530),
        "n": (41, 0, 591, 530),
        "o": (60, -12, 599, 530),
        "p": (-5, -197, 605, 530),
        "q": (59, -197, 625, 530),
        "r": (32, 0, 474, 530),
        "s": (21, -12, 551, 530),
        "t": (75, -12, 390, 698),
        "u": (70, -12, 619, 518),
        "v": (74, 0, 618, 518),
        "w": (71, 0, 840, 518),
        "x": (-21, 0, 612, 518),
        "y": (6, -210, 620, 518),
        "z": (16, 0, 518, 518),
        "braceleft": (41, -210, 490, 728),
        "bar": (85, -210, 194, 728),
        "braceright": (-84, -210, 363, 728),
        "asciitilde": (66, 253, 585, 451),
        "bullet": (81, 208, 369, 497),
        "Euro": (26, -12, 639, 728),
        "quotesinglbase": (10, -155, 212, 135),
        "florin": (-9, -210, 557, 728),
        "quotedblbase": (3, -155, 441, 135),
        "ellipsis": (92, 0, 907, 135),
        "dagger": (84, -170, 594, 706),
        "daggerdbl": (0, -170, 599, 706),
        "circumflex": (56, 584, 391, 731),
        "perthousand": (67, -28, 1021, 728),
        "Scaron": (63, -12, 676, 905),
        "guilsinglleft": (59, 34, 378, 477),
        "OE": (68, -12, 1078, 728),
        "Zcaron": (24, 0, 667, 905),
        "quoteleft": (108, 433, 311, 724),
        "quoteright": (123, 424, 325, 715),
        "quotedblleft": (125, 433, 562, 724),
        "quotedblright": (128, 424, 566, 715),
        "endash": (-1, 208, 554, 310),
        "emdash": (0, 208, 1000, 310),
        "tilde": (92, 592, 428, 710),
        "trademark": (144, 315, 916, 715),
        "scaron": (21, -12, 551, 731),
        "guilsinglright": (9, 34, 318, 477),
        "oe": (58, -12, 943, 530),
        "zcaron": (16, 0, 527, 731),
        "Ydieresis": (114, 0, 784, 875),
        "exclamdown": (11, -197, 304, 518),
        "cent": (58, -192, 562, 713),
        "sterling": (20, -18, 610, 728),
        "currency": (65, 100, 574, 610),
        "yen": (23, 0, 666, 715),
        "brokenbar": (85, -210, 194, 728),
        "section": (21, -211, 560, 728),
        "dieresis": (84, 597, 435, 716),
        "copyright": (43, -17, 791, 730),
        "ordfeminine": (82, 362, 412, 728),
        "guillemotleft": (82, 34, 590, 477),
        "logicalnot": (80, 183, 581, 524),
        "registered": (43, -17, 791, 730),
        "macron": (68, 757, 638, 847),
        "degree": (109, 416, 421, 728),
        "plusminus": (63, 0, 563, 674),
        "twosuperior": (82, 354, 395, 724),
        "threesuperior": (76, 349, 389, 724),
        "acute": (183, 583, 435, 730),
        "mu": (-37, -200, 584, 518),
        "paragraph": (43, -196, 596, 715),
        "periodcentered": (136, 290, 303, 425),
        "cedilla": (6, -207, 267, -12),
        "onesuperior": (114, 354, 361, 725),
        "ordmasculine": (72, 362, 414, 728),
        "guillemotright": (22, 34, 531, 477),
        "onequarter": (99, -29, 839, 724),
        "onehalf": (84, -29, 835, 724),
        "threequarters": (75, -29, 851, 724),
        "questiondown": (26, -209, 521, 518),
        "Agrave": (-11, 0, 673, 905),
        "Aacute": (-11, 0, 686, 903),
        "Acircumflex": (-11, 0, 673, 905),
        "Atilde": (-11, 0, 673, 874),
        "Adieresis": (-11, 0, 680, 875),
        "Aring": (-11, -9, 673, 854),
        "AE": (-32, 0, 1059, 715),
        "Ccedilla": (94, -204, 745, 728),
        "Egrave": (41, 0, 721, 905),
        "Eacute": (41, 0, 721, 903),
        "Ecircumflex": (41, 0, 721, 905),
        "Edieresis": (41, 0, 721, 875),
        "Igrave": (34, 0, 382, 905),
        "Iacute": (34, 0, 451, 903),
        "Icircumflex": (34, 0, 426, 905),
        "Idieresis": (34, 0, 452, 875),
        "Eth": (36, 0, 725, 715),
        "Ntilde": (44, 0, 762, 874),
        "Ograve": (87, -12, 784, 905),
        "Oacute": (87, -12, 784, 903),
        "Ocircumflex": (87, -12, 784, 905),
        "Otilde": (87, -12, 784, 874),
        "Odieresis": (87, -12, 784, 875),
        "multiply": (92, 114, 568, 591),
        "Oslash": (77, -59, 786, 766),
        "Ugrave": (91, -12, 765, 905),
        "Uacute": (91, -12, 765, 903),
        "Ucircumflex": (91, -12, 765, 905),
        "Udieresis": (91, -12, 765, 875),
        "Yacute": (114, 0, 784, 903),
        "Thorn": (40, 0, 673, 715),
        "germandbls": (35, -12, 581, 728),
        "agrave": (44, -12, 533, 731),
        "aacute": (44, -12, 567, 730),
        "acircumflex": (44, -12, 533, 731),
        "atilde": (44, -12, 549, 710),
        "adieresis": (44, -12, 553, 716),
        "aring": (44, -12, 533, 753),
        "ae": (30, -12, 865, 530),
        "ccedilla": (60, -203, 564, 530),
        "egrave": (58, -12, 554, 731),
        "eacute": (58, -12, 562, 730),
        "ecircumflex": (58, -12, 554, 731),
        "edieresis": (58, -12, 554, 716),
        "igrave": (40, 0, 347, 731),
        "iacute": (40, 0, 413, 730),
        "icircumflex": (40, 0, 389, 731),
        "idieresis": (40, 0, 417, 716),
        "eth": (60, -12, 607, 715),
        "ntilde": (41, 0, 591, 710),
        "ograve": (60, -12, 599, 731),
        "oacute": (60, -12, 599, 730),
        "ocircumflex": (60, -12, 599, 731),
        "otilde": (60, -12, 599, 710),
        "odieresis": (60, -12, 599, 716),
        "divide": (63, 90, 563, 616),
        "oslash": (52, -52, 604, 571),
        "ugrave": (70, -12, 619, 731),
        "uacute": (70, -12, 619, 730),
        "ucircumflex": (70, -12, 619, 731),
        "udieresis": (70, -12, 619, 716),
        "yacute": (6, -210, 620, 730),
        "thorn": (-9, -197, 602, 715),
        "ydieresis": (6, -210, 620, 716),
    },
    "Arial,Italic": {
        "space": (0, 0, 0, 0),
        "exclam": (56, 0, 303, 715),
        "quotedbl": (135, 462, 428, 715),
        "numbersign": (46, -12, 579, 728),
        "dollar": (51, -95, 572, 763),
        "percent": (97, -26, 852, 728),
        "ampersand": (78, -17, 651, 728),
        "quotesingle": (126, 462, 258, 715),
        "parenleft": (84, -210, 413, 728),
        "parenright": (-53, -210, 275, 728),
        "asterisk": (115, 423, 437, 728),
        "plus": (89, 115, 562, 588),
        "comma": (23, -144, 175, 100),
        "hyphen": (46, 214, 334, 303),
        "period": (57, 0, 178, 100),
        "slash": (-50, -11, 410, 728),
        "zero": (70, -12, 565, 718),
        "one": (147, 0, 479, 718),
        "two": (58, 0, 562, 718),
        "three": (54, -12, 557, 718),
        "four": (45, 0, 542, 715),
        "five": (69, -12, 572, 706),
        "six": (83, -12, 567, 718),
        "seven": (121, 0, 595, 706),
        "eight": (74, -12, 564, 718),
        "nine": (67, -12, 551, 718),
        "colon": (57, 0, 265, 518),
        "semicolon": (23, -144, 262, 518),
        "less": (89, 110, 563, 595),
        "equal": (89, 203, 562, 502),
        "greater": (89, 110, 563, 595),
        "question": (126, 0, 560, 728),
        "at": (54, -210, 979, 729),
        "A": (-20, 0, 616, 715),
        "B": (43, 0, 654, 715),
        "C": (90, -12, 730, 728),
        "D": (44, 0, 711, 715),
        "E": (44, 0, 711, 715),
        "F": (45, 0, 660, 715),
        "G": (97, -12, 766, 728),
        "H": (41, 0, 753, 715),
        "I": (57, 0, 302, 715),
        "J": (33, -12, 535, 715),
        "K": (44, 0, 741, 715),
        "L": (40, 0, 524, 715),
        "M": (43, 0, 872, 715),
        "N": (48, 0, 756, 715),
        "O": (91, -12, 772, 728),
        "P": (42, 0, 697, 715),
        "Q": (92, -82, 773, 728),
        "R": (46, 0, 729, 715),
        "S": (70, -12, 671, 728),
        "T": (124, 0, 705, 715),
        "U": (96, -12, 754, 715),
        "V": (124, 0, 756, 715),
        "W": (125, 0, 1061, 715),
        "X": (-31, 0, 769, 715),
        "Y": (116, 0, 772, 715),
        "Z": (24, 0, 636, 715),
        "bracketleft": (6, -195, 391, 715),
        "backslash": (84, -11, 273, 728),
        "bracketright": (-58, -195, 329, 715),
        "asciicircum": (70, 336, 486, 728),
        "underscore": (-63, -198, 519, -135),
        "grave": (145, 581, 309, 715),
        "a": (43, -11, 526, 530),
        "b": (33, -11, 535, 715),
        "c": (56, -11, 510, 530),
        "d": (52, -11, 598, 715),
        "e": (51, -11, 531, 530),
        "f": (45, 0, 407, 728),
        "g": (25, -207, 564, 530),
        "h": (33, 0, 528, 715),
        "i": (29, 0, 267, 715),
        "j": (-121, -207, 267, 715),
        "k": (34, 0, 553, 715),
        "l": (26, 0, 264, 715),
        "m": (32, 0, 812, 530),
        "n": (33, 0, 527, 530),
        "o": (48, -11, 540, 530),
        "p": (-10, -198, 535, 530),
        "q": (51, -198, 552, 530),
        "r": (33, 0, 419, 530),
        "s": (41, -11, 501, 530),
        "t": (56, -8, 321, 707),
        "u": (62, -11, 557, 518),
        "v": (79, 0, 559, 518),
        "w": (77, 0, 776, 518),
        "x": (-1, 0, 537, 518),
        "y": (0, -210, 561, 518),
        "z": (19, 0, 512, 518),
        "braceleft": (52, -210, 445, 728),
        "bar": (91, -210, 168, 728),
        "braceright": (-83, -210, 309, 728),
        "asciitilde": (80, 271, 579, 432),
        "bullet": (53, 226, 300, 474),
        "Euro": (39, -12, 645, 728),
        "quotesinglbase": (-7, -144, 144, 100),
        "florin": (22, -210, 529, 728),
        "quotedblbase": (-19, -144, 291, 100),
        "ellipsis": (143, 0, 932, 100),
        "dagger": (90, -170, 583, 706),
        "daggerdbl": (5, -170, 588, 706),
        "circumflex": (100, 581, 387, 715),
        "perthousand": (66, -26, 1003, 728),
        "Scaron": (70, -12, 671, 894),
        "guilsinglleft": (47, 35, 313, 478),
        "OE": (80, -12, 1043, 728),
        "Zcaron": (24, 0, 636, 894),
        "quoteleft": (128, 482, 280, 728),
        "quoteright": (125, 467, 276, 712),
        "quotedblleft": (105, 482, 413, 728),
        "quotedblright": (104, 467, 417, 712),
        "endash": (-1, 223, 554, 294),
        "emdash": (0, 223, 1000, 294),
        "tilde": (93, 596, 423, 706),
        "trademark": (136, 317, 897, 715),
        "scaron": (41, -11, 503, 715),
        "guilsinglright": (16, 35, 288, 478),
        "oe": (62, -11, 918, 530),
        "zcaron": (19, 0, 512, 715),
        "Ydieresis": (116, 0, 772, 858),
        "exclamdown": (57, -197, 305, 518),
        "cent": (75, -198, 529, 725),
        "sterling": (31, -12, 607, 728),
        "currency": (80, 114, 560, 593),
        "yen": (36, 0, 666, 715),
        "brokenbar": (91, -210, 168, 728),
        "section": (30, -210, 555, 728),
        "dieresis": (115, 599, 408, 699),
        "copyright": (40, -8, 777, 728),
        "ordfeminine": (81, 359, 409, 728),
        "guillemotleft": (78, 35, 537, 478),
        "logicalnot": (89, 207, 562, 502),
        "registered": (40, -8, 777, 728),
        "macron": (88, 764, 670, 827),
        "degree": (133, 457, 404, 728),
        "plusminus": (60, 0, 533, 600),
        "twosuperior": (74, 357, 400, 724),
        "threesuperior": (82, 349, 399, 724),
        "acute": (168, 581, 372, 715),
        "mu": (5, -200, 571, 518),
        "paragraph": (69, -198, 609, 715),
        "periodcentered": (151, 307, 272, 407),
        "cedilla": (37, -207, 287, 5),
        "onesuperior": (136, 357, 354, 724),
        "ordmasculine": (69, 360, 411, 728),
        "guillemotright": (40, 35, 504, 478),
        "onequarter": (83, -29, 850, 728),
        "onehalf": (60, -29, 827, 728),
        "threequarters": (82, -29, 865, 728),
        "questiondown": (83, -209, 517, 518),
        "Agrave": (-20, 0, 616, 894),
        "Aacute": (-20, 0, 616, 894),
        "Acircumflex": (-20, 0, 616, 894),
        "Atilde": (-20, 0, 616, 867),
        "Adieresis": (-20, 0, 616, 859),
        "Aring": (-20, 0, 616, 863),
        "AE": (-40, 0, 1043, 715),
        "Ccedilla": (90, -210, 730, 728),
        "Egrave": (44, 0, 711, 894),
        "Eacute": (44, 0, 711, 894),
        "Ecircumflex": (44, 0, 711, 894),
        "Edieresis": (44, 0, 711, 858),
        "Igrave": (57, 0, 340, 894),
        "Iacute": (57, 0, 389, 894),
        "Icircumflex": (57, 0, 407, 894),
        "Idieresis": (57, 0, 413, 859),
        "Eth": (44, 0, 720, 715),
        "Ntilde": (48, 0, 756, 867),
        "Ograve": (91, -12, 772, 894),
        "Oacute": (91, -12, 772, 894),
        "Ocircumflex": (91, -12, 772, 894),
        "Otilde": (91, -12, 772, 867),
        "Odieresis": (91, -12, 772, 859),
        "multiply": (127, 140, 553, 566),
        "Oslash": (84, -50, 776, 764),
        "Ugrave": (96, -12, 754, 894),
        "Uacute": (96, -12, 754, 894),
        "Ucircumflex": (96, -12, 754, 894),
        "Udieresis": (96, -12, 754, 859),
        "Yacute": (116, 0, 772, 894),
        "Thorn": (42, 0, 666, 715),
        "germandbls": (36, -12, 567, 728),
        "agrave": (43, -11, 526, 715),
        "aacute": (43, -11, 526, 715),
        "acircumflex": (43, -11, 526, 715),
        "atilde": (43, -11, 540, 706),
        "adieresis": (43, -11, 526, 699),
        "aring": (43, -11, 526, 733),
        "ae": (42, -12, 865, 530),
        "ccedilla": (56, -198, 510, 530),
        "egrave": (51, -11, 531, 715),
        "eacute": (51, -11, 531, 715),
        "ecircumflex": (51, -11, 531, 715),
        "edieresis": (51, -11, 531, 699),
        "igrave": (61, 0, 310, 715),
        "iacute": (61, 0, 349, 715),
        "icircumflex": (61, 0, 361, 715),
        "idieresis": (61, 0, 377, 699),
        "eth": (48, -12, 545, 715),
        "ntilde": (33, 0, 532, 706),
        "ograve": (48, -11, 540, 715),
        "oacute": (48, -11, 540, 715),
        "ocircumflex": (48, -11, 540, 715),
        "otilde": (48, -11, 540, 706),
        "odieresis": (48, -11, 540, 699),
        "divide": (62, 155, 535, 550),
        "oslash": (74, -49, 583, 565),
        "ugrave": (62, -11, 557, 715),
        "uacute": (62, -11, 557, 715),
        "ucircumflex": (62, -11, 557, 715),
        "udieresis": (62, -11, 557, 699),
        "yacute": (0, -210, 561, 715),
        "thorn": (-10, -198, 535, 715),
        "ydieresis": (0, -210, 561, 699),
    },
    "ArialNarrow": {
        "space": (0, 0, 0, 0),
        "exclam": (72, 0, 161, 715),
        "quotedbl": (37, 462, 252, 715),
        "numbersign": (7, -12, 444, 728),
        "dollar": (27, -103, 416, 781),
        "percent": (45, -26, 676, 728),
        "ampersand": (35, -16, 528, 728),
        "quotesingle": (34, 462, 116, 715),
        "parenleft": (49, -210, 243, 728),
        "parenright": (29, -210, 223, 728),
        "asterisk": (24, 423, 289, 728),
        "plus": (44, 115, 432, 588),
        "comma": (69, -141, 156, 100),
        "hyphen": (25, 214, 247, 303),
        "period": (75, 0, 157, 100),
        "slash": (0, -12, 228, 728),
        "zero": (32, -12, 415, 718),
        "one": (87, 0, 304, 718),
        "two": (24, 0, 412, 718),
        "three": (33, -12, 417, 718),
        "four": (9, 0, 415, 715),
        "five": (32, -12, 421, 706),
        "six": (29, -12, 416, 718),
        "seven": (37, 0, 417, 706),
        "eight": (32, -12, 418, 718),
        "nine": (32, -12, 418, 718),
        "colon": (75, 0, 157, 518),
        "semicolon": (69, -141, 156, 518),
        "less": (43, 110, 432, 595),
        "equal": (44, 203, 432, 502),
        "greater": (43, 110, 432, 595),
        "question": (34, 0, 413, 728),
        "at": (43, -210, 801, 729),
        "A": (0, 0, 548, 715),
        "B": (60, 0, 503, 715),
        "C": (39, -12, 558, 728),
        "D": (61, 0, 546, 715),
        "E": (64, 0, 502, 715),
        "F": (68, 0, 464, 715),
        "G": (45, -12, 588, 728),
        "H": (62, 0, 523, 715),
        "I": (78, 0, 155, 715),
        "J": (21, -12, 344, 715),
        "K": (60, 0, 545, 715),
        "L": (58, 0, 425, 715),
        "M": (61, 0, 621, 715),
        "N": (61, 0, 523, 715),
        "O": (41, -12, 603, 728),
        "P": (63, 0, 511, 715),
        "Q": (37, -55, 609, 728),
        "R": (62, 0, 580, 715),
        "S": (37, -12, 504, 728),
        "T": (20, 0, 485, 715),
        "U": (62, -12, 524, 715),
        "V": (3, 0, 540, 715),
        "W": (11, 0, 766, 715),
        "X": (3, 0, 541, 715),
        "Y": (2, 0, 540, 715),
        "Z": (17, 0, 481, 715),
        "bracketleft": (57, -198, 216, 715),
        "backslash": (0, -12, 228, 728),
        "bracketright": (17, -198, 176, 715),
        "asciicircum": (21, 336, 363, 728),
        "underscore": (-5, -125, 460, -75),
        "grave": (35, 583, 186, 719),
        "a": (28, -11, 419, 530),
        "b": (52, -11, 420, 715),
        "c": (31, -11, 402, 530),
        "d": (26, -11, 395, 715),
        "e": (28, -11, 420, 530),
        "f": (9, 0, 257, 728),
        "g": (24, -210, 399, 530),
        "h": (52, 0, 398, 715),
        "i": (52, 0, 124, 715),
        "j": (-39, -210, 124, 715),
        "k": (54, 0, 406, 715),
        "l": (50, 0, 122, 715),
        "m": (53, 0, 629, 530),
        "n": (52, 0, 398, 530),
        "o": (25, -11, 424, 530),
        "p": (52, -198, 421, 530),
        "q": (27, -198, 395, 530),
        "r": (52, 0, 283, 530),
        "s": (25, -12, 378, 530),
        "t": (16, -6, 223, 699),
        "u": (51, -11, 395, 518),
        "v": (10, 0, 400, 518),
        "w": (0, 0, 584, 518),
        "x": (5, 0, 403, 518),
        "y": (13, -210, 402, 518),
        "z": (16, 0, 392, 518),
        "braceleft": (22, -210, 254, 728),
        "bar": (76, -210, 139, 728),
        "braceright": (18, -210, 250, 728),
        "asciitilde": (35, 271, 444, 432),
        "bullet": (44, 226, 247, 474),
        "Euro": (-11, -12, 443, 728),
        "quotesinglbase": (41, -132, 125, 102),
        "florin": (17, -210, 433, 728),
        "quotedblbase": (28, -132, 236, 102),
        "ellipsis": (95, 0, 724, 100),
        "dagger": (27, -168, 420, 699),
        "daggerdbl": (27, -168, 422, 706),
        "circumflex": (9, 583, 263, 719),
        "perthousand": (14, -26, 805, 728),
        "Scaron": (37, -12, 504, 901),
        "guilsinglleft": (36, 35, 222, 480),
        "OE": (51, -12, 793, 728),
        "Zcaron": (17, 0, 481, 901),
        "quoteleft": (49, 481, 133, 715),
        "quoteright": (41, 481, 125, 715),
        "quotedblleft": (33, 481, 241, 715),
        "quotedblright": (28, 481, 236, 715),
        "endash": (-2, 223, 453, 294),
        "emdash": (0, 223, 819, 294),
        "tilde": (2, 595, 270, 708),
        "trademark": (90, 317, 713, 715),
        "scaron": (25, -12, 378, 719),
        "guilsinglright": (50, 35, 235, 480),
        "oe": (34, -11, 744, 530),
        "zcaron": (16, 0, 392, 719),
        "Ydieresis": (2, 0, 540, 901),
        "exclamdown": (91, -197, 181, 518),
        "cent": (41, -199, 413, 715),
        "sterling": (9, -13, 432, 728),
        "currency": (28, 114, 421, 593),
        "yen": (-2, 0, 452, 715),
        "brokenbar": (76, -210, 139, 728),
        "section": (31, -210, 417, 728),
        "dieresis": (24, 620, 249, 720),
        "copyright": (1, -8, 606, 728),
        "ordfeminine": (20, 364, 289, 728),
        "guillemotleft": (52, 35, 395, 480),
        "logicalnot": (44, 207, 432, 502),
        "registered": (1, -8, 606, 728),
        "macron": (-5, 790, 505, 840),
        "degree": (62, 457, 333, 728),
        "plusminus": (38, 0, 510, 600),
        "twosuperior": (9, 357, 259, 724),
        "threesuperior": (12, 349, 258, 724),
        "acute": (88, 583, 236, 719),
        "mu": (78, -198, 497, 518),
        "paragraph": (2, -198, 444, 715),
        "periodcentered": (95, 311, 177, 411),
        "cedilla": (42, -205, 216, 11),
        "onesuperior": (41, 357, 189, 724),
        "ordmasculine": (18, 361, 280, 728),
        "guillemotright": (54, 35, 397, 480),
        "onequarter": (41, -27, 671, 728),
        "onehalf": (41, -27, 669, 728),
        "threequarters": (12, -27, 669, 728),
        "questiondown": (64, -209, 443, 518),
        "Agrave": (0, 0, 548, 901),
        "Aacute": (0, 0, 548, 901),
        "Acircumflex": (0, 0, 548, 901),
        "Atilde": (0, 0, 548, 878),
        "Adieresis": (0, 0, 548, 901),
        "Aring": (0, 0, 548, 921),
        "AE": (0, 0, 775, 715),
        "Ccedilla": (39, -205, 558, 728),
        "Egrave": (64, 0, 502, 901),
        "Eacute": (64, 0, 502, 901),
        "Ecircumflex": (64, 0, 502, 901),
        "Edieresis": (64, 0, 502, 901),
        "Igrave": (23, 0, 174, 901),
        "Iacute": (73, 0, 220, 901),
        "Icircumflex": (-11, 0, 241, 901),
        "Idieresis": (4, 0, 229, 901),
        "Eth": (-2, 0, 546, 715),
        "Ntilde": (61, 0, 523, 878),
        "Ograve": (41, -12, 603, 901),
        "Oacute": (41, -12, 603, 901),
        "Ocircumflex": (41, -12, 603, 901),
        "Otilde": (41, -12, 603, 878),
        "Odieresis": (41, -12, 603, 901),
        "multiply": (63, 140, 412, 566),
        "Oslash": (35, -28, 609, 742),
        "Ugrave": (62, -12, 524, 901),
        "Uacute": (62, -12, 524, 901),
        "Ucircumflex": (62, -12, 524, 901),
        "Udieresis": (62, -12, 524, 901),
        "Yacute": (2, 0, 540, 901),
        "Thorn": (63, 0, 511, 715),
        "germandbls": (62, -12, 476, 728),
        "agrave": (28, -11, 419, 719),
        "aacute": (28, -11, 419, 719),
        "acircumflex": (28, -11, 419, 719),
        "atilde": (28, -11, 419, 696),
        "adieresis": (28, -11, 419, 720),
        "aring": (28, -11, 419, 762),
        "ae": (25, -11, 694, 530),
        "ccedilla": (31, -205, 402, 530),
        "egrave": (28, -11, 420, 719),
        "eacute": (28, -11, 420, 719),
        "ecircumflex": (28, -11, 420, 719),
        "edieresis": (28, -11, 420, 720),
        "igrave": (9, 0, 160, 719),
        "iacute": (62, 0, 210, 719),
        "icircumflex": (-6, 0, 246, 719),
        "idieresis": (1, 0, 226, 720),
        "eth": (27, -12, 421, 715),
        "ntilde": (52, 0, 398, 696),
        "ograve": (25, -11, 424, 719),
        "oacute": (25, -11, 424, 719),
        "ocircumflex": (25, -11, 424, 719),
        "otilde": (25, -11, 424, 696),
        "odieresis": (25, -11, 424, 720),
        "divide": (38, 155, 510, 550),
        "oslash": (55, -38, 453, 550),
        "ugrave": (51, -11, 395, 719),
        "uacute": (51, -11, 395, 719),
        "ucircumflex": (51, -11, 395, 719),
        "udieresis": (51, -11, 395, 720),
        "yacute": (13, -210, 402, 719),
        "thorn": (52, -198, 421, 715),
        "ydieresis": (13, -210, 402, 720),
    },
    "ArialNarrow,Bold": {
        "space": (0, 0, 0, 0),
        "exclam": (73, 0, 194, 715),
        "quotedbl": (44, 461, 348, 715),
        "numbersign": (7, -12, 446, 728),
        "dollar": (28, -100, 419, 773),
        "percent": (35, -28, 690, 728),
        "ampersand": (36, -18, 579, 728),
        "quotesingle": (36, 461, 159, 715),
        "parenleft": (42, -210, 246, 728),
        "parenright": (26, -210, 230, 728),
        "asterisk": (11, 386, 301, 728),
        "plus": (33, 103, 444, 603),
        "comma": (46, -159, 168, 137),
        "hyphen": (25, 190, 247, 328),
        "period": (59, 0, 171, 137),
        "slash": (0, -12, 229, 728),
        "zero": (34, -12, 416, 718),
        "one": (64, 0, 322, 718),
        "two": (20, 0, 415, 718),
        "three": (30, -12, 420, 718),
        "four": (15, 0, 437, 718),
        "five": (36, -12, 431, 706),
        "six": (35, -12, 427, 718),
        "seven": (34, 0, 419, 706),
        "eight": (33, -12, 419, 718),
        "nine": (25, -12, 417, 718),
        "colon": (80, 0, 192, 518),
        "semicolon": (67, -159, 189, 518),
        "less": (38, 81, 440, 625),
        "equal": (33, 181, 444, 524),
        "greater": (37, 81, 440, 624),
        "question": (42, 0, 463, 723),
        "at": (24, -210, 796, 728),
        "A": (0, 0, 588, 715),
        "B": (59, 0, 551, 715),
        "C": (39, -12, 550, 728),
        "D": (59, 0, 551, 715),
        "E": (60, 0, 506, 715),
        "F": (60, 0, 462, 715),
        "G": (39, -12, 588, 728),
        "H": (60, 0, 529, 715),
        "I": (56, 0, 174, 715),
        "J": (14, -12, 390, 715),
        "K": (61, 0, 590, 715),
        "L": (62, 0, 476, 709),
        "M": (58, 0, 625, 715),
        "N": (61, 0, 526, 715),
        "O": (36, -12, 605, 728),
        "P": (59, 0, 509, 715),
        "Q": (35, -71, 626, 728),
        "R": (60, 0, 587, 715),
        "S": (29, -12, 506, 728),
        "T": (17, 0, 483, 715),
        "U": (58, -12, 526, 715),
        "V": (0, 0, 546, 715),
        "W": (2, 0, 772, 715),
        "X": (0, 0, 546, 715),
        "Y": (0, 0, 547, 715),
        "Z": (8, 0, 485, 715),
        "bracketleft": (58, -201, 257, 715),
        "backslash": (0, -12, 229, 728),
        "bracketright": (15, -201, 214, 715),
        "asciicircum": (46, 337, 433, 728),
        "underscore": (-5, -125, 462, -75),
        "grave": (17, 582, 198, 728),
        "a": (29, -11, 428, 530),
        "b": (53, -11, 469, 715),
        "c": (34, -11, 435, 530),
        "d": (33, -11, 449, 715),
        "e": (26, -11, 423, 530),
        "f": (9, 0, 296, 728),
        "g": (33, -210, 448, 530),
        "h": (58, 0, 445, 715),
        "i": (59, 0, 171, 715),
        "j": (-37, -210, 169, 715),
        "k": (55, 0, 448, 715),
        "l": (59, 0, 171, 715),
        "m": (50, 0, 675, 530),
        "n": (58, 0, 445, 530),
        "o": (32, -11, 471, 530),
        "p": (55, -197, 470, 530),
        "q": (36, -197, 449, 530),
        "r": (54, 0, 329, 530),
        "s": (19, -11, 416, 530),
        "t": (12, -11, 262, 701),
        "u": (56, -11, 442, 518),
        "v": (4, 0, 445, 518),
        "w": (3, 0, 637, 518),
        "x": (4, 0, 448, 518),
        "y": (5, -210, 442, 518),
        "z": (13, 0, 393, 518),
        "braceleft": (23, -210, 297, 728),
        "bar": (70, -210, 160, 728),
        "braceright": (18, -210, 291, 728),
        "asciitilde": (26, 253, 452, 451),
        "bullet": (26, 208, 263, 497),
        "Euro": (-13, -12, 431, 728),
        "quotesinglbase": (46, -159, 168, 137),
        "florin": (-7, -210, 457, 728),
        "quotedblbase": (44, -159, 355, 137),
        "ellipsis": (80, 0, 739, 137),
        "dagger": (27, -170, 423, 707),
        "daggerdbl": (27, -170, 423, 707),
        "circumflex": (0, 583, 271, 728),
        "perthousand": (0, -28, 819, 728),
        "Scaron": (29, -12, 506, 909),
        "guilsinglleft": (30, 34, 245, 479),
        "OE": (28, -12, 794, 728),
        "Zcaron": (8, 0, 485, 909),
        "quoteleft": (61, 418, 182, 715),
        "quoteright": (45, 418, 166, 715),
        "quotedblleft": (52, 418, 362, 715),
        "quotedblright": (41, 418, 352, 715),
        "endash": (-1, 208, 454, 310),
        "emdash": (0, 208, 819, 310),
        "tilde": (-5, 588, 271, 712),
        "trademark": (86, 315, 719, 715),
        "scaron": (19, -11, 416, 728),
        "guilsinglright": (29, 34, 244, 479),
        "oe": (35, -11, 740, 530),
        "zcaron": (13, 0, 393, 728),
        "Ydieresis": (0, 0, 547, 909),
        "exclamdown": (78, -198, 199, 518),
        "cent": (33, -196, 434, 710),
        "sterling": (5, -12, 443, 728),
        "currency": (18, 100, 435, 610),
        "yen": (0, 0, 452, 715),
        "brokenbar": (70, -210, 160, 728),
        "section": (23, -210, 427, 728),
        "dieresis": (1, 610, 270, 728),
        "copyright": (-3, -17, 609, 730),
        "ordfeminine": (15, 362, 283, 728),
        "guillemotleft": (38, 34, 410, 479),
        "logicalnot": (33, 183, 444, 524),
        "registered": (-3, -17, 609, 730),
        "macron": (-5, 790, 505, 840),
        "degree": (41, 416, 353, 728),
        "plusminus": (24, 0, 524, 674),
        "twosuperior": (9, 354, 252, 724),
        "threesuperior": (15, 349, 255, 724),
        "acute": (74, 582, 256, 728),
        "mu": (54, -198, 525, 518),
        "paragraph": (0, -196, 452, 715),
        "periodcentered": (80, 279, 192, 416),
        "cedilla": (15, -204, 233, -5),
        "onesuperior": (36, 354, 198, 724),
        "ordmasculine": (10, 361, 288, 728),
        "guillemotright": (42, 34, 414, 479),
        "onequarter": (36, -26, 675, 724),
        "onehalf": (36, -26, 663, 724),
        "threequarters": (16, -26, 676, 724),
        "questiondown": (40, -205, 462, 518),
        "Agrave": (0, 0, 588, 909),
        "Aacute": (0, 0, 588, 909),
        "Acircumflex": (0, 0, 588, 909),
        "Atilde": (0, 0, 588, 894),
        "Adieresis": (0, 0, 588, 909),
        "Aring": (0, 0, 588, 932),
        "AE": (-34, 0, 780, 715),
        "Ccedilla": (39, -210, 550, 728),
        "Egrave": (60, 0, 506, 909),
        "Eacute": (60, 0, 506, 909),
        "Ecircumflex": (60, 0, 506, 909),
        "Edieresis": (60, 0, 506, 909),
        "Igrave": (-3, 0, 177, 909),
        "Iacute": (53, 0, 235, 909),
        "Icircumflex": (-20, 0, 250, 909),
        "Idieresis": (-19, 0, 250, 909),
        "Eth": (-1, 0, 551, 715),
        "Ntilde": (61, 0, 526, 894),
        "Ograve": (36, -12, 605, 909),
        "Oacute": (36, -12, 605, 909),
        "Ocircumflex": (36, -12, 605, 909),
        "Otilde": (36, -12, 605, 894),
        "Odieresis": (36, -12, 605, 909),
        "multiply": (43, 114, 434, 591),
        "Oslash": (25, -40, 615, 750),
        "Ugrave": (58, -12, 526, 909),
        "Uacute": (58, -12, 526, 909),
        "Ucircumflex": (58, -12, 526, 909),
        "Udieresis": (58, -12, 526, 909),
        "Yacute": (0, 0, 547, 909),
        "Thorn": (59, 0, 509, 715),
        "germandbls": (55, -11, 472, 728),
        "agrave": (29, -11, 428, 728),
        "aacute": (29, -11, 428, 728),
        "acircumflex": (29, -11, 428, 728),
        "atilde": (29, -11, 428, 712),
        "adieresis": (29, -11, 428, 728),
        "aring": (29, -11, 428, 750),
        "ae": (35, -11, 690, 530),
        "ccedilla": (34, -204, 435, 530),
        "egrave": (26, -11, 423, 728),
        "eacute": (26, -11, 423, 728),
        "ecircumflex": (26, -11, 423, 728),
        "edieresis": (26, -11, 423, 728),
        "igrave": (-9, 0, 172, 728),
        "iacute": (58, 0, 240, 728),
        "icircumflex": (-20, 0, 250, 728),
        "idieresis": (-19, 0, 250, 728),
        "eth": (33, -12, 470, 715),
        "ntilde": (58, 0, 445, 712),
        "ograve": (32, -11, 471, 728),
        "oacute": (32, -11, 471, 728),
        "ocircumflex": (32, -11, 471, 728),
        "otilde": (32, -11, 471, 712),
        "odieresis": (32, -11, 471, 728),
        "divide": (23, 90, 524, 616),
        "oslash": (35, -35, 474, 546),
        "ugrave": (56, -11, 442, 728),
        "uacute": (56, -11, 442, 728),
        "ucircumflex": (56, -11, 442, 728),
        "udieresis": (56, -11, 442, 728),
        "yacute": (5, -210, 442, 728),
        "thorn": (55, -197, 470, 715),
        "ydieresis": (5, -210, 442, 728),
    },
    "ArialNarrow,BoldItalic": {
        "space": (0, 0, 0, 0),
        "exclam": (50, 0, 289, 715),
        "quotedbl": (121, 461, 447, 715),
        "numbersign": (7, -12, 446, 728),
        "dollar": (36, -99, 472, 770),
        "percent": (74, -30, 708, 728),
        "ampersand": (67, -16, 578, 728),
        "quotesingle": (124, 461, 269, 715),
        "parenleft": (53, -210, 356, 728),
        "parenright": (-64, -210, 238, 728),
        "asterisk": (78, 382, 368, 721),
        "plus": (33, 103, 444, 603),
        "comma": (8, -155, 173, 135),
        "hyphen": (31, 190, 277, 325),
        "period": (36, 0, 172, 135),
        "slash": (-35, -12, 335, 728),
        "zero": (52, -12, 468, 718),
        "one": (97, 0, 418, 720),
        "two": (49, 0, 468, 718),
        "three": (41, -12, 459, 718),
        "four": (22, 0, 458, 715),
        "five": (52, -12, 474, 706),
        "six": (66, -12, 471, 718),
        "seven": (84, 0, 494, 706),
        "eight": (54, -12, 464, 718),
        "nine": (52, -12, 457, 718),
        "colon": (57, 0, 259, 518),
        "semicolon": (33, -155, 262, 518),
        "less": (38, 81, 440, 625),
        "equal": (33, 181, 444, 524),
        "greater": (37, 81, 440, 624),
        "question": (100, 0, 506, 728),
        "at": (24, -210, 796, 728),
        "A": (-9, 0, 551, 715),
        "B": (32, 0, 582, 715),
        "C": (77, -12, 611, 728),
        "D": (35, 0, 594, 715),
        "E": (33, 0, 591, 715),
        "F": (31, 0, 565, 715),
        "G": (72, -12, 644, 728),
        "H": (35, 0, 626, 715),
        "I": (28, 0, 271, 715),
        "J": (23, -12, 492, 715),
        "K": (32, 0, 657, 715),
        "L": (37, 0, 477, 715),
        "M": (33, 0, 720, 715),
        "N": (36, 0, 625, 715),
        "O": (71, -12, 643, 728),
        "P": (33, 0, 576, 715),
        "Q": (72, -95, 643, 728),
        "R": (36, 0, 607, 715),
        "S": (51, -12, 554, 728),
        "T": (98, 0, 581, 715),
        "U": (74, -12, 626, 715),
        "V": (93, 0, 650, 715),
        "W": (96, 0, 875, 715),
        "X": (-24, 0, 642, 715),
        "Y": (94, 0, 643, 715),
        "Z": (20, 0, 547, 715),
        "bracketleft": (7, -197, 359, 715),
        "backslash": (63, -12, 235, 728),
        "bracketright": (-45, -197, 307, 715),
        "asciicircum": (46, 337, 433, 728),
        "underscore": (-5, -125, 462, -75),
        "grave": (109, 585, 271, 731),
        "a": (37, -11, 437, 530),
        "b": (29, -11, 493, 715),
        "c": (49, -11, 462, 530),
        "d": (48, -11, 548, 715),
        "e": (47, -11, 454, 530),
        "f": (43, 0, 385, 728),
        "g": (25, -210, 510, 530),
        "h": (34, 0, 484, 715),
        "i": (33, 0, 270, 715),
        "j": (-89, -210, 271, 715),
        "k": (31, 0, 503, 715),
        "l": (32, 0, 270, 715),
        "m": (29, 0, 712, 530),
        "n": (34, 0, 484, 530),
        "o": (49, -11, 491, 530),
        "p": (-4, -197, 496, 530),
        "q": (48, -197, 512, 530),
        "r": (26, 0, 388, 530),
        "s": (18, -11, 452, 530),
        "t": (61, -11, 320, 698),
        "u": (57, -11, 507, 518),
        "v": (61, 0, 506, 518),
        "w": (59, 0, 689, 518),
        "x": (-18, 0, 501, 518),
        "y": (5, -210, 509, 518),
        "z": (13, 0, 425, 518),
        "braceleft": (44, -210, 412, 728),
        "bar": (70, -210, 160, 728),
        "braceright": (-70, -210, 296, 728),
        "asciitilde": (26, 253, 452, 451),
        "bullet": (26, 208, 263, 497),
        "Euro": (21, -12, 523, 728),
        "quotesinglbase": (8, -155, 173, 135),
        "florin": (-7, -210, 457, 728),
        "quotedblbase": (2, -155, 361, 135),
        "ellipsis": (76, 0, 744, 135),
        "dagger": (69, -170, 487, 706),
        "daggerdbl": (0, -170, 491, 706),
        "circumflex": (45, 584, 320, 731),
        "perthousand": (55, -28, 837, 728),
        "Scaron": (51, -12, 554, 912),
        "guilsinglleft": (48, 34, 309, 477),
        "OE": (56, -12, 884, 728),
        "Zcaron": (20, 0, 547, 912),
        "quoteleft": (87, 424, 253, 715),
        "quoteright": (101, 424, 267, 715),
        "quotedblleft": (101, 424, 459, 715),
        "quotedblright": (104, 424, 463, 715),
        "endash": (-1, 208, 454, 310),
        "emdash": (0, 208, 819, 310),
        "tilde": (76, 592, 351, 710),
        "trademark": (86, 315, 719, 715),
        "scaron": (18, -11, 452, 731),
        "guilsinglright": (7, 34, 261, 477),
        "oe": (47, -11, 773, 530),
        "zcaron": (13, 0, 433, 731),
        "Ydieresis": (94, 0, 643, 898),
        "exclamdown": (9, -197, 250, 518),
        "cent": (48, -192, 461, 713),
        "sterling": (17, -18, 500, 728),
        "currency": (18, 100, 435, 610),
        "yen": (20, 0, 546, 715),
        "brokenbar": (70, -210, 160, 728),
        "section": (18, -211, 459, 728),
        "dieresis": (69, 597, 356, 716),
        "copyright": (-3, -17, 609, 730),
        "ordfeminine": (66, 362, 337, 728),
        "guillemotleft": (43, 34, 460, 477),
        "logicalnot": (33, 183, 444, 524),
        "registered": (-3, -17, 609, 730),
        "macron": (94, 790, 605, 840),
        "degree": (41, 416, 353, 728),
        "plusminus": (24, 0, 524, 674),
        "twosuperior": (66, 354, 324, 724),
        "threesuperior": (62, 349, 319, 724),
        "acute": (150, 583, 356, 730),
        "mu": (-37, -200, 584, 518),
        "paragraph": (0, -196, 452, 715),
        "periodcentered": (108, 290, 245, 425),
        "cedilla": (5, -207, 218, -12),
        "onesuperior": (93, 354, 296, 725),
        "ordmasculine": (59, 362, 339, 728),
        "guillemotright": (18, 34, 435, 477),
        "onequarter": (81, -29, 688, 725),
        "onehalf": (69, -29, 684, 725),
        "threequarters": (62, -29, 698, 724),
        "questiondown": (21, -209, 428, 518),
        "Agrave": (-9, 0, 551, 913),
        "Aacute": (-9, 0, 562, 912),
        "Acircumflex": (-9, 0, 551, 912),
        "Atilde": (-9, 0, 556, 892),
        "Adieresis": (-9, 0, 562, 898),
        "Aring": (-9, 0, 551, 935),
        "AE": (-26, 0, 868, 715),
        "Ccedilla": (77, -204, 611, 728),
        "Egrave": (33, 0, 591, 913),
        "Eacute": (33, 0, 591, 912),
        "Ecircumflex": (33, 0, 591, 912),
        "Edieresis": (33, 0, 591, 898),
        "Igrave": (28, 0, 297, 913),
        "Iacute": (28, 0, 368, 912),
        "Icircumflex": (28, 0, 347, 912),
        "Idieresis": (28, 0, 383, 898),
        "Eth": (29, 0, 594, 715),
        "Ntilde": (36, 0, 625, 892),
        "Ograve": (71, -12, 643, 913),
        "Oacute": (71, -12, 643, 912),
        "Ocircumflex": (71, -12, 643, 912),
        "Otilde": (71, -12, 643, 892),
        "Odieresis": (71, -12, 643, 898),
        "multiply": (43, 114, 434, 591),
        "Oslash": (63, -59, 645, 766),
        "Ugrave": (74, -12, 626, 913),
        "Uacute": (74, -12, 626, 912),
        "Ucircumflex": (74, -12, 626, 912),
        "Udieresis": (74, -12, 626, 898),
        "Yacute": (94, 0, 643, 912),
        "Thorn": (33, 0, 552, 715),
        "germandbls": (28, -11, 476, 728),
        "agrave": (37, -11, 437, 731),
        "aacute": (37, -11, 437, 730),
        "acircumflex": (37, -11, 437, 731),
        "atilde": (37, -11, 447, 710),
        "adieresis": (37, -11, 454, 716),
        "aring": (37, -11, 437, 753),
        "ae": (25, -11, 709, 530),
        "ccedilla": (49, -203, 462, 530),
        "egrave": (47, -11, 454, 731),
        "eacute": (47, -11, 454, 730),
        "ecircumflex": (47, -11, 454, 731),
        "edieresis": (47, -11, 454, 716),
        "igrave": (33, 0, 258, 731),
        "iacute": (33, 0, 319, 730),
        "icircumflex": (33, 0, 319, 731),
        "idieresis": (33, 0, 342, 716),
        "eth": (49, -11, 498, 715),
        "ntilde": (34, 0, 484, 710),
        "ograve": (49, -11, 491, 731),
        "oacute": (49, -11, 491, 730),
        "ocircumflex": (49, -11, 491, 731),
        "otilde": (49, -11, 491, 710),
        "odieresis": (49, -11, 491, 716),
        "divide": (23, 90, 524, 616),
        "oslash": (42, -52, 495, 571),
        "ugrave": (57, -11, 507, 731),
        "uacute": (57, -11, 507, 730),
        "ucircumflex": (57, -11, 507, 731),
        "udieresis": (57, -11, 507, 716),
        "yacute": (5, -210, 509, 730),
        "thorn": (-7, -197, 494, 715),
        "ydieresis": (5, -210, 509, 716),
    },
    "ArialNarrow,Italic": {
        "space": (0, 0, 0, 0),
        "exclam": (46, 0, 249, 715),
        "quotedbl": (106, 462, 346, 715),
        "numbersign": (7, -12, 444, 728),
        "dollar": (41, -95, 469, 763),
        "percent": (79, -26, 698, 728),
        "ampersand": (64, -17, 534, 728),
        "quotesingle": (104, 462, 212, 715),
        "parenleft": (69, -210, 338, 728),
        "parenright": (-43, -210, 225, 728),
        "asterisk": (92, 422, 357, 727),
        "plus": (45, 115, 433, 588),
        "comma": (20, -144, 144, 100),
        "hyphen": (37, 214, 273, 303),
        "period": (47, 0, 146, 100),
        "slash": (-41, -11, 336, 728),
        "zero": (58, -12, 463, 718),
        "one": (121, 0, 393, 718),
        "two": (48, 0, 460, 718),
        "three": (44, -12, 457, 718),
        "four": (37, 0, 445, 715),
        "five": (57, -12, 469, 706),
        "six": (68, -12, 465, 718),
        "seven": (99, 0, 488, 706),
        "eight": (61, -12, 462, 718),
        "nine": (55, -12, 452, 718),
        "colon": (46, 0, 217, 518),
        "semicolon": (20, -144, 215, 518),
        "less": (44, 110, 433, 595),
        "equal": (45, 203, 433, 502),
        "greater": (44, 110, 433, 595),
        "question": (104, 0, 459, 728),
        "at": (44, -210, 803, 729),
        "A": (-16, 0, 505, 715),
        "B": (35, 0, 537, 715),
        "C": (74, -12, 598, 728),
        "D": (36, 0, 583, 715),
        "E": (37, 0, 583, 715),
        "F": (37, 0, 541, 715),
        "G": (79, -12, 628, 728),
        "H": (34, 0, 618, 715),
        "I": (46, 0, 248, 715),
        "J": (26, -12, 438, 715),
        "K": (36, 0, 607, 715),
        "L": (32, 0, 429, 715),
        "M": (36, 0, 715, 715),
        "N": (39, 0, 620, 715),
        "O": (75, -12, 633, 728),
        "P": (35, 0, 572, 715),
        "Q": (76, -82, 634, 728),
        "R": (38, 0, 599, 715),
        "S": (58, -12, 551, 728),
        "T": (102, 0, 578, 715),
        "U": (79, -12, 618, 715),
        "V": (101, 0, 620, 715),
        "W": (102, 0, 870, 715),
        "X": (-25, 0, 630, 715),
        "Y": (96, 0, 634, 715),
        "Z": (20, 0, 521, 715),
        "bracketleft": (5, -195, 320, 715),
        "backslash": (69, -11, 224, 728),
        "bracketright": (-47, -195, 270, 715),
        "asciicircum": (21, 336, 363, 728),
        "underscore": (-5, -125, 460, -75),
        "grave": (119, 581, 254, 715),
        "a": (36, -11, 431, 530),
        "b": (27, -11, 438, 715),
        "c": (45, -11, 418, 530),
        "d": (42, -11, 490, 715),
        "e": (42, -11, 436, 530),
        "f": (37, 0, 334, 728),
        "g": (21, -207, 462, 530),
        "h": (27, 0, 433, 715),
        "i": (24, 0, 219, 715),
        "j": (-99, -207, 218, 715),
        "k": (27, 0, 454, 715),
        "l": (21, 0, 216, 715),
        "m": (26, 0, 666, 530),
        "n": (27, 0, 433, 530),
        "o": (40, -11, 442, 530),
        "p": (-8, -198, 438, 530),
        "q": (42, -198, 453, 530),
        "r": (27, 0, 344, 530),
        "s": (31, -11, 408, 530),
        "t": (45, -8, 263, 707),
        "u": (51, -11, 457, 518),
        "v": (64, 0, 458, 518),
        "w": (63, 0, 636, 518),
        "x": (-1, 0, 440, 518),
        "y": (0, -210, 459, 518),
        "z": (16, 0, 419, 518),
        "braceleft": (55, -210, 376, 728),
        "bar": (75, -210, 138, 728),
        "braceright": (-68, -210, 253, 728),
        "asciitilde": (35, 271, 444, 432),
        "bullet": (43, 226, 246, 474),
        "Euro": (33, -12, 528, 728),
        "quotesinglbase": (-5, -144, 118, 100),
        "florin": (18, -210, 434, 728),
        "quotedblbase": (-16, -144, 238, 100),
        "ellipsis": (117, 0, 764, 100),
        "dagger": (74, -170, 478, 706),
        "daggerdbl": (4, -170, 482, 706),
        "circumflex": (82, 581, 317, 715),
        "perthousand": (54, -26, 822, 728),
        "Scaron": (58, -12, 551, 896),
        "guilsinglleft": (39, 35, 257, 478),
        "OE": (65, -12, 856, 728),
        "Zcaron": (20, 0, 521, 896),
        "quoteleft": (103, 470, 228, 715),
        "quoteright": (103, 470, 227, 715),
        "quotedblleft": (83, 470, 336, 715),
        "quotedblright": (85, 470, 342, 715),
        "endash": (-1, 223, 454, 294),
        "emdash": (0, 223, 819, 294),
        "tilde": (76, 596, 347, 706),
        "trademark": (90, 317, 713, 715),
        "scaron": (31, -11, 410, 715),
        "guilsinglright": (13, 35, 235, 478),
        "oe": (51, -11, 752, 530),
        "zcaron": (16, 0, 419, 715),
        "Ydieresis": (96, 0, 634, 880),
        "exclamdown": (24, -197, 228, 518),
        "cent": (62, -198, 434, 725),
        "sterling": (25, -12, 498, 728),
        "currency": (28, 114, 421, 593),
        "yen": (29, 0, 546, 715),
        "brokenbar": (75, -210, 138, 728),
        "section": (24, -210, 455, 728),
        "dieresis": (95, 599, 335, 699),
        "copyright": (0, -8, 605, 728),
        "ordfeminine": (66, 359, 335, 728),
        "guillemotleft": (64, 35, 440, 478),
        "logicalnot": (45, 207, 433, 502),
        "registered": (0, -8, 605, 728),
        "macron": (88, 790, 600, 840),
        "degree": (133, 457, 404, 728),
        "plusminus": (38, 0, 510, 600),
        "twosuperior": (61, 357, 329, 724),
        "threesuperior": (67, 349, 327, 724),
        "acute": (138, 581, 304, 715),
        "mu": (5, -200, 571, 518),
        "paragraph": (2, -198, 444, 715),
        "periodcentered": (124, 307, 223, 407),
        "cedilla": (30, -207, 235, 5),
        "onesuperior": (111, 357, 290, 724),
        "ordmasculine": (57, 360, 337, 728),
        "guillemotright": (33, 35, 414, 478),
        "onequarter": (68, -29, 697, 728),
        "onehalf": (48, -29, 677, 728),
        "threequarters": (67, -29, 708, 728),
        "questiondown": (46, -209, 401, 518),
        "Agrave": (-16, 0, 505, 896),
        "Aacute": (-16, 0, 505, 896),
        "Acircumflex": (-16, 0, 505, 896),
        "Atilde": (-16, 0, 514, 887),
        "Adieresis": (-16, 0, 505, 880),
        "Aring": (-16, 0, 505, 914),
        "AE": (-33, 0, 855, 715),
        "Ccedilla": (74, -210, 598, 728),
        "Egrave": (37, 0, 583, 896),
        "Eacute": (37, 0, 583, 896),
        "Ecircumflex": (37, 0, 583, 896),
        "Edieresis": (37, 0, 583, 880),
        "Igrave": (46, 0, 262, 896),
        "Iacute": (46, 0, 312, 896),
        "Icircumflex": (46, 0, 326, 896),
        "Idieresis": (46, 0, 343, 880),
        "Eth": (29, 0, 583, 715),
        "Ntilde": (39, 0, 620, 887),
        "Ograve": (75, -12, 633, 896),
        "Oacute": (75, -12, 633, 896),
        "Ocircumflex": (75, -12, 633, 896),
        "Otilde": (75, -12, 633, 887),
        "Odieresis": (75, -12, 633, 880),
        "multiply": (63, 140, 412, 566),
        "Oslash": (69, -50, 636, 764),
        "Ugrave": (79, -12, 618, 896),
        "Uacute": (79, -12, 618, 896),
        "Ucircumflex": (79, -12, 618, 896),
        "Udieresis": (79, -12, 618, 880),
        "Yacute": (96, 0, 634, 896),
        "Thorn": (35, 0, 547, 715),
        "germandbls": (29, -12, 465, 728),
        "agrave": (36, -11, 431, 715),
        "aacute": (36, -11, 431, 715),
        "acircumflex": (36, -11, 431, 715),
        "atilde": (36, -11, 443, 706),
        "adieresis": (36, -11, 431, 699),
        "aring": (36, -11, 431, 733),
        "ae": (34, -12, 708, 530),
        "ccedilla": (45, -207, 418, 530),
        "egrave": (42, -11, 436, 715),
        "eacute": (42, -11, 436, 715),
        "ecircumflex": (42, -11, 436, 715),
        "edieresis": (42, -11, 436, 699),
        "igrave": (50, 0, 254, 715),
        "iacute": (50, 0, 270, 715),
        "icircumflex": (50, 0, 305, 715),
        "idieresis": (50, 0, 310, 699),
        "eth": (40, -11, 447, 715),
        "ntilde": (27, 0, 436, 706),
        "ograve": (40, -11, 442, 715),
        "oacute": (40, -11, 442, 715),
        "ocircumflex": (40, -11, 442, 715),
        "otilde": (40, -11, 442, 706),
        "odieresis": (40, -11, 442, 699),
        "divide": (38, 155, 510, 550),
        "oslash": (58, -49, 476, 565),
        "ugrave": (51, -11, 457, 715),
        "uacute": (51, -11, 457, 715),
        "ucircumflex": (51, -11, 457, 715),
        "udieresis": (51, -11, 457, 699),
        "yacute": (0, -210, 459, 715),
        "thorn": (-8, -198, 438, 715),
        "ydieresis": (0, -210, 459, 699),
    },
    "Arial,Black": {
        "space": (0, 0, 0, 0),
        "exclam": (60, 0, 272, 715),
        "quotedbl": (23, 452, 476, 715),
        "numbersign": (29, -11, 627, 728),
        "dollar": (26, -104, 631, 770),
        "percent": (48, -36, 951, 728),
        "ampersand": (74, -11, 848, 728),
        "quotesingle": (41, 452, 239, 715),
        "parenleft": (54, -210, 350, 728),
        "parenright": (39, -210, 334, 728),
        "asterisk": (86, 370, 465, 728),
        "plus": (62, 91, 594, 624),
        "comma": (60, -201, 272, 197),
        "hyphen": (21, 184, 311, 337),
        "period": (60, 0, 272, 199),
        "slash": (0, -11, 280, 728),
        "zero": (41, -12, 625, 728),
        "one": (81, 0, 491, 728),
        "two": (26, 0, 623, 728),
        "three": (35, -12, 626, 728),
        "four": (20, 0, 645, 728),
        "five": (32, -12, 627, 715),
        "six": (41, -12, 631, 728),
        "seven": (44, 0, 625, 715),
        "eight": (41, -12, 625, 728),
        "nine": (34, -12, 624, 728),
        "colon": (60, 0, 272, 518),
        "semicolon": (60, -201, 272, 518),
        "less": (52, 54, 607, 660),
        "equal": (61, 158, 594, 557),
        "greater": (52, 54, 607, 660),
        "question": (35, 0, 575, 728),
        "at": (-2, -113, 741, 728),
        "A": (0, 0, 780, 715),
        "B": (73, 0, 735, 715),
        "C": (47, -12, 743, 728),
        "D": (76, 0, 734, 715),
        "E": (72, 0, 676, 715),
        "F": (74, 0, 621, 715),
        "G": (45, -12, 774, 728),
        "H": (74, 0, 759, 715),
        "I": (82, 0, 303, 715),
        "J": (17, -12, 592, 715),
        "K": (74, 0, 833, 715),
        "L": (73, 0, 639, 715),
        "M": (70, 0, 875, 715),
        "N": (74, 0, 759, 715),
        "O": (45, -12, 787, 728),
        "P": (72, 0, 679, 715),
        "Q": (45, -80, 814, 728),
        "R": (76, 0, 780, 715),
        "S": (34, -12, 684, 728),
        "T": (22, 0, 695, 715),
        "U": (73, -12, 759, 715),
        "V": (2, 0, 778, 715),
        "W": (0, 0, 1000, 715),
        "X": (1, 0, 779, 715),
        "Y": (0, 0, 779, 715),
        "Z": (16, 0, 695, 715),
        "bracketleft": (65, -198, 366, 715),
        "backslash": (-2, -11, 277, 728),
        "bracketright": (22, -198, 323, 715),
        "asciicircum": (61, 331, 595, 728),
        "underscore": (-5, -125, 505, -75),
        "grave": (0, 582, 250, 728),
        "a": (35, -11, 632, 530),
        "b": (61, -11, 631, 715),
        "c": (36, -12, 635, 530),
        "d": (35, -11, 605, 715),
        "e": (35, -11, 635, 530),
        "f": (7, 0, 418, 728),
        "g": (35, -210, 607, 530),
        "h": (60, 0, 608, 715),
        "i": (67, 0, 266, 715),
        "j": (-48, -210, 267, 715),
        "k": (60, 0, 666, 715),
        "l": (66, 0, 266, 715),
        "m": (61, 0, 941, 530),
        "n": (60, 0, 608, 530),
        "o": (35, -11, 631, 530),
        "p": (61, -197, 631, 530),
        "q": (35, -197, 605, 530),
        "r": (62, 0, 470, 530),
        "s": (24, -12, 576, 530),
        "t": (27, -11, 416, 715),
        "u": (58, -11, 606, 518),
        "v": (0, 0, 613, 518),
        "w": (1, 0, 945, 518),
        "x": (5, 0, 661, 518),
        "y": (2, -210, 614, 518),
        "z": (18, 0, 534, 518),
        "braceleft": (12, -210, 377, 728),
        "bar": (78, -197, 202, 715),
        "braceright": (11, -210, 376, 728),
        "asciitilde": (48, 240, 608, 475),
        "bullet": (87, 189, 412, 514),
        "Euro": (8, -12, 641, 728),
        "quotesinglbase": (34, -201, 246, 197),
        "florin": (18, -210, 651, 728),
        "quotedblbase": (26, -201, 486, 197),
        "ellipsis": (60, 0, 939, 199),
        "dagger": (68, -198, 604, 715),
        "daggerdbl": (68, -198, 604, 715),
        "circumflex": (-13, 582, 347, 721),
        "perthousand": (0, -36, 1000, 728),
        "Scaron": (34, -12, 684, 898),
        "guilsinglleft": (11, 34, 319, 486),
        "OE": (34, -12, 968, 728),
        "Zcaron": (16, 0, 695, 898),
        "quoteleft": (34, 329, 246, 728),
        "quoteright": (34, 329, 246, 728),
        "quotedblleft": (26, 329, 486, 728),
        "quotedblright": (26, 329, 486, 728),
        "endash": (-5, 207, 505, 315),
        "emdash": (-5, 207, 1005, 315),
        "tilde": (-9, 580, 342, 715),
        "trademark": (17, 317, 910, 715),
        "scaron": (24, -12, 576, 721),
        "guilsinglright": (13, 34, 321, 486),
        "oe": (28, -11, 972, 530),
        "zcaron": (18, 0, 534, 721),
        "Ydieresis": (0, 0, 779, 883),
        "exclamdown": (60, -197, 272, 518),
        "cent": (36, -190, 635, 706),
        "sterling": (55, -12, 662, 728),
        "currency": (47, 0, 607, 560),
        "yen": (0, 0, 667, 715),
        "brokenbar": (78, -197, 202, 715),
        "section": (31, -210, 628, 728),
        "dieresis": (0, 583, 334, 706),
        "copyright": (28, -17, 773, 728),
        "ordfeminine": (16, 363, 371, 728),
        "guillemotleft": (46, 34, 607, 486),
        "logicalnot": (61, 154, 594, 553),
        "registered": (28, -17, 773, 728),
        "macron": (-5, 780, 505, 830),
        "degree": (58, 449, 337, 728),
        "plusminus": (62, 0, 594, 705),
        "twosuperior": (10, 361, 386, 728),
        "threesuperior": (15, 352, 384, 728),
        "acute": (79, 582, 332, 728),
        "mu": (58, -196, 607, 518),
        "paragraph": (65, -198, 789, 715),
        "periodcentered": (60, 258, 272, 457),
        "cedilla": (8, -210, 304, -11),
        "onesuperior": (68, 361, 306, 728),
        "ordmasculine": (11, 362, 384, 728),
        "guillemotright": (59, 34, 620, 486),
        "onequarter": (76, -25, 962, 728),
        "onehalf": (76, -25, 971, 728),
        "threequarters": (34, -25, 962, 728),
        "questiondown": (35, -209, 575, 518),
        "Agrave": (0, 0, 780, 905),
        "Aacute": (0, 0, 780, 905),
        "Acircumflex": (0, 0, 780, 898),
        "Atilde": (0, 0, 780, 893),
        "Adieresis": (0, 0, 780, 883),
        "Aring": (0, 0, 780, 892),
        "AE": (-37, 0, 964, 715),
        "Ccedilla": (47, -210, 743, 728),
        "Egrave": (72, 0, 676, 905),
        "Eacute": (72, 0, 676, 905),
        "Ecircumflex": (72, 0, 676, 898),
        "Edieresis": (72, 0, 676, 883),
        "Igrave": (28, 0, 303, 905),
        "Iacute": (82, 0, 360, 905),
        "Icircumflex": (14, 0, 375, 898),
        "Idieresis": (27, 0, 362, 883),
        "Eth": (0, 0, 734, 715),
        "Ntilde": (74, 0, 759, 893),
        "Ograve": (45, -12, 787, 905),
        "Oacute": (45, -12, 787, 905),
        "Ocircumflex": (45, -12, 787, 898),
        "Otilde": (45, -12, 787, 893),
        "Odieresis": (45, -12, 787, 883),
        "multiply": (61, 90, 595, 625),
        "Oslash": (17, -25, 815, 740),
        "Ugrave": (73, -12, 759, 905),
        "Uacute": (73, -12, 759, 905),
        "Ucircumflex": (73, -12, 759, 898),
        "Udieresis": (73, -12, 759, 883),
        "Yacute": (0, 0, 779, 905),
        "Thorn": (72, 0, 679, 715),
        "germandbls": (58, -11, 631, 728),
        "agrave": (35, -11, 632, 728),
        "aacute": (35, -11, 632, 728),
        "acircumflex": (35, -11, 632, 721),
        "atilde": (35, -11, 632, 715),
        "adieresis": (35, -11, 632, 706),
        "aring": (35, -11, 632, 802),
        "ae": (33, -11, 971, 530),
        "ccedilla": (36, -210, 635, 530),
        "egrave": (35, -11, 635, 728),
        "eacute": (35, -11, 635, 728),
        "ecircumflex": (35, -11, 635, 721),
        "edieresis": (35, -11, 635, 706),
        "igrave": (0, 0, 266, 728),
        "iacute": (67, 0, 332, 728),
        "icircumflex": (-13, 0, 347, 721),
        "idieresis": (0, 0, 334, 706),
        "eth": (36, -11, 629, 715),
        "ntilde": (60, 0, 608, 715),
        "ograve": (35, -11, 631, 728),
        "oacute": (35, -11, 631, 728),
        "ocircumflex": (35, -11, 631, 721),
        "otilde": (35, -11, 631, 715),
        "odieresis": (35, -11, 631, 706),
        "divide": (62, 51, 594, 662),
        "oslash": (35, -47, 630, 564),
        "ugrave": (58, -11, 606, 728),
        "uacute": (58, -11, 606, 728),
        "ucircumflex": (58, -11, 606, 721),
        "udieresis": (58, -11, 606, 706),
        "yacute": (2, -210, 614, 728),
        "thorn": (61, -197, 631, 715),
        "ydieresis": (2, -210, 614, 706),
    },
    "Garamond": {
        "space": (0, 0, 0, 0),
        "exclam": (61, -12, 160, 638),
        "quotedbl": (64, 392, 341, 677),
        "numbersign": (45, -22, 620, 666),
        "dollar": (41, -133, 404, 655),
        "percent": (36, -32, 789, 637),
        "ampersand": (26, -14, 713, 594),
        "quotesingle": (39, 392, 137, 677),
        "parenleft": (76, -245, 309, 639),
        "parenright": (-21, -244, 213, 640),
        "asterisk": (28, 240, 393, 631),
        "plus": (70, 49, 595, 572),
        "comma": (41, -173, 189, 68),
        "hyphen": (37, 171, 275, 217),
        "period": (58, -14, 160, 93),
        "slash": (56, -135, 443, 696),
        "zero": (35, -14, 437, 636),
        "one": (75, 0, 354, 633),
        "two": (21, 0, 441, 633),
        "three": (38, -13, 424, 636),
        "four": (26, -11, 456, 636),
        "five": (51, -16, 418, 638),
        "six": (48, -13, 427, 639),
        "seven": (45, -12, 431, 619),
        "eight": (56, -13, 429, 633),
        "nine": (43, -14, 421, 638),
        "colon": (57, -13, 161, 387),
        "semicolon": (42, -156, 188, 391),
        "less": (71, 70, 594, 551),
        "equal": (71, 176, 595, 445),
        "greater": (71, 70, 594, 551),
        "question": (43, -14, 330, 640),
        "at": (47, -215, 896, 694),
        "A": (-7, 0, 669, 655),
        "B": (13, 0, 568, 633),
        "C": (43, -13, 601, 640),
        "D": (10, -8, 722, 635),
        "E": (23, -6, 632, 622),
        "F": (28, -9, 540, 631),
        "G": (46, -12, 758, 640),
        "H": (19, -10, 734, 629),
        "I": (20, 0, 324, 624),
        "J": (-84, -252, 277, 624),
        "K": (28, -8, 759, 625),
        "L": (5, -2, 574, 622),
        "M": (6, -4, 826, 629),
        "N": (12, -22, 732, 627),
        "O": (45, -9, 733, 630),
        "P": (18, -9, 536, 632),
        "Q": (47, -217, 748, 642),
        "R": (20, -2, 641, 629),
        "S": (37, -16, 437, 642),
        "T": (-1, -12, 602, 649),
        "U": (18, -16, 675, 627),
        "V": (-8, -19, 686, 628),
        "W": (-9, -27, 891, 624),
        "X": (4, -10, 707, 623),
        "Y": (-9, -6, 664, 629),
        "Z": (35, -7, 608, 657),
        "bracketleft": (101, -231, 295, 627),
        "backslash": (55, -135, 444, 696),
        "bracketright": (-20, -232, 174, 627),
        "asciicircum": (32, 382, 469, 670),
        "underscore": (-5, -125, 505, -75),
        "grave": (97, 479, 261, 631),
        "a": (32, -11, 399, 398),
        "b": (16, -20, 471, 658),
        "c": (38, -15, 390, 398),
        "d": (32, -18, 487, 658),
        "e": (38, -12, 392, 401),
        "f": (46, 0, 402, 653),
        "g": (6, -257, 460, 400),
        "h": (14, -3, 497, 650),
        "i": (0, -2, 221, 639),
        "j": (20, -263, 153, 634),
        "k": (25, 0, 477, 654),
        "l": (4, 0, 227, 648),
        "m": (17, 0, 753, 417),
        "n": (17, 0, 500, 411),
        "o": (35, -13, 474, 400),
        "p": (11, -256, 474, 434),
        "q": (34, -255, 498, 412),
        "r": (18, -1, 332, 422),
        "s": (55, -15, 321, 404),
        "t": (27, -10, 295, 482),
        "u": (16, -9, 483, 383),
        "v": (-5, -20, 477, 387),
        "w": (-10, -22, 675, 385),
        "x": (13, 0, 444, 385),
        "y": (3, -246, 430, 386),
        "z": (26, -2, 389, 422),
        "braceleft": (138, -215, 410, 694),
        "bar": (228, -257, 271, 653),
        "braceright": (86, -215, 358, 694),
        "asciitilde": (73, 243, 593, 378),
        "bullet": (54, 208, 299, 453),
        "Euro": (-13, -13, 454, 640),
        "quotesinglbase": (45, -173, 188, 68),
        "florin": (0, -256, 615, 642),
        "quotedblbase": (31, -172, 406, 71),
        "ellipsis": (114, -9, 885, 96),
        "dagger": (0, -243, 422, 640),
        "daggerdbl": (15, -240, 411, 643),
        "circumflex": (71, 477, 286, 650),
        "perthousand": (35, -32, 987, 637),
        "Scaron": (37, -16, 437, 859),
        "guilsinglleft": (6, 6, 190, 393),
        "OE": (46, -8, 909, 629),
        "Zcaron": (35, -7, 608, 859),
        "quoteleft": (51, 393, 199, 637),
        "quoteright": (49, 393, 193, 636),
        "quotedblleft": (43, 392, 418, 635),
        "quotedblright": (35, 395, 412, 643),
        "endash": (-5, 168, 505, 213),
        "emdash": (-5, 168, 1005, 213),
        "tilde": (42, 504, 322, 604),
        "trademark": (14, 268, 963, 662),
        "scaron": (55, -15, 321, 650),
        "guilsinglright": (8, 7, 190, 395),
        "oe": (38, -16, 666, 400),
        "zcaron": (26, -2, 389, 650),
        "Ydieresis": (-9, -6, 664, 770),
        "exclamdown": (59, -240, 159, 408),
        "cent": (38, -168, 389, 580),
        "sterling": (29, -235, 591, 633),
        "currency": (98, 89, 564, 555),
        "yen": (-9, -6, 664, 629),
        "brokenbar": (228, -257, 271, 653),
        "section": (56, -243, 369, 641),
        "dieresis": (64, 515, 316, 600),
        "copyright": (33, -15, 726, 677),
        "ordfeminine": (13, 377, 264, 630),
        "guillemotleft": (5, 5, 365, 390),
        "logicalnot": (71, 180, 595, 461),
        "registered": (33, -15, 726, 677),
        "macron": (-5, 743, 505, 793),
        "degree": (47, 376, 348, 676),
        "plusminus": (70, -18, 595, 660),
        "twosuperior": (24, 305, 284, 635),
        "threesuperior": (35, 297, 274, 636),
        "acute": (119, 479, 284, 630),
        "mu": (22, -216, 497, 383),
        "paragraph": (-6, -215, 454, 662),
        "periodcentered": (115, 284, 217, 391),
        "cedilla": (0, -210, 146, 6),
        "onesuperior": (56, 305, 231, 635),
        "ordmasculine": (18, 376, 314, 630),
        "guillemotright": (0, 5, 360, 390),
        "onequarter": (56, -34, 785, 635),
        "onehalf": (56, -32, 776, 637),
        "threequarters": (35, -32, 791, 637),
        "questiondown": (16, -245, 302, 408),
        "Agrave": (-7, 0, 669, 837),
        "Aacute": (-7, 0, 669, 836),
        "Acircumflex": (-7, 0, 669, 859),
        "Atilde": (-7, 0, 669, 785),
        "Adieresis": (-7, 0, 669, 770),
        "Aring": (-7, 0, 669, 807),
        "AE": (-62, -4, 828, 627),
        "Ccedilla": (43, -210, 601, 640),
        "Egrave": (23, -6, 632, 837),
        "Eacute": (23, -6, 632, 836),
        "Ecircumflex": (23, -6, 632, 859),
        "Edieresis": (23, -6, 632, 770),
        "Igrave": (20, 0, 324, 837),
        "Iacute": (20, 0, 324, 836),
        "Icircumflex": (20, 0, 324, 859),
        "Idieresis": (20, 0, 324, 770),
        "Eth": (7, -8, 722, 635),
        "Ntilde": (12, -22, 732, 785),
        "Ograve": (45, -9, 733, 837),
        "Oacute": (45, -9, 733, 836),
        "Ocircumflex": (45, -9, 733, 859),
        "Otilde": (45, -9, 733, 785),
        "Odieresis": (45, -9, 733, 770),
        "multiply": (96, 73, 571, 548),
        "Oslash": (45, -30, 733, 651),
        "Ugrave": (18, -16, 675, 837),
        "Uacute": (18, -16, 675, 836),
        "Ucircumflex": (18, -16, 675, 859),
        "Udieresis": (18, -16, 675, 770),
        "Yacute": (-9, -6, 664, 836),
        "Thorn": (18, -9, 536, 625),
        "germandbls": (7, -15, 469, 643),
        "agrave": (32, -11, 399, 631),
        "aacute": (32, -11, 399, 630),
        "acircumflex": (32, -11, 399, 650),
        "atilde": (32, -11, 399, 604),
        "adieresis": (32, -11, 399, 600),
        "aring": (32, -11, 399, 614),
        "ae": (36, -15, 561, 399),
        "ccedilla": (38, -210, 390, 398),
        "egrave": (38, -12, 392, 631),
        "eacute": (38, -12, 392, 630),
        "ecircumflex": (38, -12, 392, 650),
        "edieresis": (38, -12, 392, 600),
        "igrave": (-1, -2, 219, 631),
        "iacute": (-1, -2, 231, 630),
        "icircumflex": (-1, -2, 224, 650),
        "idieresis": (-1, -2, 250, 600),
        "eth": (44, -11, 485, 642),
        "ntilde": (17, 0, 500, 604),
        "ograve": (35, -13, 474, 631),
        "oacute": (35, -13, 474, 630),
        "ocircumflex": (35, -13, 474, 650),
        "otilde": (35, -13, 474, 604),
        "odieresis": (35, -13, 474, 600),
        "divide": (11, 136, 537, 524),
        "oslash": (38, -23, 476, 412),
        "ugrave": (16, -9, 483, 631),
        "uacute": (16, -9, 483, 630),
        "ucircumflex": (16, -9, 483, 650),
        "udieresis": (16, -9, 483, 600),
        "yacute": (3, -246, 430, 630),
        "thorn": (11, -256, 474, 648),
        "ydieresis": (3, -246, 430, 600),
    },
    "Garamond,Bold": {
        "space": (0, 0, 0, 0),
        "exclam": (61, -8, 202, 649),
        "quotedbl": (85, 352, 465, 677),
        "numbersign": (41, -21, 625, 675),
        "dollar": (39, -94, 437, 635),
        "percent": (31, -12, 800, 653),
        "ampersand": (45, -10, 762, 613),
        "quotesingle": (68, 352, 212, 677),
        "parenleft": (68, -236, 350, 647),
        "parenright": (11, -236, 294, 647),
        "asterisk": (32, 213, 457, 649),
        "plus": (65, 50, 601, 584),
        "comma": (45, -179, 221, 134),
        "hyphen": (34, 158, 302, 251),
        "period": (61, -8, 202, 132),
        "slash": (57, -135, 495, 696),
        "zero": (27, -10, 438, 645),
        "one": (25, 3, 368, 644),
        "two": (19, 1, 449, 642),
        "three": (14, -13, 437, 642),
        "four": (23, -10, 445, 644),
        "five": (31, -10, 428, 641),
        "six": (29, -10, 439, 648),
        "seven": (34, -10, 430, 628),
        "eight": (42, -10, 434, 641),
        "nine": (30, -14, 442, 644),
        "colon": (57, -8, 199, 423),
        "semicolon": (48, -178, 224, 424),
        "less": (66, 59, 600, 576),
        "equal": (66, 164, 600, 471),
        "greater": (66, 59, 600, 576),
        "question": (48, -9, 375, 650),
        "at": (44, -215, 908, 677),
        "A": (-12, 3, 676, 647),
        "B": (35, 0, 627, 639),
        "C": (45, -6, 645, 649),
        "D": (24, 3, 736, 645),
        "E": (17, 0, 670, 635),
        "F": (29, 0, 585, 638),
        "G": (45, -8, 711, 646),
        "H": (31, 4, 826, 639),
        "I": (40, 1, 352, 639),
        "J": (-58, -235, 345, 638),
        "K": (26, 2, 709, 639),
        "L": (19, 1, 632, 641),
        "M": (20, 0, 894, 637),
        "N": (3, -13, 814, 636),
        "O": (43, -5, 744, 647),
        "P": (23, 0, 587, 639),
        "Q": (43, -170, 750, 648),
        "R": (39, 1, 710, 640),
        "S": (49, -6, 476, 649),
        "T": (0, 1, 657, 664),
        "U": (17, -13, 718, 634),
        "V": (-11, -4, 675, 640),
        "W": (0, -14, 898, 633),
        "X": (4, 1, 687, 635),
        "Y": (-18, 2, 672, 635),
        "Z": (21, 1, 620, 660),
        "bracketleft": (122, -225, 340, 631),
        "backslash": (58, -135, 494, 696),
        "bracketright": (20, -224, 240, 633),
        "asciicircum": (73, 325, 511, 675),
        "underscore": (-5, -125, 505, -75),
        "grave": (59, 468, 242, 625),
        "a": (48, -2, 468, 415),
        "b": (20, -8, 516, 646),
        "c": (38, -7, 447, 419),
        "d": (38, -11, 543, 652),
        "e": (35, -8, 435, 418),
        "f": (26, 1, 393, 648),
        "g": (24, -250, 539, 415),
        "h": (18, 0, 540, 646),
        "i": (14, 2, 268, 645),
        "j": (21, -229, 199, 645),
        "k": (15, 0, 539, 647),
        "l": (3, 1, 260, 647),
        "m": (20, 3, 833, 434),
        "n": (19, 0, 539, 440),
        "o": (36, -8, 484, 418),
        "p": (-1, -246, 515, 447),
        "q": (38, -248, 545, 443),
        "r": (17, 3, 343, 437),
        "s": (43, -8, 374, 417),
        "t": (27, -1, 301, 497),
        "u": (20, -8, 536, 401),
        "v": (-6, -6, 466, 402),
        "w": (-6, -6, 717, 400),
        "x": (9, 2, 485, 400),
        "y": (-7, -237, 471, 400),
        "z": (29, 3, 426, 447),
        "braceleft": (80, -202, 351, 677),
        "bar": (231, -249, 309, 644),
        "braceright": (44, -202, 315, 677),
        "asciitilde": (67, 238, 599, 396),
        "bullet": (37, 190, 316, 469),
        "Euro": (-17, -5, 448, 649),
        "quotesinglbase": (40, -179, 216, 134),
        "florin": (0, -236, 708, 645),
        "quotedblbase": (43, -177, 457, 134),
        "ellipsis": (94, -7, 904, 135),
        "dagger": (14, -236, 486, 648),
        "daggerdbl": (21, -232, 479, 652),
        "circumflex": (32, 460, 322, 633),
        "perthousand": (31, -12, 998, 653),
        "Scaron": (49, -6, 476, 848),
        "guilsinglleft": (11, 13, 251, 402),
        "OE": (50, 0, 943, 646),
        "Zcaron": (21, 1, 620, 848),
        "quoteleft": (45, 326, 223, 640),
        "quoteright": (34, 326, 210, 639),
        "quotedblleft": (46, 325, 461, 640),
        "quotedblright": (33, 326, 450, 639),
        "endash": (-5, 205, 505, 295),
        "emdash": (-5, 205, 1005, 295),
        "tilde": (10, 486, 334, 615),
        "trademark": (-1, 268, 1005, 662),
        "scaron": (43, -8, 374, 635),
        "guilsinglright": (22, 10, 262, 399),
        "oe": (36, -6, 699, 419),
        "zcaron": (29, 3, 428, 635),
        "Ydieresis": (-18, 2, 672, 822),
        "exclamdown": (58, -238, 199, 419),
        "cent": (27, -171, 436, 584),
        "sterling": (46, -229, 645, 647),
        "currency": (81, 78, 581, 578),
        "yen": (-18, 2, 672, 635),
        "brokenbar": (231, -249, 309, 644),
        "section": (41, -241, 463, 647),
        "dieresis": (33, 488, 319, 609),
        "copyright": (28, -15, 721, 677),
        "ordfeminine": (22, 393, 303, 645),
        "guillemotleft": (2, 12, 430, 396),
        "logicalnot": (65, 168, 601, 483),
        "registered": (28, -15, 721, 677),
        "macron": (-5, 682, 505, 732),
        "degree": (28, 337, 366, 675),
        "plusminus": (65, -23, 601, 676),
        "twosuperior": (23, 310, 287, 644),
        "threesuperior": (20, 302, 282, 644),
        "acute": (114, 467, 298, 625),
        "mu": (25, -186, 453, 401),
        "paragraph": (0, -215, 541, 662),
        "periodcentered": (96, 253, 237, 394),
        "cedilla": (43, -228, 291, 7),
        "onesuperior": (43, 311, 258, 645),
        "ordmasculine": (17, 389, 316, 647),
        "guillemotright": (17, 12, 444, 396),
        "onequarter": (46, -12, 804, 653),
        "onehalf": (46, -12, 805, 653),
        "threequarters": (23, -12, 804, 653),
        "questiondown": (42, -239, 369, 421),
        "Agrave": (-12, 3, 676, 837),
        "Aacute": (-12, 3, 676, 837),
        "Acircumflex": (-12, 3, 676, 846),
        "Atilde": (-12, 3, 676, 828),
        "Adieresis": (-12, 3, 676, 822),
        "Aring": (-12, 3, 676, 802),
        "AE": (-44, -2, 841, 633),
        "Ccedilla": (45, -228, 645, 649),
        "Egrave": (17, 0, 670, 837),
        "Eacute": (17, 0, 670, 837),
        "Ecircumflex": (17, 0, 670, 846),
        "Edieresis": (17, 0, 670, 822),
        "Igrave": (40, 1, 352, 837),
        "Iacute": (40, 1, 352, 837),
        "Icircumflex": (40, 1, 354, 846),
        "Idieresis": (40, 1, 352, 822),
        "Eth": (24, 3, 736, 645),
        "Ntilde": (3, -13, 814, 828),
        "Ograve": (43, -5, 744, 837),
        "Oacute": (43, -5, 744, 837),
        "Ocircumflex": (43, -5, 744, 846),
        "Otilde": (43, -5, 744, 828),
        "Odieresis": (43, -5, 744, 822),
        "multiply": (85, 70, 582, 565),
        "Oslash": (43, -7, 744, 650),
        "Ugrave": (17, -13, 718, 837),
        "Uacute": (17, -13, 718, 837),
        "Ucircumflex": (17, -13, 718, 846),
        "Udieresis": (17, -13, 718, 822),
        "Yacute": (-18, 2, 672, 837),
        "Thorn": (23, 0, 588, 639),
        "germandbls": (17, -1, 514, 647),
        "agrave": (48, -2, 468, 625),
        "aacute": (48, -2, 468, 625),
        "acircumflex": (48, -2, 468, 633),
        "atilde": (48, -2, 468, 615),
        "adieresis": (48, -2, 468, 609),
        "aring": (48, -2, 468, 629),
        "ae": (41, -8, 664, 416),
        "ccedilla": (38, -228, 447, 419),
        "egrave": (35, -8, 435, 625),
        "eacute": (35, -8, 435, 625),
        "ecircumflex": (35, -8, 435, 633),
        "edieresis": (35, -8, 435, 609),
        "igrave": (16, 2, 268, 625),
        "iacute": (16, 2, 271, 625),
        "icircumflex": (5, 2, 296, 633),
        "idieresis": (7, 2, 292, 609),
        "eth": (33, -8, 482, 648),
        "ntilde": (19, 0, 539, 615),
        "ograve": (36, -8, 484, 625),
        "oacute": (36, -8, 484, 625),
        "ocircumflex": (36, -8, 484, 633),
        "otilde": (36, -8, 484, 615),
        "odieresis": (36, -8, 484, 609),
        "divide": (65, 69, 601, 569),
        "oslash": (36, -38, 485, 449),
        "ugrave": (20, -8, 536, 625),
        "uacute": (20, -8, 536, 625),
        "ucircumflex": (20, -8, 536, 633),
        "udieresis": (20, -8, 536, 609),
        "yacute": (-7, -237, 471, 625),
        "thorn": (-1, -246, 515, 647),
        "ydieresis": (-7, -237, 471, 609),
    },
    "Garamond,Italic": {
        "space": (0, 0, 0, 0),
        "exclam": (49, -11, 299, 623),
        "quotedbl": (124, 392, 465, 677),
        "numbersign": (81, -22, 656, 666),
        "dollar": (11, -105, 460, 629),
        "percent": (71, -32, 734, 633),
        "ampersand": (91, -9, 978, 655),
        "quotesingle": (131, 392, 261, 677),
        "parenleft": (95, -255, 428, 651),
        "parenright": (-78, -253, 257, 652),
        "asterisk": (95, 245, 490, 631),
        "plus": (105, 49, 630, 572),
        "comma": (-17, -160, 154, 119),
        "hyphen": (51, 169, 269, 219),
        "period": (41, -14, 142, 93),
        "slash": (56, -135, 443, 696),
        "zero": (52, -11, 471, 633),
        "one": (148, 0, 407, 631),
        "two": (16, 0, 485, 632),
        "three": (21, -11, 453, 632),
        "four": (16, 0, 443, 631),
        "five": (15, -11, 499, 640),
        "six": (56, -11, 505, 633),
        "seven": (81, -11, 518, 613),
        "eight": (45, -13, 475, 631),
        "nine": (28, -12, 478, 633),
        "colon": (42, -10, 238, 396),
        "semicolon": (0, -157, 251, 398),
        "less": (106, 69, 629, 551),
        "equal": (106, 175, 630, 445),
        "greater": (106, 69, 629, 551),
        "question": (110, -12, 416, 635),
        "at": (47, -215, 896, 694),
        "A": (-55, -8, 746, 641),
        "B": (12, -7, 544, 640),
        "C": (70, -15, 702, 646),
        "D": (18, -6, 734, 639),
        "E": (-2, -8, 673, 636),
        "F": (7, -8, 648, 640),
        "G": (70, -16, 708, 641),
        "H": (16, -7, 833, 639),
        "I": (7, -8, 393, 640),
        "J": (-117, -248, 390, 639),
        "K": (14, -8, 677, 637),
        "L": (1, -4, 674, 632),
        "M": (-25, -19, 883, 646),
        "N": (-9, -18, 865, 640),
        "O": (81, -13, 674, 648),
        "P": (12, -6, 574, 643),
        "Q": (-97, -235, 690, 643),
        "R": (30, -5, 673, 636),
        "S": (28, -15, 523, 645),
        "T": (69, -10, 682, 652),
        "U": (115, -15, 784, 641),
        "V": (118, -19, 925, 638),
        "W": (106, -18, 1003, 637),
        "X": (-10, -8, 826, 645),
        "Y": (71, -3, 760, 643),
        "Z": (41, 0, 631, 635),
        "bracketleft": (47, -229, 479, 625),
        "backslash": (55, -135, 444, 696),
        "bracketright": (-104, -229, 322, 625),
        "asciicircum": (67, 382, 504, 670),
        "underscore": (-5, -125, 505, -75),
        "grave": (194, 461, 357, 612),
        "a": (38, -12, 426, 387),
        "b": (66, -14, 429, 646),
        "c": (48, -10, 334, 400),
        "d": (44, -20, 509, 656),
        "e": (50, -16, 315, 395),
        "f": (-182, -256, 434, 642),
        "g": (-92, -246, 380, 400),
        "h": (35, -16, 422, 649),
        "i": (37, -11, 291, 621),
        "j": (-216, -245, 284, 606),
        "k": (32, -23, 512, 645),
        "l": (35, -13, 334, 649),
        "m": (24, -13, 649, 396),
        "n": (45, -14, 434, 403),
        "o": (55, -11, 354, 399),
        "p": (-141, -252, 409, 516),
        "q": (38, -252, 450, 402),
        "r": (55, -11, 397, 400),
        "s": (25, -8, 331, 399),
        "t": (38, -8, 335, 522),
        "u": (38, -12, 452, 400),
        "v": (52, -15, 379, 407),
        "w": (35, -18, 577, 401),
        "x": (8, -9, 556, 397),
        "y": (-215, -243, 350, 399),
        "z": (58, -253, 486, 399),
        "braceleft": (138, -215, 410, 694),
        "bar": (263, -246, 307, 641),
        "braceright": (133, -215, 406, 694),
        "asciitilde": (108, 243, 628, 377),
        "bullet": (102, 208, 347, 453),
        "Euro": (44, -16, 611, 645),
        "quotesinglbase": (7, -137, 151, 119),
        "florin": (0, -256, 615, 642),
        "quotedblbase": (6, -162, 357, 95),
        "ellipsis": (114, -9, 886, 96),
        "dagger": (84, -242, 499, 644),
        "daggerdbl": (-18, -254, 499, 654),
        "circumflex": (163, 439, 390, 622),
        "perthousand": (70, -32, 891, 633),
        "Scaron": (28, -15, 600, 856),
        "guilsinglleft": (61, -5, 317, 404),
        "OE": (80, -4, 963, 642),
        "Zcaron": (41, 0, 648, 853),
        "quoteleft": (177, 386, 326, 650),
        "quoteright": (152, 393, 297, 650),
        "quotedblleft": (188, 385, 536, 646),
        "quotedblright": (146, 388, 495, 645),
        "endash": (-5, 168, 505, 213),
        "emdash": (-5, 168, 1005, 213),
        "tilde": (158, 489, 437, 589),
        "trademark": (61, 268, 1010, 662),
        "scaron": (25, -8, 455, 624),
        "guilsinglright": (-19, -7, 236, 404),
        "oe": (52, -11, 493, 398),
        "zcaron": (58, -253, 522, 624),
        "Ydieresis": (71, -3, 760, 786),
        "exclamdown": (-17, -227, 232, 408),
        "cent": (-7, -121, 351, 534),
        "sterling": (31, -235, 593, 633),
        "currency": (133, 89, 600, 555),
        "yen": (45, -9, 741, 638),
        "brokenbar": (263, -246, 307, 641),
        "section": (-4, -227, 464, 644),
        "dieresis": (179, 494, 422, 574),
        "copyright": (81, -15, 773, 677),
        "ordfeminine": (103, 392, 365, 638),
        "guillemotleft": (52, -7, 458, 403),
        "logicalnot": (106, 180, 630, 461),
        "registered": (81, -15, 773, 677),
        "macron": (80, 669, 591, 719),
        "degree": (104, 378, 404, 678),
        "plusminus": (105, -18, 630, 660),
        "twosuperior": (49, 303, 338, 632),
        "threesuperior": (52, 297, 319, 632),
        "acute": (242, 460, 404, 611),
        "mu": (-62, -215, 481, 383),
        "paragraph": (-6, -215, 454, 662),
        "periodcentered": (162, 263, 264, 371),
        "cedilla": (23, -223, 147, 7),
        "onesuperior": (127, 303, 293, 632),
        "ordmasculine": (115, 392, 321, 645),
        "guillemotright": (-12, -6, 394, 404),
        "onequarter": (127, -32, 729, 633),
        "onehalf": (127, -32, 754, 633),
        "threequarters": (52, -32, 729, 633),
        "questiondown": (-4, -237, 301, 409),
        "Agrave": (-55, -8, 762, 845),
        "Aacute": (-55, -8, 853, 845),
        "Acircumflex": (-55, -8, 827, 861),
        "Atilde": (-55, -8, 890, 801),
        "Adieresis": (-55, -8, 844, 786),
        "Aring": (-55, -8, 758, 791),
        "AE": (-32, -6, 869, 637),
        "Ccedilla": (70, -226, 702, 646),
        "Egrave": (-2, -8, 673, 845),
        "Eacute": (-2, -8, 673, 845),
        "Ecircumflex": (-2, -8, 673, 861),
        "Edieresis": (-2, -8, 673, 786),
        "Igrave": (7, -8, 393, 845),
        "Iacute": (7, -8, 408, 845),
        "Icircumflex": (7, -8, 393, 861),
        "Idieresis": (7, -8, 446, 786),
        "Eth": (33, -6, 750, 639),
        "Ntilde": (-9, -18, 865, 801),
        "Ograve": (81, -13, 674, 845),
        "Oacute": (81, -13, 674, 845),
        "Ocircumflex": (81, -13, 674, 861),
        "Otilde": (81, -13, 674, 801),
        "Odieresis": (81, -13, 674, 786),
        "multiply": (131, 73, 606, 548),
        "Oslash": (81, -16, 674, 650),
        "Ugrave": (115, -15, 784, 845),
        "Uacute": (115, -15, 784, 845),
        "Ucircumflex": (115, -15, 784, 861),
        "Udieresis": (115, -15, 784, 786),
        "Yacute": (71, -3, 760, 845),
        "Thorn": (22, -6, 556, 642),
        "germandbls": (-145, -250, 538, 648),
        "agrave": (38, -12, 445, 612),
        "aacute": (38, -12, 444, 611),
        "acircumflex": (38, -12, 429, 622),
        "atilde": (38, -12, 476, 589),
        "adieresis": (38, -12, 495, 574),
        "aring": (38, -12, 426, 616),
        "ae": (26, -13, 514, 406),
        "ccedilla": (-7, -223, 334, 400),
        "egrave": (50, -16, 335, 612),
        "eacute": (50, -16, 382, 611),
        "ecircumflex": (50, -16, 367, 622),
        "edieresis": (50, -16, 399, 574),
        "igrave": (38, -9, 302, 612),
        "iacute": (38, -9, 349, 611),
        "icircumflex": (38, -9, 341, 622),
        "idieresis": (38, -9, 378, 574),
        "eth": (58, -13, 425, 642),
        "ntilde": (45, -14, 536, 589),
        "ograve": (55, -11, 369, 612),
        "oacute": (55, -11, 416, 611),
        "ocircumflex": (55, -11, 401, 622),
        "otilde": (55, -11, 448, 589),
        "odieresis": (55, -11, 433, 574),
        "divide": (106, 81, 630, 543),
        "oslash": (43, -10, 373, 400),
        "ugrave": (38, -12, 452, 612),
        "uacute": (38, -12, 455, 611),
        "ucircumflex": (38, -12, 452, 622),
        "udieresis": (38, -12, 472, 574),
        "yacute": (-215, -243, 404, 611),
        "thorn": (-141, -252, 409, 648),
        "ydieresis": (-215, -243, 363, 574),
    },
}


================================================
FILE: babeldoc/format/pdf/converter.py
================================================
import logging
import re
import unicodedata

import numpy as np
from pymupdf import Font

from babeldoc.format.pdf.document_il.frontend.il_creater import ILCreater
from babeldoc.pdfminer.converter import PDFConverter
from babeldoc.pdfminer.layout import LTChar
from babeldoc.pdfminer.layout import LTComponent
from babeldoc.pdfminer.layout import LTCurve
from babeldoc.pdfminer.layout import LTFigure
from babeldoc.pdfminer.layout import LTLine
from babeldoc.pdfminer.layout import LTPage
from babeldoc.pdfminer.layout import LTText
from babeldoc.pdfminer.pdfcolor import PDFColorSpace
from babeldoc.pdfminer.pdffont import PDFCIDFont
from babeldoc.pdfminer.pdffont import PDFFont
from babeldoc.pdfminer.pdffont import PDFUnicodeNotDefined
from babeldoc.pdfminer.pdfinterp import PDFGraphicState
from babeldoc.pdfminer.pdfinterp import PDFResourceManager
from babeldoc.pdfminer.utils import Matrix
from babeldoc.pdfminer.utils import apply_matrix_pt
from babeldoc.pdfminer.utils import bbox2str
from babeldoc.pdfminer.utils import matrix2str
from babeldoc.pdfminer.utils import mult_matrix

log = logging.getLogger(__name__)


class PDFConverterEx(PDFConverter):
    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        il_creater: ILCreater | None = None,
    ) -> None:
        PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
        self.il_creater = il_creater

    def begin_page(self, page, ctm) -> None:
        # 重载替换 cropbox
        (x0, y0, x1, y1) = page.cropbox
        (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
        (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
        mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
        self.il_creater.on_page_media_box(
            mediabox[0],
            mediabox[1],
            mediabox[2],
            mediabox[3],
        )
        self.il_creater.on_page_number(page.pageno)
        self.cur_item = LTPage(page.pageno, mediabox)

    def end_page(self, _page) -> None:
        # 重载返回指令流
        return self.receive_layout(self.cur_item)

    def begin_figure(self, name, bbox, matrix) -> None:
        # 重载设置 pageid
        self._stack.append(self.cur_item)
        self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
        self.cur_item.pageid = self._stack[-1].pageid

    def end_figure(self, _: str) -> None:
        # 重载返回指令流
        fig = self.cur_item
        if not isinstance(self.cur_item, LTFigure):
            raise ValueError(f"Unexpected item type: {type(self.cur_item)}")
        self.cur_item = self._stack.pop()
        self.cur_item.add(fig)
        return self.receive_layout(fig)

    def render_char(
        self,
        matrix,
        font,
        fontsize: float,
        scaling: float,
        rise: float,
        cid: int,
        ncs,
        graphicstate: PDFGraphicState,
    ) -> float:
        # 重载设置 cid 和 font
        try:
            text = font.to_unichr(cid)
            if not isinstance(text, str):
                raise TypeError(f"Expected string, got {type(text)}")
        except PDFUnicodeNotDefined:
            text = self.handle_undefined_char(font, cid)
        textwidth = font.char_width(cid)
        textdisp = font.char_disp(cid)
        font_id = font.font_id_temp
        if font_id is not None:
            pass
        elif not hasattr(font, "xobj_id"):
            log.debug(
                f"Font {font.fontname} does not have xobj_id attribute.",
            )
            font_id = "UNKNOW"
        else:
            font_id = self.il_creater.current_page_font_name_id_map.get(
                font.xobj_id, None
            )

        item = AWLTChar(
            matrix,
            font,
            fontsize,
            scaling,
            rise,
            text,
            textwidth,
            textdisp,
            ncs,
            graphicstate,
            self.il_creater.xobj_id,
            font_id,
            self.il_creater.get_render_order_and_increase(),
        )
        self.cur_item.add(item)
        item.cid = cid  # hack 插入原字符编码
        item.font = font  # hack 插入原字符字体
        return item.adv


class AWLTChar(LTChar):
    """Actual letter in the text as a Unicode string."""

    def __init__(
        self,
        matrix: Matrix,
        font: PDFFont,
        fontsize: float,
        scaling: float,
        rise: float,
        text: str,
        textwidth: float,
        textdisp: float | tuple[float | None, float],
        ncs: PDFColorSpace,
        graphicstate: PDFGraphicState,
        xobj_id: int,
        font_id: str,
        render_order: int,
    ) -> None:
        LTText.__init__(self)
        self._text = text
        self.matrix = matrix
        self.fontname = font.fontname
        self.ncs = ncs
        self.graphicstate = graphicstate
        self.xobj_id = xobj_id
        self.adv = textwidth * fontsize * scaling
        self.aw_font_id = font_id
        self.render_order = render_order
        # compute the boundary rectangle.
        if font.is_vertical():
            # vertical
            assert isinstance(textdisp, tuple)
            (vx, vy) = textdisp
            if vx is None:
                vx = fontsize * 0.5
            else:
                vx = vx * fontsize * 0.001
            vy = (1000 - vy) * fontsize * 0.001
            bbox_lower_left = (-vx, vy + rise + self.adv)
            bbox_upper_right = (-vx + fontsize, vy + rise)
        else:
            # horizontal
            descent = font.get_descent() * fontsize
            bbox_lower_left = (0, descent + rise)
            bbox_upper_right = (self.adv, descent + rise + fontsize)
        (a, b, c, d, e, f) = self.matrix
        self.upright = a * d * scaling > 0 and b * c <= 0
        (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
        (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
        if x1 < x0:
            (x0, x1) = (x1, x0)
        if y1 < y0:
            (y0, y1) = (y1, y0)
        LTComponent.__init__(self, (x0, y0, x1, y1))
        if font.is_vertical() or matrix[0] == 0:
            self.size = self.width
        else:
            self.size = self.height
        return

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"

    def get_text(self) -> str:
        return self._text


class Paragraph:
    def __init__(self, y, x, x0, x1, size, brk):
        self.y: float = y  # 初始纵坐标
        self.x: float = x  # 初始横坐标
        self.x0: float = x0  # 左边界
        self.x1: float = x1  # 右边界
        self.size: float = size  # 字体大小
        self.brk: bool = brk  # 换行标记


# fmt: off
class TranslateConverter(PDFConverterEx):
    def __init__(
        self,
        rsrcmgr,
        vfont: str | None = None,
        vchar: str | None = None,
        thread: int = 0,
        layout: dict | None = None,
        lang_in: str = "",  # 保留参数但添加未使用标记
        _lang_out: str = "",  # 改为未使用参数
        _service: str = "",  # 改为未使用参数
        resfont: str = "",
        noto: Font | None = None,
        envs: dict | None = None,
        _prompt: list | None = None,  # 改为未使用参数
        il_creater: ILCreater | None = None,
    ):
        layout = layout or {}
        super().__init__(rsrcmgr, il_creater)
        self.vfont = vfont
        self.vchar = vchar
        self.thread = thread
        self.layout = layout
        self.resfont = resfont
        self.noto = noto

    def receive_layout(self, ltpage: LTPage):
        # 段落
        sstk: list[str] = []            # 段落文字栈
        pstk: list[Paragraph] = []      # 段落属性栈
        vbkt: int = 0                   # 段落公式括号计数
        # 公式组
        vstk: list[LTChar] = []         # 公式符号组
        vlstk: list[LTLine] = []        # 公式线条组
        vfix: float = 0                 # 公式纵向偏移
        # 公式组栈
        var: list[list[LTChar]] = []    # 公式符号组栈
        varl: list[list[LTLine]] = []   # 公式线条组栈
        varf: list[float] = []          # 公式纵向偏移栈
        vlen: list[float] = []          # 公式宽度栈
        # 全局
        lstk: list[LTLine] = []         # 全局线条栈
        xt: LTChar = None               # 上一个字符
        xt_cls: int = -1                # 上一个字符所属段落，保证无论第一个字符属于哪个类别都可以触发新段落
        vmax: float = ltpage.width / 4  # 行内公式最大宽度
        ops: str = ""                   # 渲染结果

        def vflag(font: str, char: str):    # 匹配公式（和角标）字体
            if isinstance(font, bytes):     # 不一定能 decode，直接转 str
                font = str(font)
            font = font.split("+")[-1]      # 字体名截断
            if re.match(r"\(cid:", char):
                return True
            # 基于字体名规则的判定
            if self.vfont:
                if re.match(self.vfont, font):
                    return True
            else:
                if re.match(                                            # latex 字体
                    r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
                    font,
                ):
                    return True
            # 基于字符集规则的判定
            if self.vchar:
                if re.match(self.vchar, char):
                    return True
            else:
                if (
                    char
                    and char != " "                                     # 非空格
                    and (
                        unicodedata.category(char[0])
                        in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]   # 文字修饰符、数学符号、分隔符号
                        or ord(char[0]) in range(0x370, 0x400)          # 希腊字母
                    )
                ):
                    return True
            return False

        ############################################################
        # A. 原文档解析
        for child in ltpage:
            if isinstance(child, LTChar):
                try:
                    self.il_creater.on_lt_char(child)
                except Exception:
                    log.exception(
                        'Error processing LTChar',
                    )
                continue
                cur_v = False
                layout = self.layout[ltpage.pageid]
                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
                h, w = layout.shape
                # 读取当前字符在 layout 中的类别
                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
                cls = layout[cy, cx]
                # 锚定文档中 bullet 的位置
                if child.get_text() == "•":
                    cls = 0
                # 判定当前字符是否属于公式
                if (                                                                                        # 判定当前字符是否属于公式
                    cls == 0                                                                                # 1. 类别为保留区域
                    or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79)  # 2. 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
                    or vflag(child.fontname, child.get_text())                                              # 3. 公式字体
                    or (child.matrix[0] == 0 and child.matrix[3] == 0)                                      # 4. 垂直字体
                ):
                    cur_v = True
                # 判定括号组是否属于公式
                if not cur_v:
                    if vstk and child.get_text() == "(":
                        cur_v = True
                        vbkt += 1
                    if vbkt and child.get_text() == ")":
                        cur_v = True
                        vbkt -= 1
                if (                                                        # 判定当前公式是否结束
                    not cur_v                                               # 1. 当前字符不属于公式
                    or cls != xt_cls                                        # 2. 当前字符与前一个字符不属于同一段落
                    # or (abs(child.x0 - xt.x0) > vmax and cls != 0)        # 3. 段落内换行，可能是一长串斜体的段落，也可能是段内分式换行，这里设个阈值进行区分
                    # 禁止纯公式（代码）段落换行，直到文字开始再重开文字段落，保证只存在两种情况
                    # A. 纯公式（代码）段落（锚定绝对位置）sstk[-1]=="" -> sstk[-1]=="{v*}"
                    # B. 文字开头段落（排版相对位置）sstk[-1]!=""
                    or (sstk[-1] != "" and abs(child.x0 - xt.x0) > vmax)    # 因为 cls==xt_cls==0 一定有 sstk[-1]==""，所以这里不需要再判定 cls!=0
                ):
                    if vstk:
                        if (                                                # 根据公式右侧的文字修正公式的纵向偏移
                            not cur_v                                       # 1. 当前字符不属于公式
                            and cls == xt_cls                               # 2. 当前字符与前一个字符属于同一段落
                            and child.x0 > max([vch.x0 for vch in vstk])    # 3. 当前字符在公式右侧
                        ):
                            vfix = vstk[0].y0 - child.y0
                        if sstk[-1] == "":
                            xt_cls = -1 # 禁止纯公式段落（sstk[-1]=="{v*}"）的后续连接，但是要考虑新字符和后续字符的连接，所以这里修改的是上个字符的类别
                        sstk[-1] += f"{{v{len(var)}}}"
                        var.append(vstk)
                        varl.append(vlstk)
                        varf.append(vfix)
                        vstk = []
                        vlstk = []
                        vfix = 0
                # 当前字符不属于公式或当前字符是公式的第一个字符
                if not vstk:
                    if cls == xt_cls:               # 当前字符与前一个字符属于同一段落
                        if child.x0 > xt.x1 + 1:    # 添加行内空格
                            sstk[-1] += " "
                        elif child.x1 < xt.x0:      # 添加换行空格并标记原文段落存在换行
                            sstk[-1] += " "
                            pstk[-1].brk = True
                    else:                           # 根据当前字符构建一个新的段落
                        sstk.append("")
                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
                if not cur_v:                                               # 文字入栈
                    if (                                                    # 根据当前字符修正段落属性
                        child.size > pstk[-1].size / 0.79                   # 1. 当前字符显著比段落字体大
                        or len(sstk[-1].strip()) == 1                       # 2. 当前字符为段落第二个文字（考虑首字母放大的情况）
                    ) and child.get_text() != " ":                          # 3. 当前字符不是空格
                        pstk[-1].y -= child.size - pstk[-1].size            # 修正段落初始纵坐标，假设两个不同大小字符的上边界对齐
                        pstk[-1].size = child.size
                    sstk[-1] += child.get_text()
                else:                                                       # 公式入栈
                    if (                                                    # 根据公式左侧的文字修正公式的纵向偏移
                        not vstk                                            # 1. 当前字符是公式的第一个字符
                        and cls == xt_cls                                   # 2. 当前字符与前一个字符属于同一段落
                        and child.x0 > xt.x0                                # 3. 前一个字符在公式左侧
                    ):
                        vfix = child.y0 - xt.y0
                    vstk.append(child)
                # 更新段落边界，因为段落内换行之后可能是公式开头，所以要在外边处理
                pstk[-1].x0 = min(pstk[-1].x0, child.x0)
                pstk[-1].x1 = max(pstk[-1].x1, child.x1)
                # 更新上一个字符
                xt = child
                xt_cls = cls
            elif isinstance(child, LTFigure):
                # 图表
                self.il_creater.on_pdf_figure(child)
                pass
            # elif isinstance(child, LTLine):     # 线条
            #     continue
            #     layout = self.layout[ltpage.pageid]
            #     # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
            #     h, w = layout.shape
            #     # 读取当前线条在 layout 中的类别
            #     cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
            #     cls = layout[cy, cx]
            #     if vstk and cls == xt_cls:      # 公式线条
            #         vlstk.append(child)
            #     else:                           # 全局线条
            #         lstk.append(child)
            elif isinstance(child, LTCurve):
                self.il_creater.on_lt_curve(child)
                pass
            else:
                pass
        return
        # 处理结尾
        if vstk:    # 公式出栈
            sstk[-1] += f"{{v{len(var)}}}"
            var.append(vstk)
            varl.append(vlstk)
            varf.append(vfix)
        log.debug("\n==========[VSTACK]==========\n")
        for var_id, v in enumerate(var):  # 计算公式宽度
            l = max([vch.x1 for vch in v]) - v[0].x0
            log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[var_id])} > v{var_id} = {"".join([ch.get_text() for ch in v])}')
            vlen.append(l)

        ############################################################
        # B. 段落翻译
        log.debug("\n==========[SSTACK]==========\n")

        news = sstk.copy()

        ############################################################
        # C. 新文档排版
        def raw_string(fcur: str, cstk: str):  # 编码字符串
            if fcur == 'noto':
                return "".join([f"{self.noto.has_glyph(ord(c)):04x}" for c in cstk])
            elif isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
                return "".join([f"{ord(c):04x}" for c in cstk])
            else:
                return "".join([f"{ord(c):02x}" for c in cstk])

        _x, _y = 0, 0
        for para_id, new in enumerate(news):
            x: float = pstk[para_id].x           # 段落初始横坐标
            y: float = pstk[para_id].y           # 段落初始纵坐标
            x0: float = pstk[para_id].x0         # 段落左边界
            x1: float = pstk[para_id].x1         # 段落右边界
            size: float = pstk[para_id].size     # 段落字体大小
            brk: bool = pstk[para_id].brk        # 段落换行标记
            cstk: str = ""                  # 当前文字栈
            fcur: str = None                # 当前字体 ID
            tx = x
            fcur_ = fcur
            ptr = 0
            log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[para_id]} | {new}")
            while ptr < len(new):
                vy_regex = re.match(
                    r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE,
                )  # 匹配 {vn} 公式标记
                mod = 0  # 文字修饰符
                if vy_regex:  # 加载公式
                    ptr += len(vy_regex.group(0))
                    try:
                        vid = int(vy_regex.group(1).replace(" ", ""))
                        adv = vlen[vid]
                    except Exception as e:
                        log.debug("Skipping formula placeholder due to: %s", e)
                        continue  # 翻译器可能会自动补个越界的公式标记
                    if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]:  # 文字修饰符
                        mod = var[vid][-1].width
                else:  # 加载文字
                    ch = new[ptr]
                    fcur_ = None
                    try:
                        if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
                            fcur_ = "tiro"  # 默认拉丁字体
                    except Exception:
                        pass
                    if fcur_ is None:
                        fcur_ = self.resfont  # 默认非拉丁字体
                    if fcur_ == 'noto':
                        adv = self.noto.char_lengths(ch, size)[0]
                    else:
                        adv = self.fontmap[fcur_].char_width(ord(ch)) * size
                    ptr += 1
                if (                                # 输出文字缓冲区
                    fcur_ != fcur                   # 1. 字体更新
                    or vy_regex                     # 2. 插入公式
                    or x + adv > x1 + 0.1 * size    # 3. 到达右边界（可能一整行都被符号化，这里需要考虑浮点误差）
                ):
                    if cstk:
                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
                        cstk = ""
                if brk and x + adv > x1 + 0.1 * size:  # 到达右边界且原文段落存在换行
                    x = x0
                    lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
                    # y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1)  # 小语种大多适配 1.1
                    y -= size * 1.4
                if vy_regex:  # 插入公式
                    fix = 0
                    if fcur is not None:  # 段落内公式修正纵向偏移
                        fix = varf[vid]
                    for vch in var[vid]:  # 排版公式字符
                        vc = chr(vch.cid)
                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm <{raw_string(self.fontid[vch.font], vc)}> TJ "
                        if log.isEnabledFor(logging.DEBUG):
                            lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
                            _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
                    for l in varl[vid]:  # 排版公式线条
                        if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
                else:  # 插入文字缓冲区
                    if not cstk:  # 单行开头
                        tx = x
                        if x == x0 and ch == " ":  # 消除段落换行空格
                            adv = 0
                        else:
                            cstk += ch
                    else:
                        cstk += ch
                adv -= mod # 文字修饰符
                fcur = fcur_
                x += adv
                if log.isEnabledFor(logging.DEBUG):
                    lstk.append(LTLine(0.1, (_x, _y), (x, y)))
                    _x, _y = x, y
            # 处理结尾
            if cstk:
                ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm <{raw_string(fcur, cstk)}> TJ "
        for l in lstk:  # 排版全局线条
            if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
        ops = f"BT {ops}ET "
        return ops


================================================
FILE: babeldoc/format/pdf/document_il/__init__.py
================================================
from babeldoc.format.pdf.document_il.il_version_1 import BaseOperations
from babeldoc.format.pdf.document_il.il_version_1 import Box
from babeldoc.format.pdf.document_il.il_version_1 import Cropbox
from babeldoc.format.pdf.document_il.il_version_1 import Document
from babeldoc.format.pdf.document_il.il_version_1 import GraphicState
from babeldoc.format.pdf.document_il.il_version_1 import Mediabox
from babeldoc.format.pdf.document_il.il_version_1 import Page
from babeldoc.format.pdf.document_il.il_version_1 import PageLayout
from babeldoc.format.pdf.document_il.il_version_1 import PdfAffineTransform
from babeldoc.format.pdf.document_il.il_version_1 import PdfCharacter
from babeldoc.format.pdf.document_il.il_version_1 import PdfCurve
from babeldoc.format.pdf.document_il.il_version_1 import PdfFigure
from babeldoc.format.pdf.document_il.il_version_1 import PdfFont
from babeldoc.format.pdf.document_il.il_version_1 import PdfFontCharBoundingBox
from babeldoc.format.pdf.document_il.il_version_1 import PdfForm
from babeldoc.format.pdf.document_il.il_version_1 import PdfFormSubtype
from babeldoc.format.pdf.document_il.il_version_1 import PdfFormula
from babeldoc.format.pdf.document_il.il_version_1 import PdfInlineForm
from babeldoc.format.pdf.document_il.il_version_1 import PdfLine
from babeldoc.format.pdf.document_il.il_version_1 import PdfMatrix
from babeldoc.format.pdf.document_il.il_version_1 import PdfOriginalPath
from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraph
from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraphComposition
from babeldoc.format.pdf.document_il.il_version_1 import PdfPath
from babeldoc.format.pdf.document_il.il_version_1 import PdfRectangle
from babeldoc.format.pdf.document_il.il_version_1 import PdfSameStyleCharacters
from babeldoc.format.pdf.document_il.il_version_1 import PdfSameStyleUnicodeCharacters
from babeldoc.format.pdf.document_il.il_version_1 import PdfStyle
from babeldoc.format.pdf.document_il.il_version_1 import PdfXobject
from babeldoc.format.pdf.document_il.il_version_1 import PdfXobjForm
from babeldoc.format.pdf.document_il.il_version_1 import VisualBbox

__all__ = [
    "BaseOperations",
    "Box",
    "Cropbox",
    "Document",
    "GraphicState",
    "Mediabox",
    "Page",
    "PageLayout",
    "PdfAffineTransform",
    "PdfCharacter",
    "PdfCurve",
    "PdfFigure",
    "PdfFont",
    "PdfFontCharBoundingBox",
    "PdfForm",
    "PdfFormSubtype",
    "PdfFormula",
    "PdfInlineForm",
    "PdfLine",
    "PdfMatrix",
    "PdfOriginalPath",
    "PdfParagraph",
    "PdfParagraphComposition",
    "PdfPath",
    "PdfRectangle",
    "PdfSameStyleCharacters",
    "PdfSameStyleUnicodeCharacters",
    "PdfStyle",
    "PdfXobjForm",
    "PdfXobject",
    "VisualBbox",
]


================================================
FILE: babeldoc/format/pdf/document_il/backend/__init__.py
================================================


================================================
FILE: babeldoc/format/pdf/document_il/backend/pdf_creater.py
================================================
import io
import itertools
import logging
import os
import re
import time
import unicodedata
from abc import ABC
from abc import abstractmethod
from multiprocessing import Process
from pathlib import Path

import freetype
import pymupdf
from bitstring import BitStream

from babeldoc.assets.embedding_assets_metadata import FONT_NAMES
from babeldoc.format.pdf.document_il import PdfOriginalPath
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.utils.matrix_helper import matrix_to_bytes
from babeldoc.format.pdf.document_il.utils.zstd_helper import zstd_decompress
from babeldoc.format.pdf.translation_config import TranslateResult
from babeldoc.format.pdf.translation_config import TranslationConfig
from babeldoc.format.pdf.translation_config import WatermarkOutputMode

logger = logging.getLogger(__name__)

SUBSET_FONT_STAGE_NAME = "Subset font"
SAVE_PDF_STAGE_NAME = "Save PDF"


class RenderUnit(ABC):
    """Abstract base class for all renderable units."""

    def __init__(
        self,
        render_order: int,
        sub_render_order: int = 0,
        xobj_id: str | None = None,
    ):
        self.render_order = render_order
        self.sub_render_order = sub_render_order
        self.xobj_id = xobj_id
        if self.render_order is None:
            self.render_order = 9999999999999999
        if self.sub_render_order is None:
            self.sub_render_order = 9999999999999999

    @abstractmethod
    def render(
        self,
        draw_op: BitStream,
        context: "RenderContext",
    ) -> None:
        """Render this unit to the draw_op BitStream."""
        pass

    def get_sort_key(self) -> tuple[int, int]:
        """Get the sort key for ordering render units."""
        return (self.render_order, self.sub_render_order)


class CharacterRenderUnit(RenderUnit):
    """Render unit for PDF characters."""

    def __init__(
        self,
        char: il_version_1.PdfCharacter,
        render_order: int,
        sub_render_order: int = 0,
    ):
        super().__init__(render_order, sub_render_order, char.xobj_id)
        self.char = char

    def render(self, draw_op: BitStream, context: "RenderContext") -> None:
        char = self.char
        if char.char_unicode == "\n":
            return
        if char.pdf_character_id is None:
            return

        char_size = char.pdf_style.font_size
        font_id = char.pdf_style.font_id

        # Get encoding length map based on xobj_id
        if self.xobj_id in context.xobj_encoding_length_map:
            encoding_length_map = context.xobj_encoding_length_map[self.xobj_id]
        else:
            encoding_length_map = context.page_encoding_length_map

        # Check font exists if needed
        if context.check_font_exists:
            if self.xobj_id in context.xobj_available_fonts:
                if font_id not in context.xobj_available_fonts[self.xobj_id]:
                    return
            elif font_id not in context.available_font_list:
                return

        draw_op.append(b"q ")
        context.pdf_creator.render_graphic_state(draw_op, char.pdf_style.graphic_state)

        if char.vertical:
            draw_op.append(
                f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode(),
            )
        else:
            draw_op.append(
                f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode(),
            )

        encoding_length = encoding_length_map.get(font_id, None)
        if encoding_length is None:
            if font_id in context.all_encoding_length_map:
                encoding_length = context.all_encoding_length_map[font_id]
            else:
                logger.debug(
                    f"Font {font_id} not found in encoding length map for page {context.page.page_number}"
                )
                return

        draw_op.append(
            f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode(),
        )
        draw_op.append(b" Tj ET Q \n")


class FormRenderUnit(RenderUnit):
    """Render unit for PDF forms."""

    def __init__(
        self,
        form: il_version_1.PdfForm,
        render_order: int,
        sub_render_order: int = 0,
    ):
        super().__init__(render_order, sub_render_order, form.xobj_id)
        self.form = form

    def render(self, draw_op: BitStream, context: "RenderContext") -> None:
        form = self.form
        draw_op.append(b"q ")

        # Apply relocation transform first if present (before passthrough instructions)
        # This ensures masks in passthrough_per_char_instruction use the correct coordinate system
        assert form.pdf_matrix is not None
        if form.relocation_transform and len(form.relocation_transform) == 6:
            try:
                relocation_matrix = tuple(float(x) for x in form.relocation_transform)
                draw_op.append(matrix_to_bytes(relocation_matrix))
            except (ValueError, TypeError):
                # If relocation transform conversion fails, skip it and use original matrix later
                pass

        draw_op.append(matrix_to_bytes(form.pdf_matrix))

        draw_op.append(b" ")

        draw_op.append(
            form.graphic_state.passthrough_per_char_instruction.encode(),
        )

        draw_op.append(b" ")

        assert form.pdf_form_subtype is not None
        if form.pdf_form_subtype.pdf_xobj_form:
            draw_op.append(
                f" /{form.pdf_form_subtype.pdf_xobj_form.do_args} Do ".encode()
            )
        elif form.pdf_form_subtype.pdf_inline_form:
            # Handle inline form (inline image)
            inline_form = form.pdf_form_subtype.pdf_inline_form

            # Start inline image
            draw_op.append(b" BI ")

            # Add image parameters if available
            if inline_form.image_parameters:
                import json

                try:
                    params = json.loads(inline_form.image_parameters)
                    for key, value in params.items():
                        if key.startswith("/"):
                            key = key[1:]  # Remove leading slash
                        # Convert Python boolean to PDF boolean
                        if value is True:
                            value = "true"
                        elif value is False:
                            value = "false"
                        elif isinstance(value, str) and value in (
                            "True",
                            "False",
                        ):
                            value = value.lower()
                        draw_op.append(f"/{key} {value} ".encode())
                except json.JSONDecodeError:
                    pass

            # Start image data
            draw_op.append(b"ID ")

            # Add image data if available (base64 decode it first)
            if inline_form.form_data:
                import base64

                try:
                    image_data = base64.b64decode(inline_form.form_data)
                    draw_op.append(image_data)
                except Exception:
                    pass

            # End inline image
            draw_op.append(b" EI ")
        draw_op.append(b" Q\n")


class RectangleRenderUnit(RenderUnit):
    """Render unit for PDF rectangles."""

    def __init__(
        self,
        rectangle: il_version_1.PdfRectangle,
        render_order: int,
        sub_render_order: int = 0,
        line_width: float = 0.4,
    ):
        super().__init__(render_order, sub_render_order, rectangle.xobj_id)
        self.rectangle = rectangle
        self.line_width = line_width

    def render(self, draw_op: BitStream, context: "RenderContext") -> None:
        rectangle = self.rectangle
        x1 = rectangle.box.x
        y1 = rectangle.box.y
        x2 = rectangle.box.x2
        y2 = rectangle.box.y2
        width = x2 - x1
        height = y2 - y1

        draw_op.append(b"q n ")
        draw_op.append(
            rectangle.graphic_state.passthrough_per_char_instruction.encode(),
        )

        line_width = self.line_width
        if rectangle.line_width is not None:
            line_width = rectangle.line_width
        if line_width > 0:
            draw_op.append(f" {line_width:.6f} w ".encode())

        draw_op.append(f"{x1:.6f} {y1:.6f} {width:.6f} {height:.6f} re ".encode())
        if rectangle.fill_background:
            draw_op.append(b" f ")
        else:
            draw_op.append(b" S ")

        draw_op.append(b"Q\n")


class CurveRenderUnit(RenderUnit):
    """Render unit for PDF curves."""

    def __init__(
        self,
        curve: il_version_1.PdfCurve,
        render_order: int,
        sub_render_order: int = 0,
    ):
        super().__init__(render_order, sub_render_order, curve.xobj_id)
        self.curve = curve

    def render(self, draw_op: BitStream, context: "RenderContext") -> None:
        curve = self.curve
        draw_op.append(b"q n ")

        # Apply relocation transform first if present (before passthrough instructions)
        # This ensures masks in passthrough_per_char_instruction use the correct coordinate system
        if curve.relocation_transform and len(curve.relocation_transform) == 6:
            try:
                relocation_matrix = tuple(float(x) for x in curve.relocation_transform)
                draw_op.append(matrix_to_bytes(relocation_matrix))
            except (ValueError, TypeError):
                # If relocation transform conversion fails, skip it and use original CTM later
                pass

        draw_op.append(b" ")

        # Apply original CTM if present
        if curve.ctm and len(curve.ctm) == 6:
            ctm = curve.ctm
            draw_op.append(
                f"{ctm[0]:.6f} {ctm[1]:.6f} {ctm[2]:.6f} {ctm[3]:.6f} {ctm[4]:.6f} {ctm[5]:.6f} cm ".encode()
            )

        draw_op.append(b" ")

        draw_op.append(
            curve.graphic_state.passthrough_per_char_instruction.encode(),
        )

        draw_op.append(b" ")
        path_op = BitStream(b" ")

        # Use original path if available, otherwise fall back to transformed path
        path_to_use = (
            curve.pdf_original_path
            if curve.pdf_original_path is not None
            else curve.pdf_path
        )
        for path in path_to_use:
            if isinstance(path, PdfOriginalPath):
                path = path.pdf_path
            if path.has_xy:
                path_op.append(f"{path.x:F} {path.y:F} {path.op} ".encode())
            else:
                path_op.append(f"{path.op} ".encode())

        if curve.fill_background:
            draw_op.append(path_op)
            draw_op.append(b" f")
        if curve.evenodd:
            draw_op.append(b"* ")
        else:
            draw_op.append(b" ")
        if curve.stroke_path:
            draw_op.append(path_op)
            draw_op.append(b"S ")

        # final_op = b' B '

        draw_op.append(b" n Q\n")


class RenderContext:
    """Context object containing shared state for rendering."""

    def __init__(
        self,
        pdf_creator: "PDFCreater",
        page: il_version_1.Page,
        available_font_list: set[str],
        page_encoding_length_map: dict[str, int],
        all_encoding_length_map: dict[str, int],
        xobj_available_fonts: dict[str, set[str]],
        xobj_encoding_length_map: dict[str, dict[str, int]],
        ctm_for_ops: bytes,
        check_font_exists: bool = False,
    ):
        self.pdf_creator = pdf_creator
        self.page = page
        self.available_font_list = available_font_list
        self.page_encoding_length_map = page_encoding_length_map
        self.all_encoding_length_map = all_encoding_length_map
        self.xobj_available_fonts = xobj_available_fonts
        self.xobj_encoding_length_map = xobj_encoding_length_map
        self.ctm_for_ops = ctm_for_ops
        self.check_font_exists = check_font_exists


def to_int(src):
    return int(re.search(r"\d+", src).group(0))


def parse_mapping(text):
    mapping = []
    for x in re.finditer(rb"<(?P<num>[a-fA-F0-9]+)>", text):
        mapping.append(int(x.group("num"), 16))
    return mapping


def apply_normalization(cmap, gid, code):
    need = False
    if 0x2F00 <= code <= 0x2FD5:  # Kangxi Radicals
        need = True
    if 0xF900 <= code <= 0xFAFF:  # CJK Compatibility Ideographs
        need = True
    if need:
        norm = unicodedata.normalize("NFD", chr(code))
        cmap[gid] = ord(norm)
    else:
        cmap[gid] = code


def batched(iterable, n, *, strict=False):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    iterator = iter(iterable)
    while batch := tuple(itertools.islice(iterator, n)):
        if strict and len(batch) != n:
            raise ValueError("batched(): incomplete batch")
        yield batch


def update_tounicode_cmap_pair(cmap, data):
    for start, stop, value in batched(data, 3):
        for gid in range(start, stop + 1):
            code = value + gid - start
            apply_normalization(cmap, gid, code)


def update_tounicode_cmap_code(cmap, data):
    for gid, code in batched(data, 2):
        apply_normalization(cmap, gid, code)


def parse_tounicode_cmap(data):
    cmap = {}
    for x in re.finditer(
        rb"\s+beginbfrange\s*(?P<r>(<[0-9a-fA-F]+>\s*)+)endbfrange\s+", data
    ):
        update_tounicode_cmap_pair(cmap, parse_mapping(x.group("r")))
    for x in re.finditer(
        rb"\s+beginbfchar\s*(?P<c>(<[0-9a-fA-F]+>\s*)+)endbfchar", data
    ):
        update_tounicode_cmap_code(cmap, parse_mapping(x.group("c")))
    return cmap


def parse_truetype_data(data):
    glyph_in_use = []
    face = freetype.Face(io.BytesIO(data))
    for i in range(face.num_glyphs):
        face.load_glyph(i)
        if face.glyph.outline.contours:
            glyph_in_use.append(i)
    return glyph_in_use


TOUNICODE_HEAD = """\
/CIDInit /ProcSet findresource begin
12 dict begin
begincmap
/CIDSystemInfo <</Registry(Adobe)/Ordering(UCS)/Supplement 0>> def
/CMapName /Adobe-Identity-UCS def
/CMapType 2 def
1 begincodespacerange
<0000> <FFFF>
endcodespacerange"""
TOUNICODE_TAIL = """\
endcmap
CMapName currentdict /CMap defineresource pop
end
end"""


def make_tounicode(cmap, used):
    short = []
    for x in used:
        if x in cmap:
            short.append((x, cmap[x]))
    line = [TOUNICODE_HEAD]
    for block in batched(short, 100):
        line.append(f"{len(block)} beginbfchar")
        for glyph, code in block:
            if code < 0x10000:
                line.append(f"<{glyph:04x}><{code:04x}>")
            else:
                code -= 0x10000
                high = 0xD800 + (code >> 10)
                low = 0xDC00 + (code & 0b1111111111)
                line.append(f"<{glyph:04x}><{high:04x}{low:04x}>")
        line.append("endbfchar")
    line.append(TOUNICODE_TAIL)
    return "\n".join(line)


def reproduce_one_font(doc, index):
    m = doc.xref_get_key(index, "ToUnicode")
    f = doc.xref_get_key(index, "DescendantFonts")
    if m[0] == "xref" and f[0] == "array":
        mi = to_int(m[1])
        fi = to_int(f[1])
        ff = doc.xref_get_key(fi, "FontDescriptor/FontFile2")
        ms = doc.xref_stream(mi)
        fs = doc.xref_stream(to_int(ff[1]))
        cmap = parse_tounicode_cmap(ms)
        used = parse_truetype_data(fs)
        text = make_tounicode(cmap, used)
        doc.update_stream(mi, bytes(text, "U8"))


def reproduce_cmap(doc):
    assert doc
    font_set = set()
    for page in doc:
        try:
            font_list = page.get_fonts()
            for font in font_list:
                if font[1] == "ttf" and font[3] in FONT_NAMES and ".ttf" in font[4]:
                    font_set.add(font)
        except Exception as e:
            logger.error(f"Error in getting page fonts: {e}")
    for font in font_set:
        reproduce_one_font(doc, font[0])
    return doc


def _subset_fonts_process(pdf_path, output_path):
    """Function to run in subprocess for font subsetting.

    Args:
        pdf_path: Path to the PDF file to subset
        output_path: Path where to save the result
    """
    try:
        pdf = pymupdf.open(pdf_path)
        pdf.subset_fonts(fallback=False)
        pdf.save(output_path)
        # 返回 0 表示成功
        os._exit(0)
    except Exception as e:
        logger.error(f"Error in font subsetting subprocess: {e}")
        # 返回 1 表示失败
        os._exit(1)


def _save_pdf_clean_process(
    pdf_path,
    output_path,
    garbage=1,
    deflate=True,
    clean=True,
    deflate_fonts=True,
    linear=False,
):
    """Function to run in subprocess for saving PDF with clean=True which can be time-consuming.

    Args:
        pdf_path: Path to the PDF file to save
        output_path: Path where to save the result
        garbage: Garbage collection level (0, 1, 2, 3, 4)
        deflate: Whether to deflate the PDF
        clean: Whether to clean the PDF
        deflate_fonts: Whether to deflate fonts
        linear: Whether to linearize the PDF
    """
    try:
        pdf = pymupdf.open(pdf_path)
        pdf.save(
            output_path,
            garbage=garbage,
            deflate=deflate,
            clean=clean,
            deflate_fonts=deflate_fonts,
            linear=linear,
        )
        # 返回 0 表示成功
        os._exit(0)
    except Exception as e:
        logger.error(f"Error in save PDF with clean=True subprocess: {e}")
        # 返回 1 表示失败
        os._exit(1)


class PDFCreater:
    stage_name = "Generate drawing instructions"

    def __init__(
        self,
        original_pdf_path: str,
        document: il_version_1.Document,
        translation_config: TranslationConfig,
        mediabox_data: dict,
    ):
        self.original_pdf_path = original_pdf_path
        self.docs = document
        self.font_path = translation_config.font
        self.font_mapper = FontMapper(translation_config)
        self.translation_config = translation_config
        self.mediabox_data = mediabox_data

    def render_graphic_state(
        self,
        draw_op: BitStream,
        graphic_state: il_version_1.GraphicState,
    ):
        if graphic_state is None:
            return
        # if graphic_state.stroking_color_space_name:
        #     draw_op.append(
        #         f"/{graphic_state.stroking_color_space_name} CS \n".encode()
        #     )
        # if graphic_state.non_stroking_color_space_name:
        #     draw_op.append(
        #         f"/{graphic_state.non_stroking_color_space_name}"
        #         f" cs \n".encode()
        #     )
        # if graphic_state.ncolor is not None:
        #     if len(graphic_state.ncolor) == 1:
        #         draw_op.append(f"{graphic_state.ncolor[0]} g \n".encode())
        #     elif len(graphic_state.ncolor) == 3:
        #         draw_op.append(
        #             f"{' '.join((str(x) for x in graphic_state.ncolor))} sc \n".encode()
        #         )
        # if graphic_state.scolor is not None:
        #     if len(graphic_state.scolor) == 1:
        #         draw_op.append(f"{graphic_state.scolor[0]} G \n".encode())
        #     elif len(graphic_state.scolor) == 3:
        #         draw_op.append(
        #             f"{' '.join((str(x) for x in graphic_state.scolor))} SC \n".encode()
        #         )

        if graphic_state.passthrough_per_char_instruction:
            draw_op.append(
                f"{graphic_state.passthrough_per_char_instruction} \n".encode(),
            )

    def render_paragraph_to_char(
        self,
        paragraph: il_version_1.PdfParagraph,
    ) -> list[il_version_1.PdfCharacter]:
        chars = []
        for composition in paragraph.pdf_paragraph_composition:
            if composition.pdf_character:
                chars.append(composition.pdf_character)
            elif composition.pdf_formula:
                # Flatten formula: extract all characters from the formula
                chars.extend(composition.pdf_formula.pdf_character)
            else:
                logger.error(
                    f"Unknown composition type. "
                    f"This type only appears in the IL "
                    f"after the translation is completed."
                    f"During pdf rendering, this type is not supported."
                    f"Composition: {composition}. "
                    f"Paragraph: {paragraph}. ",
                )
                continue
        if not chars and paragraph.unicode and paragraph.debug_id:
            logger.error(
                f"Unable to export paragraphs that have "
                f"not yet been formatted: {paragraph}",
            )
            return chars
        return chars

    def create_render_units_for_page(
        self,
        page: il_version_1.Page,
        translation_config: TranslationConfig,
    ) -> list[RenderUnit]:
        """Convert all renderable objects in a page to render units."""
        render_units = []

        # Collect all characters (from page and paragraphs)
        chars = []
        if page.pdf_character:
            chars.extend(page.pdf_character)
        for paragraph in page.pdf_paragraph:
            chars.extend(self.render_paragraph_to_char(paragraph))

        # Convert characters to render units
        for i, char in enumerate(chars):
            render_order = getattr(char, "render_order", 100)  # Default render order
            sub_render_order = getattr(char, "sub_render_order", i)
            render_units.append(
                CharacterRenderUnit(char, render_order, sub_render_order)
            )

        # Collect forms from formulas within paragraphs
        formula_forms = []
        for paragraph in page.pdf_paragraph:
            for composition in paragraph.pdf_paragraph_composition:
                if composition.pdf_formula:
                    formula_forms.extend(composition.pdf_formula.pdf_form)

        # Convert forms to render units (page-level forms + forms from formulas)
        if not translation_config.skip_form_render:
            all_forms = list(page.pdf_form) + formula_forms
            for i, form in enumerate(all_forms):
                render_order = getattr(
                    form, "render_order", 50
                )  # Forms render before characters
                sub_render_order = getattr(form, "sub_render_order", i)
                render_units.append(
                    FormRenderUnit(form, render_order, sub_render_order)
                )

        # Convert rectangles to render units (only for OCR workaround or debug)
        for i, rect in enumerate(page.pdf_rectangle):
            if (
                translation_config.ocr_workaround
                and not rect.debug_info
                and rect.fill_background
            ) or (translation_config.debug and rect.debug_info):
                render_order = getattr(
                    rect, "render_order", 10
                )  # Rectangles render first
                sub_render_order = getattr(rect, "sub_render_order", i)
                line_width = 0.1 if translation_config.ocr_workaround else 0.4
                render_units.append(
                    RectangleRenderUnit(
                        rect, render_order, sub_render_order, line_width
                    )
                )

        # Collect curves from formulas within paragraphs
        formula_curves = []
        for paragraph in page.pdf_paragraph:
            for composition in paragraph.pdf_paragraph_composition:
                if composition.pdf_formula:
                    formula_curves.extend(composition.pdf_formula.pdf_curve)

        # Convert curves to render units (page-level curves + curves from formulas, only for debug)
        if not translation_config.skip_curve_render:
            all_curves = list(page.pdf_curve) + formula_curves
            for i, curve in enumerate(all_curves):
                if curve.debug_info or translation_config.debug:
                    render_order = getattr(
                        curve, "render_order", 20
                    )  # Curves render after rectangles
                    sub_render_order = getattr(curve, "sub_render_order", i)
                    render_units.append(
                        CurveRenderUnit(curve, render_order, sub_render_order)
                    )

        return render_units

    def render_units_to_stream(
        self,
        render_units: list[RenderUnit],
        context: RenderContext,
        page_op: BitStream,
        xobj_draw_ops: dict[str, BitStream],
    ) -> None:
        """Render sorted render units to appropriate draw streams."""
        # Sort render units by (render_order, sub_render_order)
        sorted_units = sorted(render_units, key=lambda unit: unit.get_sort_key())

        for unit in sorted_units:
            # Determine which draw_op to use based on xobj_id
            if unit.xobj_id in xobj_draw_ops:
                draw_op = xobj_draw_ops[unit.xobj_id]
            else:
                draw_op = page_op

            # Render the unit
            unit.render(draw_op, context)

    def get_available_font_list(self, pdf, page):
        page_xref_id = pdf[page.page_number].xref
        return self.get_xobj_available_fonts(page_xref_id, pdf)

    def get_xobj_available_fonts(self, page_xref_id, pdf):
        try:
            resources_type, r_id = pdf.xref_get_key(page_xref_id, "Resources")
            if resources_type == "xref":
                resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
                r_id = pdf.xref_object(int(resource_xref_id))
                resources_type = "dict"
            if resources_type == "dict":
                xref_id = re.search("/Font (\\d+) 0 R", r_id)
                if xref_id is not None:
                    xref_id = xref_id.group(1)
                    font_dict = pdf.xref_object(int(xref_id))
                else:
                    search = re.search("/Font *<<(.+?)>>", r_id.replace("\n", " "))
                    if search is None:
                        # Have resources but no fonts
                        return set()
                    font_dict = search.group(1)
            else:
                r_id = int(r_id.split(" ")[0])
                _, font_dict = pdf.xref_get_key(r_id, "Font")
            fonts = re.findall("/([^ ]+?) ", font_dict)
            return set(fonts)
        except Exception:
            return set()

    def _render_rectangle(
        self,
        draw_op: BitStream,
        rectangle: il_version_1.PdfRectangle,
        line_width: float = 0.4,
    ):
        """Draw a rectangle in PDF for visualization purposes.

        Args:
            draw_op: BitStream to append PDF drawing operations
            rectangle: Rectangle object containing position information
            line_width: Line width
        """
        x1 = rectangle.box.x
        y1 = rectangle.box.y
        x2 = rectangle.box.x2
        y2 = rectangle.box.y2
        width = x2 - x1
        height = y2 - y1
        # Save graphics state
        draw_op.append(b"q ")

        # Set green color for debug visibility
        draw_op.append(
            rectangle.graphic_state.passthrough_per_char_instruction.encode(),
        )  # Green stroke
        if rectangle.line_width is not None:
            line_width = rectangle.line_width
        if line_width > 0:
            draw_op.append(f" {line_width:.6f} w ".encode())  # Line width
        draw_op.append(f"{x1:.6f} {y1:.6f} {width:.6f} {height:.6f} re ".encode())
        if rectangle.fill_background:
            draw_op.append(b" f ")
        else:
            draw_op.append(b" S ")

        # Restore graphics state
        draw_op.append(b" n Q\n")

    def create_side_by_side_dual_pdf(
        self,
        original_pdf: pymupdf.Document,
        translated_pdf: pymupdf.Document,
        dual_out_path: str,
        translation_config: TranslationConfig,
    ) -> pymupdf.Document:
        """Create a dual PDF with side-by-side pages (original and translation).

        Args:
            original_pdf: Original PDF document
            translated_pdf: Translated PDF document
            dual_out_path: Output path for the dual PDF
            translation_config: Translation configuration

        Returns:
            The created dual PDF document
        """
        # Create a new PDF for side-by-side pages
        dual = pymupdf.open()
        page_count = min(original_pdf.page_count, translated_pdf.page_count)

        for page_id in range(page_count):
            # Get pages from both PDFs
            orig_page = original_pdf[page_id]
            trans_page = translated_pdf[page_id]
            rotate_angle = orig_page.rotation
            total_width = orig_page.rect.width + trans_page.rect.width
            max_height = max(orig_page.rect.height, trans_page.rect.height)
            left_width = (
                orig_page.rect.width
                if not translation_config.dual_translate_first
                else trans_page.rect.width
            )

            orig_page.set_rotation(0)
            trans_page.set_rotation(0)

            # Create new page with combined width
            dual_page = dual.new_page(width=total_width, height=max_height)

            # Define rectangles for left and right sides
            rect_left = pymupdf.Rect(0, 0, left_width, max_height)
            rect_right = pymupdf.Rect(left_width, 0, total_width, max_height)

            # Show pages according to dual_translate_first setting
            if translation_config.dual_translate_first:
                # Show translated page on left and original on right
                rect_left, rect_right = rect_right, rect_left
            try:
                # Show original page on left and translated on right (default)
                dual_page.show_pdf_page(
                    rect_left,
                    original_pdf,
                    page_id,
                    keep_proportion=True,
                    rotate=-rotate_angle,
                )
            except Exception as e:
                logger.warning(
                    f"Failed to show original page on left and translated on right (default). "
                    f"Page ID: {page_id}. "
                    f"Original PDF: {self.original_pdf_path}. "
                    f"Translated PDF: {translation_config.input_file}. ",
                    exc_info=e,
                )
            try:
                dual_page.show_pdf_page(
                    rect_right,
                    translated_pdf,
                    page_id,
                    keep_proportion=True,
                    rotate=-rotate_angle,
                )
            except Exception as e:
                logger.warning(
                    f"Failed to show translated page on left and original on right. "
                    f"Page ID: {page_id}. "
                    f"Original PDF: {self.original_pdf_path}. "
                    f"Translated PDF: {translation_config.input_file}. ",
                    exc_info=e,
                )
        return dual

    def create_alternating_pages_dual_pdf(
        self,
        original_pdf: pymupdf.Document,
        translated_pdf: pymupdf.Document,
        translation_config: TranslationConfig,
    ) -> pymupdf.Document:
        """Create a dual PDF with alternating pages (original and translation).

        Args:
            original_pdf_path: Path to the original PDF
            translated_pdf: Translated PDF document
            translation_config: Translation configuration

        Returns:
            The created dual PDF document
        """
        # Open the original PDF and insert translated PDF
        dual = original_pdf
        dual.insert_file(translated_pdf)

        # Rearrange pages to alternate between original and translated
        page_count = translated_pdf.page_count
        for page_id in range(page_count):
            if translation_config.dual_translate_first:
                dual.move_page(page_count + page_id, page_id * 2)
            else:
                dual.move_page(page_count + page_id, page_id * 2 + 1)

        return dual

    def write_debug_info(
        self,
        pdf: pymupdf.Document,
        translation_config: TranslationConfig,
    ):
        self.font_mapper.add_font(pdf, self.docs)

        for page in self.docs.page:
            _, r_id = pdf.xref_get_key(pdf[page.page_number].xref, "Contents")
            resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
            base_op = pdf.xref_stream(int(resource_xref_id))
            translation_config.raise_if_cancelled()
            xobj_available_fonts = {}
            xobj_draw_ops = {}
            xobj_encoding_length_map = {}
            available_font_list = self.get_available_font_list(pdf, page)

            page_encoding_length_map = {
                f.font_id: f.encoding_length for f in page.pdf_font
            }
            page_op = BitStream()
            # q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
            page_op.append(b"q ")
            if base_op is not None:
                page_op.append(base_op)
            page_op.append(b" Q ")
            page_op.append(
                f"q Q 1 0 0 1 {page.cropbox.box.x:.6f} {page.cropbox.box.y:.6f} cm \n".encode(),
            )
            # 收集所有字符
            chars = []
            # 首先添加页面级别的字符
            if page.pdf_character:
                chars.extend(page.pdf_character)
            # 然后添加段落中的字符
            for paragraph in page.pdf_paragraph:
                chars.extend(self.render_paragraph_to_char(paragraph))

            # 渲染所有字符
            for char in chars:
                if not getattr(char, "debug_info", False):
                    continue
                if char.char_unicode == "\n":
                    continue
                if char.pdf_character_id is None:
                    # dummy char
                    continue
                char_size = char.pdf_style.font_size
                font_id = char.pdf_style.font_id

                if font_id not in available_font_list:
                    continue
                draw_op = page_op
                encoding_length_map = page_encoding_length_map

                draw_op.append(b"q ")
                self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
                if char.vertical:
                    draw_op.append(
                        f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode(),
                    )
                else:
                    draw_op.append(
                        f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode(),
                    )

                encoding_length = encoding_length_map[font_id]
                # pdf32000-2008 page14:
                # As hexadecimal data enclosed in angle brackets < >
                # see 7.3.4.3, "Hexadecimal Strings."
                draw_op.append(
                    f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode(),
                )

                draw_op.append(b" Tj ET Q \n")
            for rect in page.pdf_rectangle:
                if not rect.debug_info:
                    continue
                self._render_rectangle(page_op, rect)
            draw_op = page_op
            # Since this is a draw instruction container,
            # no additional information is needed
            pdf.update_stream(int(resource_xref_id), draw_op.tobytes())
        translation_config.raise_if_cancelled()

        # 使用子进程进行字体子集化
        if not translation_config.skip_clean:
            pdf = self.subset_fonts_in_subprocess(pdf, translation_config, tag="debug")
        return pdf

    @staticmethod
    def subset_fonts_in_subprocess(
        pdf: pymupdf.Document, translation_config: TranslationConfig, tag: str
    ) -> pymupdf.Document:
        """Run font subsetting in a subprocess with timeout.

        Args:
            pdf: The PDF document object
            translation_config: Translation configuration

        Returns:
            Path to the PDF with subsetted fonts, or original path if subsetting failed or timed out
        """
        original_pdf = pdf
        # Create temporary file paths
        temp_input = str(
            translation_config.get_working_file_path(f"temp_subset_input_{tag}.pdf")
        )
        temp_output = str(
            translation_config.get_working_file_path(f"temp_subset_output_{tag}.pdf")
        )

        # Save PDF to temporary file without subsetting
        pdf.save(temp_input)

        # Create and start subprocess
        process = Process(target=_subset_fonts_process, args=(temp_input, temp_output))
        process.start()

        # Wait for subprocess with timeout (1 minute)
        timeout = 60  # 1 minutes in seconds
        start_time = time.time()

        while process.is_alive():
            if time.time() - start_time > timeout:
                logger.warning(
                    f"Font subsetting timeout after {timeout} seconds, terminating subprocess"
                )
                process.terminate()
                try:
                    process.join(5)  # Give it 5 seconds to clean up
                    if process.is_alive():
                        logger.warning("Subprocess did not terminate, killing it")
                        process.kill()
                        process.terminate()
                        process.kill()
                        process.terminate()
                        process.kill()
                        process.terminate()
                except Exception as e:
                    logger.error(f"Error terminating font subsetting process: {e}")

                return original_pdf

            time.sleep(0.5)  # Check every half second

        # Process completed, check exit code
        exit_code = process.exitcode
        success = exit_code == 0

        # Check if subsetting was successful
        if (
            success
            and Path(temp_output).exists()
            and Path(temp_output).stat().st_size > 0
        ):
            logger.info("Font subsetting completed successfully")
            return pymupdf.open(temp_output)
        else:
            logger.warning(
                f"Font subsetting failed with exit code {exit_code} or produced empty file"
            )
            return original_pdf

    @staticmethod
    def save_pdf_with_timeout(
        pdf: pymupdf.Document,
        output_path: str,
        translation_config: TranslationConfig,
        garbage: int = 1,
        deflate: bool = True,
        clean: bool = True,
        deflate_fonts: bool = True,
        linear: bool = False,
        timeout: int = 120,
        tag: str = "",
    ) -> bool:
        """Save a PDF document with a timeout for the clean=True operation.

        Args:
            pdf: The PDF document object
            output_path: Path where to save the PDF
            translation_config: Translation configuration
            garbage: Garbage collection level (0, 1, 2, 3, 4)
            deflate: Whether to deflate the PDF
            clean: Whether to clean the PDF
            deflate_fonts: Whether to deflate fonts
            linear: Whether to linearize the PDF
            timeout: Timeout in seconds (default: 2 minutes)

        Returns:
            True if saved with clean=True successfully, False if fallback to clean=False was used
        """
        # Create temporary file paths
        temp_input = str(
            translation_config.get_working_file_path(f"temp_save_input_{tag}.pdf")
        )
        temp_output = str(
            translation_config.get_working_file_path(f"temp_save_output_{tag}.pdf")
        )

        # Save PDF to temporary file first
        pdf.save(temp_input)

        # Try to save with clean=True in a subprocess
        process = Process(
            target=_save_pdf_clean_process,
            args=(
                temp_input,
                temp_output,
                garbage,
                deflate,
                clean,
                deflate_fonts,
                linear,
            ),
        )
        process.start()

        # Wait for subprocess with timeout
        start_time = time.time()

        while process.is_alive():
            if time.time() - start_time > timeout:
                logger.warning(
                    f"PDF save with clean={clean} timeout after {timeout} seconds, terminating subprocess"
                )
                process.terminate()
                try:
                    process.join(5)  # Give it 5 seconds to clean up
                    if process.is_alive():
                        logger.warning("Subprocess did not terminate, killing it")
                        process.kill()
                        process.terminate()
                        process.kill()
                        process.terminate()
                        process.kill()
                        process.terminate()
                except Exception as e:
                    logger.error(f"Error terminating PDF save process: {e}")

                # Fallback to save without clean parameter
                logger.info("Falling back to save with clean=False")
                try:
                    pdf.save(
                        output_path,
                        garbage=garbage,
                        deflate=deflate,
                        clean=False,
                        deflate_fonts=deflate_fonts,
                        linear=linear,
                    )
                    return False
                except Exception as e:
                    logger.error(f"Error in fallback save: {e}")
                    # Last resort: basic save
                    pdf.save(output_path)
                    return False

            time.sleep(0.5)  # Check every half second

        # Process completed, check exit code
        exit_code = process.exitcode
        success = exit_code == 0

        # Check if save was successful
        if (
            success
            and Path(temp_output).exists()
            and Path(temp_output).stat().st_size > 0
        ):
            logger.info(f"PDF save with clean={clean} completed successfully")
            # Copy the successfully created file to the target path
            try:
                import shutil

                shutil.copy2(temp_output, output_path)
                return True
            except Exception as e:
                logger.error(f"Error copying saved PDF: {e}")
                pdf.save(output_path)  # Fallback to direct save
                return False
            finally:
                Path(temp_input).unlink()
                Path(temp_output).unlink()
        else:
            logger.warning(
                f"PDF save with clean={clean} failed with exit code {exit_code} or produced empty file"
            )
            # Fallback to save without clean parameter
            try:
                pdf.save(
                    output_path,
                    garbage=garbage,
                    deflate=deflate,
                    clean=False,
                    deflate_fonts=deflate_fonts,
                    linear=linear,
                )
            except Exception as e:
                logger.error(f"Error in fallback save: {e}")
                # Last resort: basic save
                pdf.save(output_path)

            return False

    def restore_media_box(self, doc: pymupdf.Document, mediabox_data: dict) -> None:
        for xref, page_box_data in mediabox_data.items():
            for name, box in page_box_data.items():
                try:
                    doc.xref_set_key(xref, name, box)
                except Exception:
                    logger.debug(f"Error restoring media box {name} from PDF")

    def write(
        self,
        translation_config: TranslationConfig,
        check_font_exists: bool = False,
    ) -> TranslateResult:
        try:
            basename = Path(translation_config.input_file).stem
            debug_suffix = ".debug" if translation_config.debug else ""
            if (
                translation_config.watermark_output_mode
                != WatermarkOutputMode.Watermarked
            ):
                debug_suffix += ".no_watermark"
            mono_out_path = translation_config.get_output_file_path(
                f"{basename}{debug_suffix}.{translation_config.lang_out}.mono.pdf",
            )
            pdf = pymupdf.open(self.original_pdf_path)
            self.font_mapper.add_font(pdf, self.docs)
            with self.translation_config.progress_monitor.stage_start(
                self.stage_name,
                len(self.docs.page),
            ) as pbar:
                for page in self.docs.page:
                    self.update_page_content_stream(
                        check_font_exists, page, pdf, translation_config
                    )
                    pbar.advance()
            translation_config.raise_if_cancelled()
            gc_level = 1
            if self.translation_config.ocr_workaround:
                gc_level = 4
            with self.translation_config.progress_monitor.stage_start(
                SUBSET_FONT_STAGE_NAME,
                1,
            ) as pbar:
                if not translation_config.skip_clean:
                    pdf = self.subset_fonts_in_subprocess(
                        pdf, translation_config, tag="mono"
                    )

                pbar.advance()
            try:
                self.restore_media_box(pdf, self.mediabox_data)
            except Exception:
                logger.exception("restore media box failed")

            if translation_config.only_include_translated_page:
                total_page = set(range(0, len(pdf)))

                pages_to_translate = {
                    page.page_number
                    for page in self.docs.page
                    if self.translation_config.should_translate_page(
                        page.page_number + 1
                    )
                }

                should_removed_page = list(total_page - pages_to_translate)

                pdf.delete_pages(should_removed_page)

            with self.translation_config.progress_monitor.stage_start(
                SAVE_PDF_STAGE_NAME,
                2,
            ) as pbar:
                if not translation_config.no_mono:
                    if translation_config.debug:
                        translation_config.raise_if_cancelled()
                        pdf.save(
                            f"{mono_out_path}.decompressed.pdf",
                            expand=True,
                            pretty=True,
                        )
                    translation_config.raise_if_cancelled()
                    self.save_pdf_with_timeout(
                        pdf,
                        mono_out_path,
                        translation_config,
                        garbage=gc_level,
                        deflate=True,
                        clean=not translation_config.skip_clean,
                        deflate_fonts=True,
                        linear=False,
                        tag="mono",
                    )
                pbar.advance()
                dual_out_path = None
                if not translation_config.no_dual:
                    dual_out_path = translation_config.get_output_file_path(
                        f"{basename}{debug_suffix}.{translation_config.lang_out}.dual.pdf",
                    )
                    translation_config.raise_if_cancelled()
                    original_pdf = pymupdf.open(self.original_pdf_path)

                    if translation_config.debug:
                        translation_config.raise_if_cancelled()
                        try:
                            original_pdf = self.write_debug_info(
                                original_pdf, translation_config
                            )
                        except Exception:
                            logger.warning(
                                "Failed to write debug info to dual PDF",
                                exc_info=True,
                            )

                    if (
                        self.translation_config.only_include_translated_page
                        and should_removed_page
                    ):
                        original_pdf.delete_pages(should_removed_page)
                    translated_pdf = pdf

                    # Choose between alternating pages and side-by-side format
                    # Default to side-by-side if not specified
                    use_alternating_pages = (
                        translation_config.use_alternating_pages_dual
                    )

                    if use_alternating_pages:
                        # Create a dual PDF with alternating pages (original and translation)
                        dual = self.create_alternating_pages_dual_pdf(
                            original_pdf,
                            translated_pdf,
                            translation_config,
                        )
                    else:
                        # Create a dual PDF with side-by-side pages (original and translation)
                        dual = self.create_side_by_side_dual_pdf(
                            original_pdf,
                            translated_pdf,
                            dual_out_path,
                            translation_config,
                        )

                    self.save_pdf_with_timeout(
                        dual,
                        dual_out_path,
                        translation_config,
                        garbage=gc_level,
                        deflate=True,
                        clean=not translation_config.skip_clean,
                        deflate_fonts=True,
                        linear=False,
                        tag="dual",
                    )
                    if translation_config.debug:
                        translation_config.raise_if_cancelled()
                        dual.save(
                            f"{dual_out_path}.decompressed.pdf",
                            expand=True,
                            pretty=True,
                        )
                pbar.advance()
            if self.translation_config.no_mono:
                mono_out_path = None
            if self.translation_config.no_dual:
                dual_out_path = None
            auto_extracted_glossary_path = None
            if (
                self.translation_config.save_auto_extracted_glossary
                and self.translation_config.shared_context_cross_split_part.auto_extracted_glossary
            ):
                auto_extracted_glossary_path = self.translation_config.get_output_file_path(
                    f"{basename}{debug_suffix}.{translation_config.lang_out}.glossary.csv"
                )
                with auto_extracted_glossary_path.open("w", encoding="utf-8") as f:
                    logger.info(
                        f"save auto extracted glossary to {auto_extracted_glossary_path}"
                    )
                    f.write(
                        self.translation_config.shared_context_cross_split_part.auto_extracted_glossary.to_csv()
                    )

            return TranslateResult(
                mono_out_path, dual_out_path, auto_extracted_glossary_path
            )
        except Exception:
            logger.exception(
                "Failed to create PDF: %s",
                translation_config.input_file,
            )
            if not check_font_exists:
                return self.write(translation_config, True)
            raise

    def update_page_content_stream(
        self, check_font_exists, page, pdf, translation_config, skip_char: bool = False
    ):
        assert page.cropbox is not None and page.cropbox.box is not None
        page_crop_box = page.cropbox.box
        ctm_for_ops = (
            1,
            0,
            0,
            1,
            -page_crop_box.x,
            -page_crop_box.y,
        )
        ctm_for_ops = f" {' '.join(f'{x:f}' for x in ctm_for_ops)} cm ".encode()
        translation_config.raise_if_cancelled()
        xobj_available_fonts = {}
        xobj_draw_ops = {}
        xobj_encoding_length_map = {}
        available_font_list = self.get_available_font_list(pdf, page)
        page_encoding_length_map: dict[str | None, int | None] = {
            f.font_id: f.encoding_length for f in page.pdf_font
        }
        all_encoding_length_map = page_encoding_length_map.copy()
        for xobj in page.pdf_xobject:
            xobj_available_fonts[xobj.xobj_id] = available_font_list.copy()
            try:
                xobj_available_fonts[xobj.xobj_id].update(
                    self.get_xobj_available_fonts(xobj.xref_id, pdf),
                )
            except Exception:
                pass
            xobj_encoding_length_map[xobj.xobj_id] = {
                f.font_id: f.encoding_length for f in xobj.pdf_font
            }
            all_encoding_length_map.update(xobj_encoding_length_map[xobj.xobj_id])
            xobj_encoding_length_map[xobj.xobj_id].update(page_encoding_length_map)
            xobj_op = BitStream()
            base_op = xobj.base_operations.value
            base_op = zstd_decompress(base_op)
            xobj_op.append(base_op.encode())
            xobj_draw_ops[xobj.xobj_id] = xobj_op
        page_op = BitStream()
        # q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
        # page_op.append(b"q ")
        # base_op = page.base_operations.value
        # base_op = zstd_decompress(base_op)
        # page_op.append(base_op.encode())
        # page_op.append(b" \n")
        page_op.append(ctm_for_ops)
        page_op.append(b" \n")
        # Create render context
        context = RenderContext(
            pdf_creator=self,
            page=page,
            available_font_list=available_font_list,
            page_encoding_length_map=page_encoding_length_map,
            all_encoding_length_map=all_encoding_length_map,
            xobj_available_fonts=xobj_available_fonts,
            xobj_encoding_length_map=xobj_encoding_length_map,
            ctm_for_ops=ctm_for_ops,
            check_font_exists=check_font_exists,
        )
        # Create render units for all renderable objects
        render_units = self.create_render_units_for_page(page, translation_config)
        if skip_char:
            render_units = [
                unit
                for unit in render_units
                if not isinstance(unit, CharacterRenderUnit)
            ]
        # Render all units to their appropriate streams
        self.render_units_to_stream(render_units, context, page_op, xobj_draw_ops)
        # Update xobject streams
        for xobj in page.pdf_xobject:
            draw_op = xobj_draw_ops[xobj.xobj_id]
            try:
                pdf.update_stream(xobj.xref_id, draw_op.tobytes())
            except Exception:
                logger.warning(f"update xref {xobj.xref_id} stream fail, continue")
        draw_op = page_op
        op_container = pdf.get_new_xref()
        # Since this is a draw instruction container,
        # no additional information is needed
        pdf.update_object(op_container, "<<>>")
        pdf.update_stream(op_container, draw_op.tobytes())
        pdf[page.page_number].set_contents(op_container)


================================================
FILE: babeldoc/format/pdf/document_il/frontend/__init__.py
================================================


================================================
FILE: babeldoc/format/pdf/document_il/frontend/il_creater.py
================================================
import base64
import functools
import logging
import math
import re
import unicodedata
from io import BytesIO
from itertools import islice
from typing import Literal

import freetype
import pymupdf
import tiktoken

import babeldoc.pdfminer.pdfinterp
from babeldoc.format.pdf.babelpdf.base14 import get_base14_bbox
from babeldoc.format.pdf.babelpdf.cidfont import get_cidfont_bbox
from babeldoc.format.pdf.babelpdf.cidfont import get_glyph_bbox
from babeldoc.format.pdf.babelpdf.encoding import WinAnsiEncoding
from babeldoc.format.pdf.babelpdf.encoding import get_type1_encoding
from babeldoc.format.pdf.babelpdf.type3 import get_type3_bbox
from babeldoc.format.pdf.babelpdf.utils import guarded_bbox
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.document_il.utils import zstd_helper
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.utils.matrix_helper import decompose_ctm
from babeldoc.format.pdf.document_il.utils.style_helper import BLACK
from babeldoc.format.pdf.document_il.utils.style_helper import YELLOW
from babeldoc.format.pdf.translation_config import TranslationConfig
from babeldoc.pdfminer.layout import LTChar
from babeldoc.pdfminer.layout import LTFigure
from babeldoc.pdfminer.pdffont import PDFCIDFont
from babeldoc.pdfminer.pdffont import PDFFont

# from babeldoc.pdfminer.pdfpage import PDFPage as PDFMinerPDFPage
# from babeldoc.pdfminer.pdftypes import PDFObjRef as PDFMinerPDFObjRef
# from babeldoc.pdfminer.pdftypes import resolve1 as pdftypes_resolve1
from babeldoc.pdfminer.psparser import PSLiteral
from babeldoc.pdfminer.utils import apply_matrix_pt
from babeldoc.pdfminer.utils import get_bound
from babeldoc.pdfminer.utils import mult_matrix


def invert_matrix(
    ctm: tuple[float, float, float, float, float, float],
) -> tuple[float, float, float, float, float, float]:
    """
    Calculate the inverse of a 2D transformation matrix.
    Matrix format: (a, b, c, d, e, f) representing:
    [a c e]
    [b d f]
    [0 0 1]
    """
    a, b, c, d, e, f = ctm

    # Calculate determinant
    det = a * d - b * c

    if abs(det) < 1e-10:
        # Matrix is singular, return identity matrix
        return (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)

    # Calculate inverse matrix elements
    inv_a = d / det
    inv_b = -b / det
    inv_c = -c / det
    inv_d = a / det
    inv_e = (c * f - d * e) / det
    inv_f = (b * e - a * f) / det

    return (inv_a, inv_b, inv_c, inv_d, inv_e, inv_f)


def batched(iterable, n, *, strict=False):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    iterator = iter(iterable)
    while batch := tuple(islice(iterator, n)):
        if strict and len(batch) != n:
            raise ValueError("batched(): incomplete batch")
        yield batch


logger = logging.getLogger(__name__)

#
# def create_hook(func, hook):
#     @wraps(func)
#     def wrapper(*args, **kwargs):
#         hook(*args, **kwargs)
#         return func(*args, **kwargs)
#
#     return wrapper
#
#
# def hook_pdfminer_pdf_page_init(*args):
#     attrs = args[3]
#     try:
#         while isinstance(attrs["MediaBox"], PDFMinerPDFObjRef):
#             attrs["MediaBox"] = pdftypes_resolve1(attrs["MediaBox"])
#     except Exception:
#         logger.exception(f"try to fix mediabox failed: {attrs}")
#
#
# PDFMinerPDFPage.__init__ = create_hook(
#     PDFMinerPDFPage.__init__, hook_pdfminer_pdf_page_init
# )


def indirect(obj):
    if isinstance(obj, tuple) and obj[0] == "xref":
        return int(obj[1].split(" ")[0])


def get_char_cbox(face, idx):
    g = face.get_char_index(idx)
    return get_glyph_bbox(face, g)


def get_name_cbox(face, name):
    if name:
        if isinstance(name, str):
            name = name.encode("utf-8")
        g = face.get_name_index(name)
        return get_glyph_bbox(face, g)
    return (0, 0, 0, 0)


def font_encoding_lookup(doc, idx, key):
    obj = doc.xref_get_key(idx, key)
    if obj[0] == "name":
        enc_name = obj[1][1:]
        if enc_vector := get_type1_encoding(enc_name):
            return enc_name, enc_vector


def parse_font_encoding(doc, idx):
    if encoding := font_encoding_lookup(doc, idx, "Encoding/BaseEncoding"):
        return encoding
    if encoding := font_encoding_lookup(doc, idx, "Encoding"):
        return encoding
    return ("Custom", get_type1_encoding("StandardEncoding"))


def get_truetype_ansi_bbox_list(face):
    scale = 1000 / face.units_per_EM
    bbox_list = [get_char_cbox(face, code) for code in WinAnsiEncoding]
    bbox_list = [[v * scale for v in bbox] for bbox in bbox_list]
    return bbox_list


def collect_face_cmap(face):
    umap = []  # unicode maps
    lmap = []  # legacy maps
    for cmap in face.charmaps:
        if cmap.encoding_name == "FT_ENCODING_UNICODE":
            umap.append(cmap)
        else:
            lmap.append(cmap)
    return umap, lmap


def get_truetype_custom_bbox_list(face):
    umap, lmap = collect_face_cmap(face)
    if umap:
        face.set_charmap(umap[0])
    elif lmap:
        face.set_charmap(lmap[0])
    else:
        return []
    scale = 1000 / face.units_per_EM
    bbox_list = [get_char_cbox(face, code) for code in range(256)]
    bbox_list = [[v * scale for v in bbox] for bbox in bbox_list]
    return bbox_list


def parse_font_file(doc, idx, encoding, differences):
    bbox_list = []
    data = doc.xref_stream(idx)
    face = freetype.Face(BytesIO(data))
    if face.get_format() == b"TrueType":
        if encoding[0] == "WinAnsiEncoding":
            return get_truetype_ansi_bbox_list(face)
        elif encoding[0] == "Custom":
            return get_truetype_custom_bbox_list(face)
    glyph_name_set = set()
    for x in range(0, face.num_glyphs):
        glyph_name_set.add(face.get_glyph_name(x).decode("U8"))
    scale = 1000 / face.units_per_EM
    enc_name, enc_vector = encoding
    _, lmap = collect_face_cmap(face)
    abbr = enc_name.removesuffix("Encoding")
    if lmap and abbr in ["Custom", "MacRoman", "Standard", "WinAnsi", "MacExpert"]:
        face.set_charmap(lmap[0])
    for i, x in enumerate(enc_vector):
        if x in glyph_name_set:
            v = get_name_cbox(face, x.encode("U8"))
        else:
            v = get_char_cbox(face, i)
        bbox_list.append(v)
    if differences:
        for code, name in differences:
            bbox_list[code] = get_name_cbox(face, name.encode("U8"))
    norm_bbox_list = [[v * scale for v in box] for box in bbox_list]
    return norm_bbox_list


def parse_encoding(obj_str):
    delta = []
    current = 0
    for x in re.finditer(
        r"(?P<p>[\[\]])|(?P<c>\d+)|(?P<n>/[^\s/\[\]()<>]+)|(?P<s>.)", obj_str
    ):
        key = x.lastgroup
        val = x.group()
        if key == "c":
            current = int(val)
        if key == "n":
            delta.append((current, val[1:]))
            current += 1
    return delta


def parse_mapping(text):
    mapping = []
    for x in re.finditer(r"<(?P<num>[a-fA-F0-9]+)>", text):
        mapping.append(x.group("num"))
    return mapping


def update_cmap_pair(cmap, data):
    for start_str, stop_str, value_str in batched(data, 3):
        start = int(start_str, 16)
        stop = int(stop_str, 16)
        try:
            value = base64.b16decode(value_str, True).decode("UTF-16-BE")
            for code in range(start, stop + 1):
                cmap[code] = value
        except Exception:
            pass  # to skip surrogate pairs (D800-DFFF)


def update_cmap_code(cmap, data):
    for code_str, value_str in batched(data, 2):
        code = int(code_str, 16)
        try:
            value = base64.b16decode(value_str, True).decode("UTF-16-BE")
            cmap[code] = value
        except Exception:
            pass  # to skip surrogate pairs (D800-DFFF)


def parse_cmap(cmap_str):
    cmap = {}
    for x in re.finditer(
        r"\s+beginbfrange\s*(?P<r>(<[0-9a-fA-F]+>\s*)+)endbfrange\s+", cmap_str
    ):
        update_cmap_pair(cmap, parse_mapping(x.group("r")))
    for x in re.finditer(
        r"\s+beginbfchar\s*(?P<c>(<[0-9a-fA-F]+>\s*)+)endbfchar", cmap_str
    ):
        update_cmap_code(cmap, parse_mapping(x.group("c")))
    return cmap


def get_code(cmap, c):
    for k, v in cmap.items():
        if v == c:
            return k
    return -1


def get_bbox(bbox, size, c, x, y):
    x_min, y_min, x_max, y_max = bbox[c]
    factor = 1 / 1000 * size
    x_min = x_min * factor
    y_min = -y_min * factor
    x_max = x_max * factor
    y_max = -y_max * factor
    ll = (x + x_min, y + y_min)
    lr = (x + x_max, y + y_min)
    ul = (x + x_min, y + y_max)
    ur = (x + x_max, y + y_max)
    return pymupdf.Quad(ll, lr, ul, ur)


# 常见 Unicode 空格字符的代码点
unicode_spaces = [
    "\u0020",  # 半角空格
    "\u00a0",  # 不间断空格
    "\u1680",  # Ogham 空格标记
    "\u2000",  # En Quad
    "\u2001",  # Em Quad
    "\u2002",  # En Space
    "\u2003",  # Em Space
    "\u2004",  # 三分之一 Em 空格
    "\u2005",  # 四分之一 Em 空格
    "\u2006",  # 六分之一 Em 空格
    "\u2007",  # 数样间距
    "\u2008",  # 行首前导空格
    "\u2009",  # 瘦弱空格
    "\u200a",  # hair space
    "\u202f",  # 窄不间断空格
    "\u205f",  # 数学中等空格
    "\u3000",  # 全角空格
    "\u200b",  # 零宽度空格
    "\u2060",  # 零宽度非断空格
    "\t",  # 水平制表符
]

# 构建正则表达式
pattern = "^[" + "".join(unicode_spaces) + "]+$"

# 编译正则
space_regex = re.compile(pattern)


def get_rotation_angle(matrix):
    """
    根据 PDF 的字符矩阵计算旋转角度（单位：度）
    matrix: tuple/list, 格式 (a, b, c, d, e, f)
    """
    a, b, c, d, e, f = matrix
    # 旋转角度：arctan2(b, a)
    angle_rad = math.atan2(b, a)
    angle_deg = math.degrees(angle_rad)
    return angle_deg


class ILCreater:
    stage_name = "Parse PDF and Create Intermediate Representation"

    def __init__(self, translation_config: TranslationConfig):
        self.progress = None
        self.current_page: il_version_1.Page = None
        self.mupdf: pymupdf.Document = None
        self.model = translation_config.doc_layout_model
        self.docs = il_version_1.Document(page=[])
        self.stroking_color_space_name = None
        self.non_stroking_color_space_name = None
        self.passthrough_per_char_instruction: list[tuple[str, str]] = []
        self.translation_config = translation_config
        self.passthrough_per_char_instruction_stack: list[list[tuple[str, str]]] = []
        self.xobj_id = 0
        self.xobj_inc = 0
        self.xobj_map: dict[int, il_version_1.PdfXobject] = {}
        self.xobj_stack = []
        self.current_page_font_name_id_map = {}
        self.current_page_font_char_bounding_box_map = {}
        self.current_available_fonts = {}
        self.mupdf_font_map: dict[int, pymupdf.Font] = {}
        self.graphic_state_pool = {}
        self.enable_graphic_element_process = (
            translation_config.enable_graphic_element_process
        )
        self.render_order = 0
        self.current_clip_paths: list[tuple] = []
        self.clip_paths_stack: list[list[tuple]] = []
        # For valid character collection
        self.font_mapper = FontMapper(translation_config)
        self.tokenizer = tiktoken.encoding_for_model("gpt-4o")
        self._page_valid_chars_buffer: list[str] | None = None

    def transform_clip_path(
        self,
        clip_path,
        source_ctm: tuple[float, float, float, float, float, float],
        target_ctm: tuple[float, float, float, float, float, float],
    ):
        """Transform clip path coordinates from source CTM to target CTM."""
        if source_ctm == target_ctm:
            return clip_path

        # Calculate transformation matrix: inverse(target_ctm) * source_ctm
        inv_target_ctm = invert_matrix(target_ctm)
        transform_matrix = mult_matrix(source_ctm, inv_target_ctm)

        transformed_path = []
        for path_element in clip_path:
            if len(path_element) == 1:
                # Path operation without coordinates (e.g., 'h' for close path)
                transformed_path.append(path_element)
            else:
                # Path operation with coordinates
                op = path_element[0]
                coords = path_element[1:]
                transformed_coords = []

                # Transform coordinate pairs
                for i in range(0, len(coords), 2):
                    if i + 1 < len(coords):
                        x, y = coords[i], coords[i + 1]
                        transformed_point = apply_matrix_pt(transform_matrix, (x, y))
                        transformed_coords.extend(transformed_point)
                    else:
                        # Handle odd number of coordinates (shouldn't happen in well-formed paths)
                        transformed_coords.append(coords[i])

                transformed_path.append([op] + transformed_coords)

        return transformed_path

    def get_render_order_and_increase(self):
        self.render_order += 1
        return self.render_order

    def get_render_order(self):
        return self.render_order

    def on_finish(self):
        self.progress.__exit__(None, None, None)

    def is_graphic_operation(self, operator: str):
        if not self.enable_graphic_element_process:
            return False

        return re.match(
            "^(m|l|c|v|y|re|h|S|s|f|f*|F|B|B*|b|b*|n|Do)$",
            operator,
        )

    def is_passthrough_per_char_operation(self, operator: str):
        return re.match(
            "^(sc|SC|sh|scn|SCN|g|G|rg|RG|k|K|cs|CS|gs|ri|w|J|j|M|i)$",
            operator,
        )

    def can_remove_old_passthrough_per_char_instruction(self, operator: str):
        return re.match(
            "^(sc|SC|sh|scn|SCN|g|G|rg|RG|k|K|cs|CS|ri|w|J|j|M|i|d)$",
            operator,
        )

    def on_line_dash(self, dash, phase):
        dash_str = f"[{' '.join(f'{arg}' for arg in dash)}]"
        self.on_passthrough_per_char("d", [dash_str, str(phase)])

    def on_passthrough_per_char(self, operator: str, args: list[str]):
        if not self.is_passthrough_per_char_operation(operator) and operator not in (
            "W n",
            "W* n",
            "d",
            "W",
            "W*",
        ):
            logger.error("Unknown passthrough_per_char operation: %s", operator)
            return
        # logger.debug("xobj_id: %d, on_passthrough_per_char: %s ( %s )", self.xobj_id, operator, args)
        args = [self.parse_arg(arg) for arg in args]
        if self.can_remove_old_passthrough_per_char_instruction(operator):
            for _i, value in enumerate(self.passthrough_per_char_instruction.copy()):
                op, arg = value
                if op == operator:
                    self.passthrough_per_char_instruction.remove(value)
                    break
        self.passthrough_per_char_instruction.append((operator, " ".join(args)))
        pass

    def remove_latest_passthrough_per_char_instruction(self):
        if self.passthrough_per_char_instruction:
            self.passthrough_per_char_instruction.pop()

    def parse_arg(self, arg: str):
        if isinstance(arg, PSLiteral):
            return f"/{arg.name}"
        elif isinstance(arg, float):
            return f"{arg:f}"
        elif not isinstance(arg, str):
            return str(arg)
        return arg

    def pop_passthrough_per_char_instruction(self):
        if self.passthrough_per_char_instruction_stack:
            self.passthrough_per_char_instruction = (
                self.passthrough_per_char_instruction_stack.pop()
            )
        else:
            self.passthrough_per_char_instruction = []
            logging.error(
                "pop_passthrough_per_char_instruction error on page: %s",
                self.current_page.page_number,
            )

        if self.clip_paths_stack:
            self.current_clip_paths = self.clip_paths_stack.pop()
        else:
            self.current_clip_paths = []

    def push_passthrough_per_char_instruction(self):
        self.passthrough_per_char_instruction_stack.append(
            self.passthrough_per_char_instruction.copy(),
        )
        self.clip_paths_stack.append(self.current_clip_paths.copy())

    # pdf32000 page 171
    def on_stroking_color_space(self, color_space_name):
        self.stroking_color_space_name = color_space_name

    def on_non_stroking_color_space(self, color_space_name):
        self.non_stroking_color_space_name = color_space_name

    def on_new_stream(self):
        self.stroking_color_space_name = None
        self.non_stroking_color_space_name = None
        self.passthrough_per_char_instruction = []
        self.current_clip_paths = []

    def push_xobj(self):
        self.xobj_stack.append(
            (
                self.xobj_id,
                self.current_clip_paths.copy(),
                self.current_available_fonts.copy(),
            ),
        )
        self.current_clip_paths = []

    def pop_xobj(self):
        (self.xobj_id, self.current_clip_paths, self.current_available_fonts) = (
            self.xobj_stack.pop()
        )

    def on_xobj_begin(self, bbox, xref_id):
        logger.debug(f"on_xobj_begin: {bbox} @ {xref_id}")
        self.push_passthrough_per_char_instruction()
        self.push_xobj()
        self.xobj_inc += 1
        self.xobj_id = self.xobj_inc
        xobject = il_version_1.PdfXobject(
            box=il_version_1.Box(
                x=float(bbox[0]),
                y=float(bbox[1]),
                x2=float(bbox[2]),
                y2=float(bbox[3]),
            ),
            xobj_id=self.xobj_id,
            xref_id=xref_id,
            pdf_font=[],
        )
        self.current_page.pdf_xobject.append(xobject)
        self.xobj_map[self.xobj_id] = xobject
        xobject.pdf_font.extend(self.current_available_fonts.values())
        return self.xobj_id

    def on_xobj_end(self, xobj_id, base_op):
        self.pop_passthrough_per_char_instruction()
        self.pop_xobj()
        xobj = self.xobj_map[xobj_id]
        base_op = zstd_helper.zstd_compress(base_op)
        xobj.base_operations = il_version_1.BaseOperations(value=base_op)
        self.xobj_inc += 1

    def on_page_start(self):
        self.current_page = il_version_1.Page(
            pdf_font=[],
            pdf_character=[],
            page_layout=[],
            pdf_curve=[],
            pdf_form=[],
            # currently don't support UserUnit page parameter
            # pdf32000 page 79
            unit="point",
        )
        self.current_page_font_name_id_map = {}
        self.current_page_font_char_bounding_box_map = {}
        self.passthrough_per_char_instruction_stack = []
        self.xobj_stack = []
        self.non_stroking_color_space_name = None
        self.stroking_color_space_name = None
        self.current_clip_paths = []
        self.clip_paths_stack = []
        self.docs.page.append(self.current_page)
        # Prepare per-page buffer for valid characters on translated pages
        self._page_valid_chars_buffer = []

    def on_page_end(self):
        # Accumulate this page's valid characters and tokens into shared context
        try:
            if (
                self._page_valid_chars_buffer is not None
                and len(self._page_valid_chars_buffer) > 0
            ):
                page_text = "".join(self._page_valid_chars_buffer)
                char_count = len(page_text)
                try:
                    token_count = len(
                        self.tokenizer.encode(page_text, disallowed_special=())
                    )
                except Exception as e:
                    logger.warning("Failed to compute token count for page: %s", e)
                    token_count = 0
                self.translation_config.shared_context_cross_split_part.add_valid_counts(
                    char_count, token_count
                )
        except Exception as e:
            logger.warning("Failed to accumulate page valid stats: %s", e)
        finally:
            self._page_valid_chars_buffer = []
        self.progress.advance(1)

    def on_page_crop_box(
        self,
        x0: float | int,
        y0: float | int,
        x1: float | int,
        y1: float | int,
    ):
        box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1))
        self.current_page.cropbox = il_version_1.Cropbox(box=box)

    def on_page_media_box(
        self,
        x0: float | int,
        y0: float | int,
        x1: float | int,
        y1: float | int,
    ):
        box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1))
        self.current_page.mediabox = il_version_1.Mediabox(box=box)

    def on_page_number(self, page_number: int):
        assert isinstance(page_number, int)
        assert page_number >= 0
        self.current_page.page_number = page_number

    def on_page_base_operation(self, operation: str):
        operation = zstd_helper.zstd_compress(operation)
        self.current_page.base_operations = il_version_1.BaseOperations(value=operation)

    def on_page_resource_font(self, font: PDFFont, xref_id: int, font_id: str):
        font_name = font.fontname
        logger.debug(f"handle font {font_name} @ {xref_id} in {self.xobj_id}")
        if isinstance(font_name, bytes):
            try:
                font_name = font_name.decode("utf-8")
            except UnicodeDecodeError:
                font_name = "BASE64:" + base64.b64encode(font_name).decode("utf-8")
        encoding_length = 1
        if isinstance(font, PDFCIDFont):
            try:
                # pdf 32000:2008 page 273
                # Table 118 - Predefined CJK CMap names
                _, encoding = self.mupdf.xref_get_key(xref_id, "Encoding")
                if encoding == "/Identity-H" or encoding == "/Identity-V":
                    encoding_length = 2
                elif encoding == "/WinAnsiEncoding":
                    encoding_length = 1
                else:
                    _, to_unicode_id = self.mupdf.xref_get_key(xref_id, "ToUnicode")
                    if to_unicode_id is not None:
                        to_unicode_bytes = self.mupdf.xref_stream(
                            int(to_unicode_id.split(" ")[0]),
                        )
                        code_range = re.search(
                            b"begincodespacerange\n?.*<(\\d+?)>.*",
                            to_unicode_bytes,
                        ).group(1)
                        encoding_length = len(code_range) // 2
            except Exception:
                if (
                    font.unicode_map
                    and font.unicode_map.cid2unichr
                    and max(font.unicode_map.cid2unichr.keys()) > 255
                ):
                    encoding_length = 2
                else:
                    encoding_length = 1
        try:
            if xref_id in self.mupdf_font_map:
                mupdf_font = self.mupdf_font_map[xref_id]
            else:
                mupdf_font = pymupdf.Font(
                    fontbuffer=self.mupdf.extract_font(xref_id)[3]
                )
                mupdf_font.has_glyph = functools.lru_cache(maxsize=10240, typed=True)(
                    mupdf_font.has_glyph,
                )
            bold = mupdf_font.is_bold
            italic = mupdf_font.is_italic
            monospaced = mupdf_font.is_monospaced
            serif = mupdf_font.is_serif
            self.mupdf_font_map[xref_id] = mupdf_font
        except Exception:
            bold = None
            italic = None
            monospaced = None
            serif = None
        il_font_metadata = il_version_1.PdfFont(
            name=font_name,
            xref_id=xref_id,
            font_id=font_id,
            encoding_length=encoding_length,
            bold=bold,
            italic=italic,
            monospace=monospaced,
            serif=serif,
            ascent=font.ascent,
            descent=font.descent,
            pdf_font_char_bounding_box=[],
        )
        try:
            if xref_id is None:
                logger.warning("xref_id is None for font %s", font_name)
                raise ValueError("xref_id is None for font %s", font_name)
            bbox_list, cmap = self.parse_font_xobj_id(xref_id)
            font_char_bounding_box_map = {}
            if not cmap:
                cmap = {x: x for x in range(257)}
            for char_id, char_bbox in enumerate(bbox_list):
                font_char_bounding_box_map[char_id] = char_bbox
            for char_id in cmap:
                if char_id < 0 or char_id >= len(bbox_list):
                    continue
                bbox = bbox_list[char_id]
                x, y, x2, y2 = bbox
                if (
                    x == 0
                    and y == 0
                    and x2 == 500
                    and y2 == 698
                    or x == 0
                    and y == 0
                    and x2 == 0
                    and y2 == 0
                ):
                    # ignore default bounding box
                    continue
                il_font_metadata.pdf_font_char_bounding_box.append(
                    il_version_1.PdfFontCharBoundingBox(
                        x=x,
                        y=y,
                        x2=x2,
                        y2=y2,
                        char_id=char_id,
                    )
                )
                font_char_bounding_box_map[char_id] = bbox
            if self.xobj_id in self.xobj_map:
                if self.xobj_id not in self.current_page_font_char_bounding_box_map:
                    self.current_page_font_char_bounding_box_map[self.xobj_id] = {}
                self.current_page_font_char_bounding_box_map[self.xobj_id][xref_id] = (
                    font_char_bounding_box_map
                )
            else:
                self.current_page_font_char_bounding_box_map[xref_id] = (
                    font_char_bounding_box_map
                )
        except Exception as e:
            if xref_id is None:
                logger.error("failed to parse font xobj id None: %s", e)
            else:
                logger.error("failed to parse font xobj id %d: %s", xref_id, e)
        self.current_page_font_name_id_map[xref_id] = font_id
        self.current_available_fonts[font_id] = il_font_metadata

        fonts = self.current_page.pdf_font
        if self.xobj_id in self.xobj_map:
            fonts = self.xobj_map[self.xobj_id].pdf_font
        should_remove = []
        for f in fonts:
            if f.font_id == font_id:
                should_remove.append(f)
        for sr in should_remove:
            fonts.remove(sr)
        fonts.append(il_font_metadata)

    def parse_font_xobj_id(self, xobj_id: int):
        if xobj_id is None:
            return [], {}

        bbox_list = []
        encoding = parse_font_encoding(self.mupdf, xobj_id)
        differences = []
        font_differences = self.mupdf.xref_get_key(xobj_id, "Encoding/Differences")
        if font_differences:
            differences = parse_encoding(font_differences[1])
        for file_key in ["FontFile", "FontFile2", "FontFile3"]:
            font_file = self.mupdf.xref_get_key(xobj_id, f"FontDescriptor/{file_key}")
            if file_idx := indirect(font_file):
                bbox_list = parse_font_file(
                    self.mupdf,
                    file_idx,
                    encoding,
                    differences,
                )
        cmap = {}
        to_unicode = self.mupdf.xref_get_key(xobj_id, "ToUnicode")
        if to_unicode_idx := indirect(to_unicode):
            cmap = parse_cmap(self.mupdf.xref_stream(to_unicode_idx).decode("U8"))
        if not bbox_list:
            obj_type, obj_val = self.mupdf.xref_get_key(xobj_id, "BaseFont")
            if obj_type == "name":
                bbox_list = get_base14_bbox(obj_val[1:])
        if cid_bbox := get_cidfont_bbox(self.mupdf, xobj_id):
            bbox_list = cid_bbox
        if self.mupdf.xref_get_key(xobj_id, "Subtype")[1] == "/Type3":
            bbox_list = get_type3_bbox(self.mupdf, xobj_id)
        return bbox_list, cmap

    def create_graphic_state(
        self,
        gs: babeldoc.pdfminer.pdfinterp.PDFGraphicState | list[tuple[str, str]],
        include_clipping: bool = False,
        target_ctm: tuple[float, float, float, float, float, float] = None,
        clip_paths=None,
    ):
        if clip_paths is None:
            clip_paths = self.current_clip_paths
        passthrough_instruction = getattr(gs, "passthrough_instruction", gs)

        def filter_clipping(op):
            return op not in ("W n", "W* n")

        def pass_all(_op):
            return True

        if include_clipping:
            filter_clipping = pass_all

        passthrough_per_char_instruction_parts = [
            f"{arg} {op}" for op, arg in passthrough_instruction if filter_clipping(op)
        ]

        # Add transformed clipping paths if requested and target CTM is provided
        if include_clipping and target_ctm and clip_paths:
            for clip_path, source_ctm, evenodd in clip_paths:
                try:
                    # Transform clip path from source CTM to target CTM
                    transformed_path = self.transform_clip_path(
                        clip_path, source_ctm, target_ctm
                    )

                    # Generate clipping instruction
                    op = "W* n" if evenodd else "W n"
                    args = []
                    for p in transformed_path:
                        if len(p) == 1:
                            args.append(p[0])
                        elif len(p) > 1:
                            args.extend([f"{x:F}" for x in p[1:]])
                            args.append(p[0])

                    if args:
                        clipping_instruction = f"{' '.join(args)} {op}"
                        passthrough_per_char_instruction_parts.append(
                            clipping_instruction
                        )

                except Exception as e:
                    logger.warning("Error transforming clip path: %s", e)

        passthrough_per_char_instruction = " ".join(
            passthrough_per_char_instruction_parts
        )

        # 可能会影响部分 graphic state 准确度。不过 BabelDOC 仅使用 passthrough_per_char_instruction
        # 所以应该是没啥影响
        # 但是池化 graphic state 后可以减少内存占用
        if passthrough_per_char_instruction not in self.graphic_state_pool:
            self.graphic_state_pool[passthrough_per_char_instruction] = (
                il_version_1.GraphicState(
                    passthrough_per_char_instruction=passthrough_per_char_instruction
                )
            )
        graphic_state = self.graphic_state_pool[passthrough_per_char_instruction]

        return graphic_state

    def on_lt_char(self, char: LTChar):
        if char.aw_font_id is None:
            return
        try:
            rotation_angle = get_rotation_angle(char.matrix)
            if not (-0.1 <= rotation_angle <= 0.1 or 89.9 <= rotation_angle <= 90.1):
                return
        except Exception:
            logger.warning(
                "Failed to get rotation angle for char %s",
                char.get_text(),
            )
        # Collect valid characters for statistics
        try:
            self._collect_valid_char(char.get_text())
        except Exception as e:
            logger.warning("Error collecting valid char: %s", e)
        gs = self.create_graphic_state(char.graphicstate)
        # Get font from current page or xobject
        font = None
        pdf_font = None
        for pdf_font in self.xobj_map.get(char.xobj_id, self.current_page).pdf_font:
            if pdf_font.font_id == char.aw_font_id:
                font = pdf_font
                break

        # Get descent from font
        descent = 0
        if font and hasattr(font, "descent"):
            descent = font.descent * char.size / 1000

        char_id = char.cid

        char_bounding_box = None
        try:
            if (
                font_bounding_box_map
                := self.current_page_font_char_bounding_box_map.get(
                    char.xobj_id, self.current_page_font_char_bounding_box_map
                ).get(font.xref_id)
            ):
                char_bounding_box = font_bounding_box_map.get(char_id, None)
            else:
                char_bounding_box = None
        except Exception:
            # logger.debug(
            #     "Failed to get font bounding box for char %s",
            #     char.get_text(),
            # )
            char_bounding_box = None

        char_unicode = char.get_text()
        # if "(cid:" not in char_unicode and len(char_unicode) > 1:
        #     return
        if space_regex.match(char_unicode):
            char_unicode = " "
        advance = char.adv
        bbox = il_version_1.Box(
            x=char.bbox[0],
            y=char.bbox[1],
            x2=char.bbox[2],
            y2=char.bbox[3],
        )
        if bbox.x2 < bbox.x or bbox.y2 < bbox.y:
            logger.warning(
                "Invalid bounding box for character %s: %s",
                char_unicode,
                bbox,
            )

        if char.matrix[0] == 0 and char.matrix[3] == 0:
            vertical = True
            visual_bbox = il_version_1.Box(
                x=char.bbox[0] - descent,
                y=char.bbox[1],
                x2=char.bbox[2] - descent,
                y2=char.bbox[3],
            )
        else:
            vertical = False
            # Add descent to y coordinates
            visual_bbox = il_version_1.Box(
                x=char.bbox[0],
                y=char.bbox[1] + descent,
                x2=char.bbox[2],
                y2=char.bbox[3] + descent,
            )
        visual_bbox = il_version_1.VisualBbox(box=visual_bbox)
        pdf_style = il_version_1.PdfStyle(
            font_id=char.aw_font_id,
            font_size=char.size,
            graphic_state=gs,
        )

        if font:
            font_xref_id = font.xref_id
            if font_xref_id in self.mupdf_font_map:
                mupdf_font = self.mupdf_font_map[font_xref_id]
                # if "(cid:" not in char_unicode:
                #     if mupdf_cid := mupdf_font.has_glyph(ord(char_unicode)):
                #         char_id = mupdf_cid

        pdf_char = il_version_1.PdfCharacter(
            box=bbox,
            pdf_character_id=char_id,
            advance=advance,
            char_unicode=char_unicode,
            vertical=vertical,
            pdf_style=pdf_style,
            xobj_id=char.xobj_id,
            visual_bbox=visual_bbox,
            render_order=char.render_order,
            sub_render_order=0,
        )
        if self.translation_config.ocr_workaround:
            pdf_char.pdf_style.graphic_state = BLACK
            pdf_char.render_order = None
        if pdf_style.font_size == 0.0:
            logger.warning(
                "Font size is 0.0 for character %s. Skip it.",
                char_unicode,
            )
            return

        if char_bounding_box and len(char_bounding_box) == 4:
            x_min, y_min, x_max, y_max = char_bounding_box
            factor = 1 / 1000 * pdf_style.font_size
            x_min = x_min * factor
            y_min = y_min * factor
            x_max = x_max * factor
            y_max = y_max * factor
            ll = (char.bbox[0] + x_min, char.bbox[1] + y_min)
            ur = (char.bbox[0] + x_max, char.bbox[1] + y_max)

            volume = (ur[0] - ll[0]) * (ur[1] - ll[1])
            if volume > 1:
                pdf_char.visual_bbox = il_version_1.VisualBbox(
                    il_version_1.Box(ll[0], ll[1], ur[0], ur[1])
                )

        self.current_page.pdf_character.append(pdf_char)

        if self.translation_config.show_char_box:
            self.current_page.pdf_rectangle.append(
                il_version_1.PdfRectangle(
                    box=pdf_char.visual_bbox.box,
                    graphic_state=YELLOW,
                    debug_info=True,
                    line_width=0.2,
                )
            )

    def _collect_valid_char(self, ch: str):
        """Append a valid character into the current page buffer according to rules.
        Rules:
        - Include whitespace matched by space_regex directly.
        - Ignore categories that are never normal text: {Cc, Cs, Co, Cn}.
        - Apply inverted criteria from formular_helper.py (21-28):
          empty -> invalid, contains '(cid:' -> invalid,
          not has_char(ch) -> invalid unless len(ch) > 1 and all(has_char(x)).
        """
        if self._page_valid_chars_buffer is None:
            return
        if space_regex.match(ch):
            self._page_valid_chars_buffer.append(ch)
            return
        try:
            cat = unicodedata.category(ch[0]) if ch else None
        except Exception:
            cat = None
        if cat in {"Cc", "Cs", "Co", "Cn"}:
            return
        is_invalid = False
        if not ch:
            is_invalid = True
        elif "(cid:" in ch:
            is_invalid = True
        else:
            try:
                if not self.font_mapper.has_char(ch):
                    if len(ch) > 1 and all(self.font_mapper.has_char(x) for x in ch):
                        is_invalid = False
                    else:
                        is_invalid = True
            except Exception:
                is_invalid = True
        if not is_invalid:
            self._page_valid_chars_buffer.append(ch)

    def on_lt_curve(self, curve: babeldoc.pdfminer.layout.LTCurve):
        if not self.enable_graphic_element_process:
            return
        bbox = il_version_1.Box(
            x=curve.bbox[0],
            y=curve.bbox[1],
            x2=curve.bbox[2],
            y2=curve.bbox[3],
        )
        # Extract CTM from curve object if it exists
        curve_ctm = getattr(curve, "ctm", None)
        gs = self.create_graphic_state(
            curve.passthrough_instruction,
            include_clipping=True,
            target_ctm=curve_ctm,
            clip_paths=curve.clip_paths,
        )
        paths = []
        for point in curve.original_path:
            op = point[0]
            if len(point) == 1:
                paths.append(
                    il_version_1.PdfPath(
                        op=op,
                        x=None,
                        y=None,
                        has_xy=False,
                    )
                )
                continue
            for p in point[1:-1]:
                paths.append(
                    il_version_1.PdfPath(
                        op="",
                        x=p[0],
                        y=p[1],
                        has_xy=True,
                    )
                )
            paths.append(
                il_version_1.PdfPath(
                    op=point[0],
                    x=point[-1][0],
                    y=point[-1][1],
                    has_xy=True,
                )
            )

        fill_background = curve.fill
        stroke_path = curve.stroke
        evenodd = curve.evenodd
        # Extract CTM from curve object if it exists
        ctm = getattr(curve, "ctm", None)

        # Extract raw path from curve object if it exists
        raw_path = getattr(curve, "raw_path", None)
        raw_pdf_paths = None
        if raw_path is not None:
            raw_pdf_paths = []
            for path in raw_path:
                if path[0] == "h":  # h command (close path)
                    raw_pdf_paths.append(
                        il_version_1.PdfOriginalPath(
                            pdf_path=il_version_1.PdfPath(
                                x=0.0,
                                y=0.0,
                                op=path[0],
                                has_xy=False,
                            )
                        )
                    )
                else:  # commands with coordinates (m, l, c, v, y, etc.)
                    for p in batched(path[1:-2], 2, strict=True):
                        raw_pdf_paths.append(
                            il_version_1.PdfOriginalPath(
                                pdf_path=il_version_1.PdfPath(
                                    x=float(p[0]),
                                    y=float(p[1]),
                                    op="",
                                    has_xy=True,
                                )
                            )
                        )
                    # Last point in the path
                    raw_pdf_paths.append(
                        il_version_1.PdfOriginalPath(
                            pdf_path=il_version_1.PdfPath(
                                x=float(path[-2]),
                                y=float(path[-1]),
                                op=path[0],
                                has_xy=True,
                            )
                        )
                    )

        curve_obj = il_version_1.PdfCurve(
            box=bbox,
            graphic_state=gs,
            pdf_path=paths,
            fill_background=fill_background,
            stroke_path=stroke_path,
            evenodd=evenodd,
            debug_info="a",
            xobj_id=curve.xobj_id,
            render_order=curve.render_order,
            ctm=list(ctm) if ctm is not None else None,
            pdf_original_path=raw_pdf_paths,
        )
        self.current_page.pdf_curve.append(curve_obj)
        pass

    def on_xobj_form(
        self,
        ctm: tuple[float, float, float, float, float, float],
        xobj_id: int,
        xref_id: int,
        form_type: Literal["image", "form"],
        do_args: str,
        bbox: tuple[float, float, float, float],
        matrix: tuple[float, float, float, float, float, float],
    ):
        logger.debug(f"on_xobj_form: {do_args}[{bbox}] @ {xref_id} in {self.xobj_id}")
        matrix = mult_matrix(matrix, ctm)
        (x, y, w, h) = guarded_bbox(bbox)
        bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h))
        bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds)

        gs = self.create_graphic_state(
            self.passthrough_per_char_instruction, include_clipping=True, target_ctm=ctm
        )

        figure_bbox = il_version_1.Box(
            x=bbox[0],
            y=bbox[1],
            x2=bbox[2],
            y2=bbox[3],
        )
        pdf_matrix = il_version_1.PdfMatrix(
            a=ctm[0],
            b=ctm[1],
            c=ctm[2],
            d=ctm[3],
            e=ctm[4],
            f=ctm[5],
        )
        affine_transform = decompose_ctm(ctm)
        xobj_form = il_version_1.PdfXobjForm(
            xref_id=xref_id,
            do_args=do_args,
        )
        pdf_form_subtype = il_version_1.PdfFormSubtype(
            pdf_xobj_form=xobj_form,
        )
        new_form = il_version_1.PdfForm(
            xobj_id=xobj_id,
            box=figure_bbox,
            pdf_matrix=pdf_matrix,
            graphic_state=gs,
            pdf_affine_transform=affine_transform,
            render_order=self.get_render_order_and_increase(),
            form_type=form_type,
            pdf_form_subtype=pdf_form_subtype,
            ctm=list(ctm),
        )
        self.current_page.pdf_form.append(new_form)

    def on_pdf_clip_path(
        self,
        clip_path,
        evenodd: bool,
        ctm: tuple[float, float, float, float, float, float],
    ):
        try:
            self.current_clip_paths.append((clip_path.copy(), ctm, evenodd))
        except Exception as e:
            logger.warning("Error in on_pdf_clip_path: %s", e)

    def create_il(self):
        pages = [
            page
            for page in self.docs.page
            if self.translation_config.should_translate_page(page.page_number + 1)
        ]
        self.docs.page = pages
        return self.docs

    def on_total_pages(self, total_pages: int):
        assert isinstance(total_pages, int)
        assert total_pages > 0
        self.docs.total_pages = total_pages
        total = 0
        for page in range(total_pages):
            if self.translation_config.should_translate_page(page + 1) is False:
                continue
            total += 1
        self.progress = self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            total,
        )

    def on_pdf_figure(self, figure: LTFigure):
        box = il_version_1.Box(
            figure.bbox[0],
            figure.bbox[1],
            figure.bbox[2],
            figure.bbox[3],
        )
        self.current_page.pdf_figure.append(il_version_1.PdfFigure(box=box))

    def on_inline_image_begin(self):
        """Begin processing inline image"""
        # Store current state for inline image processing
        self._inline_image_state = {
            "ctm": None,
            "parameters": {},
        }

    def on_inline_image_end(self, stream_obj, ctm):
        """End processing inline image and create PdfForm"""
        import base64
        import json

        from babeldoc.format.pdf.babelpdf.utils import guarded_bbox
        from babeldoc.format.pdf.document_il.utils.matrix_helper import decompose_ctm
        from babeldoc.pdfminer.utils import apply_matrix_pt
        from babeldoc.pdfminer.utils import get_bound

        # Extract image parameters from stream dictionary
        image_dict = stream_obj.attrs if hasattr(stream_obj, "attrs") else {}

        # Build parameters dictionary
        parameters = {}
        for key, value in image_dict.items():
            if hasattr(value, "name"):
                parameters[key] = value.name
            else:
                parameters[key] = str(value)

        # Get image data (encoded as base64)
        image_data = ""
        if hasattr(stream_obj, "data") and stream_obj.data is not None:
            image_data = base64.b64encode(stream_obj.data).decode("ascii")
        elif hasattr(stream_obj, "rawdata") and stream_obj.rawdata is not None:
            image_data = base64.b64encode(stream_obj.rawdata).decode("ascii")

        # Create inline form with parameters as JSON string
        inline_form = il_version_1.PdfInlineForm(
            form_data=image_data, image_parameters=json.dumps(parameters)
        )

        # Calculate bounding box - inline images are typically 1x1 unit square in user space
        bbox = (0, 0, 1, 1)
        (x, y, w, h) = guarded_bbox(bbox)
        bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h))
        final_bbox = get_bound(apply_matrix_pt(ctm, (p, q)) for (p, q) in bounds)

        # Create graphics state
        gs = self.create_graphic_state(
            self.passthrough_per_char_instruction, include_clipping=True, target_ctm=ctm
        )

        # Create PdfMatrix from CTM
        pdf_matrix = il_version_1.PdfMatrix(
            a=ctm[0], b=ctm[1], c=ctm[2], d=ctm[3], e=ctm[4], f=ctm[5]
        )

        # Create affine transform
        affine_transform = decompose_ctm(ctm)

        # Create PdfFormSubtype with inline form
        pdf_form_subtype = il_version_1.PdfFormSubtype(pdf_inline_form=inline_form)

        # Create PdfForm for the inline image
        pdf_form = il_version_1.PdfForm(
            box=il_version_1.Box(
                x=final_bbox[0],
                y=final_bbox[1],
                x2=final_bbox[2],
                y2=final_bbox[3],
            ),
            graphic_state=gs,
            pdf_matrix=pdf_matrix,
            pdf_affine_transform=affine_transform,
            pdf_form_subtype=pdf_form_subtype,
            xobj_id=self.xobj_id,
            ctm=list(ctm),
            render_order=self.get_render_order_and_increase(),
            form_type="image",
        )

        # Add to current page
        self.current_page.pdf_form.append(pdf_form)


================================================
FILE: babeldoc/format/pdf/document_il/il_version_1.py
================================================
from dataclasses import dataclass
from dataclasses import field


@dataclass(slots=True)
class BaseOperations:
    class Meta:
        name = "baseOperations"

    value: str = field(
        default="",
        metadata={
            "required": True,
        },
    )


@dataclass(slots=True)
class Box:
    class Meta:
        name = "box"

    x: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    y: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    x2: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    y2: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )


@dataclass(slots=True)
class GraphicState:
    class Meta:
        name = "graphicState"

    passthrough_per_char_instruction: str | None = field(
        default=None,
        metadata={
            "name": "passthroughPerCharInstruction",
            "type": "Attribute",
        },
    )


@dataclass(slots=True)
class PdfAffineTransform:
    class Meta:
        name = "pdfAffineTransform"

    translation_x: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    translation_y: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    rotation: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    scale_x: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    scale_y: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    shear: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )


@dataclass(slots=True)
class PdfFontCharBoundingBox:
    class Meta:
        name = "pdfFontCharBoundingBox"

    x: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    y: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    x2: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    y2: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    char_id: int | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )


@dataclass(slots=True)
class PdfInlineForm:
    class Meta:
        name = "pdfInlineForm"

    form_data: str | None = field(
        default=None,
        metadata={
            "name": "formData",
            "type": "Attribute",
        },
    )
    image_parameters: str | None = field(
        default=None,
        metadata={
            "name": "imageParameters",
            "type": "Attribute",
        },
    )


@dataclass(slots=True)
class PdfMatrix:
    class Meta:
        name = "pdfMatrix"

    a: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    b: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    c: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    d: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    e: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    f: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )


@dataclass(slots=True)
class PdfPath:
    class Meta:
        name = "pdfPath"

    x: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    y: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    op: str | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    has_xy: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )


@dataclass(slots=True)
class PdfXobjForm:
    class Meta:
        name = "pdfXobjForm"

    xref_id: int | None = field(
        default=None,
        metadata={
            "name": "xrefId",
            "type": "Attribute",
            "required": True,
        },
    )
    do_args: str | None = field(
        default=None,
        metadata={
            "name": "doArgs",
            "type": "Attribute",
            "required": True,
        },
    )


@dataclass(slots=True)
class Cropbox:
    class Meta:
        name = "cropbox"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )


@dataclass(slots=True)
class Mediabox:
    class Meta:
        name = "mediabox"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )


@dataclass(slots=True)
class PageLayout:
    class Meta:
        name = "pageLayout"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    id: int | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    conf: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    class_name: str | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )


@dataclass(slots=True)
class PdfFigure:
    class Meta:
        name = "pdfFigure"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )


@dataclass(slots=True)
class PdfFont:
    class Meta:
        name = "pdfFont"

    pdf_font_char_bounding_box: list[PdfFontCharBoundingBox] = field(
        default_factory=list,
        metadata={
            "name": "pdfFontCharBoundingBox",
            "type": "Element",
        },
    )
    name: str | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    font_id: str | None = field(
        default=None,
        metadata={
            "name": "fontId",
            "type": "Attribute",
            "required": True,
        },
    )
    xref_id: int | None = field(
        default=None,
        metadata={
            "name": "xrefId",
            "type": "Attribute",
            "required": True,
        },
    )
    encoding_length: int | None = field(
        default=None,
        metadata={
            "name": "encodingLength",
            "type": "Attribute",
            "required": True,
        },
    )
    bold: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    italic: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    monospace: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    serif: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    ascent: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    descent: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )


@dataclass(slots=True)
class PdfFormSubtype:
    class Meta:
        name = "pdfFormSubtype"

    pdf_inline_form: PdfInlineForm | None = field(
        default=None,
        metadata={
            "name": "pdfInlineForm",
            "type": "Element",
        },
    )
    pdf_xobj_form: PdfXobjForm | None = field(
        default=None,
        metadata={
            "name": "pdfXobjForm",
            "type": "Element",
        },
    )


@dataclass(slots=True)
class PdfOriginalPath:
    class Meta:
        name = "pdfOriginalPath"

    pdf_path: PdfPath | None = field(
        default=None,
        metadata={
            "name": "pdfPath",
            "type": "Element",
            "required": True,
        },
    )


@dataclass(slots=True)
class PdfRectangle:
    class Meta:
        name = "pdfRectangle"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    graphic_state: GraphicState | None = field(
        default=None,
        metadata={
            "name": "graphicState",
            "type": "Element",
            "required": True,
        },
    )
    debug_info: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    fill_background: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    xobj_id: int | None = field(
        default=None,
        metadata={
            "name": "xobjId",
            "type": "Attribute",
        },
    )
    line_width: float | None = field(
        default=None,
        metadata={
            "name": "lineWidth",
            "type": "Attribute",
        },
    )
    render_order: int | None = field(
        default=None,
        metadata={
            "name": "renderOrder",
            "type": "Attribute",
        },
    )


@dataclass(slots=True)
class PdfStyle:
    class Meta:
        name = "pdfStyle"

    graphic_state: GraphicState | None = field(
        default=None,
        metadata={
            "name": "graphicState",
            "type": "Element",
            "required": True,
        },
    )
    font_id: str | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    font_size: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )


@dataclass(slots=True)
class VisualBbox:
    class Meta:
        name = "visual_bbox"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )


@dataclass(slots=True)
class PdfCharacter:
    class Meta:
        name = "pdfCharacter"

    pdf_style: PdfStyle | None = field(
        default=None,
        metadata={
            "name": "pdfStyle",
            "type": "Element",
            "required": True,
        },
    )
    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    visual_bbox: VisualBbox | None = field(
        default=None,
        metadata={
            "type": "Element",
        },
    )
    vertical: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    scale: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    pdf_character_id: int | None = field(
        default=None,
        metadata={
            "name": "pdfCharacterId",
            "type": "Attribute",
        },
    )
    char_unicode: str | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    advance: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    xobj_id: int | None = field(
        default=None,
        metadata={
            "name": "xobjId",
            "type": "Attribute",
        },
    )
    debug_info: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    formula_layout_id: int | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    render_order: int | None = field(
        default=None,
        metadata={
            "name": "renderOrder",
            "type": "Attribute",
        },
    )
    sub_render_order: int | None = field(
        default=None,
        metadata={
            "name": "subRenderOrder",
            "type": "Attribute",
        },
    )


@dataclass(slots=True)
class PdfCurve:
    class Meta:
        name = "pdfCurve"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    graphic_state: GraphicState | None = field(
        default=None,
        metadata={
            "name": "graphicState",
            "type": "Element",
            "required": True,
        },
    )
    pdf_path: list[PdfPath] = field(
        default_factory=list,
        metadata={
            "name": "pdfPath",
            "type": "Element",
        },
    )
    pdf_original_path: list[PdfOriginalPath] = field(
        default_factory=list,
        metadata={
            "name": "pdfOriginalPath",
            "type": "Element",
        },
    )
    debug_info: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    fill_background: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    stroke_path: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    evenodd: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    xobj_id: int | None = field(
        default=None,
        metadata={
            "name": "xobjId",
            "type": "Attribute",
        },
    )
    render_order: int | None = field(
        default=None,
        metadata={
            "name": "renderOrder",
            "type": "Attribute",
        },
    )
    ctm: list[object] = field(
        default_factory=list,
        metadata={
            "type": "Attribute",
            "length": 6,
            "tokens": True,
        },
    )
    relocation_transform: list[object] = field(
        default_factory=list,
        metadata={
            "type": "Attribute",
            "length": 6,
            "tokens": True,
        },
    )


@dataclass(slots=True)
class PdfForm:
    class Meta:
        name = "pdfForm"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    graphic_state: GraphicState | None = field(
        default=None,
        metadata={
            "name": "graphicState",
            "type": "Element",
            "required": True,
        },
    )
    pdf_matrix: PdfMatrix | None = field(
        default=None,
        metadata={
            "name": "pdfMatrix",
            "type": "Element",
            "required": True,
        },
    )
    pdf_affine_transform: PdfAffineTransform | None = field(
        default=None,
        metadata={
            "name": "pdfAffineTransform",
            "type": "Element",
            "required": True,
        },
    )
    pdf_form_subtype: PdfFormSubtype | None = field(
        default=None,
        metadata={
            "name": "pdfFormSubtype",
            "type": "Element",
            "required": True,
        },
    )
    xobj_id: int | None = field(
        default=None,
        metadata={
            "name": "xobjId",
            "type": "Attribute",
            "required": True,
        },
    )
    ctm: list[object] = field(
        default_factory=list,
        metadata={
            "type": "Attribute",
            "length": 6,
            "tokens": True,
        },
    )
    relocation_transform: list[object] = field(
        default_factory=list,
        metadata={
            "type": "Attribute",
            "length": 6,
            "tokens": True,
        },
    )
    render_order: int | None = field(
        default=None,
        metadata={
            "name": "renderOrder",
            "type": "Attribute",
            "required": True,
        },
    )
    form_type: str | None = field(
        default=None,
        metadata={
            "name": "formType",
            "type": "Attribute",
            "required": True,
        },
    )


@dataclass(slots=True)
class PdfSameStyleUnicodeCharacters:
    class Meta:
        name = "pdfSameStyleUnicodeCharacters"

    pdf_style: PdfStyle | None = field(
        default=None,
        metadata={
            "name": "pdfStyle",
            "type": "Element",
        },
    )
    unicode: str | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    debug_info: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )


@dataclass(slots=True)
class PdfXobject:
    class Meta:
        name = "pdfXobject"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    pdf_font: list[PdfFont] = field(
        default_factory=list,
        metadata={
            "name": "pdfFont",
            "type": "Element",
        },
    )
    base_operations: BaseOperations | None = field(
        default=None,
        metadata={
            "name": "baseOperations",
            "type": "Element",
            "required": True,
        },
    )
    xobj_id: int | None = field(
        default=None,
        metadata={
            "name": "xobjId",
            "type": "Attribute",
            "required": True,
        },
    )
    xref_id: int | None = field(
        default=None,
        metadata={
            "name": "xrefId",
            "type": "Attribute",
            "required": True,
        },
    )


@dataclass(slots=True)
class PdfFormula:
    class Meta:
        name = "pdfFormula"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    pdf_character: list[PdfCharacter] = field(
        default_factory=list,
        metadata={
            "name": "pdfCharacter",
            "type": "Element",
            "min_occurs": 1,
        },
    )
    pdf_curve: list[PdfCurve] = field(
        default_factory=list,
        metadata={
            "name": "pdfCurve",
            "type": "Element",
        },
    )
    pdf_form: list[PdfForm] = field(
        default_factory=list,
        metadata={
            "name": "pdfForm",
            "type": "Element",
        },
    )
    x_offset: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    y_offset: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    x_advance: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    line_id: int | None = field(
        default=None,
        metadata={
            "name": "lineId",
            "type": "Attribute",
        },
    )
    is_corner_mark: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )


@dataclass(slots=True)
class PdfLine:
    class Meta:
        name = "pdfLine"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    pdf_character: list[PdfCharacter] = field(
        default_factory=list,
        metadata={
            "name": "pdfCharacter",
            "type": "Element",
            "min_occurs": 1,
        },
    )
    render_order: int | None = field(
        default=None,
        metadata={
            "name": "renderOrder",
            "type": "Attribute",
        },
    )


@dataclass(slots=True)
class PdfSameStyleCharacters:
    class Meta:
        name = "pdfSameStyleCharacters"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    pdf_style: PdfStyle | None = field(
        default=None,
        metadata={
            "name": "pdfStyle",
            "type": "Element",
            "required": True,
        },
    )
    pdf_character: list[PdfCharacter] = field(
        default_factory=list,
        metadata={
            "name": "pdfCharacter",
            "type": "Element",
            "min_occurs": 1,
        },
    )


@dataclass(slots=True)
class PdfParagraphComposition:
    class Meta:
        name = "pdfParagraphComposition"

    pdf_line: PdfLine | None = field(
        default=None,
        metadata={
            "name": "pdfLine",
            "type": "Element",
        },
    )
    pdf_formula: PdfFormula | None = field(
        default=None,
        metadata={
            "name": "pdfFormula",
            "type": "Element",
        },
    )
    pdf_same_style_characters: PdfSameStyleCharacters | None = field(
        default=None,
        metadata={
            "name": "pdfSameStyleCharacters",
            "type": "Element",
        },
    )
    pdf_character: PdfCharacter | None = field(
        default=None,
        metadata={
            "name": "pdfCharacter",
            "type": "Element",
        },
    )
    pdf_same_style_unicode_characters: PdfSameStyleUnicodeCharacters | None = field(
        default=None,
        metadata={
            "name": "pdfSameStyleUnicodeCharacters",
            "type": "Element",
        },
    )


@dataclass(slots=True)
class PdfParagraph:
    class Meta:
        name = "pdfParagraph"

    box: Box | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    pdf_style: PdfStyle | None = field(
        default=None,
        metadata={
            "name": "pdfStyle",
            "type": "Element",
            "required": True,
        },
    )
    pdf_paragraph_composition: list[PdfParagraphComposition] = field(
        default_factory=list,
        metadata={
            "name": "pdfParagraphComposition",
            "type": "Element",
        },
    )
    xobj_id: int | None = field(
        default=None,
        metadata={
            "name": "xobjId",
            "type": "Attribute",
        },
    )
    unicode: str | None = field(
        default=None,
        metadata={
            "type": "Attribute",
            "required": True,
        },
    )
    scale: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    optimal_scale: float | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    vertical: bool | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    first_line_indent: bool | None = field(
        default=None,
        metadata={
            "name": "FirstLineIndent",
            "type": "Attribute",
        },
    )
    debug_id: str | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    layout_label: str | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    layout_id: int | None = field(
        default=None,
        metadata={
            "type": "Attribute",
        },
    )
    render_order: int | None = field(
        default=None,
        metadata={
            "name": "renderOrder",
            "type": "Attribute",
        },
    )


@dataclass(slots=True)
class Page:
    class Meta:
        name = "page"

    mediabox: Mediabox | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    cropbox: Cropbox | None = field(
        default=None,
        metadata={
            "type": "Element",
            "required": True,
        },
    )
    pdf_xobject: list[PdfXobject] = field(
        default_factory=list,
        metadata={
            "name": "pdfXobject",
            "type": "Element",
        },
    )
    page_layout: list[PageLayout] = field(
        default_factory=list,
        metadata={
            "name": "pageLayout",
            "type": "Element",
        },
    )
    pdf_rectangle: list[PdfRectangle] = field(
        default_factory=list,
        metadata={
            "name": "pdfRectangle",
            "type": "Element",
        },
    )
    pdf_font: list[PdfFont] = field(
        default_factory=list,
        metadata={
            "name": "pdfFont",
            "type": "Element",
        },
    )
    pdf_paragraph: list[PdfParagraph] = field(
        default_factory=list,
        metadata={
            "name": "pdfParagraph",
            "type": "Element",
        },
    )
    pdf_figure: list[PdfFigure] = field(
        default_factory=list,
        metadata={
            "name": "pdfFigure",
            "type": "Element",
        },
    )
    pdf_character: list[PdfCharacter] = field(
        default_factory=list,
        metadata={
            "name": "pdfCharacter",
            "type": "Element",
        },
    )
    pdf_curve: list[PdfCurve] = field(
        default_factory=list,
        metadata={
            "name": "pdfCurve",
            "type": "Element",
        },
    )
    pdf_form: list[PdfForm] = field(
        default_factory=list,
        metadata={
            "name": "pdfForm",
            "type": "Element",
        },
    )
    base_operations: BaseOperations | None = field(
        default=None,
        metadata={
            "name": "baseOperations",
            "type": "Element",
            "required": True,
        },
    )
    page_number: int | None = field(
        default=None,
        metadata={
            "name": "pageNumber",
            "type": "Attribute",
            "required": True,
        },
    )
    unit: str | None = field(
        default=None,
        metadata={
            "name": "Unit",
            "type": "Attribute",
            "required": True,
        },
    )


@dataclass(slots=True)
class Document:
    class Meta:
        name = "document"

    page: list[Page] = field(
        default_factory=list,
        metadata={
            "type": "Element",
            "min_occurs": 1,
        },
    )
    total_pages: int | None = field(
        default=None,
        metadata={
            "name": "totalPages",
            "type": "Attribute",
            "required": True,
        },
    )


================================================
FILE: babeldoc/format/pdf/document_il/il_version_1.rnc
================================================
start = Document
Document =
  element document {
    Page+,
    attribute totalPages { xsd:int }
  }
Page =
  element page {
    element mediabox { Box },
    element cropbox { Box },
    PDFXobject*,
    PageLayout*,
    PDFRectangle*,
    PDFFont*,
    PDFParagraph*,
    PDFFigure*,
    PDFCharacter*,
    PDFCurve*,
    PDFForm*,
    attribute pageNumber { xsd:int },
    attribute Unit { xsd:string },
    element baseOperations { xsd:string }
  }
Box =
  element box {
    # from (x,y) to (x2,y2)
    attribute x { xsd:float },
    attribute y { xsd:float },
    attribute x2 { xsd:float },
    attribute y2 { xsd:float }
  }
PDFXrefId = xsd:int
PDFFont =
  element pdfFont {
    attribute name { xsd:string },
    attribute fontId { xsd:string },
    attribute xrefId { PDFXrefId },
    attribute encodingLength { xsd:int },
    attribute bold { xsd:boolean }?,
    attribute italic { xsd:boolean }?,
    attribute monospace { xsd:boolean }?,
    attribute serif { xsd:boolean }?,
    attribute ascent { xsd:float }?,
    attribute descent { xsd:float }?,
    PDFFontCharBoundingBox*
  }
PDFFontCharBoundingBox =
  element pdfFontCharBoundingBox {
    attribute x { xsd:float },
    attribute y { xsd:float },
    attribute x2 { xsd:float },
    attribute y2 { xsd:float },
    attribute char_id { xsd:int }
  }
PDFXobject =
  element pdfXobject {
    attribute xobjId { xsd:int },
    attribute xrefId { PDFXrefId },
    Box,
    PDFFont*,
    element baseOperations { xsd:string }
  }
PDFCharacter =
  element pdfCharacter {
    attribute vertical { xsd:boolean }?,
    attribute scale { xsd:float }?,
    attribute pdfCharacterId { xsd:int }?,
    attribute char_unicode { xsd:string },
    attribute advance { xsd:float }?,
    # xobject nesting depth
    attribute xobjId { xsd:int }?,
    attribute debug_info { xsd:boolean }?,
    attribute formula_layout_id { xsd:int }?,
    attribute renderOrder { xsd:int }?,
    attribute subRenderOrder { xsd:int }?,
    PDFStyle,
    Box,
    element visual_bbox { Box }?
  }
PageLayout =
  element pageLayout {
    attribute id { xsd:int },
    attribute conf { xsd:float },
    attribute class_name { xsd:string },
    Box
  }
GraphicState =
  element graphicState {
    attribute passthroughPerCharInstruction { xsd:string }?
  }
PDFStyle =
  element pdfStyle {
    attribute font_id { xsd:string },
    attribute font_size { xsd:float },
    GraphicState
  }
PDFParagraph =
  element pdfParagraph {
    attribute xobjId { xsd:int }?,
    attribute unicode { xsd:string },
    attribute scale { xsd:float }?,
    attribute optimal_scale { xsd:float }?,
    attribute vertical { xsd:boolean }?,
    attribute FirstLineIndent { xsd:boolean }?,
    attribute debug_id { xsd:string }?,
    attribute layout_label { xsd:string }?,
    attribute layout_id { xsd:int }?,
    attribute renderOrder { xsd:int }?,
    Box,
    PDFStyle,
    PDFParagraphComposition*
  }
PDFParagraphComposition =
  element pdfParagraphComposition {
    PDFLine
    | PDFFormula
    | PDFSameStyleCharacters
    | PDFCharacter
    | PDFSameStyleUnicodeCharacters
  }
PDFLine =
  element pdfLine {
    Box,
    PDFCharacter+,
    attribute renderOrder { xsd:int }?
  }
PDFSameStyleCharacters =
  element pdfSameStyleCharacters { Box, PDFStyle, PDFCharacter+ }
PDFSameStyleUnicodeCharacters =
  element pdfSameStyleUnicodeCharacters {
    PDFStyle?,
    attribute unicode { xsd:string },
    attribute debug_info { xsd:boolean }?
  }
PDFFormula =
  element pdfFormula {
    Box,
    PDFCharacter+,
    PDFCurve*,
    PDFForm*,
    attribute x_offset { xsd:float },
    attribute y_offset { xsd:float },
    attribute x_advance { xsd:float }?,
    attribute lineId { xsd:int }?,
    attribute is_corner_mark { xsd:boolean }?
  }
PDFFigure = element pdfFigure { Box }
PDFRectangle =
  element pdfRectangle {
    Box,
    GraphicState,
    attribute debug_info { xsd:boolean }?,
    attribute fill_background { xsd:boolean }?,
    attribute xobjId { xsd:int }?,
    attribute lineWidth { xsd:float }?,
    attribute renderOrder { xsd:int }?
  }
PDFCurve =
  element pdfCurve {
    Box,
    GraphicState,
    PDFPath*,
    PDFOriginalPath*,
    attribute debug_info { xsd:boolean }?,
    attribute fill_background { xsd:boolean }?,
    attribute stroke_path { xsd:boolean }?,
    attribute evenodd { xsd:boolean }?,
    attribute xobjId { xsd:int }?,
    attribute renderOrder { xsd:int }?,
    attribute ctm {
      list {
        xsd:float, xsd:float, xsd:float, xsd:float, xsd:float, xsd:float
      }
    }?,
    attribute relocation_transform {
      list {
        xsd:float, xsd:float, xsd:float, xsd:float, xsd:float, xsd:float
      }
    }?
  }
PDFOriginalPath = element pdfOriginalPath { PDFPath }
PDFPath =
  element pdfPath {
    attribute x { xsd:float },
    attribute y { xsd:float },
    attribute op { xsd:string },
    attribute has_xy { xsd:boolean }?
  }
PDFForm =
  element pdfForm {
    attribute xobjId { xsd:int },
    Box,
    GraphicState,
    PDFMatrix,
    PDFAffineTransform,
    attribute ctm {
      list {
        xsd:float, xsd:float, xsd:float, xsd:float, xsd:float, xsd:float
      }
    }?,
    attribute relocation_transform {
      list {
        xsd:float, xsd:float, xsd:float, xsd:float, xsd:float, xsd:float
      }
    }?,
    attribute renderOrder { xsd:int },
    attribute formType { xsd:string },
    PDFFormSubtype
  }
PDFFormSubtype = element pdfFormSubtype { PDFInlineForm | PDFXobjForm }
PDFInlineForm =
  element pdfInlineForm {
    attribute formData { xsd:string }?,
    attribute imageParameters { xsd:string }?
  }
PDFXobjForm =
  element pdfXobjForm {
    attribute xrefId { PDFXrefId },
    attribute doArgs { xsd:string }
  }
PDFMatrix =
  element pdfMatrix {
    attribute a { xsd:float },
    attribute b { xsd:float },
    attribute c { xsd:float },
    attribute d { xsd:float },
    attribute e { xsd:float },
    attribute f { xsd:float }
  }
# Decomposed transform parameters for a CTM
PDFAffineTransform =
  element pdfAffineTransform {
    attribute translation_x { xsd:float },
    attribute translation_y { xsd:float },
    attribute rotation { xsd:float },
    attribute scale_x { xsd:float },
    attribute scale_y { xsd:float },
    attribute shear { xsd:float }
  }


================================================
FILE: babeldoc/format/pdf/document_il/il_version_1.rng
================================================
<?xml version="1.0" encoding="UTF-8"?>
<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
  <start>
    <ref name="Document"/>
  </start>
  <define name="Document">
    <element name="document">
      <oneOrMore>
        <ref name="Page"/>
      </oneOrMore>
      <attribute name="totalPages">
        <data type="int"/>
      </attribute>
    </element>
  </define>
  <define name="Page">
    <element name="page">
      <element name="mediabox">
        <ref name="Box"/>
      </element>
      <element name="cropbox">
        <ref name="Box"/>
      </element>
      <zeroOrMore>
        <ref name="PDFXobject"/>
      </zeroOrMore>
      <zeroOrMore>
        <ref name="PageLayout"/>
      </zeroOrMore>
      <zeroOrMore>
        <ref name="PDFRectangle"/>
      </zeroOrMore>
      <zeroOrMore>
        <ref name="PDFFont"/>
      </zeroOrMore>
      <zeroOrMore>
        <ref name="PDFParagraph"/>
      </zeroOrMore>
      <zeroOrMore>
        <ref name="PDFFigure"/>
      </zeroOrMore>
      <zeroOrMore>
        <ref name="PDFCharacter"/>
      </zeroOrMore>
      <zeroOrMore>
        <ref name="PDFCurve"/>
      </zeroOrMore>
      <zeroOrMore>
        <ref name="PDFForm"/>
      </zeroOrMore>
      <attribute name="pageNumber">
        <data type="int"/>
      </attribute>
      <attribute name="Unit">
        <data type="string"/>
      </attribute>
      <element name="baseOperations">
        <data type="string"/>
      </element>
    </element>
  </define>
  <define name="Box">
    <element name="box">
      <!-- from (x,y) to (x2,y2) -->
      <attribute name="x">
        <data type="float"/>
      </attribute>
      <attribute name="y">
        <data type="float"/>
      </attribute>
      <attribute name="x2">
        <data type="float"/>
      </attribute>
      <attribute name="y2">
        <data type="float"/>
      </attribute>
    </element>
  </define>
  <define name="PDFXrefId">
    <data type="int"/>
  </define>
  <define name="PDFFont">
    <element name="pdfFont">
      <attribute name="name">
        <data type="string"/>
      </attribute>
      <attribute name="fontId">
        <data type="string"/>
      </attribute>
      <attribute name="xrefId">
        <ref name="PDFXrefId"/>
      </attribute>
      <attribute name="encodingLength">
        <data type="int"/>
      </attribute>
      <optional>
        <attribute name="bold">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="italic">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="monospace">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="serif">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="ascent">
          <data type="float"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="descent">
          <data type="float"/>
        </attribute>
      </optional>
      <zeroOrMore>
        <ref name="PDFFontCharBoundingBox"/>
      </zeroOrMore>
    </element>
  </define>
  <define name="PDFFontCharBoundingBox">
    <element name="pdfFontCharBoundingBox">
      <attribute name="x">
        <data type="float"/>
      </attribute>
      <attribute name="y">
        <data type="float"/>
      </attribute>
      <attribute name="x2">
        <data type="float"/>
      </attribute>
      <attribute name="y2">
        <data type="float"/>
      </attribute>
      <attribute name="char_id">
        <data type="int"/>
      </attribute>
    </element>
  </define>
  <define name="PDFXobject">
    <element name="pdfXobject">
      <attribute name="xobjId">
        <data type="int"/>
      </attribute>
      <attribute name="xrefId">
        <ref name="PDFXrefId"/>
      </attribute>
      <ref name="Box"/>
      <zeroOrMore>
        <ref name="PDFFont"/>
      </zeroOrMore>
      <element name="baseOperations">
        <data type="string"/>
      </element>
    </element>
  </define>
  <define name="PDFCharacter">
    <element name="pdfCharacter">
      <optional>
        <attribute name="vertical">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="scale">
          <data type="float"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="pdfCharacterId">
          <data type="int"/>
        </attribute>
      </optional>
      <attribute name="char_unicode">
        <data type="string"/>
      </attribute>
      <optional>
        <attribute name="advance">
          <data type="float"/>
        </attribute>
      </optional>
      <optional>
        <!-- xobject nesting depth -->
        <attribute name="xobjId">
          <data type="int"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="debug_info">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="formula_layout_id">
          <data type="int"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="renderOrder">
          <data type="int"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="subRenderOrder">
          <data type="int"/>
        </attribute>
      </optional>
      <ref name="PDFStyle"/>
      <ref name="Box"/>
      <optional>
        <element name="visual_bbox">
          <ref name="Box"/>
        </element>
      </optional>
    </element>
  </define>
  <define name="PageLayout">
    <element name="pageLayout">
      <attribute name="id">
        <data type="int"/>
      </attribute>
      <attribute name="conf">
        <data type="float"/>
      </attribute>
      <attribute name="class_name">
        <data type="string"/>
      </attribute>
      <ref name="Box"/>
    </element>
  </define>
  <define name="GraphicState">
    <element name="graphicState">
      <optional>
        <attribute name="passthroughPerCharInstruction">
          <data type="string"/>
        </attribute>
      </optional>
    </element>
  </define>
  <define name="PDFStyle">
    <element name="pdfStyle">
      <attribute name="font_id">
        <data type="string"/>
      </attribute>
      <attribute name="font_size">
        <data type="float"/>
      </attribute>
      <ref name="GraphicState"/>
    </element>
  </define>
  <define name="PDFParagraph">
    <element name="pdfParagraph">
      <optional>
        <attribute name="xobjId">
          <data type="int"/>
        </attribute>
      </optional>
      <attribute name="unicode">
        <data type="string"/>
      </attribute>
      <optional>
        <attribute name="scale">
          <data type="float"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="optimal_scale">
          <data type="float"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="vertical">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="FirstLineIndent">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="debug_id">
          <data type="string"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="layout_label">
          <data type="string"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="layout_id">
          <data type="int"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="renderOrder">
          <data type="int"/>
        </attribute>
      </optional>
      <ref name="Box"/>
      <ref name="PDFStyle"/>
      <zeroOrMore>
        <ref name="PDFParagraphComposition"/>
      </zeroOrMore>
    </element>
  </define>
  <define name="PDFParagraphComposition">
    <element name="pdfParagraphComposition">
      <choice>
        <ref name="PDFLine"/>
        <ref name="PDFFormula"/>
        <ref name="PDFSameStyleCharacters"/>
        <ref name="PDFCharacter"/>
        <ref name="PDFSameStyleUnicodeCharacters"/>
      </choice>
    </element>
  </define>
  <define name="PDFLine">
    <element name="pdfLine">
      <ref name="Box"/>
      <oneOrMore>
        <ref name="PDFCharacter"/>
      </oneOrMore>
      <optional>
        <attribute name="renderOrder">
          <data type="int"/>
        </attribute>
      </optional>
    </element>
  </define>
  <define name="PDFSameStyleCharacters">
    <element name="pdfSameStyleCharacters">
      <ref name="Box"/>
      <ref name="PDFStyle"/>
      <oneOrMore>
        <ref name="PDFCharacter"/>
      </oneOrMore>
    </element>
  </define>
  <define name="PDFSameStyleUnicodeCharacters">
    <element name="pdfSameStyleUnicodeCharacters">
      <optional>
        <ref name="PDFStyle"/>
      </optional>
      <attribute name="unicode">
        <data type="string"/>
      </attribute>
      <optional>
        <attribute name="debug_info">
          <data type="boolean"/>
        </attribute>
      </optional>
    </element>
  </define>
  <define name="PDFFormula">
    <element name="pdfFormula">
      <ref name="Box"/>
      <oneOrMore>
        <ref name="PDFCharacter"/>
      </oneOrMore>
      <zeroOrMore>
        <ref name="PDFCurve"/>
      </zeroOrMore>
      <zeroOrMore>
        <ref name="PDFForm"/>
      </zeroOrMore>
      <attribute name="x_offset">
        <data type="float"/>
      </attribute>
      <attribute name="y_offset">
        <data type="float"/>
      </attribute>
      <optional>
        <attribute name="x_advance">
          <data type="float"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="lineId">
          <data type="int"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="is_corner_mark">
          <data type="boolean"/>
        </attribute>
      </optional>
    </element>
  </define>
  <define name="PDFFigure">
    <element name="pdfFigure">
      <ref name="Box"/>
    </element>
  </define>
  <define name="PDFRectangle">
    <element name="pdfRectangle">
      <ref name="Box"/>
      <ref name="GraphicState"/>
      <optional>
        <attribute name="debug_info">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="fill_background">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="xobjId">
          <data type="int"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="lineWidth">
          <data type="float"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="renderOrder">
          <data type="int"/>
        </attribute>
      </optional>
    </element>
  </define>
  <define name="PDFCurve">
    <element name="pdfCurve">
      <ref name="Box"/>
      <ref name="GraphicState"/>
      <zeroOrMore>
        <ref name="PDFPath"/>
      </zeroOrMore>
      <zeroOrMore>
        <ref name="PDFOriginalPath"/>
      </zeroOrMore>
      <optional>
        <attribute name="debug_info">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="fill_background">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="stroke_path">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="evenodd">
          <data type="boolean"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="xobjId">
          <data type="int"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="renderOrder">
          <data type="int"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="ctm">
          <list>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
          </list>
        </attribute>
      </optional>
      <optional>
        <attribute name="relocation_transform">
          <list>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
          </list>
        </attribute>
      </optional>
    </element>
  </define>
  <define name="PDFOriginalPath">
    <element name="pdfOriginalPath">
      <ref name="PDFPath"/>
    </element>
  </define>
  <define name="PDFPath">
    <element name="pdfPath">
      <attribute name="x">
        <data type="float"/>
      </attribute>
      <attribute name="y">
        <data type="float"/>
      </attribute>
      <attribute name="op">
        <data type="string"/>
      </attribute>
      <optional>
        <attribute name="has_xy">
          <data type="boolean"/>
        </attribute>
      </optional>
    </element>
  </define>
  <define name="PDFForm">
    <element name="pdfForm">
      <attribute name="xobjId">
        <data type="int"/>
      </attribute>
      <ref name="Box"/>
      <ref name="GraphicState"/>
      <ref name="PDFMatrix"/>
      <ref name="PDFAffineTransform"/>
      <optional>
        <attribute name="ctm">
          <list>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
          </list>
        </attribute>
      </optional>
      <optional>
        <attribute name="relocation_transform">
          <list>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
            <data type="float"/>
          </list>
        </attribute>
      </optional>
      <attribute name="renderOrder">
        <data type="int"/>
      </attribute>
      <attribute name="formType">
        <data type="string"/>
      </attribute>
      <ref name="PDFFormSubtype"/>
    </element>
  </define>
  <define name="PDFFormSubtype">
    <element name="pdfFormSubtype">
      <choice>
        <ref name="PDFInlineForm"/>
        <ref name="PDFXobjForm"/>
      </choice>
    </element>
  </define>
  <define name="PDFInlineForm">
    <element name="pdfInlineForm">
      <optional>
        <attribute name="formData">
          <data type="string"/>
        </attribute>
      </optional>
      <optional>
        <attribute name="imageParameters">
          <data type="string"/>
        </attribute>
      </optional>
    </element>
  </define>
  <define name="PDFXobjForm">
    <element name="pdfXobjForm">
      <attribute name="xrefId">
        <ref name="PDFXrefId"/>
      </attribute>
      <attribute name="doArgs">
        <data type="string"/>
      </attribute>
    </element>
  </define>
  <define name="PDFMatrix">
    <element name="pdfMatrix">
      <attribute name="a">
        <data type="float"/>
      </attribute>
      <attribute name="b">
        <data type="float"/>
      </attribute>
      <attribute name="c">
        <data type="float"/>
      </attribute>
      <attribute name="d">
        <data type="float"/>
      </attribute>
      <attribute name="e">
        <data type="float"/>
      </attribute>
      <attribute name="f">
        <data type="float"/>
      </attribute>
    </element>
  </define>
  <!-- Decomposed transform parameters for a CTM -->
  <define name="PDFAffineTransform">
    <element name="pdfAffineTransform">
      <attribute name="translation_x">
        <data type="float"/>
      </attribute>
      <attribute name="translation_y">
        <data type="float"/>
      </attribute>
      <attribute name="rotation">
        <data type="float"/>
      </attribute>
      <attribute name="scale_x">
        <data type="float"/>
      </attribute>
      <attribute name="scale_y">
        <data type="float"/>
      </attribute>
      <attribute name="shear">
        <data type="float"/>
      </attribute>
    </element>
  </define>
</grammar>


================================================
FILE: babeldoc/format/pdf/document_il/il_version_1.xsd
================================================
<?xml version="1.0" encoding="UTF-8"?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified">
  <xs:element name="document">
    <xs:complexType>
      <xs:sequence>
        <xs:element maxOccurs="unbounded" ref="page"/>
      </xs:sequence>
      <xs:attribute name="totalPages" use="required" type="xs:int"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="page">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="mediabox"/>
        <xs:element ref="cropbox"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfXobject"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pageLayout"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfRectangle"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraph"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFigure"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfCharacter"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfCurve"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfForm"/>
        <xs:element ref="baseOperations"/>
      </xs:sequence>
      <xs:attribute name="pageNumber" use="required" type="xs:int"/>
      <xs:attribute name="Unit" use="required" type="xs:string"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="mediabox">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="cropbox">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="baseOperations" type="xs:string"/>
  <xs:element name="box">
    <xs:complexType>
      <xs:attribute name="x" use="required" type="xs:float"/>
      <xs:attribute name="y" use="required" type="xs:float"/>
      <xs:attribute name="x2" use="required" type="xs:float"/>
      <xs:attribute name="y2" use="required" type="xs:float"/>
    </xs:complexType>
  </xs:element>
  <xs:simpleType name="PDFXrefId">
    <xs:restriction base="xs:int"/>
  </xs:simpleType>
  <xs:element name="pdfFont">
    <xs:complexType>
      <xs:sequence>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFontCharBoundingBox"/>
      </xs:sequence>
      <xs:attribute name="name" use="required" type="xs:string"/>
      <xs:attribute name="fontId" use="required" type="xs:string"/>
      <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
      <xs:attribute name="encodingLength" use="required" type="xs:int"/>
      <xs:attribute name="bold" type="xs:boolean"/>
      <xs:attribute name="italic" type="xs:boolean"/>
      <xs:attribute name="monospace" type="xs:boolean"/>
      <xs:attribute name="serif" type="xs:boolean"/>
      <xs:attribute name="ascent" type="xs:float"/>
      <xs:attribute name="descent" type="xs:float"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfFontCharBoundingBox">
    <xs:complexType>
      <xs:attribute name="x" use="required" type="xs:float"/>
      <xs:attribute name="y" use="required" type="xs:float"/>
      <xs:attribute name="x2" use="required" type="xs:float"/>
      <xs:attribute name="y2" use="required" type="xs:float"/>
      <xs:attribute name="char_id" use="required" type="xs:int"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfXobject">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
        <xs:element ref="baseOperations"/>
      </xs:sequence>
      <xs:attribute name="xobjId" use="required" type="xs:int"/>
      <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfCharacter">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="pdfStyle"/>
        <xs:element ref="box"/>
        <xs:element minOccurs="0" ref="visual_bbox"/>
      </xs:sequence>
      <xs:attribute name="vertical" type="xs:boolean"/>
      <xs:attribute name="scale" type="xs:float"/>
      <xs:attribute name="pdfCharacterId" type="xs:int"/>
      <xs:attribute name="char_unicode" use="required" type="xs:string"/>
      <xs:attribute name="advance" type="xs:float"/>
      <xs:attribute name="xobjId" type="xs:int"/>
      <xs:attribute name="debug_info" type="xs:boolean"/>
      <xs:attribute name="formula_layout_id" type="xs:int"/>
      <xs:attribute name="renderOrder" type="xs:int"/>
      <xs:attribute name="subRenderOrder" type="xs:int"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="visual_bbox">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="pageLayout">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
      </xs:sequence>
      <xs:attribute name="id" use="required" type="xs:int"/>
      <xs:attribute name="conf" use="required" type="xs:float"/>
      <xs:attribute name="class_name" use="required" type="xs:string"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="graphicState">
    <xs:complexType>
      <xs:attribute name="passthroughPerCharInstruction" type="xs:string"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfStyle">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="graphicState"/>
      </xs:sequence>
      <xs:attribute name="font_id" use="required" type="xs:string"/>
      <xs:attribute name="font_size" use="required" type="xs:float"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfParagraph">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element ref="pdfStyle"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraphComposition"/>
      </xs:sequence>
      <xs:attribute name="xobjId" type="xs:int"/>
      <xs:attribute name="unicode" use="required" type="xs:string"/>
      <xs:attribute name="scale" type="xs:float"/>
      <xs:attribute name="optimal_scale" type="xs:float"/>
      <xs:attribute name="vertical" type="xs:boolean"/>
      <xs:attribute name="FirstLineIndent" type="xs:boolean"/>
      <xs:attribute name="debug_id" type="xs:string"/>
      <xs:attribute name="layout_label" type="xs:string"/>
      <xs:attribute name="layout_id" type="xs:int"/>
      <xs:attribute name="renderOrder" type="xs:int"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfParagraphComposition">
    <xs:complexType>
      <xs:choice>
        <xs:element ref="pdfLine"/>
        <xs:element ref="pdfFormula"/>
        <xs:element ref="pdfSameStyleCharacters"/>
        <xs:element ref="pdfCharacter"/>
        <xs:element ref="pdfSameStyleUnicodeCharacters"/>
      </xs:choice>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfLine">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
      </xs:sequence>
      <xs:attribute name="renderOrder" type="xs:int"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfSameStyleCharacters">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element ref="pdfStyle"/>
        <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfSameStyleUnicodeCharacters">
    <xs:complexType>
      <xs:sequence>
        <xs:element minOccurs="0" ref="pdfStyle"/>
      </xs:sequence>
      <xs:attribute name="unicode" use="required" type="xs:string"/>
      <xs:attribute name="debug_info" type="xs:boolean"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfFormula">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfCurve"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfForm"/>
      </xs:sequence>
      <xs:attribute name="x_offset" use="required" type="xs:float"/>
      <xs:attribute name="y_offset" use="required" type="xs:float"/>
      <xs:attribute name="x_advance" type="xs:float"/>
      <xs:attribute name="lineId" type="xs:int"/>
      <xs:attribute name="is_corner_mark" type="xs:boolean"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfFigure">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfRectangle">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element ref="graphicState"/>
      </xs:sequence>
      <xs:attribute name="debug_info" type="xs:boolean"/>
      <xs:attribute name="fill_background" type="xs:boolean"/>
      <xs:attribute name="xobjId" type="xs:int"/>
      <xs:attribute name="lineWidth" type="xs:float"/>
      <xs:attribute name="renderOrder" type="xs:int"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfCurve">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element ref="graphicState"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfPath"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfOriginalPath"/>
      </xs:sequence>
      <xs:attribute name="debug_info" type="xs:boolean"/>
      <xs:attribute name="fill_background" type="xs:boolean"/>
      <xs:attribute name="stroke_path" type="xs:boolean"/>
      <xs:attribute name="evenodd" type="xs:boolean"/>
      <xs:attribute name="xobjId" type="xs:int"/>
      <xs:attribute name="renderOrder" type="xs:int"/>
      <xs:attribute name="ctm">
        <xs:simpleType>
          <xs:restriction>
            <xs:simpleType>
              <xs:list>
                <xs:simpleType>
                  <xs:union memberTypes="xs:float xs:float xs:float xs:float xs:float xs:float"/>
                </xs:simpleType>
              </xs:list>
            </xs:simpleType>
            <xs:length value="6"/>
          </xs:restriction>
        </xs:simpleType>
      </xs:attribute>
      <xs:attribute name="relocation_transform">
        <xs:simpleType>
          <xs:restriction>
            <xs:simpleType>
              <xs:list>
                <xs:simpleType>
                  <xs:union memberTypes="xs:float xs:float xs:float xs:float xs:float xs:float"/>
                </xs:simpleType>
              </xs:list>
            </xs:simpleType>
            <xs:length value="6"/>
          </xs:restriction>
        </xs:simpleType>
      </xs:attribute>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfOriginalPath">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="pdfPath"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfPath">
    <xs:complexType>
      <xs:attribute name="x" use="required" type="xs:float"/>
      <xs:attribute name="y" use="required" type="xs:float"/>
      <xs:attribute name="op" use="required" type="xs:string"/>
      <xs:attribute name="has_xy" type="xs:boolean"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfForm">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element ref="graphicState"/>
        <xs:element ref="pdfMatrix"/>
        <xs:element ref="pdfAffineTransform"/>
        <xs:element ref="pdfFormSubtype"/>
      </xs:sequence>
      <xs:attribute name="xobjId" use="required" type="xs:int"/>
      <xs:attribute name="ctm">
        <xs:simpleType>
          <xs:restriction>
            <xs:simpleType>
              <xs:list>
                <xs:simpleType>
                  <xs:union memberTypes="xs:float xs:float xs:float xs:float xs:float xs:float"/>
                </xs:simpleType>
              </xs:list>
            </xs:simpleType>
            <xs:length value="6"/>
          </xs:restriction>
        </xs:simpleType>
      </xs:attribute>
      <xs:attribute name="relocation_transform">
        <xs:simpleType>
          <xs:restriction>
            <xs:simpleType>
              <xs:list>
                <xs:simpleType>
                  <xs:union memberTypes="xs:float xs:float xs:float xs:float xs:float xs:float"/>
                </xs:simpleType>
              </xs:list>
            </xs:simpleType>
            <xs:length value="6"/>
          </xs:restriction>
        </xs:simpleType>
      </xs:attribute>
      <xs:attribute name="renderOrder" use="required" type="xs:int"/>
      <xs:attribute name="formType" use="required" type="xs:string"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfFormSubtype">
    <xs:complexType>
      <xs:choice>
        <xs:element ref="pdfInlineForm"/>
        <xs:element ref="pdfXobjForm"/>
      </xs:choice>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfInlineForm">
    <xs:complexType>
      <xs:attribute name="formData" type="xs:string"/>
      <xs:attribute name="imageParameters" type="xs:string"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfXobjForm">
    <xs:complexType>
      <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
      <xs:attribute name="doArgs" use="required" type="xs:string"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfMatrix">
    <xs:complexType>
      <xs:attribute name="a" use="required" type="xs:float"/>
      <xs:attribute name="b" use="required" type="xs:float"/>
      <xs:attribute name="c" use="required" type="xs:float"/>
      <xs:attribute name="d" use="required" type="xs:float"/>
      <xs:attribute name="e" use="required" type="xs:float"/>
      <xs:attribute name="f" use="required" type="xs:float"/>
    </xs:complexType>
  </xs:element>
  <!-- Decomposed transform parameters for a CTM -->
  <xs:element name="pdfAffineTransform">
    <xs:complexType>
      <xs:attribute name="translation_x" use="required" type="xs:float"/>
      <xs:attribute name="translation_y" use="required" type="xs:float"/>
      <xs:attribute name="rotation" use="required" type="xs:float"/>
      <xs:attribute name="scale_x" use="required" type="xs:float"/>
      <xs:attribute name="scale_y" use="required" type="xs:float"/>
      <xs:attribute name="shear" use="required" type="xs:float"/>
    </xs:complexType>
  </xs:element>
</xs:schema>


================================================
FILE: babeldoc/format/pdf/document_il/midend/__init__.py
================================================


================================================
FILE: babeldoc/format/pdf/document_il/midend/add_debug_information.py
================================================
import logging

import babeldoc.format.pdf.document_il.il_version_1 as il_version_1
from babeldoc.format.pdf.document_il import GraphicState
from babeldoc.format.pdf.document_il.utils.style_helper import BLUE
from babeldoc.format.pdf.document_il.utils.style_helper import ORANGE
from babeldoc.format.pdf.document_il.utils.style_helper import PINK
from babeldoc.format.pdf.document_il.utils.style_helper import TEAL
from babeldoc.format.pdf.document_il.utils.style_helper import YELLOW
from babeldoc.format.pdf.translation_config import TranslationConfig

logger = logging.getLogger(__name__)


class AddDebugInformation:
    stage_name = "Add Debug Information"

    def __init__(self, translation_config: TranslationConfig):
        self.translation_config = translation_config
        self.model = translation_config.doc_layout_model

    def process(self, docs: il_version_1.Document):
        if not self.translation_config.debug:
            return

        for page in docs.page:
            self.process_page(page)

    def _create_rectangle(
        self,
        box: il_version_1.Box,
        color: GraphicState,
        line_width: float | None = None,
    ):
        rect = il_version_1.PdfRectangle(
            box=box,
            graphic_state=color,
            debug_info=True,
            line_width=line_width,
        )
        return rect

    def _create_text(
        self,
        text: str,
        color: GraphicState,
        box: il_version_1.Box,
        font_size: float = 4,
    ):
        style = il_version_1.PdfStyle(
            font_id="base",
            font_size=font_size,
            graphic_state=color,
        )
        return il_version_1.PdfParagraph(
            first_line_indent=False,
            box=il_version_1.Box(
                x=box.x,
                y=box.y2,
                x2=box.x2,
                y2=box.y2 + 5,
            ),
            vertical=False,
            pdf_style=style,
            unicode=text,
            pdf_paragraph_composition=[
                il_version_1.PdfParagraphComposition(
                    pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
                        unicode=text,
                        pdf_style=style,
                        debug_info=True,
                    ),
                ),
            ],
            xobj_id=-1,
        )

    def process_page(self, page: il_version_1.Page):
        # Add page number text at top-left corner
        page_width = page.cropbox.box.x2 - page.cropbox.box.x
        page_height = page.cropbox.box.y2 - page.cropbox.box.y
        page_number_text = f"pagenumber: {page.page_number + 1}"
        page_number_box = il_version_1.Box(
            x=page.cropbox.box.x + page_width * 0.02,
            y=page.cropbox.box.y,
            x2=page.cropbox.box.x2,
            y2=page.cropbox.box.y2 - page_height * 0.02,
        )
        page_number_paragraph = self._create_text(
            page_number_text,
            BLUE,
            page_number_box,
        )
        page.pdf_paragraph.append(page_number_paragraph)

        new_paragraphs = []

        for paragraph in page.pdf_paragraph:
            if not paragraph.pdf_paragraph_composition:
                continue
            if any(
                x.pdf_same_style_unicode_characters.debug_info
                for x in paragraph.pdf_paragraph_composition
                if x.pdf_same_style_unicode_characters
            ):
                continue
            # Create a rectangle box
            rect = self._create_rectangle(paragraph.box, BLUE)

            page.pdf_rectangle.append(rect)

            # Create text label at top-left corner
            # Note: PDF coordinates are from bottom-left,
            # so we use y2 for top position

            debug_text = "paragraph"
            if hasattr(paragraph, "debug_id") and paragraph.debug_id:
                debug_text = (
                    f"paragraph[{paragraph.debug_id}]-[{paragraph.layout_label}]"
                )
            new_paragraphs.append(self._create_text(debug_text, BLUE, paragraph.box))

            for composition in paragraph.pdf_paragraph_composition:
                if composition.pdf_formula:
                    new_paragraphs.append(
                        self._create_text(
                            "formula",
                            ORANGE,
                            composition.pdf_formula.box,
                        ),
                    )
                    page.pdf_rectangle.append(
                        self._create_rectangle(
                            composition.pdf_formula.box,
                            ORANGE,
                        ),
                    )
                    for char in composition.pdf_formula.pdf_character:
                        page.pdf_rectangle.append(
                            self._create_rectangle(
                                char.visual_bbox.box, TEAL, line_width=0.2
                            ),
                        )
                        # page.pdf_rectangle.append(
                        #     self._create_rectangle(char.box, CYAN, line_width=0.2),
                        # )

            for xobj in page.pdf_xobject:
                # new_paragraphs.append(
                #     self._create_text(
                #         "xobj",
                #         YELLOW,
                #         xobj.box,
                #     ),
                # )
                page.pdf_rectangle.append(
                    self._create_rectangle(
                        xobj.box,
                        YELLOW,
                    ),
                )

            for form in page.pdf_form:
                debug_text = "Form"
                if form.pdf_form_subtype.pdf_xobj_form:
                    debug_text += f"[{form.pdf_form_subtype.pdf_xobj_form.do_args}]"
                elif form.pdf_form_subtype.pdf_inline_form:
                    debug_text += "[inline]"

                new_paragraphs.append(
                    self._create_text(debug_text, PINK, form.box, font_size=0.4),
                )
                page.pdf_rectangle.append(
                    self._create_rectangle(
                        form.box,
                        PINK,
                    ),
                )

        page.pdf_paragraph.extend(new_paragraphs)


================================================
FILE: babeldoc/format/pdf/document_il/midend/automatic_term_extractor.py
================================================
from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import TYPE_CHECKING

import tiktoken
from tqdm import tqdm

from babeldoc.format.pdf.document_il import (
    Document as ILDocument,  # Renamed to avoid conflict
)
from babeldoc.format.pdf.document_il import PdfParagraph  # Renamed to avoid conflict
from babeldoc.format.pdf.document_il.midend.il_translator import Page
from babeldoc.format.pdf.document_il.utils.paragraph_helper import is_cid_paragraph
from babeldoc.format.pdf.document_il.utils.paragraph_helper import (
    is_placeholder_only_paragraph,
)
from babeldoc.format.pdf.document_il.utils.paragraph_helper import (
    is_pure_numeric_paragraph,
)
from babeldoc.utils.priority_thread_pool_executor import PriorityThreadPoolExecutor

if TYPE_CHECKING:
    from babeldoc.format.pdf.translation_config import TranslationConfig
    from babeldoc.translator.translator import BaseTranslator

logger = logging.getLogger(__name__)

LLM_PROMPT_TEMPLATE: str = """
You are an expert multilingual terminologist. Extract key terms from the text and translate them into {target_language}.

### Extraction Rules
1. Include only: named entities (people, orgs, locations, theorem/algorithm names, dates) and domain-specific nouns/noun phrases essential to meaning.
2. No full sentences. Ignore function words.
3. Use minimal noun phrases (≤5 words unless a named entity). No generic academic nouns (e.g., model, case, property) unless part of a standard term.
4. No mathematical items: variables (X1, a, ε), symbols (=, +, →, ⊥⊥, ∈), subscripts/superscripts, formula fragments, mappings (T: H1→H2), etc. Keep only natural-language concepts.
5. Extract each term once. Keep order of first appearance.

### Translation Rules
1. Translate each term into {target_language}.
2. If in the reference glossary, use its translation exactly.
3. Keep proper names in original language unless a well-known translation exists.
4. Ensure consistent translations.

{reference_glossary_section}

### Output Format
- Return ONLY a valid JSON array.
- Each element: {{"src": "...", "tgt": "..."}}.
- No comments, no backticks, no extra text.
- If no terms: [].

### Example
For terms “LLM”, “GPT”:
{example_output}

Input Text:
```
{text_to_process}
```

Return JSON ONLY. NO OTHER TEXT.
Result:
"""


class BatchParagraph:
    def __init__(
        self,
        paragraphs: list[PdfParagraph],
        page_tracker: PageTermExtractTracker,
    ):
        self.paragraphs = paragraphs
        self.tracker = page_tracker.new_paragraph()


class DocumentTermExtractTracker:
    def __init__(self):
        self.page = []

    def new_page(self):
        page = PageTermExtractTracker()
        self.page.append(page)
        return page

    def to_json(self):
        pages = []
        for page in self.page:
            paragraphs = []
            for para in page.paragraph:
                o_str = getattr(para, "output", None)
                i_str = getattr(para, "input", None)
                pdf_unicodes = getattr(para, "pdf_unicodes", None)
                if not pdf_unicodes:
                    continue
                paragraphs.append(
                    {
                        "pdf_unicodes": pdf_unicodes,
                        "output": o_str,
                        "input": i_str,
                    },
                )
            pages.append({"paragraph": paragraphs})
        return json.dumps({"page": pages}, ensure_ascii=False, indent=2)


class PageTermExtractTracker:
    def __init__(self):
        self.paragraph = []

    def new_paragraph(self):
        paragraph = ParagraphTermExtractTracker()
        self.paragraph.append(paragraph)
        return paragraph


class ParagraphTermExtractTracker:
    def __init__(self):
        self.pdf_unicodes = []

    def append_paragraph_unicode(self, unicode: str):
        self.pdf_unicodes.append(unicode)

    def set_output(self, output: str):
        self.output = output

    def set_input(self, _input: str):
        self.input = _input


class AutomaticTermExtractor:
    stage_name = "Automatic Term Extraction"

    def __init__(
        self,
        translate_engine: BaseTranslator,
        translation_config: TranslationConfig,
    ):
        self.translate_engine = translate_engine
        self.translation_config = translation_config
        self.shared_context = translation_config.shared_context_cross_split_part
        self.tokenizer = tiktoken.encoding_for_model("gpt-4o")

        # Check if the translate_engine has llm_translate capability
        if not hasattr(self.translate_engine, "llm_translate") or not callable(
            self.translate_engine.llm_translate
        ):
            raise ValueError(
                "The provided translate_engine does not support LLM-based translation, which is required for AutomaticTermExtractor."
            )

    def calc_token_count(self, text: str) -> int:
        try:
            return len(self.tokenizer.encode(text, disallowed_special=()))
        except Exception:
            return 0

    def _snapshot_token_usage(self) -> tuple[int, int, int, int]:
        if not self.translate_engine:
            return 0, 0, 0, 0
        token_counter = getattr(self.translate_engine, "token_count", None)
        prompt_counter = getattr(self.translate_engine, "prompt_token_count", None)
        completion_counter = getattr(
            self.translate_engine, "completion_token_count", None
        )
        cache_hit_prompt_counter = getattr(
            self.translate_engine, "cache_hit_prompt_token_count", None
        )
        total_tokens = token_counter.value if token_counter else 0
        prompt_tokens = prompt_counter.value if prompt_counter else 0
        completion_tokens = completion_counter.value if completion_counter else 0
        cache_hit_prompt_tokens = (
            cache_hit_prompt_counter.value if cache_hit_prompt_counter else 0
        )
        return total_tokens, prompt_tokens, completion_tokens, cache_hit_prompt_tokens

    def _clean_json_output(self, llm_output: str) -> str:
        llm_output = llm_output.strip()
        if llm_output.startswith("<json>"):
            llm_output = llm_output[6:]
        if llm_output.endswith("</json>"):
            llm_output = llm_output[:-7]
        if llm_output.startswith("```json"):
            llm_output = llm_output[7:]
        if llm_output.startswith("```"):
            llm_output = llm_output[3:]
        if llm_output.endswith("```"):
            llm_output = llm_output[:-3]
        return llm_output.strip()

    def _process_llm_response(self, llm_response_text: str, request_id: str):
        try:
            cleaned_response_text = self._clean_json_output(llm_response_text)
            extracted_data = json.loads(cleaned_response_text)

            if not isinstance(extracted_data, list):
                logger.warning(
                    f"Request ID {request_id}: LLM response was not a JSON list, but type: {type(extracted_data)}. Content: {cleaned_response_text[:200]}"
                )
                return

            for item in extracted_data:
                if isinstance(item, dict) and "src" in item and "tgt" in item:
                    src_term = str(item["src"]).strip()
                    tgt_term = str(item["tgt"]).strip()
                    if (
                        src_term and tgt_term and len(src_term) < 100
                    ):  # Basic validation
                        self.shared_context.add_raw_extracted_term_pair(
                            src_term, tgt_term
                        )
                else:
                    logger.warning(
                        f"Request ID {request_id}: Skipping malformed item in LLM JSON response: {item}"
                    )

        except json.JSONDecodeError as e:
            logger.error(
                f"Request ID {request_id}: JSON Parsing Error: {e}. Problematic LLM Response after cleaning (start): {cleaned_response_text[:200]}..."
            )
        except Exception as e:
            logger.error(f"Request ID {request_id}: Error processing LLM response: {e}")

    def process_page(
        self,
        page: Page,
        executor: PriorityThreadPoolExecutor,
        pbar: tqdm | None = None,
        tracker: PageTermExtractTracker = None,
    ):
        self.translation_config.raise_if_cancelled()
        paragraphs = []
        total_token_count = 0
        for paragraph in page.pdf_paragraph:
            if paragraph.debug_id is None or paragraph.unicode is None:
                pbar.advance(1)
                continue
            if is_cid_paragraph(paragraph):
                pbar.advance(1)
                continue
            if is_pure_numeric_paragraph(paragraph):
                pbar.advance(1)
                continue
            if is_placeholder_only_paragraph(paragraph):
                pbar.advance(1)
                continue
            # if len(paragraph.unicode) < self.translation_config.min_text_length:
            #     pbar.advance(1)
            #     continue
            total_token_count += self.calc_token_count(paragraph.unicode)
            paragraphs.append(paragraph)
            if total_token_count > 600 or len(paragraphs) > 12:
                executor.submit(
                    self.extract_terms_from_paragraphs,
                    BatchParagraph(paragraphs, tracker),
                    pbar,
                    total_token_count,
                    priority=1048576 - total_token_count,
                )
                paragraphs = []
                total_token_count = 0

        if paragraphs:
            executor.submit(
                self.extract_terms_from_paragraphs,
                BatchParagraph(paragraphs, tracker),
                pbar,
                total_token_count,
                priority=1048576 - total_token_count,
            )

    def extract_terms_from_paragraphs(
        self,
        paragraphs: BatchParagraph,
        pbar: tqdm | None = None,
        paragraph_token_count: int = 0,
    ):
        self.translation_config.raise_if_cancelled()
        try:
            inputs = [p.unicode for p in paragraphs.paragraphs if p.unicode]
            tracker = paragraphs.tracker
            for u in inputs:
                tracker.append_paragraph_unicode(u)
            if not inputs:
                return

            # Build reference glossary section
            reference_glossary_section = ""
            user_glossaries = self.shared_context.user_glossaries
            if user_glossaries:
                text_for_glossary = "\n\n".join(inputs)

                # Group entries by glossary name
                glossary_entries = {}
                for glossary in user_glossaries:
                    active_entries = glossary.get_active_entries_for_text(
                        text_for_glossary
                    )
                    if active_entries:
                        glossary_entries[glossary.name] = active_entries

                if glossary_entries:
                    reference_glossary_section = (
                        "Reference Glossaries (for consistency and quality):\n"
                    )

                    # Add entries grouped by glossary name
                    for glossary_name, entries in glossary_entries.items():
                        reference_glossary_section += f"\n{glossary_name}:\n"
                        for src, tgt in sorted(set(entries)):
                            reference_glossary_section += f"- {src} → {tgt}\n"

                    reference_glossary_section += "\nPlease consider these existing translations for consistency when extracting new terms. IMPORTANT: You should also extract terms that appear in the reference glossaries above if they are found in the input text - don't skip them just because they already exist in the reference."

            prompt = LLM_PROMPT_TEMPLATE.format(
                target_language=self.translation_config.lang_out,
                text_to_process="\n\n".join(inputs),
                reference_glossary_section=reference_glossary_section,
                example_output="""[
  {"src": "LLM", "tgt": "大语言模型"},
  {"src": "GPT", "tgt": "GPT"}
]""",
            )
            tracker.set_input(prompt)
            output = self.translate_engine.llm_translate(
                prompt,
                rate_limit_params={
                    "paragraph_token_count": paragraph_token_count,
                    "request_json_mode": True,
                },
            )
            tracker.set_output(output)
            cleaned_output = self._clean_json_output(output)
            response = json.loads(cleaned_output)
            if not isinstance(response, list):
                response = [response]  # Ensure we have a list

            for term in response:
                if isinstance(term, dict) and "src" in term and "tgt" in term:
                    src_term = str(term["src"]).strip()
                    tgt_term = str(term["tgt"]).strip()
                    if src_term == tgt_term and len(src_term) < 3:
                        continue
                    if src_term and tgt_term and len(src_term) < 100:
                        self.shared_context.add_raw_extracted_term_pair(
                            src_term, tgt_term
                        )

        except Exception as e:
            logger.warning(f"Error during automatic terms extract: {e}")
            return
        finally:
            pbar.advance(len(paragraphs.paragraphs))

    def procress(self, doc_il: ILDocument):
        logger.info(f"{self.stage_name}: Starting term extraction for document.")
        start_total, start_prompt, start_completion, start_cache_hit_prompt = (
            self._snapshot_token_usage()
        )
        tracker = DocumentTermExtractTracker()
        total = sum(len(page.pdf_paragraph) for page in doc_il.page)
        with self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            total,
        ) as pbar:
            max_workers = self.translation_config.term_pool_max_workers
            logger.info(
                f"Using {max_workers} worker threads for automatic term extraction."
            )
            with PriorityThreadPoolExecutor(
                max_workers=max_workers,
            ) as executor:
                for page in doc_il.page:
                    self.process_page(page, executor, pbar, tracker.new_page())

        self.shared_context.finalize_auto_extracted_glossary()
        end_total, end_prompt, end_completion, end_cache_hit_prompt = (
            self._snapshot_token_usage()
        )
        self.translation_config.record_term_extraction_usage(
            end_total - start_total,
            end_prompt - start_prompt,
            end_completion - start_completion,
            end_cache_hit_prompt - start_cache_hit_prompt,
        )

        if (
            self.translation_config.debug
            or self.translation_config.working_dir is not None
        ):
            path = self.translation_config.get_working_file_path(
                "term_extractor_tracking.json"
            )
            logger.debug(f"save translate tracking to {path}")
            with Path(path).open("w", encoding="utf-8") as f:
                f.write(tracker.to_json())

            path = self.translation_config.get_working_file_path(
                "term_extractor_freq.json"
            )
            logger.debug(f"save term frequency to {path}")
            with Path(path).open("w", encoding="utf-8") as f:
                json.dump(
                    self.shared_context.raw_extracted_terms,
                    f,
                    ensure_ascii=False,
                    indent=2,
                )

            path = self.translation_config.get_working_file_path(
                "auto_extractor_glossary.csv"
            )
            logger.debug(f"save auto extracted glossary to {path}")
            with Path(path).open("w", encoding="utf-8") as f:
                auto_extracted_glossary = self.shared_context.auto_extracted_glossary
                if auto_extracted_glossary:
                    f.write(auto_extracted_glossary.to_csv())


================================================
FILE: babeldoc/format/pdf/document_il/midend/detect_scanned_file.py
================================================
import logging

import cv2
import numpy as np
import pymupdf
import regex
from skimage.metrics import structural_similarity

from babeldoc.babeldoc_exception.BabelDOCException import ScannedPDFError
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.document_il.backend.pdf_creater import PDFCreater
from babeldoc.format.pdf.document_il.utils.style_helper import BLACK
from babeldoc.format.pdf.document_il.utils.style_helper import GREEN
from babeldoc.format.pdf.translation_config import TranslationConfig

logger = logging.getLogger(__name__)


class DetectScannedFile:
    stage_name = "DetectScannedFile"

    def __init__(self, translation_config: TranslationConfig):
        self.translation_config = translation_config

    def _save_debug_box_to_page(self, page: il_version_1.Page, similarity: float):
        """Save debug boxes and text labels to the PDF page."""
        if not self.translation_config.debug:
            return

        color = GREEN

        # Create text label at top-left corner
        # Note: PDF coordinates are from bottom-left,
        # so we use y2 for top position
        style = il_version_1.PdfStyle(
            font_id="base",
            font_size=4,
            graphic_state=color,
        )
        page_width = page.cropbox.box.x2 - page.cropbox.box.x
        page_height = page.cropbox.box.y2 - page.cropbox.box.y
        unicode = f"scanned score: {similarity * 100:.2f} %"
        page.pdf_paragraph.append(
            il_version_1.PdfParagraph(
                first_line_indent=False,
                box=il_version_1.Box(
                    x=page.cropbox.box.x + page_width * 0.03,
                    y=page.cropbox.box.y,
                    x2=page.cropbox.box.x2,
                    y2=page.cropbox.box.y2 - page_height * 0.03,
                ),
                vertical=False,
                pdf_style=style,
                unicode=unicode,
                pdf_paragraph_composition=[
                    il_version_1.PdfParagraphComposition(
                        pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
                            unicode=unicode,
                            pdf_style=style,
                            debug_info=True,
                        ),
                    ),
                ],
                xobj_id=-1,
            ),
        )

    def fast_check(self, doc: pymupdf.Document) -> bool:
        if doc:
            hit_list = [0] * len(doc)
            for page in doc:
                contents_list = page.get_contents()
                for index in contents_list:
                    contents = doc.xref_stream(index)
                    if regex.search(
                        rb"(/Artifact|/P)(\s*\<\<\s*/MCID\s+|\s+BDC)", contents
                    ):
                        hit_list[page.number] += 1
                    if regex.search(rb"\s3\s+Tr\s", contents):
                        hit_list[page.number] += 1
            return bool(sum(hit_list) > len(doc) * 0.8)
        return False

    def process(
        self, docs: il_version_1.Document, original_pdf_path, mediabox_data: dict
    ):
        """Generate layouts for all pages that need to be translated."""
        # Get pages that need to be translated

        pdf_creater = PDFCreater(
            original_pdf_path, docs, self.translation_config, mediabox_data
        )

        pages_to_translate = [
            page
            for page in docs.page
            if self.translation_config.should_translate_page(page.page_number + 1)
        ]
        if not pages_to_translate:
            return
        mupdf = pymupdf.open(self.translation_config.get_working_file_path("input.pdf"))
        total = len(pages_to_translate)
        threshold = 0.8 * total
        threshold = max(threshold, 1)
        scanned = 0
        non_scanned = 0
        non_scanned_threshold = total - threshold
        with self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            total,
        ) as progress:
            for page in pages_to_translate:
                if scanned < threshold and non_scanned < non_scanned_threshold:
                    # Only continue detection if both counts are below thresholds
                    is_scanned = self.detect_page_is_scanned(page, mupdf, pdf_creater)
                    if is_scanned:
                        scanned += 1
                    else:
                        non_scanned += 1
                else:
                    # We have enough information to determine document type
                    non_scanned += 1
                progress.advance(1)

        if scanned >= threshold:
            if self.translation_config.auto_enable_ocr_workaround:
                logger.warning(
                    f"Detected {scanned} scanned pages, which is more than 80% of the total pages. "
                    "Turning on OCR workaround.",
                )
                self.translation_config.shared_context_cross_split_part.auto_enabled_ocr_workaround = True
                self.translation_config.ocr_workaround = True
                self.translation_config.skip_scanned_detection = True
                self.translation_config.disable_rich_text_translate = True
                self.clean_render_order_for_chars(docs)
                self.translation_config.remove_non_formula_lines = False
            else:
                logger.warning(
                    f"Detected {scanned} scanned pages, which is more than 80% of the total pages. "
                    "Please check the input PDF file.",
                )
                raise ScannedPDFError("Scanned PDF detected.")

    def clean_render_order_for_chars(self, docs: il_version_1.Document):
        for page in docs.page:
            for char in page.pdf_character:
                char.render_order = None
                if not char.debug_info:
                    char.pdf_style.graphic_state = BLACK

    def detect_page_is_scanned(
        self, page: il_version_1.Page, pdf: pymupdf.Document, pdf_creater: PDFCreater
    ) -> bool:
        before_page_image = pdf[page.page_number].get_pixmap()
        before_page_image = np.frombuffer(before_page_image.samples, np.uint8).reshape(
            before_page_image.height,
            before_page_image.width,
            3,
        )[:, :, ::-1]

        pdf_creater.update_page_content_stream(
            False, page, pdf, self.translation_config, True
        )

        after_page_image = pdf[page.page_number].get_pixmap()
        after_page_image = np.frombuffer(after_page_image.samples, np.uint8).reshape(
            after_page_image.height,
            after_page_image.width,
            3,
        )[:, :, ::-1]
        before_page_image = cv2.cvtColor(before_page_image, cv2.COLOR_RGB2GRAY)
        after_page_image = cv2.cvtColor(after_page_image, cv2.COLOR_RGB2GRAY)
        similarity = structural_similarity(before_page_image, after_page_image)
        return similarity > 0.95


================================================
FILE: babeldoc/format/pdf/document_il/midend/il_translator.py
================================================
from __future__ import annotations

import copy
import json
import logging
import re
import threading
from pathlib import Path
from string import Template

import tiktoken
from tqdm import tqdm

import babeldoc.format.pdf.document_il.il_version_1 as il_version_1
from babeldoc.babeldoc_exception.BabelDOCException import ContentFilterError
from babeldoc.format.pdf.document_il import Document
from babeldoc.format.pdf.document_il import GraphicState
from babeldoc.format.pdf.document_il import Page
from babeldoc.format.pdf.document_il import PdfFont
from babeldoc.format.pdf.document_il import PdfFormula
from babeldoc.format.pdf.document_il import PdfParagraph
from babeldoc.format.pdf.document_il import PdfParagraphComposition
from babeldoc.format.pdf.document_il import PdfSameStyleCharacters
from babeldoc.format.pdf.document_il import PdfSameStyleUnicodeCharacters
from babeldoc.format.pdf.document_il import PdfStyle
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.utils.layout_helper import get_char_unicode_string
from babeldoc.format.pdf.document_il.utils.layout_helper import get_paragraph_unicode
from babeldoc.format.pdf.document_il.utils.layout_helper import is_same_style
from babeldoc.format.pdf.document_il.utils.layout_helper import (
    is_same_style_except_font,
)
from babeldoc.format.pdf.document_il.utils.layout_helper import (
    is_same_style_except_size,
)
from babeldoc.format.pdf.document_il.utils.paragraph_helper import (
    is_placeholder_only_paragraph,
)
from babeldoc.format.pdf.document_il.utils.paragraph_helper import (
    is_pure_numeric_paragraph,
)
from babeldoc.format.pdf.document_il.utils.style_helper import GRAY80
from babeldoc.format.pdf.translation_config import TranslationConfig
from babeldoc.translator.translator import BaseTranslator
from babeldoc.utils.priority_thread_pool_executor import PriorityThreadPoolExecutor

logger = logging.getLogger(__name__)


PROMPT_TEMPLATE = Template(
    """$role_block

## Rules

1. Keep the structure exactly unchanged: do NOT add/remove/reorder any tags, placeholders, or tokens.
2. Keep all tags unchanged (e.g., <style>, <b>, </style>).
   - Translate human-readable text inside tags.
   - Do NOT translate text inside <code>…</code>.
3. Do NOT translate or alter placeholders: {v1}, {name}, %s, %d, [[...]], %%...%%.
4. If the entire input is pure code/identifiers, return it unchanged.
5. Translate ALL human-readable content into $lang_out.

$glossary_block

$context_block

## Output

Output ONLY the translated $lang_out text. No explanations, no backticks, no extra text.

Now translate the following text:

$text_to_translate"""
)


class RichTextPlaceholder:
    def __init__(
        self,
        placeholder_id: int,
        composition: PdfSameStyleCharacters,
        left_placeholder: str,
        right_placeholder: str,
        left_regex_pattern: str = None,
        right_regex_pattern: str = None,
    ):
        self.id = placeholder_id
        self.composition = composition
        self.left_placeholder = left_placeholder
        self.right_placeholder = right_placeholder
        self.left_regex_pattern = left_regex_pattern
        self.right_regex_pattern = right_regex_pattern

    def to_dict(self) -> dict:
        return {
            "type": "rich_text",
            "id": self.id,
            "left_placeholder": self.left_placeholder,
            "right_placeholder": self.right_placeholder,
            "left_regex_pattern": self.left_regex_pattern,
            "right_regex_pattern": self.right_regex_pattern,
            "composition_chars": get_char_unicode_string(self.composition.pdf_character)
            if self.composition and self.composition.pdf_character
            else None,
        }


class FormulaPlaceholder:
    def __init__(
        self,
        placeholder_id: int,
        formula: PdfFormula,
        placeholder: str,
        regex_pattern: str,
    ):
        self.id = placeholder_id
        self.formula = formula
        self.placeholder = placeholder
        self.regex_pattern = regex_pattern

    def to_dict(self) -> dict:
        return {
            "type": "formula",
            "id": self.id,
            "placeholder": self.placeholder,
            "regex_pattern": self.regex_pattern,
            "formula_chars": get_char_unicode_string(self.formula.pdf_character)
            if self.formula and self.formula.pdf_character
            else None,
        }


class PbarContext:
    def __init__(self, pbar):
        self.pbar = pbar

    def __enter__(self):
        return self.pbar

    def __exit__(self, exc_type, exc_value, traceback):
        self.pbar.advance()


class DocumentTranslateTracker:
    def __init__(self):
        self.page = []
        self.cross_page = []
        # Track paragraphs that are combined due to cross-column detection within the same page
        self.cross_column = []

    def new_page(self):
        page = PageTranslateTracker()
        self.page.append(page)
        return page

    def new_cross_page(self):
        page = PageTranslateTracker()
        self.cross_page.append(page)
        return page

    def new_cross_column(self):
        """Create and return a new PageTranslateTracker dedicated to cross-column merging."""
        page = PageTranslateTracker()
        self.cross_column.append(page)
        return page

    def to_json(self):
        pages = []
        for page in self.page:
            paragraphs = self.convert_paragraph(page)
            pages.append({"paragraph": paragraphs})
        cross_page = []
        for page in self.cross_page:
            paragraphs = self.convert_paragraph(page)
            cross_page.append({"paragraph": paragraphs})
        cross_column = []
        for page in self.cross_column:
            paragraphs = self.convert_paragraph(page)
            cross_column.append({"paragraph": paragraphs})
        return json.dumps(
            {
                "cross_page": cross_page,
                "cross_column": cross_column,
                "page": pages,
            },
            ensure_ascii=False,
            indent=2,
        )

    def convert_paragraph(self, page):
        paragraphs = []
        for para in page.paragraph:
            i_str = getattr(para, "input", None)
            o_str = getattr(para, "output", None)
            pdf_unicode = getattr(para, "pdf_unicode", None)
            llm_translate_trackers = getattr(para, "llm_translate_trackers", None)
            placeholders = getattr(para, "placeholders", None)
            original_placeholders = getattr(para, "original_placeholders", None)
            removed_hallucinated_placeholders = getattr(
                para,
                "removed_hallucinated_placeholders",
                None,
            )

            llm_translate_trackers_json = []
            if llm_translate_trackers:
                for tracker in llm_translate_trackers:
                    llm_translate_trackers_json.append(tracker.to_dict())

            placeholders_json = []
            if placeholders:
                for placeholder in placeholders:
                    placeholders_json.append(placeholder.to_dict())

            if pdf_unicode is None or i_str is None:
                continue
            paragraph_json = {
                "input": i_str,
                "output": o_str,
                "pdf_unicode": pdf_unicode,
                "llm_translate_trackers": llm_translate_trackers_json,
                "placeholders": placeholders_json,
                "multi_paragraph_id": getattr(para, "multi_paragraph_id", None),
                "multi_paragraph_index": getattr(para, "multi_paragraph_index", None),
                "original_placeholders": original_placeholders,
                "removed_hallucinated_placeholders": removed_hallucinated_placeholders,
            }
            paragraphs.append(
                paragraph_json,
            )
        return paragraphs


class PageTranslateTracker:
    def __init__(self):
        self.paragraph = []

    def new_paragraph(self):
        paragraph = ParagraphTranslateTracker()
        self.paragraph.append(paragraph)
        return paragraph


class ParagraphTranslateTracker:
    def __init__(self):
        self.llm_translate_trackers = []
        self.original_placeholders: dict[str, int] = {}
        self.removed_hallucinated_placeholders: dict[str, int] = {}

    def set_pdf_unicode(self, unicode: str):
        self.pdf_unicode = unicode

    def set_input(self, input_text: str):
        self.input = input_text

    def set_placeholders(
        self, placeholders: list[RichTextPlaceholder | FormulaPlaceholder]
    ):
        self.placeholders = placeholders

    def set_original_placeholders(self, placeholders: dict[str, int] | None):
        """Record original placeholder-like tokens from the source text."""
        self.original_placeholders = placeholders or {}

    def record_multi_paragraph_id(self, mid):
        self.multi_paragraph_id = mid

    def record_multi_paragraph_index(self, index):
        self.multi_paragraph_index = index

    def set_output(self, output: str):
        self.output = output

    def record_removed_hallucinated_placeholder(self, token: str):
        """Record placeholder-like tokens removed from translated text."""
        if not token:
            return
        self.removed_hallucinated_placeholders[token] = (
            self.removed_hallucinated_placeholders.get(token, 0) + 1
        )

    def new_llm_translate_tracker(self) -> LLMTranslateTracker:
        tracker = LLMTranslateTracker()
        self.llm_translate_trackers.append(tracker)
        return tracker

    def last_llm_translate_tracker(self) -> LLMTranslateTracker | None:
        if self.llm_translate_trackers:
            return self.llm_translate_trackers[-1]
        return None


class LLMTranslateTracker:
    def __init__(self):
        self.input = ""
        self.output = ""
        self.has_error = False
        self.error_message = ""
        self.placeholder_full_match = False
        self.fallback_to_translate = False

    def set_input(self, input_text: str):
        self.input = input_text

    def set_output(self, output_text: str):
        self.output = output_text

    def set_error_message(self, error_message: str):
        self.has_error = True
        self.error_message = error_message

    def set_placeholder_full_match(self):
        self.placeholder_full_match = True

    def set_fallback_to_translate(self):
        self.fallback_to_translate = True

    def to_dict(self):
        return {
            "input": self.input,
            "output": self.output,
            "has_error": self.has_error,
            "error_message": self.error_message,
            "placeholder_full_match": self.placeholder_full_match,
            "fallback_to_translate": self.fallback_to_translate,
        }


class ILTranslator:
    stage_name = "Translate Paragraphs"

    def __init__(
        self,
        translate_engine: BaseTranslator,
        translation_config: TranslationConfig,
        tokenizer=None,
    ):
        self.translate_engine = translate_engine
        self.translation_config = translation_config
        self.font_mapper = FontMapper(translation_config)
        self.shared_context_cross_split_part = (
            translation_config.shared_context_cross_split_part
        )
        if tokenizer is None:
            self.tokenizer = tiktoken.encoding_for_model("gpt-4o")
        else:
            self.tokenizer = tokenizer

        # Cache glossaries at initialization
        self._cached_glossaries = (
            self.shared_context_cross_split_part.get_glossaries_for_translation(
                self.translation_config.auto_extract_glossary
            )
        )

        self.support_llm_translate = False
        try:
            if translate_engine and hasattr(translate_engine, "do_llm_translate"):
                translate_engine.do_llm_translate(None)
                self.support_llm_translate = True
        except NotImplementedError:
            self.support_llm_translate = False

        self.use_as_fallback = False
        self.add_content_filter_hint_lock = threading.Lock()
        self.docs = None

        # Pre-compile patterns for placeholder-like tokens that may be hallucinated by LLM.
        # We only consider the same shapes as our own formula & rich-text placeholders.
        self._formula_placeholder_pattern = re.compile(
            self.translate_engine.get_formular_placeholder(r"\d+")[1], re.IGNORECASE
        )
        self._style_left_placeholder_pattern = re.compile(
            self.translate_engine.get_rich_text_left_placeholder(r"\d+")[1],
            re.IGNORECASE,
        )
        self._style_right_placeholder_pattern = re.compile(
            self.translate_engine.get_rich_text_right_placeholder(r"\d+")[1],
            re.IGNORECASE,
        )

    def calc_token_count(self, text: str) -> int:
        try:
            return len(self.tokenizer.encode(text, disallowed_special=()))
        except Exception:
            return 0

    def translate(self, docs: Document):
        self.docs = docs
        tracker = DocumentTranslateTracker()

        if not self.translation_config.shared_context_cross_split_part.first_paragraph:
            # Try to find the first title paragraph
            title_paragraph = self.find_title_paragraph(docs)
            self.translation_config.shared_context_cross_split_part.first_paragraph = (
                copy.deepcopy(title_paragraph)
            )
            self.translation_config.shared_context_cross_split_part.recent_title_paragraph = copy.deepcopy(
                title_paragraph
            )
            if title_paragraph:
                logger.info(f"Found first title paragraph: {title_paragraph.unicode}")

        # count total paragraph
        total = sum(len(page.pdf_paragraph) for page in docs.page)
        with self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            total,
        ) as pbar:
            with PriorityThreadPoolExecutor(
                max_workers=self.translation_config.pool_max_workers,
            ) as executor:
                for page in docs.page:
                    self.process_page(page, executor, pbar, tracker.new_page())

        path = self.translation_config.get_working_file_path("translate_tracking.json")

        if (
            self.translation_config.debug
            or self.translation_config.working_dir is not None
        ):
            logger.debug(f"save translate tracking to {path}")
            with Path(path).open("w", encoding="utf-8") as f:
                f.write(tracker.to_json())

    def find_title_paragraph(self, docs: Document) -> PdfParagraph | None:
        """Find the first paragraph with layout_label 'title' in the document.

        Args:
            docs: The document to search in

        Returns:
            The first title paragraph found, or None if no title paragraph exists
        """
        for page in docs.page:
            for paragraph in page.pdf_paragraph:
                if paragraph.layout_label == "title":
                    logger.info(f"Found title paragraph: {paragraph.unicode}")
                    return paragraph
        return None

    def process_page(
        self,
        page: Page,
        executor: PriorityThreadPoolExecutor,
        pbar: tqdm | None = None,
        tracker: PageTranslateTracker = None,
    ):
        self.translation_config.raise_if_cancelled()
        for paragraph in page.pdf_paragraph:
            page_font_map = {}
            for font in page.pdf_font:
                page_font_map[font.font_id] = font
            page_xobj_font_map = {}
            for xobj in page.pdf_xobject:
                page_xobj_font_map[xobj.xobj_id] = page_font_map.copy()
                for font in xobj.pdf_font:
                    page_xobj_font_map[xobj.xobj_id][font.font_id] = font
            # self.translate_paragraph(paragraph, pbar,tracker.new_paragraph(), page_font_map, page_xobj_font_map)
            paragraph_token_count = self.calc_token_count(paragraph.unicode)
            if paragraph.layout_label == "title":
                self.shared_context_cross_split_part.recent_title_paragraph = (
                    copy.deepcopy(paragraph)
                )
            executor.submit(
                self.translate_paragraph,
                paragraph,
                page,
                pbar,
                tracker.new_paragraph(),
                page_font_map,
                page_xobj_font_map,
                priority=1048576 - paragraph_token_count,
                paragraph_token_count=paragraph_token_count,
                title_paragraph=self.translation_config.shared_context_cross_split_part.first_paragraph,
                local_title_paragraph=self.translation_config.shared_context_cross_split_part.recent_title_paragraph,
            )

    class TranslateInput:
        def __init__(
            self,
            unicode: str,
            placeholders: list[RichTextPlaceholder | FormulaPlaceholder],
            base_style: PdfStyle = None,
        ):
            self.unicode = unicode
            self.placeholders = placeholders
            self.base_style = base_style
            # Original placeholder-like tokens extracted from the source text.
            # Key: exact matched token string; Value: occurrence count.
            self.original_placeholder_tokens: dict[str, int] = {}

        def set_original_placeholder_tokens(self, tokens: dict[str, int] | None):
            """Attach original placeholder-like tokens from source text."""
            self.original_placeholder_tokens = tokens or {}

        def get_placeholders_hint(self) -> dict[str, str] | None:
            hint = {}
            for placeholder in self.placeholders:
                if isinstance(placeholder, FormulaPlaceholder):
                    cid_count = 0
                    for char in placeholder.formula.pdf_character:
                        if re.match(r"^\(cid:\d+\)$", char.char_unicode):
                            cid_count += 1
                    if cid_count > len(placeholder.formula.pdf_character) * 0.8:
                        continue

                    hint[placeholder.placeholder] = get_char_unicode_string(
                        placeholder.formula.pdf_character
                    )
            if hint:
                return hint
            return None

    def create_formula_placeholder(
        self,
        formula: PdfFormula,
        formula_id: int,
        paragraph: PdfParagraph,
    ):
        placeholder = self.translate_engine.get_formular_placeholder(formula_id)
        if isinstance(placeholder, tuple):
            placeholder, regex_pattern = placeholder
        else:
            regex_pattern = re.escape(placeholder)
        if re.match(regex_pattern, paragraph.unicode, re.IGNORECASE):
            return self.create_formula_placeholder(formula, formula_id + 1, paragraph)

        return FormulaPlaceholder(formula_id, formula, placeholder, regex_pattern)

    def create_rich_text_placeholder(
        self,
        composition: PdfSameStyleCharacters,
        composition_id: int,
        paragraph: PdfParagraph,
    ):
        left_placeholder = self.translate_engine.get_rich_text_left_placeholder(
            composition_id,
        )
        right_placeholder = self.translate_engine.get_rich_text_right_placeholder(
            composition_id,
        )
        if isinstance(left_placeholder, tuple):
            left_placeholder, left_placeholder_regex_pattern = left_placeholder
        else:
            left_placeholder_regex_pattern = re.escape(left_placeholder)
        if isinstance(right_placeholder, tuple):
            right_placeholder, right_placeholder_regex_pattern = right_placeholder
        else:
            right_placeholder_regex_pattern = re.escape(right_placeholder)
        if re.match(
            f"{left_placeholder_regex_pattern}|{right_placeholder_regex_pattern}",
            paragraph.unicode,
            re.IGNORECASE,
        ):
            return self.create_rich_text_placeholder(
                composition,
                composition_id + 1,
                paragraph,
            )

        return RichTextPlaceholder(
            composition_id,
            composition,
            left_placeholder,
            right_placeholder,
            left_placeholder_regex_pattern,
            right_placeholder_regex_pattern,
        )

    def get_translate_input(
        self,
        paragraph: PdfParagraph,
        page_font_map: dict[str, PdfFont] = None,
        disable_rich_text_translate: bool | None = None,
    ):
        if not paragraph.pdf_paragraph_composition:
            return

        # Skip pure numeric paragraphs
        if is_pure_numeric_paragraph(paragraph):
            return None

        # Skip paragraphs with only placeholders
        if is_placeholder_only_paragraph(paragraph):
            return None

        # Extract original placeholder-like tokens from the raw paragraph text
        original_placeholder_tokens: dict[str, int] = {}

        def scan_placeholder_tokens(text: str, tokens: dict[str, int]):
            for pattern in (
                self._formula_placeholder_pattern,
                self._style_left_placeholder_pattern,
                self._style_right_placeholder_pattern,
            ):
                for match in pattern.finditer(text):
                    token = match.group(0)
                    tokens[token] = tokens.get(token, 0) + 1

        if paragraph.unicode:
            scan_placeholder_tokens(paragraph.unicode, original_placeholder_tokens)
        if len(paragraph.pdf_paragraph_composition) == 1:
            # 如果整个段落只有一个组成部分，那么直接返回，不需要套占位符等
            composition = paragraph.pdf_paragraph_composition[0]
            if (
                composition.pdf_line
                or composition.pdf_same_style_characters
                or composition.pdf_character
            ):
                translate_input = self.TranslateInput(
                    paragraph.unicode,
                    [],
                    paragraph.pdf_style,
                )
                translate_input.set_original_placeholder_tokens(
                    original_placeholder_tokens,
                )
                return translate_input
            elif composition.pdf_formula:
                # 不需要翻译纯公式
                return None
            elif composition.pdf_same_style_unicode_characters:
                # DEBUG INSERT CHAR, NOT TRANSLATE
                return None
            else:
                logger.error(
                    f"Unknown composition type. "
                    f"Composition: {composition}. "
                    f"Paragraph: {paragraph}. ",
                )
                return None

        # 如果没有指定 disable_rich_text_translate，使用配置中的值
        if disable_rich_text_translate is None:
            disable_rich_text_translate = (
                self.translation_config.disable_rich_text_translate
            )

        placeholder_id = 1
        placeholders = []
        chars = []
        for composition in paragraph.pdf_paragraph_composition:
            if composition.pdf_line:
                chars.extend(composition.pdf_line.pdf_character)
            elif composition.pdf_formula:
                formula_placeholder = self.create_formula_placeholder(
                    composition.pdf_formula,
                    placeholder_id,
                    paragraph,
                )
                placeholders.append(formula_placeholder)
                # 公式只需要一个占位符，所以 id+1
                placeholder_id = formula_placeholder.id + 1
                chars.extend(formula_placeholder.placeholder)
            elif composition.pdf_character:
                chars.append(composition.pdf_character)
            elif composition.pdf_same_style_characters:
                if disable_rich_text_translate:
                    # 如果禁用富文本翻译，直接添加字符
                    chars.extend(composition.pdf_same_style_characters.pdf_character)
                    continue

                fonta = self.font_mapper.map(
                    page_font_map[
                        composition.pdf_same_style_characters.pdf_style.font_id
                    ],
                    "1",
                )
                fontb = self.font_mapper.map(
                    page_font_map[paragraph.pdf_style.font_id],
                    "1",
                )
                if (
                    # 样式和段落基准样式一致，无需占位符
                    is_same_style(
                        composition.pdf_same_style_characters.pdf_style,
                        paragraph.pdf_style,
                    )
                    # 字号差异在 0.7-1.3 之间，可能是首字母变大效果，无需占位符
                    or is_same_style_except_size(
                        composition.pdf_same_style_characters.pdf_style,
                        paragraph.pdf_style,
                    )
                    or (
                        # 除了字体以外样式都和基准一样，并且字体都映射到同一个字体。无需占位符
                        is_same_style_except_font(
                            composition.pdf_same_style_characters.pdf_style,
                            paragraph.pdf_style,
                        )
                        and fonta
                        and fontb
                        and fonta.font_id == fontb.font_id
                    )
                    # or len(composition.pdf_same_style_characters.pdf_character) == 1
                ):
                    chars.extend(composition.pdf_same_style_characters.pdf_character)
                    continue
                placeholder = self.create_rich_text_placeholder(
                    composition.pdf_same_style_characters,
                    placeholder_id,
                    paragraph,
                )
                placeholders.append(placeholder)
                # 样式需要一左一右两个占位符，所以 id+2
                placeholder_id = placeholder.id + 2
                chars.append(placeholder.left_placeholder)
                chars.extend(composition.pdf_same_style_characters.pdf_character)
                chars.append(placeholder.right_placeholder)
            else:
                logger.error(
                    "Unexpected PdfParagraphComposition type "
                    "in PdfParagraph during translation. "
                    f"Composition: {composition}. "
                    f"Paragraph: {paragraph}. ",
                )
                return None

            # 如果占位符数量超过阈值，且未禁用富文本翻译，则递归调用并禁用富文本翻译
            if len(placeholders) > 40 and not disable_rich_text_translate:
                logger.warning(
                    f"Too many placeholders ({len(placeholders)}) in paragraph[{paragraph.debug_id}], "
                    "disabling rich text translation for this paragraph",
                )
                return self.get_translate_input(paragraph, page_font_map, True)

        text = get_char_unicode_string(chars)
        translate_input = self.TranslateInput(text, placeholders, paragraph.pdf_style)
        translate_input.set_original_placeholder_tokens(original_placeholder_tokens)
        return translate_input

    def process_formula(
        self,
        formula: PdfFormula,
        formula_id: int,
        paragraph: PdfParagraph,
    ):
        placeholder = self.create_formula_placeholder(formula, formula_id, paragraph)
        if placeholder.placeholder in paragraph.unicode:
            return self.process_formula(formula, formula_id + 1, paragraph)

        return placeholder

    def process_composition(
        self,
        composition: PdfSameStyleCharacters,
        composition_id: int,
        paragraph: PdfParagraph,
    ):
        placeholder = self.create_rich_text_placeholder(
            composition,
            composition_id,
            paragraph,
        )
        if (
            placeholder.left_placeholder in paragraph.unicode
            or placeholder.right_placeholder in paragraph.unicode
        ):
            return self.process_composition(
                composition,
                composition_id + 1,
                paragraph,
            )

        return placeholder

    def parse_translate_output(
        self,
        input_text: TranslateInput,
        output: str,
        tracker: ParagraphTranslateTracker | None = None,
        llm_translate_tracker: LLMTranslateTracker | None = None,
    ) -> [PdfParagraphComposition]:
        result = []

        # 如果没有占位符，直接返回整个文本
        if not input_text.placeholders:
            comp = PdfParagraphComposition()
            comp.pdf_same_style_unicode_characters = PdfSameStyleUnicodeCharacters()
            comp.pdf_same_style_unicode_characters.unicode = output
            comp.pdf_same_style_unicode_characters.pdf_style = input_text.base_style
            if llm_translate_tracker:
                llm_translate_tracker.set_placeholder_full_match()
            return [comp]

        # 构建正则表达式模式
        patterns = []
        placeholder_patterns = []
        placeholder_map = {}

        for placeholder in input_text.placeholders:
            if isinstance(placeholder, FormulaPlaceholder):
                # 转义特殊字符
                # pattern = re.escape(placeholder.placeholder)
                pattern = placeholder.regex_pattern
                patterns.append(f"({pattern})")
                placeholder_patterns.append(f"({pattern})")
                placeholder_map[placeholder.placeholder] = placeholder
            else:
                left = placeholder.left_regex_pattern
                right = placeholder.right_regex_pattern
                patterns.append(f"({left}.*?{right})")
                placeholder_patterns.append(f"({left})")
                placeholder_patterns.append(f"({right})")
                placeholder_map[placeholder.left_placeholder] = placeholder
        all_match = True
        for pattern in patterns:
            if not re.search(pattern, output, flags=re.IGNORECASE):
                all_match = False
                break
        if all_match:
            if llm_translate_tracker:
                llm_translate_tracker.set_placeholder_full_match()
        else:
            logger.debug(f"Failed to match all placeholder for {input_text.unicode}")
        # 合并所有模式
        combined_pattern = "|".join(patterns)
        combined_placeholder_pattern = "|".join(placeholder_patterns)
        # Build allowed placeholder tokens: originals from source + placeholders we injected.
        allowed_placeholder_tokens: set[str] = set()
        if getattr(input_text, "original_placeholder_tokens", None):
            allowed_placeholder_tokens.update(input_text.original_placeholder_tokens)
        for placeholder in input_text.placeholders:
            if isinstance(placeholder, FormulaPlaceholder):
                allowed_placeholder_tokens.add(placeholder.placeholder)
            else:
                allowed_placeholder_tokens.add(placeholder.left_placeholder)
                allowed_placeholder_tokens.add(placeholder.right_placeholder)

        def remove_placeholder(text: str):
            """Remove placeholder artifacts and hallucinated placeholder-like tokens."""
            # First, remove any leftover placeholders built from our own regex patterns.
            if combined_placeholder_pattern:
                text = re.sub(
                    combined_placeholder_pattern,
                    "",
                    text,
                    flags=re.IGNORECASE,
                )

            # Then, detect placeholder-like tokens of the same shapes as our own
            # formula and rich-text placeholders. Only keep those in the allowed set.
            def _replace_token(match: re.Match) -> str:
                token = match.group(0)
                if token in allowed_placeholder_tokens:
                    return token
                if tracker is not None:
                    tracker.record_removed_hallucinated_placeholder(token)
                return ""

            text = self._formula_placeholder_pattern.sub(_replace_token, text)
            text = self._style_left_placeholder_pattern.sub(_replace_token, text)
            text = self._style_right_placeholder_pattern.sub(_replace_token, text)
            return text

        # 找到所有匹配
        last_end = 0
        for match in re.finditer(combined_pattern, output, flags=re.IGNORECASE):
            # 处理匹配之前的普通文本
            if match.start() > last_end:
                text = output[last_end : match.start()]
                if text:
                    comp = PdfParagraphComposition()
                    comp.pdf_same_style_unicode_characters = (
                        PdfSameStyleUnicodeCharacters()
                    )
                    comp.pdf_same_style_unicode_characters.unicode = remove_placeholder(
                        text,
                    )
                    comp.pdf_same_style_unicode_characters.pdf_style = (
                        input_text.base_style
                    )
                    result.append(comp)

            matched_text = match.group(0)

            # 处理占位符
            if any(
                isinstance(p, FormulaPlaceholder)
                and re.match(f"^{p.regex_pattern}$", matched_text, re.IGNORECASE)
                for p in input_text.placeholders
            ):
                # 处理公式占位符
                placeholder = next(
                    p
                    for p in input_text.placeholders
                    if isinstance(p, FormulaPlaceholder)
                    and re.match(f"^{p.regex_pattern}$", matched_text, re.IGNORECASE)
                )
                comp = PdfParagraphComposition()
                comp.pdf_formula = placeholder.formula
                result.append(comp)
            else:
                # 处理富文本占位符
                placeholder = next(
                    p
                    for p in input_text.placeholders
                    if not isinstance(p, FormulaPlaceholder)
                    and re.match(
                        f"^{p.left_regex_pattern}", matched_text, re.IGNORECASE
                    )
                )
                text = re.match(
                    f"^{placeholder.left_regex_pattern}(.*){placeholder.right_regex_pattern}$",
                    matched_text,
                    re.IGNORECASE,
                ).group(1)

                if isinstance(
                    placeholder.composition,
                    PdfSameStyleCharacters,
                ) and text.replace(" ", "") == "".join(
                    x.char_unicode for x in placeholder.composition.pdf_character
                ).replace(
                    " ",
                    "",
                ):
                    comp = PdfParagraphComposition(
                        pdf_same_style_characters=placeholder.composition,
                    )
                else:
                    comp = PdfParagraphComposition()
                    comp.pdf_same_style_unicode_characters = (
                        PdfSameStyleUnicodeCharacters()
                    )
                    comp.pdf_same_style_unicode_characters.pdf_style = (
                        placeholder.composition.pdf_style
                    )
                    comp.pdf_same_style_unicode_characters.unicode = remove_placeholder(
                        text,
                    )
                result.append(comp)

            last_end = match.end()

        # 处理最后的普通文本
        if last_end < len(output):
            text = output[last_end:]
            if text:
                comp = PdfParagraphComposition()
                comp.pdf_same_style_unicode_characters = PdfSameStyleUnicodeCharacters()
                comp.pdf_same_style_unicode_characters.unicode = remove_placeholder(
                    text,
                )
                comp.pdf_same_style_unicode_characters.pdf_style = input_text.base_style
                result.append(comp)

        return result

    def pre_translate_paragraph(
        self,
        paragraph: PdfParagraph,
        tracker: ParagraphTranslateTracker,
        page_font_map: dict[str, PdfFont],
        xobj_font_map: dict[int, dict[str, PdfFont]],
    ):
        """Pre-translation processing: prepare text for translation."""
        if paragraph.vertical:
            return None, None
        tracker.set_pdf_unicode(paragraph.unicode)
        if paragraph.xobj_id in xobj_font_map:
            page_font_map = xobj_font_map[paragraph.xobj_id]
        disable_rich_text_translate = (
            self.translation_config.disable_rich_text_translate
        )
        if not self.support_llm_translate:
            disable_rich_text_translate = True

        translate_input = self.get_translate_input(
            paragraph, page_font_map, disable_rich_text_translate
        )
        if not translate_input:
            return None, None
        tracker.set_input(translate_input.unicode)
        tracker.set_placeholders(translate_input.placeholders)
        tracker.set_original_placeholders(
            getattr(translate_input, "original_placeholder_tokens", None),
        )
        text = translate_input.unicode
        if len(text) < self.translation_config.min_text_length:
            logger.debug(
                f"Text too short to translate, skip. Text: {text}. Paragraph id: {paragraph.debug_id}."
            )
            return None, None
        return text, translate_input

    def post_translate_paragraph(
        self,
        paragraph: PdfParagraph,
        tracker: ParagraphTranslateTracker,
        translate_input,
        translated_text: str,
    ):
        """Post-translation processing: update paragraph with translated text."""
        tracker.set_output(translated_text)
        if translated_text == translate_input:
            if llm_translate_tracker := tracker.last_llm_translate_tracker():
                llm_translate_tracker.set_placeholder_full_match()
            return False
        paragraph.unicode = translated_text
        paragraph.pdf_paragraph_composition = self.parse_translate_output(
            translate_input,
            translated_text,
            tracker,
            tracker.last_llm_translate_tracker(),
        )
        for composition in paragraph.pdf_paragraph_composition:
            if (
                composition.pdf_same_style_unicode_characters
                and composition.pdf_same_style_unicode_characters.pdf_style is None
            ):
                composition.pdf_same_style_unicode_characters.pdf_style = (
                    paragraph.pdf_style
                )
        return True

    def _build_role_block(self) -> str:
        """Build the role block for LLM prompt.

        Returns:
            Role block string with custom_system_prompt or default role description.
        """
        custom_prompt = getattr(self.translation_config, "custom_system_prompt", None)
        if custom_prompt:
            role_block = custom_prompt.strip()
            if "Follow all rules strictly." not in role_block:
                if not role_block.endswith("\n"):
                    role_block += "\n"
                role_block += "Follow all rules strictly."
        else:
            role_block = (
                f"You are a professional {self.translation_config.lang_out} native translator who needs to fluently translate text "
                f"into {self.translation_config.lang_out}.\n\n"
                "Follow all rules strictly."
            )
        return role_block

    def _build_context_block(
        self,
        title_paragraph: PdfParagraph | None = None,
        local_title_paragraph: PdfParagraph | None = None,
        translate_input: TranslateInput | None = None,
    ) -> str:
        """Build the context/hints block for LLM prompt.

        Args:
            title_paragraph: First title paragraph in the document
            local_title_paragraph: Most recent title paragraph
            translate_input: TranslateInput containing placeholder hints

        Returns:
            Context block string, empty if no context hints available
        """
        context_lines: list[str] = []
        hint_idx = 1

        if title_paragraph:
            context_lines.append(
                f"{hint_idx}. First title in the full text: {title_paragraph.unicode}"
            )
            hint_idx += 1

        if local_title_paragraph:
            is_different_from_global = True
            if title_paragraph:
                if local_title_paragraph.debug_id == title_paragraph.debug_id:
                    is_different_from_global = False

            if is_different_from_global:
                context_lines.append(
                    f"{hint_idx}. The most recent title is: {local_title_paragraph.unicode}"
                )
                hint_idx += 1

        if translate_input and self.translation_config.add_formula_placehold_hint:
            placeholders_hint = translate_input.get_placeholders_hint()
            if placeholders_hint:
                context_lines.append(
                    f"{hint_idx}. Formula placeholder hint:\n{placeholders_hint}"
                )

        if context_lines:
            return "## Context / Hints\n" + "\n".join(context_lines) + "\n"
        return ""

    def _build_glossary_block(self, text: str) -> str:
        """Build the glossary block for LLM prompt.

        Args:
            text: Text to match against glossary entries

        Returns:
            Glossary block string with tables, empty if no active glossary entries
        """
        if not self._cached_glossaries:
            return ""

        glossary_entries_per_glossary: dict[str, list[tuple[str, str]]] = {}

        for glossary in self._cached_glossaries:
            active_entries = glossary.get_active_entries_for_text(text)
            if active_entries:
                glossary_entries_per_glossary[glossary.name] = sorted(active_entries)

        if not glossary_entries_per_glossary:
            return ""

        glossary_block_lines: list[str] = [
            "## Glossary",
            "",
            "Always use the glossary's **Target Term** for any occurrence of its **Source Term** "
            "(including variants, inside tags, or broken across lines).",
            "",
            "Unlisted terms are translated naturally.",
            "",
        ]

        for glossary_name, entries in glossary_entries_per_glossary.items():
            glossary_block_lines.append(f"### Glossary: {glossary_name}")
            glossary_block_lines.append("")
            glossary_block_lines.append(
                "| Source Term | Target Term |\n|-------------|-------------|"
            )
            for original_source, target_text in entries:
                glossary_block_lines.append(f"| {original_source} | {target_text} |")
            glossary_block_lines.append("")

        return "\n".join(glossary_block_lines)

    def generate_prompt_for_llm(
        self,
        text: str,
        title_paragraph: PdfParagraph | None = None,
        local_title_paragraph: PdfParagraph | None = None,
        translate_input: TranslateInput | None = None,
    ):
        """Generate LLM prompt using template-based approach.

        Args:
            text: Text to be translated
            title_paragraph: First title paragraph in the document
            local_title_paragraph: Most recent title paragraph
            translate_input: TranslateInput containing placeholder information

        Returns:
            Final LLM prompt string
        """
        role_block = self._build_role_block()
        context_block = self._build_context_block(
            title_paragraph, local_title_paragraph, translate_input
        )
        glossary_block = self._build_glossary_block(text)

        return PROMPT_TEMPLATE.substitute(
            role_block=role_block,
            glossary_block=glossary_block,
            context_block=context_block,
            lang_out=self.translation_config.lang_out,
            text_to_translate=text,
        )

    def add_content_filter_hint(self, page: Page, paragraph: PdfParagraph):
        with self.add_content_filter_hint_lock:
            new_box = il_version_1.Box(
                x=paragraph.box.x,
                y=paragraph.box.y2,
                x2=paragraph.box.x2,
                y2=paragraph.box.y2 + 1.1,
            )
            page.pdf_paragraph.append(
                self._create_text(
                    "翻译服务检测到内容可能包含不安全或敏感内容，请您避免翻译敏感内容，感谢您的配合。",
                    GRAY80,
                    new_box,
                    1,
                )
            )
            logger.info("success add content filter hint")

    def _create_text(
        self,
        text: str,
        color: GraphicState,
        box: il_version_1.Box,
        font_size: float = 4,
    ):
        style = il_version_1.PdfStyle(
            font_id="base",
            font_size=font_size,
            graphic_state=color,
        )
        return il_version_1.PdfParagraph(
            first_line_indent=False,
            box=box,
            vertical=False,
            pdf_style=style,
            unicode=text,
            pdf_paragraph_composition=[
                il_version_1.PdfParagraphComposition(
                    pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
                        unicode=text,
                        pdf_style=style,
                        debug_info=True,
                    ),
                ),
            ],
            xobj_id=-1,
        )

    def translate_paragraph(
        self,
        paragraph: PdfParagraph,
        page: Page,
        pbar: tqdm | None = None,
        tracker: ParagraphTranslateTracker = None,
        page_font_map: dict[str, PdfFont] = None,
        xobj_font_map: dict[int, dict[str, PdfFont]] = None,
        paragraph_token_count: int = 0,
        title_paragraph: PdfParagraph | None = None,
        local_title_paragraph: PdfParagraph | None = None,
    ):
        """Translate a paragraph using pre and post processing functions."""
        self.translation_config.raise_if_cancelled()
        with PbarContext(pbar):
            try:
                if self.use_as_fallback:
                    # il translator llm only modifies unicode in some situations
                    paragraph.unicode = get_paragraph_unicode(paragraph)
                # Pre-translation processing
                text, translate_input = self.pre_translate_paragraph(
                    paragraph, tracker, page_font_map, xobj_font_map
                )
                if text is None:
                    return
                llm_translate_tracker = tracker.new_llm_translate_tracker()
                # Perform translation
                if self.support_llm_translate:
                    llm_prompt = self.generate_prompt_for_llm(
                        text,
                        title_paragraph,
                        local_title_paragraph,
                        translate_input,
                    )
                    llm_translate_tracker.set_input(llm_prompt)
                    translated_text = self.translate_engine.llm_translate(
                        llm_prompt,
                        rate_limit_params={
                            "paragraph_token_count": paragraph_token_count
                        },
                    )
                    llm_translate_tracker.set_output(translated_text)
                else:
                    translated_text = self.translate_engine.translate(
                        text,
                        rate_limit_params={
                            "paragraph_token_count": paragraph_token_count
                        },
                    )
                translated_text = re.sub(r"[. 。…，]{20,}", ".", translated_text)

                # Post-translation processing
                self.post_translate_paragraph(
                    paragraph, tracker, translate_input, translated_text
                )
            except ContentFilterError as e:
                logger.warning(f"ContentFilterError: {e.message}")
                self.add_content_filter_hint(page, paragraph)
                return
            except Exception as e:
                logger.exception(
                    f"Error translating paragraph. Paragraph: {paragraph.debug_id} ({paragraph.unicode}). Error: {e}. ",
                )
                # ignore error and continue
                return


================================================
FILE: babeldoc/format/pdf/document_il/midend/il_translator_llm_only.py
================================================
import copy
import json
import logging
import re
from pathlib import Path
from string import Template

import Levenshtein
import tiktoken
from tqdm import tqdm

from babeldoc.format.pdf.document_il import Document
from babeldoc.format.pdf.document_il import Page
from babeldoc.format.pdf.document_il import PdfFont
from babeldoc.format.pdf.document_il import PdfParagraph
from babeldoc.format.pdf.document_il.midend import il_translator
from babeldoc.format.pdf.document_il.midend.il_translator import (
    DocumentTranslateTracker,
)
from babeldoc.format.pdf.document_il.midend.il_translator import ILTranslator
from babeldoc.format.pdf.document_il.midend.il_translator import PageTranslateTracker
from babeldoc.format.pdf.document_il.midend.il_translator import (
    ParagraphTranslateTracker,
)
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.utils.paragraph_helper import is_cid_paragraph
from babeldoc.format.pdf.document_il.utils.paragraph_helper import (
    is_placeholder_only_paragraph,
)
from babeldoc.format.pdf.document_il.utils.paragraph_helper import (
    is_pure_numeric_paragraph,
)
from babeldoc.format.pdf.translation_config import TranslationConfig
from babeldoc.translator.translator import BaseTranslator
from babeldoc.utils.priority_thread_pool_executor import PriorityThreadPoolExecutor

logger = logging.getLogger(__name__)


PROMPT_TEMPLATE = Template(
    """$role_block

## Structure Rules
1. Keep **the same number of paragraphs as the input**.
2. Input paragraphs may be **sliced pieces of the same original paragraph**.  
   → You MUST treat each input paragraph **as an independent, fixed unit**.  
   → Do NOT merge paragraphs, split paragraphs, or move content between paragraphs.
3. Inside each paragraph, you may adjust word order for fluency, but:
   - Do NOT change the meaning.
   - Do NOT move placeholders, tags, or code outside their paragraph.
4. Translate ALL human-readable content into $lang_out.

## Do NOT Modify
- Tags (e.g., <style>, <b>, <code>): keep them exactly the same.  
  *Translate tag-internal text except code blocks (<code>…</code>)*.
- Placeholders: `{v1}`, `{name}`, `%s`, `%d`, `[[...]]`, `%%...%%` — keep exactly unchanged.
- JSON keys or structure.

$glossary_usage_rules_block
## Output Format
Return a JSON array of the same length.  
For each item:
- Keep the same "id" and remove other fields like "input" and "layout_label".
- Add "output" with the translated text only.
- No extra text, no ```json blocks.

## Style
- Produce fluent, professional $lang_out.
- Preserve punctuation unless needed for target language fluency.

### Example
Input:
[
    {
    "id": 0,
    "input": "{v1}<style id='2'>hello</style>, world!",
    "layout_label": "text"
    }
]
Output:
[
    {
    "id": 0,
    "output": "{v1}<style id='2'>你好</style>，世界！"
    }
]

$contextual_hints_block

$glossary_tables_block

## Here is the input:

$json_input_str"""
)


class BatchParagraph:
    def __init__(
        self,
        paragraphs: list[PdfParagraph],
        pages: list[Page],
        page_tracker: PageTranslateTracker,
    ):
        self.paragraphs = paragraphs
        self.pages = pages
        self.trackers = [page_tracker.new_paragraph() for _ in paragraphs]


class ILTranslatorLLMOnly:
    stage_name = "Translate Paragraphs"

    def __init__(
        self,
        translate_engine: BaseTranslator,
        translation_config: TranslationConfig,
        tokenizer=None,
    ):
        self.translate_engine = translate_engine
        self.translation_config = translation_config
        self.font_mapper = FontMapper(translation_config)
        self.shared_context_cross_split_part = (
            translation_config.shared_context_cross_split_part
        )

        if tokenizer is None:
            self.tokenizer = tiktoken.encoding_for_model("gpt-4o")
        else:
            self.tokenizer = tokenizer

        # Cache glossaries at initialization
        self._cached_glossaries = (
            self.shared_context_cross_split_part.get_glossaries_for_translation(
                translation_config.auto_extract_glossary
            )
        )

        self.il_translator = ILTranslator(
            translate_engine=translate_engine,
            translation_config=translation_config,
            tokenizer=self.tokenizer,
        )
        self.il_translator.use_as_fallback = True
        try:
            self.translate_engine.do_llm_translate(None)
        except NotImplementedError as e:
            raise ValueError("LLM translator not supported") from e

        self.ok_count = 0
        self.fallback_count = 0
        self.total_count = 0

    def calc_token_count(self, text: str) -> int:
        try:
            return len(self.tokenizer.encode(text, disallowed_special=()))
        except Exception:
            return 0

    def find_title_paragraph(self, docs: Document) -> PdfParagraph | None:
        """Find the first paragraph with layout_label 'title' in the document.

        Args:
            docs: The document to search in

        Returns:
            The first title paragraph found, or None if no title paragraph exists
        """
        for page in docs.page:
            for paragraph in page.pdf_paragraph:
                if paragraph.layout_label == "title":
                    logger.info(f"Found title paragraph: {paragraph.unicode}")
                    return paragraph
        return None

    def translate(self, docs: Document) -> None:
        self.il_translator.docs = docs
        tracker = DocumentTranslateTracker()
        self.mid = 0

        if not self.translation_config.shared_context_cross_split_part.first_paragraph:
            # Try to find the first title paragraph
            title_paragraph = self.find_title_paragraph(docs)
            self.translation_config.shared_context_cross_split_part.first_paragraph = (
                copy.deepcopy(title_paragraph)
            )
            self.translation_config.shared_context_cross_split_part.recent_title_paragraph = copy.deepcopy(
                title_paragraph
            )
            if title_paragraph:
                logger.info(f"Found first title paragraph: {title_paragraph.unicode}")

        # count total paragraph
        total = sum(
            [
                len(
                    [
                        p
                        for p in page.pdf_paragraph
                        if p.debug_id is not None and p.unicode is not None
                    ]
                )
                for page in docs.page
            ]
        )
        translated_ids = set()
        with self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            total,
        ) as pbar:
            with PriorityThreadPoolExecutor(
                max_workers=self.translation_config.pool_max_workers,
            ) as executor2:
                with PriorityThreadPoolExecutor(
                    max_workers=self.translation_config.pool_max_workers,
                ) as executor:
                    self.process_cross_page_paragraph(
                        docs,
                        executor,
                        pbar,
                        tracker,
                        executor2,
                        translated_ids,
                    )
                    # Cross-column detection per page (after cross-page processing)
                    for page in docs.page:
                        self.process_cross_column_paragraph(
                            page,
                            executor,
                            pbar,
                            tracker,
                            executor2,
                            translated_ids,
                        )
                    for page in docs.page:
                        self.process_page(
                            page,
                            executor,
                            pbar,
                            tracker.new_page(),
                            executor2,
                            translated_ids,
                        )

        path = self.translation_config.get_working_file_path("translate_tracking.json")

        if (
            self.translation_config.debug
            or self.translation_config.working_dir is not None
        ):
            logger.debug(f"save translate tracking to {path}")
            with Path(path).open("w", encoding="utf-8") as f:
                f.write(tracker.to_json())
        logger.info(
            f"Translation completed. Total: {self.total_count}, Successful: {self.ok_count}, Fallback: {self.fallback_count}"
        )

    def _is_body_text_paragraph(self, paragraph: PdfParagraph) -> bool:
        """判断正文段落（当前仅 layout_label == 'text'）。

        Args:
            paragraph: PDF paragraph to check

        Returns:
            True if this is a body text paragraph, False otherwise
        """
        return paragraph.layout_label in (
            "text",
            "plain text",
            "paragraph_hybrid",
        )

    def _should_translate_paragraph(
        self,
        paragraph: PdfParagraph,
        translated_ids: set[int] | None = None,
        require_body_text: bool = False,
    ) -> bool:
        """Check if a paragraph should be translated based on common filtering criteria.

        Args:
            paragraph: PDF paragraph to check
            translated_ids: Set of already translated paragraph IDs
            require_body_text: Whether to additionally check if paragraph is body text

        Returns:
            True if paragraph should be translated, False otherwise
        """
        # Basic validation checks
        if paragraph.debug_id is None or paragraph.unicode is None:
            return False

        # Check if already translated
        if translated_ids is not None and id(paragraph) in translated_ids:
            return False

        # CID paragraph check
        if is_cid_paragraph(paragraph):
            return False

        # Minimum length check
        if len(paragraph.unicode) < self.translation_config.min_text_length:
            return False

        # Body text check if requested
        if require_body_text and not self._is_body_text_paragraph(paragraph):
            return False

        return True

    def _filter_paragraphs(
        self,
        page: Page,
        translated_ids: set[int] | None = None,
        require_body_text: bool = False,
    ) -> list[PdfParagraph]:
        """Get list of paragraphs that should be translated from a page.

        Args:
            page: Page to get paragraphs from
            translated_ids: Set of already translated paragraph IDs
            require_body_text: Whether to filter for body text paragraphs only

        Returns:
            List of paragraphs that should be translated
        """
        return [
            paragraph
            for paragraph in page.pdf_paragraph
            if self._should_translate_paragraph(
                paragraph, translated_ids, require_body_text
            )
        ]

    def _build_font_maps(
        self, page: Page
    ) -> tuple[dict[str, PdfFont], dict[int, dict[str, PdfFont]]]:
        """Build font maps for a page.

        Args:
            page: The page to build font maps for

        Returns:
            Tuple of (page_font_map, page_xobj_font_map)
        """
        page_font_map = {}
        for font in page.pdf_font:
            page_font_map[font.font_id] = font

        page_xobj_font_map = {}
        for xobj in page.pdf_xobject:
            page_xobj_font_map[xobj.xobj_id] = page_font_map.copy()
            for font in xobj.pdf_font:
                page_xobj_font_map[xobj.xobj_id][font.font_id] = font

        return page_font_map, page_xobj_font_map

    def process_cross_page_paragraph(
        self,
        docs: Document,
        executor: PriorityThreadPoolExecutor,
        pbar: tqdm | None = None,
        tracker: DocumentTranslateTracker | None = None,
        executor2: PriorityThreadPoolExecutor | None = None,
        translated_ids: set[int] | None = None,
    ):
        """Process cross-page paragraphs by combining last body text paragraph of current page
        with first body text paragraph of next page.

        Args:
            docs: Document containing pages to process
            executor: Thread pool executor for translation tasks
            pbar: Progress bar for tracking translation progress
            tracker: Page translation tracker
            executor2: Secondary executor for fallback translation
            translated_ids: Set of already translated paragraph IDs
        """
        self.translation_config.raise_if_cancelled()

        if tracker is None:
            tracker = DocumentTranslateTracker()

        if translated_ids is None:
            translated_ids = set()

        # Process adjacent page pairs
        for i in range(len(docs.page) - 1):
            page_curr = docs.page[i]
            page_next = docs.page[i + 1]

            # Find body text paragraphs in current page
            curr_body_paragraphs = self._filter_paragraphs(
                page_curr, translated_ids, require_body_text=True
            )

            # Find body text paragraphs in next page
            next_body_paragraphs = self._filter_paragraphs(
                page_next, translated_ids, require_body_text=True
            )

            # Get last paragraph from current page and first paragraph from next page
            if not curr_body_paragraphs or not next_body_paragraphs:
                continue

            last_curr_paragraph = curr_body_paragraphs[-1]
            first_next_paragraph = next_body_paragraphs[0]

            # Skip if either paragraph is already translated
            if (
                id(last_curr_paragraph) in translated_ids
                or id(first_next_paragraph) in translated_ids
            ):
                continue

            # Build font maps for both pages
            curr_font_map, curr_xobj_font_map = self._build_font_maps(page_curr)
            next_font_map, next_xobj_font_map = self._build_font_maps(page_next)

            # Merge font maps
            merged_font_map = {**curr_font_map, **next_font_map}
            merged_xobj_font_map = {**curr_xobj_font_map, **next_xobj_font_map}

            # Calculate total token count
            total_token_count = self.calc_token_count(
                last_curr_paragraph.unicode
            ) + self.calc_token_count(first_next_paragraph.unicode)

            # Create batch with both paragraphs
            cross_page_paragraphs = [last_curr_paragraph, first_next_paragraph]
            cross_page_pages = [page_curr, page_next]
            batch_paragraph = BatchParagraph(
                cross_page_paragraphs, cross_page_pages, tracker.new_cross_page()
            )

            self.mid += 1
            # Submit translation task (force submit regardless of token count)
            executor.submit(
                self.translate_paragraph,
                batch_paragraph,
                pbar,
                merged_font_map,
                merged_xobj_font_map,
                self.translation_config.shared_context_cross_split_part.first_paragraph,
                self.translation_config.shared_context_cross_split_part.recent_title_paragraph,
                executor2,
                priority=1048576 - total_token_count,
                paragraph_token_count=total_token_count,
                mp_id=self.mid,
            )

            # Mark paragraphs as translated
            translated_ids.add(id(last_curr_paragraph))
            translated_ids.add(id(first_next_paragraph))

    def process_cross_column_paragraph(
        self,
        page: Page,
        executor: PriorityThreadPoolExecutor,
        pbar: tqdm | None = None,
        tracker: DocumentTranslateTracker | None = None,
        executor2: PriorityThreadPoolExecutor | None = None,
        translated_ids: set[int] | None = None,
    ):
        """Process cross-column paragraphs within the same page.

        If two adjacent body-text paragraphs have a gap in their y2 coordinate
        greater than 20 units, they are considered split across columns and
        will be translated together.
        """
        self.translation_config.raise_if_cancelled()

        if tracker is None:
            tracker = DocumentTranslateTracker()
        if translated_ids is None:
            translated_ids = set()

        # Filter body-text paragraphs maintaining original order
        body_paragraphs = self._filter_paragraphs(
            page, translated_ids, require_body_text=True
        )
        if len(body_paragraphs) < 2:
            return

        # Build font maps once for the whole page
        page_font_map, page_xobj_font_map = self._build_font_maps(page)

        for idx in range(len(body_paragraphs) - 1):
            p1 = body_paragraphs[idx]
            p2 = body_paragraphs[idx + 1]

            # Skip already translated
            if id(p1) in translated_ids or id(p2) in translated_ids:
                continue

            # Safety checks for box information
            if not (
                p1.box and p2.box and p1.box.y2 is not None and p2.box.y2 is not None
            ):
                continue

            if p2.box.y2 - p1.box.y2 <= 20:
                continue

            total_token_count = self.calc_token_count(
                p1.unicode
            ) + self.calc_token_count(p2.unicode)

            batch = BatchParagraph([p1, p2], [page, page], tracker.new_cross_column())
            self.mid += 1
            executor.submit(
                self.translate_paragraph,
                batch,
                pbar,
                page_font_map,
                page_xobj_font_map,
                self.translation_config.shared_context_cross_split_part.first_paragraph,
                self.translation_config.shared_context_cross_split_part.recent_title_paragraph,
                executor2,
                priority=1048576 - total_token_count,
                paragraph_token_count=total_token_count,
                mp_id=self.mid,
            )

            translated_ids.add(id(p1))
            translated_ids.add(id(p2))

    def process_page(
        self,
        page: Page,
        executor: PriorityThreadPoolExecutor,
        pbar: tqdm | None = None,
        tracker: PageTranslateTracker = None,
        executor2: PriorityThreadPoolExecutor | None = None,
        translated_ids: set | None = None,
    ):
        self.translation_config.raise_if_cancelled()
        page_font_map = {}
        for font in page.pdf_font:
            page_font_map[font.font_id] = font
        page_xobj_font_map = {}
        for xobj in page.pdf_xobject:
            page_xobj_font_map[xobj.xobj_id] = page_font_map.copy()
            for font in xobj.pdf_font:
                page_xobj_font_map[xobj.xobj_id][font.font_id] = font

        paragraphs = []

        total_token_count = 0
        for paragraph in page.pdf_paragraph:
            # Check if already translated
            if id(paragraph) in translated_ids:
                continue

            # Check basic validation
            if paragraph.debug_id is None or paragraph.unicode is None:
                continue

            # Check CID paragraph - advance progress bar if filtered out
            if is_cid_paragraph(paragraph):
                if pbar:
                    pbar.advance(1)
                continue

            # Check minimum length - advance progress bar if filtered out
            if len(paragraph.unicode) < self.translation_config.min_text_length:
                if pbar:
                    pbar.advance(1)
                continue

            if is_pure_numeric_paragraph(paragraph):
                if pbar:
                    pbar.advance(1)
                continue

            if is_placeholder_only_paragraph(paragraph):
                if pbar:
                    pbar.advance(1)
                continue

            # self.translate_paragraph(paragraph, pbar,tracker.new_paragraph(), page_font_map, page_xobj_font_map)
            total_token_count += self.calc_token_count(paragraph.unicode)
            paragraphs.append(paragraph)
            translated_ids.add(id(paragraph))
            if paragraph.layout_label == "title":
                self.shared_context_cross_split_part.recent_title_paragraph = (
                    copy.deepcopy(paragraph)
                )

            if total_token_count > 200 or len(paragraphs) > 5:
                self.mid += 1
                executor.submit(
                    self.translate_paragraph,
                    BatchParagraph(paragraphs, [page] * len(paragraphs), tracker),
                    pbar,
                    page_font_map,
                    page_xobj_font_map,
                    self.translation_config.shared_context_cross_split_part.first_paragraph,
                    self.translation_config.shared_context_cross_split_part.recent_title_paragraph,
                    executor2,
                    priority=1048576 - total_token_count,
                    paragraph_token_count=total_token_count,
                    mp_id=self.mid,
                )
                paragraphs = []
                total_token_count = 0

        if paragraphs:
            self.mid += 1
            executor.submit(
                self.translate_paragraph,
                BatchParagraph(paragraphs, [page] * len(paragraphs), tracker),
                pbar,
                page_font_map,
                page_xobj_font_map,
                self.translation_config.shared_context_cross_split_part.first_paragraph,
                self.translation_config.shared_context_cross_split_part.recent_title_paragraph,
                executor2,
                priority=1048576 - total_token_count,
                paragraph_token_count=total_token_count,
                mp_id=self.mid,
            )

    def translate_paragraph(
        self,
        batch_paragraph: BatchParagraph,
        pbar: tqdm | None = None,
        page_font_map: dict[str, PdfFont] = None,
        xobj_font_map: dict[int, dict[str, PdfFont]] = None,
        title_paragraph: PdfParagraph | None = None,
        local_title_paragraph: PdfParagraph | None = None,
        executor: PriorityThreadPoolExecutor | None = None,
        paragraph_token_count: int = 0,
        mp_id: int = 0,
    ):
        """Translate a paragraph using pre and post processing functions."""
        self.translation_config.raise_if_cancelled()
        should_translate_paragraph = []
        try:
            inputs = []
            llm_translate_trackers = []
            paragraph_unicodes = []
            for i in range(len(batch_paragraph.paragraphs)):
                paragraph = batch_paragraph.paragraphs[i]
                tracker = batch_paragraph.trackers[i]
                text, translate_input = self.il_translator.pre_translate_paragraph(
                    paragraph, tracker, page_font_map, xobj_font_map
                )
                if text is None:
                    pbar.advance(1)
                    continue

                tracker.record_multi_paragraph_id(mp_id)

                llm_translate_tracker = tracker.new_llm_translate_tracker()
                should_translate_paragraph.append(i)
                llm_translate_trackers.append(llm_translate_tracker)
                inputs.append(
                    (
                        text,
                        translate_input,
                        paragraph,
                        tracker,
                        llm_translate_tracker,
                        paragraph_unicodes,
                    )
                )
                paragraph_unicodes.append(paragraph.unicode)
            if not inputs:
                return
            json_format_input = []

            for id_, input_text in enumerate(inputs):
                ti: il_translator.ILTranslator.TranslateInput = input_text[1]
                tracker: ParagraphTranslateTracker = input_text[3]
                tracker.record_multi_paragraph_index(id_)
                placeholders_hint = ti.get_placeholders_hint()
                obj = {
                    "id": id_,
                    "input": input_text[0],
                    "layout_label": input_text[2].layout_label,
                }
                if (
                    placeholders_hint
                    and self.translation_config.add_formula_placehold_hint
                ):
                    obj["formula_placeholders_hint"] = placeholders_hint
                json_format_input.append(obj)

            json_format_input_str = json.dumps(
                json_format_input, ensure_ascii=False, indent=2
            )

            batch_text_for_glossary_matching = "\n".join(
                item.get("input", "") for item in json_format_input
            )

            final_input = self._build_llm_prompt(
                json_input_str=json_format_input_str,
                title_paragraph=title_paragraph,
                local_title_paragraph=local_title_paragraph,
                batch_text_for_glossary_matching=batch_text_for_glossary_matching,
            )

            for llm_translate_tracker in llm_translate_trackers:
                llm_translate_tracker.set_input(final_input)
            llm_output = self.translate_engine.llm_translate(
                final_input,
                rate_limit_params={
                    "paragraph_token_count": paragraph_token_count,
                    "request_json_mode": True,
                },
            )
            for llm_translate_tracker in llm_translate_trackers:
                llm_translate_tracker.set_output(llm_output)
            llm_output = llm_output.strip()

            llm_output = self._clean_json_output(llm_output)

            parsed_output = json.loads(llm_output)

            if isinstance(parsed_output, dict) and parsed_output.get(
                "output", parsed_output.get("input", False)
            ):
                parsed_output = [parsed_output]

            translation_results = {
                item["id"]: item.get("output", item.get("input"))
                for item in parsed_output
            }

            if len(translation_results) != len(inputs):
                raise Exception(
                    f"Translation results length mismatch. Expected: {len(inputs)}, Got: {len(translation_results)}"
                )

            for id_, output in translation_results.items():
                should_fallback = True
                try:
                    if not isinstance(output, str):
                        logger.warning(
                            f"Translation result is not a string. Output: {output}"
                        )
                        continue

                    id_ = int(id_)  # Ensure id is an integer
                    if id_ >= len(inputs):
                        logger.warning(f"Invalid id {id_}, skipping")
                        continue

                    # Clean up any excessive punctuation in the translated text
                    translated_text = re.sub(r"[. 。…，]{20,}", ".", output)

                    # Get the original input for this translation
                    translate_input = inputs[id_][1]
                    llm_translate_tracker = inputs[id_][4]

                    input_unicode = inputs[id_][0]
                    output_unicode = translated_text

                    trimed_input = re.sub(r"[. 。…，]{20,}", ".", input_unicode)

                    input_token_count = self.calc_token_count(trimed_input)
                    output_token_count = self.calc_token_count(output_unicode)

                    same_as_input = trimed_input == output_unicode
                    if (
                        same_as_input
                        and input_token_count > 10
                        and not self.translation_config.disable_same_text_fallback
                    ):
                        llm_translate_tracker.set_error_message(
                            "Translation result is the same as input, fallback."
                        )
                        llm_translate_tracker.set_placeholder_full_match()
                        logger.warning(
                            "Translation result is the same as input, fallback."
                        )
                        continue

                    if not (0.3 < output_token_count / input_token_count < 3):
                        llm_translate_tracker.set_error_message(
                            f"Translation result is too long or too short. Input: {input_token_count}, Output: {output_token_count}"
                        )
                        logger.warning(
                            f"Translation result is too long or too short. Input: {input_token_count}, Output: {output_token_count}"
                        )
                        llm_translate_tracker.set_placeholder_full_match()
                        continue

                    if not self.translation_config.disable_same_text_fallback:
                        edit_distance = Levenshtein.distance(
                            input_unicode, output_unicode
                        )
                        if edit_distance < 5 and input_token_count > 20:
                            llm_translate_tracker.set_error_message(
                                f"Translation result edit distance is too small. distance: {edit_distance}, input: {input_unicode}, output: {output_unicode}"
                            )
                            logger.warning(
                                f"Translation result edit distance is too small. distance: {edit_distance}, input: {input_unicode}, output: {output_unicode}"
                            )
                            llm_translate_tracker.set_placeholder_full_match()
                            continue
                    # Apply the translation to the paragraph
                    self.il_translator.post_translate_paragraph(
                        inputs[id_][2],
                        inputs[id_][3],
                        translate_input,
                        translated_text,
                    )
                    should_fallback = False
                    if pbar:
                        pbar.advance(1)
                except Exception as e:
                    error_message = f"Error translating paragraph. Error: {e}."
                    logger.exception(error_message)
                    # Ignore error and continue
                    for llm_translate_tracker in llm_translate_trackers:
                        llm_translate_tracker.set_error_message(error_message)
                    continue
                finally:
                    self.total_count += 1
                    if should_fallback:
                        self.fallback_count += 1
                        inputs[id_][4].set_fallback_to_translate()
                        logger.warning(
                            f"Fallback to simple translation. paragraph id: {inputs[id_][2].debug_id}"
                        )
                        paragraph_token_count = self.calc_token_count(
                            inputs[id_][2].unicode
                        )
                        paragraph_unicodes = inputs[id_][5]
                        inputs[id_][2].unicode = paragraph_unicodes[id_]
                        executor.submit(
                            self.il_translator.translate_paragraph,
                            inputs[id_][2],
                            batch_paragraph.pages[id_],
                            pbar,
                            inputs[id_][3],
                            page_font_map,
                            xobj_font_map,
                            priority=1048576 - paragraph_token_count,
                            paragraph_token_count=paragraph_token_count,
                            title_paragraph=title_paragraph,
                            local_title_paragraph=local_title_paragraph,
                        )
                    else:
                        self.ok_count += 1

        except Exception as e:
            error_message = f"Error {e} during translation. try fallback"
            logger.warning(error_message)
            for llm_translate_tracker in llm_translate_trackers:
                llm_translate_tracker.set_error_message(error_message)
                llm_translate_tracker.set_fallback_to_translate()
            self.total_count += len(llm_translate_trackers)
            self.fallback_count += len(llm_translate_trackers)
            for input_ in inputs:
                input_[2].unicode = input_[5]
            if not should_translate_paragraph:
                should_translate_paragraph = list(
                    range(len(batch_paragraph.paragraphs))
                )
            for i in should_translate_paragraph:
                paragraph = batch_paragraph.paragraphs[i]
                tracker = batch_paragraph.trackers[i]
                if paragraph.debug_id is None:
                    continue
                paragraph_token_count = self.calc_token_count(paragraph.unicode)
                executor.submit(
                    self.il_translator.translate_paragraph,
                    paragraph,
                    batch_paragraph.pages[i],
                    pbar,
                    tracker,
                    page_font_map,
                    xobj_font_map,
                    priority=1048576 - paragraph_token_count,
                    paragraph_token_count=paragraph_token_count,
                    title_paragraph=title_paragraph,
                    local_title_paragraph=local_title_paragraph,
                )

    def _build_llm_prompt(
        self,
        json_input_str: str,
        title_paragraph: PdfParagraph | None,
        local_title_paragraph: PdfParagraph | None,
        batch_text_for_glossary_matching: str,
    ) -> str:
        """Build LLM prompt using a single template for easier maintenance."""
        # Build role block, honoring custom_system_prompt if provided.
        custom_prompt = getattr(self.translation_config, "custom_system_prompt", None)
        if custom_prompt:
            role_block = custom_prompt.strip()
            if "Follow all rules strictly." not in role_block:
                if not role_block.endswith("\n"):
                    role_block += "\n"
                role_block += "Follow all rules strictly."
        else:
            role_block = (
                f"You are a professional {self.translation_config.lang_out} native translator who needs to fluently translate text "
                f"into {self.translation_config.lang_out}.\n\n"
                "Follow all rules strictly."
            )

        # Build contextual hints section.
        contextual_lines: list[str] = []
        hint_idx = 1
        if title_paragraph:
            contextual_lines.append(
                f"{hint_idx}. First title in full text: {title_paragraph.unicode}"
            )
            hint_idx += 1

        if local_title_paragraph:
            is_different_from_global = True
            if title_paragraph:
                if local_title_paragraph.debug_id == title_paragraph.debug_id:
                    is_different_from_global = False

            if is_different_from_global:
                contextual_lines.append(
                    f"{hint_idx}. The most recent title is: {local_title_paragraph.unicode}"
                )

        if contextual_lines:
            contextual_hints_block = (
                "## Contextual Hints for Better Translation\n"
                + "\n".join(contextual_lines)
                + "\n"
            )
        else:
            contextual_hints_block = ""

        # Build glossary usage rules and glossary tables.
        glossary_usage_rules_block = ""
        glossary_tables_block = ""
        glossary_entries_per_glossary: dict[str, list[tuple[str, str]]] = {}

        if self._cached_glossaries:
            for glossary in self._cached_glossaries:
                active_entries = glossary.get_active_entries_for_text(
                    batch_text_for_glossary_matching
                )
                if active_entries:
                    glossary_entries_per_glossary[glossary.name] = sorted(
                        active_entries
                    )

        if glossary_entries_per_glossary:
            glossary_usage_rules_block = (
                "## Glossary\n"
                "If a glossary is provided:\n"
                "- Always use the exact target term.\n"
                "- Apply glossary items even inside tags or when broken by hyphens/line breaks.\n"
                "- If glossary does NOT include a term, translate it naturally.\n\n"
            )

            glossary_table_lines: list[str] = ["## Glossary Tables", ""]
            for glossary_name, entries in glossary_entries_per_glossary.items():
                glossary_table_lines.append(f"### Glossary: {glossary_name}")
                glossary_table_lines.append("")
                glossary_table_lines.append(
                    "| Source Term | Target Term |\n|-------------|-------------|"
                )
                for original_source, target_text in entries:
                    glossary_table_lines.append(
                        f"| {original_source} | {target_text} |"
                    )
                glossary_table_lines.append("")
            glossary_tables_block = "\n".join(glossary_table_lines)

        return PROMPT_TEMPLATE.substitute(
            role_block=role_block,
            glossary_usage_rules_block=glossary_usage_rules_block,
            contextual_hints_block=contextual_hints_block,
            json_input_str=json_input_str,
            glossary_tables_block=glossary_tables_block,
            lang_out=self.translation_config.lang_out,
        )

    def _clean_json_output(self, llm_output: str) -> str:
        # Clean up JSON output by removing common wrapper tags
        llm_output = llm_output.strip()
        if llm_output.startswith("<json>"):
            llm_output = llm_output[6:]
        if llm_output.endswith("</json>"):
            llm_output = llm_output[:-7]
        if llm_output.startswith("```json"):
            llm_output = llm_output[7:]
        if llm_output.startswith("```"):
            llm_output = llm_output[3:]
        if llm_output.endswith("```"):
            llm_output = llm_output[:-3]
        return llm_output.strip()


================================================
FILE: babeldoc/format/pdf/document_il/midend/layout_parser.py
================================================
import logging
import math
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import numpy as np
from pymupdf import Document

import babeldoc.format.pdf.document_il.utils.extract_char
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.document_il.utils.style_helper import GREEN
from babeldoc.format.pdf.translation_config import TranslationConfig

logger = logging.getLogger(__name__)


class LayoutParser:
    stage_name = "Parse Page Layout"

    def __init__(self, translation_config: TranslationConfig):
        self.translation_config = translation_config
        self.model = translation_config.doc_layout_model

    def _save_debug_image(self, image: np.ndarray, layout, page_number: int):
        """Save debug image with drawn boxes if debug mode is enabled."""
        if not self.translation_config.debug:
            return

        debug_dir = Path(self.translation_config.get_working_file_path("ocr-box-image"))
        debug_dir.mkdir(parents=True, exist_ok=True)

        # Draw boxes on the image
        debug_image = image.copy()
        for box in layout.boxes:
            x0, y0, x1, y1 = box.xyxy
            cv2.rectangle(
                debug_image,
                (int(x0), int(y0)),
                (int(x1), int(y1)),
                (0, 255, 0),
                2,
            )
            # Add text label
            cv2.putText(
                debug_image,
                layout.names[box.cls],
                (int(x0), int(y0) - 5),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                (0, 255, 0),
                1,
            )
        img_bgr = cv2.cvtColor(debug_image, cv2.COLOR_RGB2BGR)

        # Save the image
        output_path = debug_dir / f"{page_number}.jpg"
        cv2.imwrite(str(output_path), img_bgr)

    def _save_debug_box_to_page(self, page: il_version_1.Page):
        """Save debug boxes and text labels to the PDF page."""
        if not self.translation_config.debug:
            return

        color = GREEN

        for layout in page.page_layout:
            # Create a rectangle box
            scale_factor = 1
            if layout.class_name == "fallback_line":
                scale_factor = 0.1
            rect = il_version_1.PdfRectangle(
                box=il_version_1.Box(
                    x=layout.box.x,
                    y=layout.box.y,
                    x2=layout.box.x2,
                    y2=layout.box.y2,
                ),
                graphic_state=color,
                debug_info=True,
                line_width=0.4 * scale_factor,
            )
            page.pdf_rectangle.append(rect)

            # Create text label at top-left corner
            # Note: PDF coordinates are from bottom-left,
            # so we use y2 for top position
            style = il_version_1.PdfStyle(
                font_id="base",
                font_size=4 * scale_factor,
                graphic_state=color,
            )
            page.pdf_paragraph.append(
                il_version_1.PdfParagraph(
                    first_line_indent=False,
                    box=il_version_1.Box(
                        x=layout.box.x,
                        y=layout.box.y2,
                        x2=layout.box.x2,
                        y2=layout.box.y2 + 5,
                    ),
                    vertical=False,
                    pdf_style=style,
                    unicode=layout.class_name,
                    pdf_paragraph_composition=[
                        il_version_1.PdfParagraphComposition(
                            pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
                                unicode=layout.class_name,
                                pdf_style=style,
                                debug_info=True,
                            ),
                        ),
                    ],
                    xobj_id=-1,
                ),
            )

    def process(self, docs: il_version_1.Document, mupdf_doc: Document):
        """Generate layouts for all pages that need to be translated."""
        # Get pages that need to be translated
        total = len(docs.page)
        with self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            total * 2,
        ) as progress:
            # Process predictions for each page
            for page, layouts in self.model.handle_document(
                docs.page,
                mupdf_doc,
                self.translation_config,
                self._save_debug_image,
            ):
                page_layouts = []
                for layout in layouts.boxes:
                    # Convert coordinate system from picture to il
                    # system to the il coordinate system
                    x0, y0, x1, y1 = layout.xyxy
                    # pix = get_no_rotation_img(mupdf_doc[page.page_number])
                    # pix = mupdf_doc[page.page_number].get_pixmap()
                    # h, w = pix.height, pix.width
                    box = mupdf_doc[page.page_number].mediabox_size
                    b_h = math.ceil(box.y)
                    b_w = math.ceil(box.x)
                    # if b_h != h or b_w != w:
                    #     logger.warning(f"page {page.page_number} mediabox is not correct, b_h: {b_h}, h: {h}, b_w: {b_w}, w: {w}")
                    h, w = b_h, b_w
                    x0, y0, x1, y1 = (
                        np.clip(int(x0 - 1), 0, w - 1),
                        np.clip(int(h - y1 - 1), 0, h - 1),
                        np.clip(int(x1 + 1), 0, w - 1),
                        np.clip(int(h - y0 + 1), 0, h - 1),
                    )
                    page_layout = il_version_1.PageLayout(
                        id=len(page_layouts) + 1,
                        box=il_version_1.Box(
                            x0.item(),
                            y0.item(),
                            x1.item(),
                            y1.item(),
                        ),
                        conf=layout.conf.item(),
                        class_name=layouts.names[layout.cls],
                    )
                    page_layouts.append(page_layout)

                page.page_layout = page_layouts
                # self.generate_fallback_line_layout_for_page(page)
                # self._save_debug_box_to_page(page)
                progress.advance(1)
            with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
                for page in docs.page:
                    executor.submit(
                        self.generate_fallback_line_layout_for_page, page, progress
                    )
        return docs

    def generate_fallback_line_layout_for_page(self, page: il_version_1.Page, progress):
        try:
            exists_page_layouts = page.page_layout
            char_boxes = babeldoc.format.pdf.document_il.utils.extract_char.convert_page_to_char_boxes(
                page
            )
            if not char_boxes:
                return

            clusters = babeldoc.format.pdf.document_il.utils.extract_char.process_page_chars_to_lines(
                char_boxes
            )
            for cluster in clusters:
                boxes = [c[0] for c in cluster.chars]
                min_x = min(b.x for b in boxes)
                max_x = max(b.x2 for b in boxes)
                min_y = min(b.y for b in boxes)
                max_y = max(b.y2 for b in boxes)
                cluster.chars = il_version_1.Box(min_x, min_y, max_x, max_y)
                page_layout = il_version_1.PageLayout(
                    id=len(exists_page_layouts) + 1,
                    box=il_version_1.Box(
                        min_x,
                        min_y,
                        max_x,
                        max_y,
                    ),
                    conf=1,
                    class_name="fallback_line",
                )
                exists_page_layouts.append(page_layout)
            self._save_debug_box_to_page(page)
        finally:
            progress.advance(1)


================================================
FILE: babeldoc/format/pdf/document_il/midend/paragraph_finder.py
================================================
import logging
import random
import re

import numpy as np

from babeldoc.babeldoc_exception.BabelDOCException import ExtractTextError
from babeldoc.format.pdf.document_il import Box
from babeldoc.format.pdf.document_il import Document
from babeldoc.format.pdf.document_il import Page
from babeldoc.format.pdf.document_il import PdfCharacter
from babeldoc.format.pdf.document_il import PdfLine
from babeldoc.format.pdf.document_il import PdfParagraph
from babeldoc.format.pdf.document_il import PdfParagraphComposition
from babeldoc.format.pdf.document_il import PdfRectangle
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.utils.formular_helper import (
    collect_page_formula_font_ids,
)
from babeldoc.format.pdf.document_il.utils.layout_helper import (
    HEIGHT_NOT_USFUL_CHAR_IN_CHAR,
)
from babeldoc.format.pdf.document_il.utils.layout_helper import SPACE_REGEX
from babeldoc.format.pdf.document_il.utils.layout_helper import Layout
from babeldoc.format.pdf.document_il.utils.layout_helper import add_space_dummy_chars
from babeldoc.format.pdf.document_il.utils.layout_helper import build_layout_index
from babeldoc.format.pdf.document_il.utils.layout_helper import calculate_iou_for_boxes
from babeldoc.format.pdf.document_il.utils.layout_helper import get_char_unicode_string
from babeldoc.format.pdf.document_il.utils.layout_helper import get_character_layout
from babeldoc.format.pdf.document_il.utils.layout_helper import is_bullet_point
from babeldoc.format.pdf.document_il.utils.layout_helper import (
    is_character_in_formula_layout,
)
from babeldoc.format.pdf.document_il.utils.layout_helper import is_text_layout
from babeldoc.format.pdf.document_il.utils.paragraph_helper import is_cid_paragraph
from babeldoc.format.pdf.document_il.utils.style_helper import INDIGO
from babeldoc.format.pdf.document_il.utils.style_helper import WHITE
from babeldoc.format.pdf.translation_config import TranslationConfig

logger = logging.getLogger(__name__)

# Base58 alphabet (Bitcoin style, without numbers 0, O, I, l)
BASE58_ALPHABET = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"


def generate_base58_id(length: int = 5) -> str:
    """Generate a random base58 ID of specified length."""
    return "".join(random.choice(BASE58_ALPHABET) for _ in range(length))


class ParagraphFinder:
    stage_name = "Parse Paragraphs"

    # 定义项目符号的正则表达式模式

    def __init__(self, translation_config: TranslationConfig):
        self.translation_config = translation_config
        self.font_mapper = FontMapper(translation_config)

    def _preprocess_formula_layouts(self, page: Page):
        """
        Identifies 'formula' layouts that do not significantly overlap with any text layouts
        and re-labels them as 'isolate_formula'.
        """
        # Use a simplified Layout object for is_text_layout check
        text_layouts = [
            layout
            for layout in page.page_layout
            if is_text_layout(Layout(layout.id, layout.class_name))
        ]
        formula_layouts = [
            layout for layout in page.page_layout if layout.class_name == "formula"
        ]

        if not text_layouts or not formula_layouts:
            return

        for formula_layout in formula_layouts:
            is_isolated = True
            for text_layout in text_layouts:
                iou = calculate_iou_for_boxes(formula_layout.box, text_layout.box)
                if iou >= 0.5:
                    is_isolated = False
                    break

            if is_isolated:
                formula_layout.class_name = "isolate_formula"

    def add_text_fill_background(self, page: Page):
        layout_map = {layout.id: layout for layout in page.page_layout}
        for paragraph in page.pdf_paragraph:
            layout_id = paragraph.layout_id
            if layout_id is None:
                continue
            layout = layout_map[layout_id]
            if paragraph.box is None:
                continue
            x1, y1, x2, y2 = (
                paragraph.box.x,
                paragraph.box.y,
                paragraph.box.x2,
                paragraph.box.y2,
            )
            layout_box = layout.box
            if layout_box.x < x1:
                x1 = layout_box.x
            if layout_box.y < y1:
                y1 = layout_box.y
            if layout_box.x2 > x2:
                x2 = layout_box.x2
            if layout_box.y2 > y2:
                y2 = layout_box.y2
            assert x2 > x1 and y2 > y1
            page.pdf_rectangle.append(
                PdfRectangle(
                    box=Box(x1, y1, x2, y2),
                    fill_background=True,
                    graphic_state=WHITE,
                    debug_info=False,
                    xobj_id=paragraph.xobj_id,
                )
            )

    def update_paragraph_data(self, paragraph: PdfParagraph, update_unicode=False):
        if not paragraph.pdf_paragraph_composition:
            return

        chars = []
        for composition in paragraph.pdf_paragraph_composition:
            if composition.pdf_line:
                chars.extend(composition.pdf_line.pdf_character)
            elif composition.pdf_formula:
                chars.extend(composition.pdf_formula.pdf_character)
            elif composition.pdf_character:
                chars.append(composition.pdf_character)
            elif composition.pdf_same_style_unicode_characters:
                continue
            else:
                logger.error(
                    "Unexpected composition type"
                    " in PdfParagraphComposition. "
                    "This type only appears in the IL "
                    "after the translation is completed.",
                )
                continue

        if update_unicode and chars:
            paragraph.unicode = get_char_unicode_string(chars)
        if not chars:
            return
        # 更新边界框
        min_x = min(char.visual_bbox.box.x for char in chars)
        min_y = min(char.visual_bbox.box.y for char in chars)
        max_x = max(char.visual_bbox.box.x2 for char in chars)
        max_y = max(char.visual_bbox.box.y2 for char in chars)
        paragraph.box = Box(min_x, min_y, max_x, max_y)
        paragraph.vertical = chars[0].vertical
        paragraph.xobj_id = chars[0].xobj_id

        paragraph.first_line_indent = False
        if (
            paragraph.pdf_paragraph_composition
            and paragraph.pdf_paragraph_composition[0].pdf_line
            and paragraph.pdf_paragraph_composition[0]
            .pdf_line.pdf_character[0]
            .visual_bbox.box.x
            - paragraph.box.x
            > 1
        ):
            paragraph.first_line_indent = True

    def update_line_data(self, line: PdfLine):
        min_x = min(char.visual_bbox.box.x for char in line.pdf_character)
        min_y = min(char.visual_bbox.box.y for char in line.pdf_character)
        max_x = max(char.visual_bbox.box.x2 for char in line.pdf_character)
        max_y = max(char.visual_bbox.box.y2 for char in line.pdf_character)
        line.box = Box(min_x, min_y, max_x, max_y)

    def add_debug_info(self, page: Page):
        if not self.translation_config.debug:
            return
        for paragraph in page.pdf_paragraph:
            for composition in paragraph.pdf_paragraph_composition:
                if composition.pdf_line:
                    line = composition.pdf_line
                    page.pdf_rectangle.append(
                        PdfRectangle(
                            box=line.box,
                            fill_background=False,
                            graphic_state=INDIGO,
                            debug_info=True,
                            line_width=0.2,
                        )
                    )

    def process(self, document):
        with self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            len(document.page),
        ) as pbar:
            if not document.page:
                return
            for page in document.page:
                self.translation_config.raise_if_cancelled()
                self.process_page(page)
                pbar.advance()

            total_paragraph_count = 0
            for page in document.page:
                total_paragraph_count += len(page.pdf_paragraph)
            if total_paragraph_count == 0:
                raise ExtractTextError("The document contains no paragraphs.")

            if self.check_cid_paragraph(document):
                raise ExtractTextError("The document contains too many CID paragraphs.")

    def check_cid_paragraph(self, doc: Document):
        cid_para_count = 0
        para_total = 0
        for page in doc.page:
            para_total += len(page.pdf_paragraph)
            for para in page.pdf_paragraph:
                if is_cid_paragraph(para):
                    cid_para_count += 1
        return cid_para_count / para_total > 0.8

    def bbox_overlap(self, bbox1: Box, bbox2: Box) -> bool:
        return (
            bbox1.x < bbox2.x2
            and bbox1.x2 > bbox2.x
            and bbox1.y < bbox2.y2
            and bbox1.y2 > bbox2.y
        )

    def process_page(self, page: Page):
        layout_index, layout_map = build_layout_index(page)
        # 预处理公式布局的标签
        self._preprocess_formula_layouts(page)

        # 第一步：根据 layout 创建 paragraphs
        # 在这一步中，page.pdf_character 中的字符会被移除
        paragraphs = self._group_characters_into_paragraphs(
            page, layout_index, layout_map
        )
        page.pdf_paragraph = paragraphs

        page_level_formula_font_ids, xobj_specific_formula_font_ids = (
            collect_page_formula_font_ids(
                page, self.translation_config.formular_font_pattern
            )
        )

        # for para in paragraphs:
        #     if not para.debug_id:
        #         continue
        #     new_line = PdfLine(
        #         pdf_character=[x.pdf_character for x in para.pdf_paragraph_composition]
        #     )
        #     self.update_line_data(new_line)
        #     para.pdf_paragraph_composition = [
        #         PdfParagraphComposition(pdf_line=new_line)
        #     ]

        # 第二步：将段落内的字符拆分为行
        for paragraph in paragraphs:
            if (
                paragraph.xobj_id
                and paragraph.xobj_id in xobj_specific_formula_font_ids
            ):
                current_formula_font_ids = xobj_specific_formula_font_ids[
                    paragraph.xobj_id
                ]
            else:
                current_formula_font_ids = page_level_formula_font_ids
            self._split_paragraph_into_lines(paragraph, current_formula_font_ids)

        # 第三步：处理段落中的空格
        for paragraph in paragraphs:
            add_space_dummy_chars(paragraph)
            self.process_paragraph_spacing(paragraph)
            self.update_paragraph_data(paragraph)

        # 第四步：计算所有行宽度的中位数
        median_width = self.calculate_median_line_width(paragraphs)

        # 第五步：处理独立段落
        self.process_independent_paragraphs(paragraphs, median_width)

        # 新增后处理：合并带行号交替的正文段落（a 正文、b 行号、c 正文 -> 合并 a 与 c，保留 b）
        if getattr(self.translation_config, "merge_alternating_line_numbers", True):
            self.merge_alternating_line_number_paragraphs(paragraphs)

        for paragraph in paragraphs:
            self.update_paragraph_data(paragraph, update_unicode=True)

        if self.translation_config.ocr_workaround:
            self.add_text_fill_background(page)
            # since this is ocr file,
            # image characters are not needed
            page.pdf_character = []

        self.fix_overlapping_paragraphs(page)

        # 第六步：对每一行的字符进行排序
        # self._sort_characters_in_lines(page)

        self.add_debug_info(page)

        # 新阶段：设置段落的 renderorder 为所有组成部分中 renderorder 最小的
        self._set_paragraph_render_order(page)

    def _set_paragraph_render_order(self, page: Page):
        """
        设置段落的 renderorder 为段落所有组成部分中 renderorder 最小的值
        """
        for paragraph in page.pdf_paragraph:
            min_render_order = 9999999999999999

            # 遍历段落的所有组成部分
            for composition in paragraph.pdf_paragraph_composition:
                # 检查 PdfLine 中的字符
                if composition.pdf_line:
                    for char in composition.pdf_line.pdf_character:
                        if (
                            hasattr(char, "render_order")
                            and char.render_order is not None
                        ):
                            min_render_order = min(min_render_order, char.render_order)

                # 检查单个字符
                elif composition.pdf_character:
                    char = composition.pdf_character
                    if hasattr(char, "render_order") and char.render_order is not None:
                        min_render_order = min(min_render_order, char.render_order)

                # 检查公式中的字符
                elif composition.pdf_formula:
                    for char in composition.pdf_formula.pdf_character:
                        if (
                            hasattr(char, "render_order")
                            and char.render_order is not None
                        ):
                            min_render_order = min(min_render_order, char.render_order)

            # 如果找到了有效的 renderorder，设置段落的 renderorder
            if min_render_order != 9999999999999999:
                paragraph.render_order = min_render_order

    def is_isolated_formula(self, char: PdfCharacter):
        return char.char_unicode in (
            "(cid:122)",
            "(cid:123)",
            "(cid:124)",
            "(cid:125)",
        )

    def _paragraph_text_ascii(self, p: PdfParagraph) -> str:
        parts: list[str] = []
        for comp in p.pdf_paragraph_composition or []:
            if comp.pdf_line:
                for ch in comp.pdf_line.pdf_character or []:
                    if ch.char_unicode is not None:
                        parts.append(ch.char_unicode)
            elif comp.pdf_character and comp.pdf_character.char_unicode is not None:
                parts.append(comp.pdf_character.char_unicode)
        return "".join(parts)

    def _is_ascii_digit_or_space_paragraph(self, p: PdfParagraph) -> bool:
        text = self._paragraph_text_ascii(p)
        if not text:
            return True
        has_digit = False
        for c in text:
            if c.isdigit() and ord(c) < 128:
                has_digit = True
                continue
            if c.isspace():
                continue
            return False
        return True if has_digit or text.strip() == "" else False

    @staticmethod
    def _same_layout_and_xobj(a: PdfParagraph, c: PdfParagraph) -> bool:
        return (
            a.layout_id is not None
            and c.layout_id is not None
            and a.layout_id == c.layout_id
            and a.xobj_id is not None
            and c.xobj_id is not None
            and a.xobj_id == c.xobj_id
        )

    def merge_alternating_line_number_paragraphs(self, paragraphs: list[PdfParagraph]):
        # a 代表正文
        # l 代表行号
        if not paragraphs or len(paragraphs) < 3:
            return
        i = 0
        while i < len(paragraphs) - 2:
            a = paragraphs[i]
            # 吞掉一个或多个连续的行号段 l
            j = i + 1
            saw_l = False
            while j < len(paragraphs) and self._is_ascii_digit_or_space_paragraph(
                paragraphs[j]
            ):
                saw_l = True
                j += 1
            # 现在 j 指向候选的 c
            if saw_l and j < len(paragraphs):
                c = paragraphs[j]
                if self._same_layout_and_xobj(a, c):
                    a.pdf_paragraph_composition.extend(c.pdf_paragraph_composition)
                    self.update_paragraph_data(a)
                    del paragraphs[j]
                    # 不移动 i，继续尝试把更多正文接到 a，实现 a l+ a l+ a ... 链式合并
                    continue
            i += 1

    def _group_characters_into_paragraphs(
        self, page: Page, layout_index, layout_map
    ) -> list[PdfParagraph]:
        paragraphs: list[PdfParagraph] = []
        if page.pdf_paragraph:
            paragraphs.extend(page.pdf_paragraph)
            page.pdf_paragraph = []

        char_areas = [
            (char.visual_bbox.box.x2 - char.visual_bbox.box.x)
            * (char.visual_bbox.box.y2 - char.visual_bbox.box.y)
            for char in page.pdf_character
        ]
        median_char_area = 0.0
        if char_areas:
            char_areas.sort()
            mid = len(char_areas) // 2
            median_char_area = (
                char_areas[mid]
                if len(char_areas) % 2 == 1
                else (char_areas[mid - 1] + char_areas[mid]) / 2
            )

        current_paragraph: PdfParagraph | None = None
        current_layout: Layout | None = None
        skip_chars = []

        for char in page.pdf_character:
            char_layout = get_character_layout(char, layout_index, layout_map)
            # Check if character is in any formula layout and set formula_layout_id
            char.formula_layout_id = is_character_in_formula_layout(
                char, page, layout_index, layout_map
            )

            if not is_text_layout(char_layout) or self.is_isolated_formula(char):
                skip_chars.append(char)
                continue

            char_box = char.visual_bbox.box
            # char_pdf_box = char.box
            # if calculate_iou_for_boxes(char_box, char_pdf_box) < 0.2:
            #     char_box = char_pdf_box
            char_area = (char_box.x2 - char_box.x) * (char_box.y2 - char_box.y)
            is_small_char = char_area < median_char_area * 0.05

            is_new_paragraph = False
            if current_paragraph is None:
                is_new_paragraph = True
            elif (
                not (
                    is_small_char
                    and current_paragraph.pdf_paragraph_composition
                    and char_layout.id == current_layout.id
                )
                and char.char_unicode not in HEIGHT_NOT_USFUL_CHAR_IN_CHAR
            ):
                if (
                    (
                        char_layout.id != current_layout.id
                        and not SPACE_REGEX.match(char.char_unicode)
                    )
                    or (  # not same xobject
                        current_paragraph.pdf_paragraph_composition
                        and current_paragraph.pdf_paragraph_composition[
                            -1
                        ].pdf_character.xobj_id
                        != char.xobj_id
                    )
                    or (
                        is_bullet_point(char)
                        and not current_paragraph.pdf_paragraph_composition
                    )
                ):
                    is_new_paragraph = True

            if is_new_paragraph:
                current_layout = char_layout
                current_paragraph = PdfParagraph(
                    pdf_paragraph_composition=[],
                    layout_id=current_layout.id,
                    debug_id=generate_base58_id(),
                    layout_label=current_layout.name,
                )
                paragraphs.append(current_paragraph)

            current_paragraph.pdf_paragraph_composition.append(
                PdfParagraphComposition(pdf_character=char)
            )

        page.pdf_character = skip_chars
        for para in paragraphs:
            self.update_paragraph_data(para)
        return paragraphs

    def _merge_overlapping_clusters(
        self, lines: dict[int, list[PdfCharacter]], char_height_average: float
    ) -> dict[int, list[PdfCharacter]]:
        """
        Merge clusters that have significant y-axis overlap.
        If y_intersection / min_height > 0.5 or the distance between y-midlines is less than char_height_average, merge the two clusters.
        """
        if len(lines) <= 1:
            return lines

        # Calculate y-axis ranges for each cluster
        cluster_ranges = {}
        cluster_midlines = {}
        for label, chars in lines.items():
            y_values = [char.visual_bbox.box.y for char in chars] + [
                char.visual_bbox.box.y2 for char in chars
            ]
            y_min, y_max = min(y_values), max(y_values)
            cluster_ranges[label] = (y_min, y_max)
            cluster_midlines[label] = (y_min + y_max) / 2

        # Keep merging until no more merges are possible
        changed = True
        while changed:
            changed = False
            labels_to_check = list(lines.keys())

            for i in range(len(labels_to_check)):
                if not changed:  # Only continue if no merge happened in this iteration
                    for j in range(i + 1, len(labels_to_check)):
                        label1, label2 = labels_to_check[i], labels_to_check[j]

                        # Skip if either label has been merged away
                        if label1 not in lines or label2 not in lines:
                            continue

                        y1_min, y1_max = cluster_ranges[label1]
                        y2_min, y2_max = cluster_ranges[label2]

                        # Calculate intersection
                        intersection_start = max(y1_min, y2_min)
                        intersection_end = min(y1_max, y2_max)

                        # Calculate midline distance
                        midline_distance = abs(
                            cluster_midlines[label1] - cluster_midlines[label2]
                        )

                        should_merge = False
                        if (
                            intersection_end > intersection_start
                        ):  # There is intersection
                            intersection_height = intersection_end - intersection_start
                            height1 = y1_max - y1_min
                            height2 = y2_max - y2_min
                            min_height = min(height1, height2)

                            # Check if intersection ratio exceeds threshold
                            if (
                                min_height > 0
                                and intersection_height / min_height > 0.3
                            ):
                                should_merge = True

                        # Check if midline distance is less than char_height_average
                        if midline_distance < char_height_average:
                            should_merge = True

                        if should_merge:
                            # Merge label2 into label1
                            lines[label1].extend(lines[label2])
                            del lines[label2]

                            # Update cluster range and midline for the merged cluster
                            new_y_min = min(y1_min, y2_min)
                            new_y_max = max(y1_max, y2_max)
                            cluster_ranges[label1] = (new_y_min, new_y_max)
                            cluster_midlines[label1] = (new_y_min + new_y_max) / 2
                            del cluster_ranges[label2]
                            del cluster_midlines[label2]

                            changed = True
                            break

        return lines

    def _get_effective_y_bounds(self, char: PdfCharacter) -> tuple[float, float]:
        """
        Determines the effective vertical boundaries (y1, y2) for a character.

        It prioritizes the visual bounding box if its Intersection over Union (IoU)
        with the PDF bounding box is high (>= 0.5), otherwise, it falls back to the
        PDF bounding box. This helps use more accurate layout information when available.
        """
        visual_box = char.visual_bbox.box
        return visual_box.y, visual_box.y2
        pdf_box = char.box
        if calculate_iou_for_boxes(visual_box, pdf_box) >= 0.5:
            return visual_box.y, visual_box.y2
        return pdf_box.y, pdf_box.y2

    @staticmethod
    def _compute_collision_counts_histogram(
        y1_arr: np.ndarray,
        y2_arr: np.ndarray,
        para_y_min: float,
        para_y_max: float,
        step: float,
    ) -> np.ndarray:
        """Compute overlap counts at each scan line using a difference-array histogram.

        Args:
            y1_arr: 1-D array with lower y bounds of characters (inclusive).
            y2_arr: 1-D array with upper y bounds of characters (exclusive).
            para_y_min: Minimum y of the paragraph.
            para_y_max: Maximum y of the paragraph.
            step: Scan step size.

        Returns:
            1-D NumPy int32 array where index i corresponds to y = para_y_max - i × step.
        """
        # Number of scan positions
        m = int(np.ceil((para_y_max - para_y_min) / step))
        if m <= 0:
            return np.array([], dtype=np.int32)

        # Map character bounds to discrete indices (top inclusive, bottom exclusive)
        starts = np.floor((para_y_max - y2_arr) / step).astype(np.int32)
        ends = np.floor((para_y_max - y1_arr) / step).astype(np.int32) + 1
        # Clip ends to the valid range [0, m]
        np.clip(ends, 0, m, out=ends)

        hist = np.zeros(m + 1, dtype=np.int32)
        np.add.at(hist, starts, 1)
        np.add.at(hist, ends, -1)

        return np.cumsum(hist[:-1])

    def _split_paragraph_into_lines(
        self, paragraph: PdfParagraph, formula_font_ids: set[str]
    ):
        """
        Splits a paragraph into lines using a "line-threading" method.

        This method works by scanning vertically across the paragraph's bounding
        box and counting how many characters intersect with a horizontal line
        at each y-coordinate. The regions with a low number of intersections
        (less than 2) are identified as gaps between lines. The characters
        are then partitioned into lines based on these identified gaps.
        """
        if not paragraph.pdf_paragraph_composition:
            return

        # 1. Extract all characters and other compositions from the paragraph.
        all_chars: list[PdfCharacter] = []
        other_compositions: list[PdfParagraphComposition] = []
        for comp in paragraph.pdf_paragraph_composition:
            if comp.pdf_character:
                all_chars.append(comp.pdf_character)
            else:
                other_compositions.append(comp)

        if not all_chars:
            return

        # 2. Determine effective y-bounds for each character and the paragraph's total vertical range.
        char_y_bounds = [
            {"char": char, "y1": y1, "y2": y2}
            for char in all_chars
            for y1, y2 in [self._get_effective_y_bounds(char)]
        ]

        if not char_y_bounds:
            paragraph.pdf_paragraph_composition = other_compositions
            self.update_paragraph_data(paragraph)
            return

        para_y_min = min(b["y1"] for b in char_y_bounds)
        para_y_max = max(b["y2"] for b in char_y_bounds)

        # If the paragraph is vertically flat, treat it as a single line.
        if (para_y_max - para_y_min) < 5:  # Using a small threshold
            # all_chars.sort(key=lambda c: c.visual_bbox.box.x)
            single_line_composition = self.create_line(all_chars)
            paragraph.pdf_paragraph_composition = [
                single_line_composition
            ] + other_compositions
            self.update_paragraph_data(paragraph)
            return

        # 3. Perform "threading" scan to create a collision histogram.
        # Scan from top (max y) to bottom (min y) with a step of 0.5.
        scan_y_min = para_y_min
        scan_y_max = para_y_max
        step = 0.25

        y_coordinates = np.arange(scan_y_max, scan_y_min, -step)

        # Compute collision counts using NumPy histogram (O(m + n))
        y1_arr = np.array([b["y1"] for b in char_y_bounds], dtype=np.float32)
        y2_arr = np.array([b["y2"] for b in char_y_bounds], dtype=np.float32)
        collision_counts = self._compute_collision_counts_histogram(
            y1_arr,
            y2_arr,
            scan_y_min,
            scan_y_max,
            step,
        )

        # 4. Find gaps (regions with low collision count) from the histogram.
        gaps = []
        in_gap = False
        for i, count in enumerate(collision_counts):
            if count < 1 and not in_gap:
                in_gap = True
                gap_start_index = i
            elif count >= 1 and in_gap:
                in_gap = False
                gaps.append((gap_start_index, i - 1))
        if in_gap:
            gaps.append((gap_start_index, len(collision_counts) - 1))

        # If no significant gaps are found, treat it as a single line.
        if not gaps:
            # all_chars.sort(key=lambda c: c.visual_bbox.box.x)
            single_line_composition = self.create_line(all_chars)
            paragraph.pdf_paragraph_composition = [
                single_line_composition
            ] + other_compositions
            self.update_paragraph_data(paragraph)
            return

        # 5. Assign characters to lines based on the identified gaps.
        # Calculate separator y-coordinates from the midpoints of the gaps.
        separator_y_coords = sorted(
            [y_coordinates[start_idx] for start_idx, end_idx in gaps],
            reverse=True,
        )

        lines: list[list[PdfCharacter]] = [
            [] for _ in range(len(separator_y_coords) + 1)
        ]

        for b in char_y_bounds:
            char_y_center = (b["y1"] + b["y2"]) / 2
            line_idx = 0
            # Find which line bucket the character belongs to.
            for sep_y in separator_y_coords:
                if char_y_center > sep_y:
                    break
                line_idx += 1
            lines[line_idx].append(b["char"])

        # 6. Rebuild the paragraph's composition list from the new lines.
        new_line_compositions = []
        for line_chars in lines:
            if line_chars:
                # Sort characters within each line by x-coordinate (left-to-right).
                # line_chars.sort(key=lambda c: c.visual_bbox.box.x)
                new_line_compositions.append(self.create_line(line_chars))

        # The lines are already sorted vertically due to the scanning process.
        paragraph.pdf_paragraph_composition = new_line_compositions + other_compositions
        self.update_paragraph_data(paragraph)

    def process_paragraph_spacing(self, paragraph: PdfParagraph):
        if not paragraph.pdf_paragraph_composition:
            return

        # 处理行级别的空格
        processed_lines = []
        for composition in paragraph.pdf_paragraph_composition:
            if not composition.pdf_line:
                processed_lines.append(composition)
                continue

            line = composition.pdf_line
            if not "".join(
                x.char_unicode for x in line.pdf_character
            ).strip():  # 跳过完全空白的行
                continue

            # 处理行内字符的尾随空格
            processed_chars = []
            for char in line.pdf_character:
                if not char.char_unicode.isspace():
                    processed_chars = processed_chars + [char]
                elif processed_chars:  # 只有在有非空格字符后才考虑保留空格
                    processed_chars.append(char)

            # 移除尾随空格
            while processed_chars and processed_chars[-1].char_unicode.isspace():
                processed_chars.pop()

            if processed_chars:  # 如果行内还有字符
                line = self.create_line(processed_chars)
                processed_lines.append(line)

        paragraph.pdf_paragraph_composition = processed_lines
        self.update_paragraph_data(paragraph)

    def create_line(self, chars: list[PdfCharacter]) -> PdfParagraphComposition:
        assert chars

        line = PdfLine(pdf_character=chars)
        self.update_line_data(line)
        return PdfParagraphComposition(pdf_line=line)

    def calculate_median_line_width(self, paragraphs: list[PdfParagraph]) -> float:
        # 收集所有行的宽度
        line_widths = []
        for paragraph in paragraphs:
            for composition in paragraph.pdf_paragraph_composition:
                if composition.pdf_line:
                    line = composition.pdf_line
                    line_widths.append(line.box.x2 - line.box.x)

        if not line_widths:
            return 0.0

        # 计算中位数
        line_widths.sort()
        mid = len(line_widths) // 2
        if len(line_widths) % 2 == 0:
            return (line_widths[mid - 1] + line_widths[mid]) / 2
        return line_widths[mid]

    def process_independent_paragraphs(
        self,
        paragraphs: list[PdfParagraph],
        median_width: float,
    ):
        i = 0
        while i < len(paragraphs):
            paragraph = paragraphs[i]
            if len(paragraph.pdf_paragraph_composition) <= 1:  # 跳过只有一行的段落
                i += 1
                continue

            j = 1
            while j < len(paragraph.pdf_paragraph_composition):
                prev_composition = paragraph.pdf_paragraph_composition[j - 1]
                if not prev_composition.pdf_line:
                    j += 1
                    continue

                prev_line = prev_composition.pdf_line
                prev_width = prev_line.box.x2 - prev_line.box.x
                prev_text = "".join([c.char_unicode for c in prev_line.pdf_character])

                # 检查是否包含连续的点（至少 20 个）
                # 如果有至少连续 20 个点，则代表这是目录条目
                if re.search(r"\.{20,}", prev_text):
                    # 创建新的段落
                    new_paragraph = PdfParagraph(
                        box=Box(0, 0, 0, 0),  # 临时边界框
                        pdf_paragraph_composition=(
                            paragraph.pdf_paragraph_composition[j:]
                        ),
                        unicode="",
                        debug_id=generate_base58_id(),
                        layout_label=paragraph.layout_label,
                        layout_id=paragraph.layout_id,
                    )
                    # 更新原段落
                    paragraph.pdf_paragraph_composition = (
                        paragraph.pdf_paragraph_composition[:j]
                    )

                    # 更新两个段落的数据
                    self.update_paragraph_data(paragraph)
                    self.update_paragraph_data(new_paragraph)

                    # 在原段落后插入新段落
                    paragraphs.insert(i + 1, new_paragraph)
                    break

                # 如果前一行宽度小于中位数的一半，将当前行及后续行分割成新段落
                if (
                    self.translation_config.split_short_lines
                    and prev_width
                    < median_width * self.translation_config.short_line_split_factor
                ) or (
                    paragraph.pdf_paragraph_composition
                    and (current_line := paragraph.pdf_paragraph_composition[j])
                    and (line := current_line.pdf_line)
                    and (chars := line.pdf_character)
                    and (char := chars[0])
                    and is_bullet_point(char)
                ):
                    # 创建新的段落
                    new_paragraph = PdfParagraph(
                        box=Box(0, 0, 0, 0),  # 临时边界框
                        pdf_paragraph_composition=(
                            paragraph.pdf_paragraph_composition[j:]
                        ),
                        unicode="",
                        debug_id=generate_base58_id(),
                        layout_label=paragraph.layout_label,
                        layout_id=paragraph.layout_id,
                    )
                    # 更新原段落
                    paragraph.pdf_paragraph_composition = (
                        paragraph.pdf_paragraph_composition[:j]
                    )

                    # 更新两个段落的数据
                    self.update_paragraph_data(paragraph)
                    self.update_paragraph_data(new_paragraph)

                    # 在原段落后插入新段落
                    paragraphs.insert(i + 1, new_paragraph)
                    break
                j += 1
            i += 1

    @staticmethod
    def is_bbox_contain_in_vertical(bbox1: Box, bbox2: Box) -> bool:
        """Check if one bounding box is completely contained within the other."""
        # Check if bbox1 is contained in bbox2
        bbox1_in_bbox2 = bbox1.y >= bbox2.y and bbox1.y2 <= bbox2.y2
        # Check if bbox2 is contained in bbox1
        bbox2_in_bbox1 = bbox2.y >= bbox1.y and bbox2.y2 <= bbox1.y2
        return bbox1_in_bbox2 or bbox2_in_bbox1

    def fix_overlapping_paragraphs(self, page: Page):
        """
        Adjusts the bounding boxes of paragraphs on a page to resolve vertical overlaps.

        Iteratively checks pairs of paragraphs and adjusts their vertical boundaries
        (y and y2) if they overlap, aiming to place the boundary at the midpoint
        of the vertical overlap.
        """
        paragraphs = page.pdf_paragraph
        if not paragraphs or len(paragraphs) < 2:
            return

        max_iterations = len(paragraphs) * len(paragraphs)  # Safety break
        iterations = 0

        while iterations < max_iterations:
            iterations += 1
            overlap_found_in_pass = False

            for i in range(len(paragraphs)):
                for j in range(i + 1, len(paragraphs)):
                    para1 = paragraphs[i]
                    para2 = paragraphs[j]

                    if para1.box is None or para2.box is None:
                        continue

                    if para1.xobj_id != para2.xobj_id:
                        continue

                    # Check for overlap using the existing method
                    if self.bbox_overlap(para1.box, para2.box):
                        if self.is_bbox_contain_in_vertical(para1.box, para2.box):
                            continue
                        # Calculate vertical overlap details
                        overlap_y_start = max(para1.box.y, para2.box.y)
                        overlap_y_end = min(para1.box.y2, para2.box.y2)
                        overlap_height = overlap_y_end - overlap_y_start

                        # Calculate horizontal overlap details
                        overlap_x_start = max(para1.box.x, para2.box.x)
                        overlap_x_end = min(para1.box.x2, para2.box.x2)
                        overlap_width = overlap_x_end - overlap_x_start

                        # Ensure there's a real 2D overlap, focusing on vertical adjustment
                        if overlap_height > 1e-6 and overlap_width > 1e-6:
                            overlap_found_in_pass = True

                            # Determine which paragraph is visually higher
                            if para1.box.y2 > para2.box.y and para1.box.y < para2.box.y:
                                lower_para = para1
                                higher_para = para2
                            # Handle cases where y values are identical (or very close)
                            # Prefer the one with smaller y2 as the higher one, or break tie arbitrarily
                            elif para1.box.y2 < para2.box.y2:
                                lower_para = para1
                                higher_para = para2
                            else:
                                lower_para = para2
                                higher_para = para1

                            # Calculate the midpoint of the vertical overlap
                            mid_y = overlap_y_start + overlap_height / 2

                            # Adjust boxes, ensuring they remain valid (y2 > y)
                            if mid_y > higher_para.box.y and mid_y < lower_para.box.y2:
                                higher_para.box.y = mid_y + 1
                                lower_para.box.y2 = mid_y - 1
                            else:
                                # This might happen if one box is fully contained vertically
                                # within another, or due to floating point issues.
                                # Log a warning and skip adjustment for this pair in this iteration.
                                # A more complex strategy might be needed for full containment.
                                logger.warning(
                                    "Could not resolve overlap between paragraphs"
                                    f" {higher_para.debug_id} and {lower_para.debug_id}"
                                    " using simple midpoint strategy."
                                    f" Midpoint: {mid_y},"
                                    f" Higher Box: {higher_para.box},"
                                    f" Lower Box: {lower_para.box}"
                                )

            # If no overlaps were found and adjusted in this pass, we're done.
            if not overlap_found_in_pass:
                break

        if iterations == max_iterations:
            logger.warning(
                f"Maximum iterations ({max_iterations}) reached in"
                f" fix_overlapping_paragraphs for page {page.page_number}."
                " Some overlaps might remain."
            )

    def _sort_characters_in_lines(self, page: Page):
        """Sort characters in each line from left to right, top to bottom."""
        for paragraph in page.pdf_paragraph:
            for composition in paragraph.pdf_paragraph_composition:
                if composition.pdf_line:
                    line = composition.pdf_line
                    line.pdf_character.sort(key=self._get_char_sort_key)

    def _get_char_sort_key(self, char: PdfCharacter):
        """Get sort key for character positioning (top to bottom, left to right)."""
        visual_box = char.visual_bbox.box
        pdf_box = char.box

        # Use visual box if IoU with bbox is >= 0.1, otherwise use bbox
        if calculate_iou_for_boxes(visual_box, pdf_box) >= 0.1:
            box = visual_box
        else:
            box = pdf_box

        # Sort by y coordinate first (top to bottom), then x coordinate (left to right)
        # Note: In PDF coordinate system, y increases upward, so we negate y for top-to-bottom sorting
        return (box.x, -box.y)


================================================
FILE: babeldoc/format/pdf/document_il/midend/remove_descent.py
================================================
import logging
from collections import Counter
from functools import cache

from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.translation_config import TranslationConfig

logger = logging.getLogger(__name__)


class RemoveDescent:
    stage_name = "Remove Char Descent"

    def __init__(self, translation_config: TranslationConfig):
        self.translation_config = translation_config

    def _remove_char_descent(
        self,
        char: il_version_1.PdfCharacter,
        font: il_version_1.PdfFont,
    ) -> float | None:
        """Remove descent from a single character and return the descent value.

        Args:
            char: The character to process
            font: The font used by this character

        Returns:
            The descent value if it was removed, None otherwise
        """
        if (
            char.box
            and char.box.y is not None
            and char.box.y2 is not None
            and font
            and hasattr(font, "descent")
        ):
            descent = font.descent * char.pdf_style.font_size / 1000
            if char.vertical:
                # For vertical text, remove descent from x coordinates
                char.box.x += descent
                char.box.x2 += descent
            else:
                # For horizontal text, remove descent from y coordinates
                char.box.y -= descent
                char.box.y2 -= descent
            return descent
        return None

    def process(self, document: il_version_1.Document):
        """Process the document to remove descent adjustments from character boxes.

        Args:
            document: The document to process
        """
        with self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            len(document.page),
        ) as pbar:
            for page in document.page:
                self.translation_config.raise_if_cancelled()
                self.process_page(page)
                pbar.advance()

    def process_page(self, page: il_version_1.Page):
        """Process a single page to remove descent adjustments.

        Args:
            page: The page to process
        """
        # Build font map including xobjects
        fonts: dict[
            str | int,
            il_version_1.PdfFont | dict[str, il_version_1.PdfFont],
        ] = {f.font_id: f for f in page.pdf_font}
        page_fonts = {f.font_id: f for f in page.pdf_font}

        # Add xobject fonts
        for xobj in page.pdf_xobject:
            fonts[xobj.xobj_id] = page_fonts.copy()
            for font in xobj.pdf_font:
                fonts[xobj.xobj_id][font.font_id] = font

        @cache
        def get_font(
            font_id: str,
            xobj_id: int | None = None,
        ) -> il_version_1.PdfFont | None:
            if xobj_id is not None and xobj_id in fonts:
                font_map = fonts[xobj_id]
                if isinstance(font_map, dict) and font_id in font_map:
                    return font_map[font_id]
            return (
                fonts.get(font_id)
                if isinstance(fonts.get(font_id), il_version_1.PdfFont)
                else None
            )

        # Process all standalone characters in the page
        for char in page.pdf_character:
            if font := get_font(char.pdf_style.font_id, char.xobj_id):
                self._remove_char_descent(char, font)

        # Process all paragraphs
        for paragraph in page.pdf_paragraph:
            descent_values = []
            vertical_chars = []

            # Process all characters in paragraph compositions
            for comp in paragraph.pdf_paragraph_composition:
                # Handle direct characters
                if comp.pdf_character:
                    font = get_font(
                        comp.pdf_character.pdf_style.font_id,
                        comp.pdf_character.xobj_id,
                    )
                    if font:
                        descent = self._remove_char_descent(comp.pdf_character, font)
                        if descent is not None:
                            descent_values.append(descent)
                            vertical_chars.append(comp.pdf_character.vertical)

                # Handle characters in PdfLine
                elif comp.pdf_line:
                    for char in comp.pdf_line.pdf_character:
                        if font := get_font(char.pdf_style.font_id, char.xobj_id):
                            descent = self._remove_char_descent(char, font)
                            if descent is not None:
                                descent_values.append(descent)
                                vertical_chars.append(char.vertical)

                # Handle characters in PdfFormula
                elif comp.pdf_formula:
                    for char in comp.pdf_formula.pdf_character:
                        if font := get_font(char.pdf_style.font_id, char.xobj_id):
                            descent = self._remove_char_descent(char, font)
                            if descent is not None:
                                descent_values.append(descent)
                                vertical_chars.append(char.vertical)

                # Handle characters in PdfSameStyleCharacters
                elif comp.pdf_same_style_characters:
                    for char in comp.pdf_same_style_characters.pdf_character:
                        if font := get_font(char.pdf_style.font_id, char.xobj_id):
                            descent = self._remove_char_descent(char, font)
                            if descent is not None:
                                descent_values.append(descent)
                                vertical_chars.append(char.vertical)

            # Adjust paragraph box based on most common descent value
            if descent_values and paragraph.box:
                # Calculate mode of descent values
                descent_counter = Counter(descent_values)
                most_common_descent = descent_counter.most_common(1)[0][0]

                # Check if paragraph is vertical (all characters are vertical)
                is_vertical = all(vertical_chars) if vertical_chars else False

                # Adjust paragraph box
                if paragraph.box.y is not None and paragraph.box.y2 is not None:
                    if is_vertical:
                        # For vertical paragraphs, adjust x coordinates
                        paragraph.box.x += most_common_descent
                        paragraph.box.x2 += most_common_descent
                    else:
                        # For horizontal paragraphs, adjust y coordinates
                        paragraph.box.y -= most_common_descent
                        paragraph.box.y2 -= most_common_descent


================================================
FILE: babeldoc/format/pdf/document_il/midend/styles_and_formulas.py
================================================
import math
import re

from babeldoc.format.pdf.document_il.il_version_1 import Box
from babeldoc.format.pdf.document_il.il_version_1 import Document
from babeldoc.format.pdf.document_il.il_version_1 import GraphicState
from babeldoc.format.pdf.document_il.il_version_1 import Page
from babeldoc.format.pdf.document_il.il_version_1 import PdfCharacter
from babeldoc.format.pdf.document_il.il_version_1 import PdfFormula
from babeldoc.format.pdf.document_il.il_version_1 import PdfLine
from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraphComposition
from babeldoc.format.pdf.document_il.il_version_1 import PdfSameStyleCharacters
from babeldoc.format.pdf.document_il.il_version_1 import PdfStyle
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.utils.formular_helper import (
    collect_page_formula_font_ids,
)
from babeldoc.format.pdf.document_il.utils.formular_helper import (
    is_formulas_middle_char,
)
from babeldoc.format.pdf.document_il.utils.formular_helper import is_formulas_start_char
from babeldoc.format.pdf.document_il.utils.formular_helper import update_formula_data
from babeldoc.format.pdf.document_il.utils.layout_helper import LEFT_BRACKET
from babeldoc.format.pdf.document_il.utils.layout_helper import RIGHT_BRACKET
from babeldoc.format.pdf.document_il.utils.layout_helper import build_layout_index
from babeldoc.format.pdf.document_il.utils.layout_helper import calculate_iou_for_boxes
from babeldoc.format.pdf.document_il.utils.layout_helper import (
    calculate_y_true_iou_for_boxes,
)
from babeldoc.format.pdf.document_il.utils.layout_helper import is_bullet_point
from babeldoc.format.pdf.document_il.utils.layout_helper import (
    is_curve_in_figure_table_layout,
)
from babeldoc.format.pdf.document_il.utils.layout_helper import (
    is_curve_overlapping_with_paragraphs,
)
from babeldoc.format.pdf.document_il.utils.layout_helper import is_same_style
from babeldoc.format.pdf.document_il.utils.spatial_analyzer import (
    is_element_contained_in_formula,
)
from babeldoc.format.pdf.translation_config import TranslationConfig


class StylesAndFormulas:
    stage_name = "Parse Formulas and Styles"

    def __init__(self, translation_config: TranslationConfig):
        self.translation_config = translation_config
        self.font_mapper = FontMapper(translation_config)

    def update_formula_data(self, formula: PdfFormula):
        update_formula_data(formula)

    def process(self, document: Document):
        with self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            len(document.page),
        ) as pbar:
            for page in document.page:
                self.translation_config.raise_if_cancelled()
                self.process_page(page)
                pbar.advance()

    def update_all_formula_data(self, page: Page):
        for para in page.pdf_paragraph:
            for comp in para.pdf_paragraph_composition:
                if comp.pdf_formula:
                    self.update_formula_data(comp.pdf_formula)

    def _calculate_element_formula_iou(
        self, element_box: Box, formula_box: Box, tolerance: float = 2.0
    ) -> float:
        """Calculate precise IoU between an element and a formula with tolerance.

        Args:
            element_box: Bounding box of the element (curve/form)
            formula_box: Bounding box of the formula
            tolerance: Tolerance to expand formula box for containment check

        Returns:
            IoU value between element and expanded formula box
        """
        if element_box is None or formula_box is None:
            return 0.0

        # Expand formula box by tolerance for more lenient containment check
        expanded_formula_box = Box(
            x=formula_box.x - tolerance,
            y=formula_box.y - tolerance,
            x2=formula_box.x2 + tolerance,
            y2=formula_box.y2 + tolerance,
        )

        return calculate_iou_for_boxes(element_box, expanded_formula_box)

    def _is_element_contained_exact(
        self,
        element_box: Box,
        formula_box: Box,
        containment_threshold: float = 0.95,
    ) -> bool:
        """Check if an element is contained within a formula with zero tolerance.

        Args:
            element_box: Bounding box of the element (curve/form)
            formula_box: Bounding box of the formula
            containment_threshold: Minimum IoU ratio to consider as contained

        Returns:
            True if the element is contained within the formula (exact match)
        """
        if element_box is None or formula_box is None:
            return False

        # Use formula box without any tolerance expansion
        iou = calculate_iou_for_boxes(element_box, formula_box)
        return iou >= containment_threshold

    def _calculate_element_formula_distance(
        self, element_box: Box, formula_box: Box
    ) -> float:
        """Calculate the shortest distance between an element and a formula.

        Args:
            element_box: Bounding box of the element (curve/form)
            formula_box: Bounding box of the formula

        Returns:
            Shortest distance between the element and formula boxes
        """
        if element_box is None or formula_box is None:
            return float("inf")

        # Calculate horizontal distance
        if element_box.x2 < formula_box.x:
            # Element is to the left of formula
            dx = formula_box.x - element_box.x2
        elif element_box.x > formula_box.x2:
            # Element is to the right of formula
            dx = element_box.x - formula_box.x2
        else:
            # Horizontal overlap
            dx = 0.0

        # Calculate vertical distance
        if element_box.y2 < formula_box.y:
            # Element is above formula
            dy = formula_box.y - element_box.y2
        elif element_box.y > formula_box.y2:
            # Element is below formula
            dy = element_box.y - formula_box.y2
        else:
            # Vertical overlap
            dy = 0.0

        # Return Euclidean distance
        return (dx * dx + dy * dy) ** 0.5

    def _collect_element_formula_candidates(
        self, page: Page
    ) -> tuple[list, dict, dict]:
        """Collect all potential assignments of elements to formulas.

        Uses two-level IoU matching strategy:
        1. Exact IoU matching (zero tolerance) - highest priority
        2. Tolerant IoU matching (2.0 tolerance, distance-sorted) - second priority

        Returns:
            Tuple of (all_formulas, curve_candidates, form_candidates) where:
            - all_formulas: list of (formula, paragraph_xobj_id) tuples
            - curve_candidates: dict mapping curve index to (curve, candidates) tuples
            - form_candidates: dict mapping form index to (form, candidates) tuples
            where candidates is a list of (formula_index, score, match_type) tuples
        """
        curve_candidates = {}
        form_candidates = {}

        # Configuration parameters
        max_tolerant_distance = 100.0  # Maximum distance for tolerant matching scoring

        if not page.pdf_paragraph:
            return [], curve_candidates, form_candidates

        # Collect all formulas from all paragraphs with their index
        all_formulas = []
        for paragraph in page.pdf_paragraph:
            for composition in paragraph.pdf_paragraph_composition:
                if composition.pdf_formula:
                    all_formulas.append((composition.pdf_formula, paragraph.xobj_id))

        # Check each curve against all formulas
        for curve_idx, curve in enumerate(page.pdf_curve):
            if not curve.box:
                continue

            candidates = []
            for formula_idx, (formula, paragraph_xobj_id) in enumerate(all_formulas):
                if not formula.box:
                    continue

                # Check xobj_id compatibility
                if paragraph_xobj_id is not None and curve.xobj_id != paragraph_xobj_id:
                    continue

                # Level 1: Exact IoU matching (zero tolerance) - highest priority
                if self._is_element_contained_exact(curve.box, formula.box):
                    iou = calculate_iou_for_boxes(curve.box, formula.box)
                    candidates.append((formula_idx, iou, "iou_exact"))
                # Level 2: Tolerant IoU matching (with tolerance) - distance sorted
                elif is_element_contained_in_formula(curve.box, formula.box):
                    distance = self._calculate_element_formula_distance(
                        curve.box, formula.box
                    )
                    # Convert distance to score (closer = higher score)
                    # Score range: 0.5-0.9 to ensure lower than exact IoU
                    distance_factor = max(0.0, 1.0 - distance / max_tolerant_distance)
                    score = 0.5 + 0.4 * distance_factor
                    candidates.append((formula_idx, score, "iou_tolerant"))

            if candidates:
                curve_candidates[curve_idx] = (curve, candidates)

        # Check each form against all formulas
        for form_idx, form in enumerate(page.pdf_form):
            if not form.box:
                continue

            candidates = []
            for formula_idx, (formula, paragraph_xobj_id) in enumerate(all_formulas):
                if not formula.box:
                    continue

                # Check xobj_id compatibility
                if paragraph_xobj_id is not None and form.xobj_id != paragraph_xobj_id:
                    continue

                # Level 1: Exact IoU matching (zero tolerance) - highest priority
                if self._is_element_contained_exact(form.box, formula.box):
                    iou = calculate_iou_for_boxes(form.box, formula.box)
                    candidates.append((formula_idx, iou, "iou_exact"))
                # Level 2: Tolerant IoU matching (with tolerance) - distance sorted
                elif is_element_contained_in_formula(form.box, formula.box):
                    distance = self._calculate_element_formula_distance(
                        form.box, formula.box
                    )
                    # Convert distance to score (closer = higher score)
                    # Score range: 0.5-0.9 to ensure lower than exact IoU
                    distance_factor = max(0.0, 1.0 - distance / max_tolerant_distance)
                    score = 0.5 + 0.4 * distance_factor
                    candidates.append((formula_idx, score, "iou_tolerant"))

            if candidates:
                form_candidates[form_idx] = (form, candidates)

        return all_formulas, curve_candidates, form_candidates

    def _resolve_assignment_conflicts(
        self, curve_candidates: dict, form_candidates: dict
    ) -> tuple[dict, list, list]:
        """Resolve assignment conflicts using prioritized matching strategy.

        Args:
            curve_candidates: dict mapping curve index to (curve, candidates) tuples
            form_candidates: dict mapping form index to (form, candidates) tuples
            where candidates is a list of (formula_index, score, match_type) tuples

        Returns:
            Tuple of (formula_assignments, curves_to_remove, forms_to_remove) where:
            - formula_assignments: dict mapping formula_index to (curves, forms) tuples
            - curves_to_remove: list of curves to remove from page level
            - forms_to_remove: list of forms to remove from page level
        """
        formula_assignments = {}
        curves_to_remove = []
        forms_to_remove = []

        def _get_best_candidate(candidates):
            """Get the best candidate using priority: Exact IoU > Tolerant IoU, then by score."""
            if not candidates:
                return None

            # Sort by match_type priority and then by score (descending)
            def sort_key(candidate):
                formula_idx, score, match_type = candidate
                # Exact IoU matches get priority 1, tolerant IoU matches get priority 2
                priority = 1 if match_type == "iou_exact" else 2
                # Return tuple for sorting: (priority, -score) for descending score within priority
                return (priority, -score)

            sorted_candidates = sorted(candidates, key=sort_key)
            return sorted_candidates[0]

        # Resolve curve assignments
        for _curve_idx, (curve, candidates) in curve_candidates.items():
            if not candidates:
                continue

            best_candidate = _get_best_candidate(candidates)
            if best_candidate:
                best_formula_idx, best_score, match_type = best_candidate

                # Add to assignments
                if best_formula_idx not in formula_assignments:
                    formula_assignments[best_formula_idx] = ([], [])
                formula_assignments[best_formula_idx][0].append(curve)
                curves_to_remove.append(curve)

        # Resolve form assignments
        for _form_idx, (form, candidates) in form_candidates.items():
            if not candidates:
                continue

            best_candidate = _get_best_candidate(candidates)
            if best_candidate:
                best_formula_idx, best_score, match_type = best_candidate

                # Add to assignments
                if best_formula_idx not in formula_assignments:
                    formula_assignments[best_formula_idx] = ([], [])
                formula_assignments[best_formula_idx][1].append(form)
                forms_to_remove.append(form)

        return formula_assignments, curves_to_remove, forms_to_remove

    def collect_contained_elements(self, page: Page):
        """Collect curves and forms that are contained within formulas.

        Uses two-phase assignment strategy to ensure each element is assigned
        to only one formula based on highest IoU value.
        """
        if not page.pdf_paragraph:
            return

        # Phase 1: Collect all potential element-formula assignments
        all_formulas, curve_candidates, form_candidates = (
            self._collect_element_formula_candidates(page)
        )

        # Phase 2: Resolve conflicts using IoU maximization
        formula_assignments, curves_to_remove, forms_to_remove = (
            self._resolve_assignment_conflicts(curve_candidates, form_candidates)
        )

        # Apply the resolved assignments using formula indices
        for formula_idx, (
            assigned_curves,
            assigned_forms,
        ) in formula_assignments.items():
            formula = all_formulas[formula_idx][0]  # Extract formula from tuple
            formula.pdf_curve.extend(assigned_curves)
            formula.pdf_form.extend(assigned_forms)

        # Remove assigned elements from page level
        for curve in curves_to_remove:
            if curve in page.pdf_curve:
                page.pdf_curve.remove(curve)

        for form in forms_to_remove:
            if form in page.pdf_form:
                page.pdf_form.remove(form)

    def process_page(self, page: Page):
        """处理页面，包括公式识别和偏移量计算"""
        self.process_page_formulas(page)
        # self.process_page_offsets(page)
        self.process_comma_formulas(page)
        self.merge_overlapping_formulas(page)
        if not self.translation_config.skip_formula_offset_calculation:
            self.process_page_offsets(page)
        self.process_translatable_formulas(page)
        self.update_all_formula_data(page)
        if not self.translation_config.ocr_workaround:
            self.collect_contained_elements(page)

        # Process remaining non-formula lines after formula assignment is complete
        if self.translation_config.remove_non_formula_lines:
            self.remove_non_formula_lines_from_paragraphs(page)

        if not self.translation_config.skip_formula_offset_calculation:
            self.process_page_offsets(page)
        self.update_all_formula_data(page)
        self.process_page_styles(page)

    def update_line_data(self, line: PdfLine):
        min_x = min(char.visual_bbox.box.x for char in line.pdf_character)
        min_y = min(char.visual_bbox.box.y for char in line.pdf_character)
        max_x = max(char.visual_bbox.box.x2 for char in line.pdf_character)
        max_y = max(char.visual_bbox.box.y2 for char in line.pdf_character)
        line.box = Box(min_x, min_y, max_x, max_y)

    def _classify_characters_in_composition(
        self,
        composition: PdfParagraphComposition,
        formula_font_ids: set[int],
        first_is_bullet_so_far: bool,
        line_index: int,
    ) -> tuple[list[tuple[PdfCharacter, bool]], bool]:
        """
        Phase 1: Classify every character in a composition as either formula or text.
        This preserves the original logic, including the sticky `first_is_bullet` flag.
        """
        tagged_chars = []
        is_formula_tags = []

        line = composition.pdf_line
        if not line or not line.pdf_character:
            return [], first_is_bullet_so_far

        first_is_bullet = first_is_bullet_so_far
        in_formula_state = False
        in_corner_mark_state = False
        corner_mark_info = []

        # Determine the `is_formula` tag for each character
        for i, char in enumerate(line.pdf_character):
            # The original logic for `first_is_bullet`: it is set if any segment starts with a bullet.
            # A "segment" started when `current_chars` was empty.
            # We determine the start of a segment by looking at the previous char's tag.
            is_start_of_segment = i == 0 or (
                len(is_formula_tags) > 0 and is_formula_tags[-1] != in_formula_state
            )
            if not first_is_bullet and is_start_of_segment and is_bullet_point(char):
                first_is_bullet = True

            is_formula = (
                (  # 区分公式开头的字符&公式中间的字符。主要是逗号不能在公式开头，但是可以在中间。
                    char.formula_layout_id
                    or (
                        is_formulas_start_char(
                            char.char_unicode,
                            self.font_mapper,
                            self.translation_config,
                        )
                        and not in_formula_state
                    )
                    or (
                        is_formulas_middle_char(
                            char.char_unicode,
                            self.font_mapper,
                            self.translation_config,
                        )
                        and in_formula_state
                    )
                )  # 公式字符
                or char.pdf_style.font_id in formula_font_ids  # 公式字体
                or char.vertical  # 垂直字体
                or (
                    #   如果是程序添加的 dummy 空格
                    char.char_unicode is None and in_formula_state
                )
                or (
                    # 如果字符的视觉框和实际框不一致，则认为是公式字符
                    char.box.x > char.visual_bbox.box.x2
                    or char.box.x2 < char.visual_bbox.box.x
                    or char.box.y > char.visual_bbox.box.y2
                    or char.box.y2 < char.visual_bbox.box.y
                )
            )

            previous_char = line.pdf_character[i - 1] if i > 0 else None
            next_char = (
                line.pdf_character[i + 1] if i < len(line.pdf_character) - 1 else None
            )
            isspace = char.char_unicode.isspace() if char.char_unicode else False
            prev_is_space = (
                previous_char.char_unicode.isspace()
                if previous_char and previous_char.char_unicode
                else False
            )

            is_corner_mark = (
                (
                    previous_char is not None
                    and not isspace
                    and not prev_is_space
                    and not first_is_bullet
                    # 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
                    and char.pdf_style.font_size
                    < previous_char.pdf_style.font_size * 0.79
                    and not in_corner_mark_state
                )
                or (
                    previous_char is not None
                    and not isspace
                    and not prev_is_space
                    and not first_is_bullet
                    # 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
                    and char.pdf_style.font_size
                    < previous_char.pdf_style.font_size * 1.1
                    and in_corner_mark_state
                )
                or (
                    # 检查段落开始的角标：当没有前一个字符时，通过下一个字符判断
                    previous_char is None
                    and next_char is not None
                    and not isspace
                    and not prev_is_space
                    and not first_is_bullet
                    # 当前字符字体大小明显小于下一个字符，判定为角标
                    and char.pdf_style.font_size < next_char.pdf_style.font_size * 0.79
                    and not in_corner_mark_state
                )
            )

            is_formula = is_formula or is_corner_mark

            if char.char_unicode == " ":
                is_formula = in_formula_state

            # This simulates the state change for the next iteration
            if is_formula != in_formula_state:
                in_formula_state = is_formula

            in_corner_mark_state = is_corner_mark
            is_formula_tags.append(is_formula)
            corner_mark_info.append(is_corner_mark)

        for char, is_formula, is_corner_mark in zip(
            line.pdf_character, is_formula_tags, corner_mark_info, strict=False
        ):
            tagged_chars.append((char, is_formula, is_corner_mark))

        return tagged_chars, first_is_bullet

    def _group_classified_characters(
        self,
        tagged_chars: list[tuple[PdfCharacter, bool, bool]],
        line_index: int,
    ) -> list[PdfParagraphComposition]:
        """
        Phase 2: Group consecutive characters with the same tag into new compositions.
        """
        if not tagged_chars:
            return []

        new_compositions = []
        current_chars = []
        current_tag = tagged_chars[0][1]
        current_corner_mark_flags = []

        for char, is_formula_tag, is_corner_mark in tagged_chars:
            if is_formula_tag == current_tag:
                current_chars.append(char)
                current_corner_mark_flags.append(is_corner_mark)
            else:
                # Check if any character in current group is a corner mark
                has_corner_mark = any(current_corner_mark_flags)
                new_compositions.append(
                    self.create_composition(
                        current_chars, current_tag, line_index, has_corner_mark
                    ),
                )
                current_chars = [char]
                current_tag = is_formula_tag
                current_corner_mark_flags = [is_corner_mark]

        if current_chars:
            # Check if any character in final group is a corner mark
            has_corner_mark = any(current_corner_mark_flags)
            new_compositions.append(
                self.create_composition(
                    current_chars, current_tag, line_index, has_corner_mark
                ),
            )

        return new_compositions

    def process_page_formulas(self, page: Page):
        if not page.pdf_paragraph:
            return

        page_level_formula_font_ids, xobj_specific_formula_font_ids = (
            collect_page_formula_font_ids(
                page, self.translation_config.formular_font_pattern
            )
        )

        for paragraph in page.pdf_paragraph:
            if not paragraph.pdf_paragraph_composition:
                continue

            current_formula_font_ids: set[int]
            if (
                paragraph.xobj_id
                and paragraph.xobj_id in xobj_specific_formula_font_ids
            ):
                current_formula_font_ids = xobj_specific_formula_font_ids[
                    paragraph.xobj_id
                ]
            else:
                current_formula_font_ids = page_level_formula_font_ids

            new_paragraph_compositions = []
            # This flag is carried through all compositions in a paragraph, as in the original implementation.
            first_is_bullet = False

            for line_index, composition in enumerate(
                paragraph.pdf_paragraph_composition
            ):
                (
                    tagged_chars,
                    first_is_bullet,
                ) = self._classify_characters_in_composition(
                    composition,
                    current_formula_font_ids,
                    first_is_bullet,
                    line_index,
                )

                if not tagged_chars:
                    new_paragraph_compositions.append(composition)
                    continue

                grouped_compositions = self._group_classified_characters(
                    tagged_chars, line_index
                )
                new_paragraph_compositions.extend(grouped_compositions)

            paragraph.pdf_paragraph_composition = new_paragraph_compositions

    def process_translatable_formulas(self, page: Page):
        """将需要正常翻译的公式（如纯数字、数字加逗号等）转换为普通文本行"""
        if not page.pdf_paragraph:
            return

        for paragraph in page.pdf_paragraph:
            if not paragraph.pdf_paragraph_composition:
                continue

            new_compositions = []
            for composition in paragraph.pdf_paragraph_composition:
                if (
                    composition.pdf_formula is not None
                    and not composition.pdf_formula.is_corner_mark
                    and self.is_translatable_formula(
                        composition.pdf_formula,
                    )
                ):
                    # 将可翻译公式转换为普通文本行
                    new_line = PdfLine(
                        pdf_character=composition.pdf_formula.pdf_character,
                    )
                    self.update_line_data(new_line)
                    new_compositions.append(PdfParagraphComposition(pdf_line=new_line))
                else:
                    new_compositions.append(composition)

            paragraph.pdf_paragraph_composition = new_compositions

    def process_page_styles(self, page: Page):
        """处理页面中的文本样式，识别相同样式的文本"""
        if not page.pdf_paragraph:
            return

        for paragraph in page.pdf_paragraph:
            if not paragraph.pdf_paragraph_composition:
                continue

            # 计算基准样式（除公式外所有文字样式的交集）
            base_style = self._calculate_base_style(paragraph)
            paragraph.pdf_style = base_style

            # 重新组织段落中的文本，将相同样式的文本组合在一起
            new_compositions = []
            current_chars = []
            current_style = None

            for comp in paragraph.pdf_paragraph_composition:
                if comp.pdf_formula is not None:
                    if current_chars:
                        new_comp = self._create_same_style_composition(
                            current_chars,
                            current_style,
                        )
                        new_compositions.append(new_comp)
                        current_chars = []
                    new_compositions.append(comp)
                    continue

                if not comp.pdf_line:
                    new_compositions.append(comp)
                    continue

                for char in comp.pdf_line.pdf_character:
                    char_style = char.pdf_style
                    if current_style is None:
                        current_style = char_style
                        current_chars.append(char)
                    elif is_same_style(char_style, current_style):
                        current_chars.append(char)
                    else:
                        if current_chars:
                            new_comp = self._create_same_style_composition(
                                current_chars,
                                current_style,
                            )
                            new_compositions.append(new_comp)
                        current_chars = [char]
                        current_style = char_style

            if current_chars:
                new_comp = self._create_same_style_composition(
                    current_chars,
                    current_style,
                )
                new_compositions.append(new_comp)

            paragraph.pdf_paragraph_composition = new_compositions

    def _calculate_base_style(self, paragraph) -> PdfStyle:
        """计算段落的基准样式（除公式外所有文字样式的交集）"""
        styles = []
        for comp in paragraph.pdf_paragraph_composition:
            if isinstance(comp, PdfFormula):
                continue
            if not comp.pdf_line:
                continue
            for char in comp.pdf_line.pdf_character:
                styles.append(char.pdf_style)

        if not styles:
            return None

        # 返回所有样式的交集
        base_style = styles[0]
        for style in styles[1:]:
            # 更新基准样式为所有样式的交集
            base_style = self._merge_styles(base_style, style)

        # 如果 font_id 或 font_size 为 None，则使用众数
        if base_style.font_id is None:
            base_style.font_id = self._get_mode_value([s.font_id for s in styles])
        if base_style.font_size is None:
            base_style.font_size = self._get_mode_value([s.font_size for s in styles])

        return base_style

    def _get_mode_value(self, values):
        """计算列表中的众数"""
        if not values:
            return None
        from collections import Counter

        counter = Counter(values)
        return counter.most_common(1)[0][0]

    def _merge_styles(self, style1, style2):
        """合并两个样式，返回它们的交集"""
        if style1 is None or style1.font_size is None:
            return style2
        if style2 is None or style2.font_size is None:
            return style1

        return PdfStyle(
            font_id=style1.font_id if style1.font_id == style2.font_id else None,
            font_size=(
                style1.font_size
                if math.fabs(style1.font_size - style2.font_size) < 0.02
                else None
            ),
            graphic_state=self._merge_graphic_states(
                style1.graphic_state,
                style2.graphic_state,
            ),
        )

    def _merge_graphic_states(self, state1, state2):
        """合并两个 GraphicState，返回它们的交集"""
        if state1 is None:
            return state2
        if state2 is None:
            return state1

        return GraphicState(
            passthrough_per_char_instruction=(
                state1.passthrough_per_char_instruction
                if state1.passthrough_per_char_instruction
                == state2.passthrough_per_char_instruction
                else None
            ),
        )

    def _create_same_style_composition(
        self,
        chars: list[PdfCharacter],
        style,
    ) -> PdfParagraphComposition:
        """创建具有相同样式的文本组合"""
        if not chars:
            return None

        # 计算边界框
        min_x = min(char.visual_bbox.box.x for char in chars)
        min_y = min(char.visual_bbox.box.y for char in chars)
        max_x = max(char.visual_bbox.box.x2 for char in chars)
        max_y = max(char.visual_bbox.box.y2 for char in chars)
        box = Box(min_x, min_y, max_x, max_y)

        return PdfParagraphComposition(
            pdf_same_style_characters=PdfSameStyleCharacters(
                box=box,
                pdf_style=style,
                pdf_character=chars,
            ),
        )

    def process_page_offsets(self, page: Page):
        """计算公式的 x 和 y 偏移量"""
        if not page.pdf_paragraph:
            return

        for paragraph in page.pdf_paragraph:
            if paragraph.debug_id is None:
                continue
            if not paragraph.pdf_paragraph_composition:
                continue

            # 计算该段落的行间距，用其 80% 作为容差
            # line_spacing = self.calculate_line_spacing(paragraph)
            # y_tolerance = line_spacing * 0.8

            for i, composition in enumerate(paragraph.pdf_paragraph_composition):
                if not composition.pdf_formula:
                    continue

                formula = composition.pdf_formula
                left_char = None
                right_char = None

                left_iou = 0
                right_iou = 0

                # 查找左边最近的同一行的文本
                for j in range(i - 1, -1, -1):
                    comp = paragraph.pdf_paragraph_composition[j]
                    if comp.pdf_line:
                        for char in reversed(comp.pdf_line.pdf_character):
                            if not char.pdf_character_id:
                                continue
                            # 检查 y 坐标是否接近，判断是否在同一行
                            left_iou = calculate_y_true_iou_for_boxes(
                                formula.box, char.box
                            )
                            if left_iou > 0.6:
                                left_char = char
                                break
                    break

                # 查找右边最近的同一行的文本
                for j in range(i + 1, len(paragraph.pdf_paragraph_composition)):
                    comp = paragraph.pdf_paragraph_composition[j]
                    if comp.pdf_line:
                        for char in comp.pdf_line.pdf_character:
                            if not char.pdf_character_id:
                                continue
                            # 检查 y 坐标是否接近，判断是否在同一行
                            right_iou = calculate_y_true_iou_for_boxes(
                                formula.box, char.box
                            )
                            if right_iou > 0.6:
                                right_char = char
                                break
                    break

                # If both text segments exist, keep the one with higher IOU
                if left_char and right_char:
                    if left_iou < right_iou:
                        left_char = None
                    elif right_iou < left_iou:
                        right_char = None
                    # If IOUs are equal, keep both

                # 计算 x 偏移量（相对于左边文本）
                if left_char:
                    formula.x_offset = formula.box.x - left_char.box.x2
                else:
                    formula.x_offset = 0  # 如果左边没有文字，x_offset 应该为 0
                if abs(formula.x_offset) < 0.1:
                    formula.x_offset = 0
                if formula.x_offset > 10:
                    formula.x_offset = 0
                # if formula.x_offset > 0:
                #     formula.x_offset = 0
                if formula.x_offset < -5:
                    formula.x_offset = 0

                # 计算 y 偏移量
                if left_char:
                    # 使用底部坐标计算偏移量
                    formula.y_offset = formula.box.y - left_char.box.y
                elif right_char:
                    formula.y_offset = formula.box.y - right_char.box.y
                else:
                    formula.y_offset = 0

                if abs(formula.y_offset) < 0.1:
                    formula.y_offset = 0

                if max(abs(formula.y_offset), abs(formula.x_offset)) > 10:
                    pass
                    # logging.debug(
                    #     f"公式 {formula.box} 的偏移量过大：{formula.x_offset}, {formula.y_offset}"
                    # )

    def calculate_line_spacing(self, paragraph) -> float:
        """计算段落中的平均行间距"""
        if not paragraph.pdf_paragraph_composition:
            return 0.0

        # 收集所有文本行的 y 坐标
        line_y_positions = []
        for comp in paragraph.pdf_paragraph_composition:
            if comp.pdf_line:
                line_y_positions.append(comp.pdf_line.box.y)

        if len(line_y_positions) < 2:
            return 10.0  # 如果只有一行或没有行，返回一个默认值

        # 计算相邻行之间的 y 差值
        line_spacings = []
        for i in range(len(line_y_positions) - 1):
            spacing = abs(line_y_positions[i] - line_y_positions[i + 1])
            if spacing > 0:  # 忽略重叠的行
                line_spacings.append(spacing)

        if not line_spacings:
            return 10.0  # 如果没有有效的行间距，返回默认值

        # 使用中位数来避免异常值的影响
        median_spacing = sorted(line_spacings)[len(line_spacings) // 2]
        return median_spacing

    def create_composition(
        self,
        chars: list[PdfCharacter],
        is_formula: bool,
        line_index: int,
        is_corner_mark: bool = False,
    ) -> PdfParagraphComposition:
        if is_formula:
            formula = PdfFormula(pdf_character=chars, line_id=line_index)
            formula.is_corner_mark = is_corner_mark
            self.update_formula_data(formula)
            return PdfParagraphComposition(pdf_formula=formula)
        else:
            new_line = PdfLine(pdf_character=chars)
            self.update_line_data(new_line)
            return PdfParagraphComposition(pdf_line=new_line)

    def is_translatable_formula(self, formula: PdfFormula) -> bool:
        """判断公式是否只包含需要正常翻译的字符（数字、空格和英文逗号）"""
        if all(char.formula_layout_id for char in formula.pdf_character):
            return False

        text = "".join(char.char_unicode for char in formula.pdf_character)
        if formula.y_offset > 0.1:
            return False
        return bool(re.match(r"^[0-9, .]+$", text))

    def should_split_formula(self, formula: PdfFormula) -> bool:
        """判断公式是否需要按逗号拆分（包含逗号且有其他特殊符号）"""

        if all(x.formula_layout_id for x in formula.pdf_character):
            return False

        text = "".join(char.char_unicode for char in formula.pdf_character)
        # 必须包含逗号
        if "," not in text:
            return False
        # 检查是否包含除了数字和 [] 之外的其他符号
        text_without_basic = re.sub(r"[0-9\[\],\s]", "", text)
        return bool(text_without_basic)

    def split_formula_by_comma(
        self,
        formula: PdfFormula,
    ) -> list[tuple[list[PdfCharacter], PdfCharacter]]:
        """按逗号拆分公式字符，返回 (字符组，逗号字符) 的列表，最后一组的逗号字符为 None。
        只有不在括号内的逗号才会被用作分隔符。支持的括号对包括：
        - (cid:8) 和 (cid:9)
        - ( 和 )
        - (cid:16) 和 (cid:17)
        """
        result = []
        current_chars = []
        bracket_level = 0  # 跟踪括号的层数

        for char in formula.pdf_character:
            # 检查是否是左括号
            if char.char_unicode in LEFT_BRACKET:
                bracket_level += 1
                current_chars.append(char)
            # 检查是否是右括号
            elif char.char_unicode in RIGHT_BRACKET:
                bracket_level = max(0, bracket_level - 1)  # 防止括号不匹配的情况
                current_chars.append(char)
            # 检查是否是逗号，且不在括号内
            elif char.char_unicode == "," and bracket_level == 0:
                if current_chars:
                    result.append((current_chars, char))
                    current_chars = []
            else:
                current_chars.append(char)

        if current_chars:
            result.append((current_chars, None))  # 最后一组没有逗号

        return result

    def merge_formulas(self, formula1: PdfFormula, formula2: PdfFormula) -> PdfFormula:
        """合并两个公式，保持字符的相对位置"""
        # 合并所有字符
        all_chars = formula1.pdf_character + formula2.pdf_character
        # 按 y 坐标和 x 坐标排序，确保字符顺序正确
        # sorted_chars = sorted(
        #     all_chars, key=lambda c: (c.visual_bbox.box.y, c.visual_bbox.box.x))

        # 继承第一个公式的行 ID
        merged_formula = PdfFormula(pdf_character=all_chars, line_id=formula1.line_id)
        self.update_formula_data(merged_formula)
        return merged_formula

    def is_x_axis_contained(self, box1: Box, box2: Box) -> bool:
        """判断 box1 的 x 轴是否完全包含在 box2 的 x 轴内，或反之"""
        return (box1.x >= box2.x and box1.x2 <= box2.x2) or (
            box2.x >= box1.x and box2.x2 <= box1.x2
        )

    def has_y_intersection(self, box1: Box, box2: Box) -> bool:
        """判断两个 box 的 y 轴是否有交集"""
        tolerance = 1.0
        return not (box1.y2 < box2.y - tolerance or box2.y2 < box1.y - tolerance)

    def is_x_axis_adjacent(self, box1: Box, box2: Box, tolerance: float = 2.0) -> bool:
        """判断两个 box 在 x 轴上是否相邻或有交集"""
        # 检查是否有交集
        has_intersection = not (box1.x2 < box2.x or box2.x2 < box1.x)

        # 检查 box1 是否在 box2 左边且相邻
        left_adjacent = abs(box1.x2 - box2.x) <= tolerance
        # 检查 box2 是否在 box1 左边且相邻
        right_adjacent = abs(box2.x2 - box1.x) <= tolerance

        return has_intersection or left_adjacent or right_adjacent

    def calculate_y_iou(self, box1: Box, box2: Box) -> float:
        """计算两个 box 在 y 轴上的 IOU (Intersection over Union)"""
        # 计算交集
        intersection_start = max(box1.y, box2.y)
        intersection_end = min(box1.y2, box2.y2)
        intersection_length = max(0, intersection_end - intersection_start)

        # 计算并集
        box1_height = box1.y2 - box1.y
        box2_height = box2.y2 - box2.y
        union_length = box1_height + box2_height - intersection_length

        # 避免除零错误
        if union_length <= 0:
            return 0.0

        return intersection_length / union_length

    def merge_overlapping_formulas(self, page: Page):
        """
        合并符合以下条件的公式：
        1. x 轴重叠且 y 轴有交集的相邻公式，或者
        2. x 轴相邻且 y 轴 IOU > 0.5 的相邻公式，或者
        3. 所有字符的 layout id 都相同的相邻公式，或者
        4. 任意两个公式的 IOU > 0.8
        角标可能会被识别成单独的公式，需要合并
        """
        if not page.pdf_paragraph:
            return

        for paragraph in page.pdf_paragraph:
            if not paragraph.pdf_paragraph_composition:
                continue

            # 重复执行合并过程，直到没有更多可以合并的公式
            merged = True
            while merged:
                merged = False
                for i in range(len(paragraph.pdf_paragraph_composition)):
                    if merged:
                        break
                    comp1 = paragraph.pdf_paragraph_composition[i]
                    if comp1.pdf_formula is None:
                        continue

                    for j in range(i + 1, len(paragraph.pdf_paragraph_composition)):
                        comp2 = paragraph.pdf_paragraph_composition[j]
                        if comp2.pdf_formula is None:
                            continue

                        formula1 = comp1.pdf_formula
                        formula2 = comp2.pdf_formula

                        # 检查合并条件：
                        # 0. 必须在同一行（line_id 相同），以及
                        # 1. x 轴重叠且 y 轴有交集，或者
                        # 2. x 轴相邻且 y 轴 IOU > 0.5，或者
                        # 3. 所有字符的 layout id 都相同，或者
                        # 4. 任意两个公式的 IOU > 0.8

                        # 检查是否在同一行
                        same_line = formula1.line_id == formula2.line_id

                        should_merge = same_line and (
                            (
                                j == i + 1
                                and (
                                    (
                                        self.is_x_axis_contained(
                                            formula1.box, formula2.box
                                        )
                                        and self.has_y_intersection(
                                            formula1.box, formula2.box
                                        )
                                    )
                                    or (
                                        self.is_x_axis_adjacent(
                                            formula1.box, formula2.box
                                        )
                                        and self.calculate_y_iou(
                                            formula1.box, formula2.box
                                        )
                                        > 0.5
                                    )
                                )
                            )
                            or (self._have_same_layout_ids(formula1, formula2, page))
                            or (
                                calculate_iou_for_boxes(formula1.box, formula2.box)
                                > 0.8
                            )
                            or (
                                calculate_iou_for_boxes(formula2.box, formula1.box)
                                > 0.8
                            )
                        )

                        if should_merge:
                            # 合并公式
                            merged_formula = self.merge_formulas(formula1, formula2)
                            paragraph.pdf_paragraph_composition[i] = (
                                PdfParagraphComposition(
                                    pdf_formula=merged_formula,
                                )
                            )
                            # 删除第二个公式
                            del paragraph.pdf_paragraph_composition[j]
                            merged = True
                            break

    def _have_same_layout_ids(
        self, formula1: PdfFormula, formula2: PdfFormula, page: Page
    ) -> bool:
        """检查两个公式的所有字符是否具有相同的 layout id"""
        # 获取 formula1 中所有字符的 layout id
        formula1_layout_ids = set()
        for char in formula1.pdf_character:
            if char.char_unicode == " ":
                continue
            layout = char.formula_layout_id
            if layout:
                formula1_layout_ids.add(layout)

        # 获取 formula2 中所有字符的 layout id
        formula2_layout_ids = set()
        for char in formula2.pdf_character:
            if char.char_unicode == " ":
                continue
            layout = char.formula_layout_id
            if layout:
                formula2_layout_ids.add(layout)

        # 如果任一公式没有有效的 layout id，则不合并
        if not (len(formula1_layout_ids) == len(formula2_layout_ids) == 1):
            return False

        # 检查两个公式的 layout id 集合是否相同
        return formula1_layout_ids == formula2_layout_ids

    def process_comma_formulas(self, page: Page):
        """处理包含逗号的复杂公式，将其按逗号拆分"""
        if not page.pdf_paragraph:
            return

        for paragraph in page.pdf_paragraph:
            if not paragraph.pdf_paragraph_composition:
                continue

            new_compositions = []
            for composition in paragraph.pdf_paragraph_composition:
                if composition.pdf_formula is not None and self.should_split_formula(
                    composition.pdf_formula,
                ):
                    # 按逗号拆分公式
                    char_groups = self.split_formula_by_comma(composition.pdf_formula)
                    for chars, comma in char_groups:
                        if chars:  # 忽略空组（连续的逗号）
                            # 继承原公式的行 ID
                            formula = PdfFormula(
                                pdf_character=chars,
                                line_id=composition.pdf_formula.line_id,
                            )
                            self.update_formula_data(formula)
                            new_compositions.append(
                                PdfParagraphComposition(pdf_formula=formula),
                            )

                            # 如果有逗号，添加为文本行
                            if comma:
                                comma_line = PdfLine(pdf_character=[comma])
                                self.update_line_data(comma_line)
                                new_compositions.append(
                                    PdfParagraphComposition(pdf_line=comma_line),
                                )
                else:
                    new_compositions.append(composition)

            paragraph.pdf_paragraph_composition = new_compositions

    def remove_non_formula_lines_from_paragraphs(self, page: Page):
        """Remove non-formula lines from paragraphs.

        This method processes curves that remain in page.pdf_curve after
        collect_contained_elements() has assigned formula-related curves to formulas.
        All remaining curves are non-formula lines, but we need to be careful
        not to remove lines from figure/table areas.

        Args:
            page: The page to process
        """
        if not page.pdf_curve:
            return

        # Build layout index for efficient spatial queries
        layout_index, layout_map = build_layout_index(page)

        curves_to_remove = []

        # Get configuration thresholds
        protection_threshold = getattr(
            self.translation_config, "figure_table_protection_threshold", 0.9
        )
        overlap_threshold = getattr(
            self.translation_config, "non_formula_line_iou_threshold", 0.9
        )

        for curve in page.pdf_curve:
            # Skip if curve is in figure/table layout areas
            if is_curve_in_figure_table_layout(
                curve, layout_index, layout_map, protection_threshold
            ):
                continue

            # Only remove if curve overlaps with text paragraph areas
            if is_curve_overlapping_with_paragraphs(
                curve, page.pdf_paragraph, overlap_threshold
            ):
                curves_to_remove.append(curve)

        # Remove identified curves
        removed_count = 0
        for curve in curves_to_remove:
            if curve in page.pdf_curve:
                page.pdf_curve.remove(curve)
                removed_count += 1

        if removed_count > 0:
            import logging

            logger = logging.getLogger(__name__)
            logger.debug(f"Removed {removed_count} non-formula lines from paragraphs")


================================================
FILE: babeldoc/format/pdf/document_il/midend/table_parser.py
================================================
import logging
from pathlib import Path

import cv2
import numpy as np
from pymupdf import Document

from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img
from babeldoc.format.pdf.document_il.utils.style_helper import GREEN
from babeldoc.format.pdf.translation_config import TranslationConfig

logger = logging.getLogger(__name__)


class TableParser:
    stage_name = "Parse Table"

    def __init__(self, translation_config: TranslationConfig):
        self.translation_config = translation_config
        self.model = translation_config.table_model

    def _save_debug_image(self, image: np.ndarray, layouts, page_number: int):
        """Save debug image with drawn boxes if debug mode is enabled."""
        if not self.translation_config.debug:
            return

        if not isinstance(layouts, list):
            layouts = [layouts]
        debug_dir = Path(
            self.translation_config.get_working_file_path("table-ocr-box-image")
        )
        debug_dir.mkdir(parents=True, exist_ok=True)

        # Draw boxes on the image
        debug_image = image.copy()
        for layout in layouts:
            for box in layout.boxes:
                x0, y0, x1, y1 = box.xyxy
                cv2.rectangle(
                    debug_image,
                    (int(x0), int(y0)),
                    (int(x1), int(y1)),
                    (0, 255, 0),
                    2,
                )
                # Add text label
                cv2.putText(
                    debug_image,
                    layout.names[box.cls],
                    (int(x0), int(y0) - 5),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (0, 255, 0),
                    1,
                )

        # Save the image
        output_path = debug_dir / f"{page_number}.jpg"
        cv2.imwrite(str(output_path), debug_image)

    def _save_debug_box_to_page(self, page: il_version_1.Page):
        """Save debug boxes and text labels to the PDF page."""
        if not self.translation_config.debug:
            return

        color = GREEN

        for layout in page.page_layout:
            # Create a rectangle box
            rect = il_version_1.PdfRectangle(
                box=il_version_1.Box(
                    x=layout.box.x,
                    y=layout.box.y,
                    x2=layout.box.x2,
                    y2=layout.box.y2,
                ),
                graphic_state=color,
                debug_info=True,
            )
            page.pdf_rectangle.append(rect)

            # Create text label at top-left corner
            # Note: PDF coordinates are from bottom-left,
            # so we use y2 for top position
            style = il_version_1.PdfStyle(
                font_id="base",
                font_size=4,
                graphic_state=color,
            )
            page.pdf_paragraph.append(
                il_version_1.PdfParagraph(
                    first_line_indent=False,
                    box=il_version_1.Box(
                        x=layout.box.x,
                        y=layout.box.y2,
                        x2=layout.box.x2,
                        y2=layout.box.y2 + 5,
                    ),
                    vertical=False,
                    pdf_style=style,
                    unicode=layout.class_name,
                    pdf_paragraph_composition=[
                        il_version_1.PdfParagraphComposition(
                            pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
                                unicode=layout.class_name,
                                pdf_style=style,
                                debug_info=True,
                            ),
                        ),
                    ],
                    xobj_id=-1,
                ),
            )

    def process(self, docs: il_version_1.Document, mupdf_doc: Document):
        """Generate layouts for all pages that need to be translated."""
        # Get pages that need to be translated
        have_table_pages = {}
        for page in docs.page:
            for layout in page.page_layout:
                if layout.class_name == "table":
                    have_table_pages[page.page_number] = page
        with self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            len(have_table_pages),
        ) as progress:
            # Process predictions for each page
            for page, layouts in self.model.handle_document(
                have_table_pages.values(),
                mupdf_doc,
                self.translation_config,
                self._save_debug_image,
            ):
                page_layouts = []
                for layout in layouts.boxes:
                    # Convert coordinate system from picture to il
                    # system to the il coordinate system
                    x0, y0, x1, y1 = layout.xyxy
                    # pix = mupdf_doc[page.page_number].get_pixmap()
                    pix = get_no_rotation_img(mupdf_doc[page.page_number])
                    h, w = pix.height, pix.width
                    x0, y0, x1, y1 = (
                        np.clip(int(x0 - 1), 0, w - 1),
                        np.clip(int(h - y1 - 1), 0, h - 1),
                        np.clip(int(x1 + 1), 0, w - 1),
                        np.clip(int(h - y0 + 1), 0, h - 1),
                    )
                    page_layout = il_version_1.PageLayout(
                        id=len(page_layouts) + 1,
                        box=il_version_1.Box(
                            x0.item(),
                            y0.item(),
                            x1.item(),
                            y1.item(),
                        ),
                        conf=layout.conf.item(),
                        class_name=layouts.names[layout.cls],
                    )
                    page_layouts.append(page_layout)

                page.page_layout.extend(page_layouts)
                self._save_debug_box_to_page(page)
                progress.advance(1)

        return docs


================================================
FILE: babeldoc/format/pdf/document_il/midend/typesetting.py
================================================
from __future__ import annotations

import copy
import logging
import re
import statistics
import unicodedata
from functools import cache

import pymupdf
import regex
from rtree import index

from babeldoc.const import WATERMARK_VERSION
from babeldoc.format.pdf.document_il import Box
from babeldoc.format.pdf.document_il import PdfCharacter
from babeldoc.format.pdf.document_il import PdfCurve
from babeldoc.format.pdf.document_il import PdfForm
from babeldoc.format.pdf.document_il import PdfFormula
from babeldoc.format.pdf.document_il import PdfParagraphComposition
from babeldoc.format.pdf.document_il import PdfStyle
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.utils.formular_helper import update_formula_data
from babeldoc.format.pdf.document_il.utils.layout_helper import box_to_tuple
from babeldoc.format.pdf.translation_config import TranslationConfig
from babeldoc.format.pdf.translation_config import WatermarkOutputMode

logger = logging.getLogger(__name__)

LINE_BREAK_REGEX = regex.compile(
    r"^["
    r"a-z"
    r"A-Z"
    r"0-9"
    r"\u00C0-\u00FF"  # Latin-1 Supplement
    r"\u0100-\u017F"  # Latin Extended A
    r"\u0180-\u024F"  # Latin Extended B
    r"\u1E00-\u1EFF"  # Latin Extended Additional
    r"\u2C60-\u2C7F"  # Latin Extended C
    r"\uA720-\uA7FF"  # Latin Extended D
    r"\uAB30-\uAB6F"  # Latin Extended E
    r"\u0250-\u02A0"  # IPA Extensions
    r"\u0400-\u04FF"  # Cyrillic
    r"\u0300-\u036F"  # Combining Diacritical Marks
    r"\u0500-\u052F"  # Cyrillic Supplement
    r"\u0370-\u03FF"  # Greek and Coptic
    r"\u2DE0-\u2DFF"  # Cyrillic Extended-A
    r"\uA650-\uA69F"  # Cyrillic Extended-B
    r"\u1200-\u137F"  # Ethiopic
    r"\u1380-\u139F"  # Ethiopic Supplement
    r"\u2D80-\u2DDF"  # Ethiopic Extended
    r"\uAB00-\uAB2F"  # Ethiopic Extended-A
    r"\U0001E7E0-\U0001E7FF"  # Ethiopic Extended-B
    r"\u0E80-\u0EFF"  # Lao
    r"\u0D00-\u0D7F"  # Malayalam
    r"\u0A80-\u0AFF"  # Gujarati
    r"\u0E00-\u0E7F"  # Thai
    r"\u1000-\u109F"  # Myanmar
    r"\uAA60-\uAA7F"  # Myanmar Extended-A
    r"\uA9E0-\uA9FF"  # Myanmar Extended-B
    r"\U000116D0-\U000116FF"  # Myanmar Extended-C
    r"\u0B80-\u0BFF"  # Tamil
    r"\u0C00-\u0C7F"  # Telugu
    r"\u0B00-\u0B7F"  # Oriya
    r"\u0530-\u058F"  # Armenian
    r"\u10A0-\u10FF"  # Georgian
    r"\u1C90-\u1CBF"  # Georgian Extended
    r"\u2D00-\u2D2F"  # Georgian Supplement
    r"\u1780-\u17FF"  # Khmer
    r"\u19E0-\u19FF"  # Khmer Symbols
    r"\U00010B00-\U00010B3F"  # Avestan
    r"\u1D00-\u1D7F"  # Phonetic Extensions
    r"\u1400-\u167F"  # Unified Canadian Aboriginal Syllabics
    r"\u0B00-\u0B7F"  # Oriya
    r"\u0780-\u07BF"  # Thaana
    r"\U0001E900-\U0001E95F"  # Adlam
    r"\u1C80-\u1C8F"  # Cyrillic Extended-C
    r"\U0001E030-\U0001E08F"  # Cyrillic Extended-D
    r"\uA000-\uA48F"  # Yi Syllables
    r"\uA490-\uA4CF"  # Yi Radicals
    r"'"
    r"-"  # Hyphen
    r"·"  # Middle Dot (U+00B7) For Català
    r"ʻ"  # Spacing Modifier Letters U+02BB
    r"]+$"
)


class TypesettingUnit:
    def __str__(self):
        return self.try_get_unicode() or ""

    def __init__(
        self,
        char: PdfCharacter | None = None,
        formular: PdfFormula | None = None,
        unicode: str | None = None,
        font: pymupdf.Font | None = None,
        original_font: il_version_1.PdfFont | None = None,
        font_size: float | None = None,
        style: PdfStyle | None = None,
        xobj_id: int | None = None,
        debug_info: bool = False,
    ):
        assert (char is not None) + (formular is not None) + (
            unicode is not None
        ) == 1, "Only one of chars and formular can be not None"
        self.char = char
        self.formular = formular
        self.unicode = unicode
        self.x = None
        self.y = None
        self.scale = None
        self.debug_info = debug_info

        # Cache variables
        self.box_cache: Box | None = None
        self.can_break_line_cache: bool | None = None
        self.is_cjk_char_cache: bool | None = None
        self.mixed_character_blacklist_cache: bool | None = None
        self.is_space_cache: bool | None = None
        self.is_hung_punctuation_cache: bool | None = None
        self.is_cannot_appear_in_line_end_punctuation_cache: bool | None = None
        self.can_passthrough_cache: bool | None = None
        self.width_cache: float | None = None
        self.height_cache: float | None = None

        self.font_size: float | None = None

        if unicode:
            assert font_size, "Font size must be provided when unicode is provided"
            assert style, "Style must be provided when unicode is provided"
            assert len(unicode) == 1, "Unicode must be a single character"
            assert xobj_id is not None, (
                "Xobj id must be provided when unicode is provided"
            )

            self.font = font
            if font is not None and hasattr(font, "font_id"):
                self.font_id = font.font_id
            else:
                self.font_id = "base"
            if original_font:
                self.original_font = original_font
            else:
                self.original_font = None

            self.font_size = font_size
            self.style = style
            self.xobj_id = xobj_id

    def try_resue_cache(self, old_tu: TypesettingUnit):
        if old_tu.is_cjk_char_cache is not None:
            self.is_cjk_char_cache = old_tu.is_cjk_char_cache

        if old_tu.can_break_line_cache is not None:
            self.can_break_line_cache = old_tu.can_break_line_cache

        if old_tu.is_space_cache is not None:
            self.is_space_cache = old_tu.is_space_cache

        if old_tu.is_hung_punctuation_cache is not None:
            self.is_hung_punctuation_cache = old_tu.is_hung_punctuation_cache

        if old_tu.is_cannot_appear_in_line_end_punctuation_cache is not None:
            self.is_cannot_appear_in_line_end_punctuation_cache = (
                old_tu.is_cannot_appear_in_line_end_punctuation_cache
            )

        if old_tu.can_passthrough_cache is not None:
            self.can_passthrough_cache = old_tu.can_passthrough_cache

        if old_tu.mixed_character_blacklist_cache is not None:
            self.mixed_character_blacklist_cache = (
                old_tu.mixed_character_blacklist_cache
            )

    def try_get_unicode(self) -> str | None:
        if self.char:
            return self.char.char_unicode
        elif self.formular:
            return None
        elif self.unicode:
            return self.unicode

    @property
    def mixed_character_blacklist(self):
        if self.mixed_character_blacklist_cache is None:
            self.mixed_character_blacklist_cache = self.calc_mixed_character_blacklist()

        return self.mixed_character_blacklist_cache

    def calc_mixed_character_blacklist(self):
        unicode = self.try_get_unicode()
        if unicode:
            return unicode in [
                "。",
                "，",
                "：",
                "？",
                "！",
            ]
        return False

    @property
    def can_break_line(self):
        if self.can_break_line_cache is None:
            self.can_break_line_cache = self.calc_can_break_line()

        return self.can_break_line_cache

    def calc_can_break_line(self):
        unicode = self.try_get_unicode()
        if not unicode:
            return True
        if LINE_BREAK_REGEX.match(unicode):
            return False
        return True

    @property
    def is_cjk_char(self):
        if self.is_cjk_char_cache is None:
            self.is_cjk_char_cache = self.calc_is_cjk_char()

        return self.is_cjk_char_cache

    def calc_is_cjk_char(self):
        if self.formular:
            return False
        unicode = self.try_get_unicode()
        if not unicode:
            return False
        if "(cid" in unicode:
            return False
        if len(unicode) > 1:
            return False
        assert len(unicode) == 1, "Unicode must be a single character"
        if unicode in [
            "（",
            "）",
            "【",
            "】",
            "《",
            "》",
            "〔",
            "〕",
            "〈",
            "〉",
            "〖",
            "〗",
            "「",
            "」",
            "『",
            "』",
            "、",
            "。",
            "：",
            "？",
            "！",
            "，",
        ]:
            return True
        if unicode:
            if re.match(
                r"^["
                r"\u3000-\u303f"  # CJK Symbols and Punctuation
                r"\u3040-\u309f"  # Hiragana
                r"\u30a0-\u30ff"  # Katakana
                r"\u3100-\u312f"  # Bopomofo
                r"\uac00-\ud7af"  # Hangul Syllables
                r"\u1100-\u11ff"  # Hangul Jamo
                r"\u3130-\u318f"  # Hangul Compatibility Jamo
                r"\ua960-\ua97f"  # Hangul Jamo Extended-A
                r"\ud7b0-\ud7ff"  # Hangul Jamo Extended-B
                r"\u3190-\u319f"  # Kanbun
                r"\u3200-\u32ff"  # Enclosed CJK Letters and Months
                r"\u3300-\u33ff"  # CJK Compatibility
                r"\ufe30-\ufe4f"  # CJK Compatibility Forms
                r"\u4e00-\u9fff"  # CJK Unified Ideographs
                r"\u2e80-\u2eff"  # CJK Radicals Supplement
                r"\u31c0-\u31ef"  # CJK Strokes
                r"\u2f00-\u2fdf"  # Kangxi Radicals
                r"\ufe10-\ufe1f"  # Vertical Forms
                r"]+$",
                unicode,
            ):
                return True
            try:
                unicodedata_name = unicodedata.name(unicode)
                return (
                    "CJK UNIFIED IDEOGRAPH" in unicodedata_name
                    or "FULLWIDTH" in unicodedata_name
                )
            except ValueError:
                return False
        return False

    @property
    def is_space(self):
        if self.is_space_cache is None:
            self.is_space_cache = self.calc_is_space()

        return self.is_space_cache

    def calc_is_space(self):
        if self.formular:
            return False
        unicode = self.try_get_unicode()
        return unicode == " "

    @property
    def is_hung_punctuation(self):
        if self.is_hung_punctuation_cache is None:
            self.is_hung_punctuation_cache = self.calc_is_hung_punctuation()

        return self.is_hung_punctuation_cache

    def calc_is_hung_punctuation(self):
        if self.formular:
            return False
        unicode = self.try_get_unicode()

        if unicode:
            return unicode in [
                # 英文标点
                ",",
                ".",
                ":",
                ";",
                "?",
                "!",
                # 中文点号
                "，",  # 逗号
                "。",  # 句号
                "．",  # 全角句号
                "、",  # 顿号
                "：",  # 冒号
                "；",  # 分号
                "！",  # 叹号
                "‼",  # 双叹号
                "？",  # 问号
                "⁇",  # 双问号
                # 结束引号
                "”",  # 右双引号
                "’",  # 右单引号
                "」",  # 右直角单引号
                "』",  # 右直角双引号
                # 结束括号
                ")",  # 右圆括号
                "]",  # 右方括号
                "}",  # 右花括号
                "）",  # 右圆括号
                "〕",  # 右龟甲括号
                "〉",  # 右单书名号
                "】",  # 右黑色方头括号
                "〗",  # 右空白方头括号
                "］",  # 全角右方括号
                "｝",  # 全角右花括号
                # 结束双书名号
                "》",  # 右双书名号
                # 连接号
                "～",  # 全角波浪号
                "-",  # 连字符减号
                "–",  # 短破折号 (EN DASH)
                "—",  # 长破折号 (EM DASH)
                # 间隔号
                "·",  # 中间点
                "・",  # 片假名中间点
                "‧",  # 连字点
                # 分隔号
                "/",  # 斜杠
                "／",  # 全角斜杠
                "⁄",  # 分数斜杠
            ]
        return False

    @property
    def is_cannot_appear_in_line_end_punctuation(self):
        if self.is_cannot_appear_in_line_end_punctuation_cache is None:
            self.is_cannot_appear_in_line_end_punctuation_cache = (
                self.calc_is_cannot_appear_in_line_end_punctuation()
            )

        return self.is_cannot_appear_in_line_end_punctuation_cache

    def calc_is_cannot_appear_in_line_end_punctuation(self):
        if self.formular:
            return False
        unicode = self.try_get_unicode()
        if not unicode:
            return False
        return unicode in [
            # 开始引号
            "“",  # 左双引号
            "‘",  # 左单引号
            "「",  # 左直角单引号
            "『",  # 左直角双引号
            # 开始括号
            "(",  # 左圆括号
            "[",  # 左方括号
            "{",  # 左花括号
            "（",  # 左圆括号
            "〔",  # 左龟甲括号
            "〈",  # 左单书名号
            "《",  # 左双书名号
            # 开始单双书名号
            "〖",  # 左空白方头括号
            "〘",  # 左黑色方头括号
            "〚",  # 左单书名号
        ]

    def passthrough(
        self,
    ) -> tuple[list[PdfCharacter], list[PdfCurve], list[PdfForm]]:
        if self.char:
            return [self.char], [], []
        elif self.formular:
            return (
                self.formular.pdf_character,
                self.formular.pdf_curve,
                self.formular.pdf_form,
            )
        elif self.unicode:
            logger.error(f"Cannot passthrough unicode. TypesettingUnit: {self}. ")
            logger.error(f"Cannot passthrough unicode. TypesettingUnit: {self}. ")
            return [], [], []

    @property
    def can_passthrough(self):
        if self.can_passthrough_cache is None:
            self.can_passthrough_cache = self.calc_can_passthrough()

        return self.can_passthrough_cache

    def calc_can_passthrough(self):
        return self.unicode is None

    def calculate_box(self):
        if self.char:
            box = copy.deepcopy(self.char.box)
            if self.char.visual_bbox and self.char.visual_bbox.box:
                box.y = self.char.visual_bbox.box.y
                box.y2 = self.char.visual_bbox.box.y2
                # return self.char.visual_bbox.box

            return box
        elif self.formular:
            return self.formular.box
            # if self.formular.x_offset <= 0.5:
            #     return self.formular.box
            # formular_box = copy.copy(self.formular.box)
            # formular_box.x2 += self.formular.x_advance
            # return formular_box
        elif self.unicode:
            char_width = self.font.char_lengths(self.unicode, self.font_size)[0]
            if self.x is None or self.y is None or self.scale is None:
                return Box(0, 0, char_width, self.font_size)
            return Box(self.x, self.y, self.x + char_width, self.y + self.font_size)

    @property
    def box(self):
        if not self.box_cache:
            self.box_cache = self.calculate_box()

        return self.box_cache

    @property
    def width(self):
        if self.width_cache is None:
            self.width_cache = self.calc_width()

        return self.width_cache

    def calc_width(self):
        box = self.box
        return box.x2 - box.x

    @property
    def height(self):
        if self.height_cache is None:
            self.height_cache = self.calc_height()

        return self.height_cache

    def calc_height(self):
        box = self.box
        return box.y2 - box.y

    def relocate(
        self,
        x: float,
        y: float,
        scale: float,
    ) -> TypesettingUnit:
        """重定位并缩放排版单元

        Args:
            x: 新的 x 坐标
            y: 新的 y 坐标
            scale: 缩放因子

        Returns:
            新的排版单元
        """
        if self.char:
            # 创建新的字符对象
            new_char = PdfCharacter(
                pdf_character_id=self.char.pdf_character_id,
                char_unicode=self.char.char_unicode,
                box=Box(
                    x=x,
                    y=y,
                    x2=x + self.width * scale,
                    y2=y + self.height * scale,
                ),
                pdf_style=PdfStyle(
                    font_id=self.char.pdf_style.font_id,
                    font_size=self.char.pdf_style.font_size * scale,
                    graphic_state=self.char.pdf_style.graphic_state,
                ),
                scale=scale,
                vertical=self.char.vertical,
                advance=self.char.advance * scale if self.char.advance else None,
                debug_info=self.debug_info,
                xobj_id=self.char.xobj_id,
            )
            new_tu = TypesettingUnit(char=new_char)
            new_tu.try_resue_cache(self)
            return new_tu

        elif self.formular:
            # 创建新的公式对象，保持内部字符的相对位置
            new_chars = []
            min_x = self.formular.box.x
            min_y = self.formular.box.y

            for char in self.formular.pdf_character:
                # 计算相对位置
                rel_x = char.box.x - min_x
                rel_y = char.box.y - min_y

                visual_rel_x = char.visual_bbox.box.x - min_x
                visual_rel_y = char.visual_bbox.box.y - min_y

                # 创建新的字符对象
                new_char = PdfCharacter(
                    pdf_character_id=char.pdf_character_id,
                    char_unicode=char.char_unicode,
                    box=Box(
                        x=x + (rel_x + self.formular.x_offset) * scale,
                        y=y + (rel_y + self.formular.y_offset) * scale,
                        x2=x
                        + (rel_x + (char.box.x2 - char.box.x) + self.formular.x_offset)
                        * scale,
                        y2=y
                        + (rel_y + (char.box.y2 - char.box.y) + self.formular.y_offset)
                        * scale,
                    ),
                    visual_bbox=il_version_1.VisualBbox(
                        box=Box(
                            x=x + (visual_rel_x + self.formular.x_offset) * scale,
                            y=y + (visual_rel_y + self.formular.y_offset) * scale,
                            x2=x
                            + (
                                visual_rel_x
                                + (char.visual_bbox.box.x2 - char.visual_bbox.box.x)
                                + self.formular.x_offset
                            )
                            * scale,
                            y2=y
                            + (
                                visual_rel_y
                                + (char.visual_bbox.box.y2 - char.visual_bbox.box.y)
                                + self.formular.y_offset
                            )
                            * scale,
                        ),
                    ),
                    pdf_style=PdfStyle(
                        font_id=char.pdf_style.font_id,
                        font_size=char.pdf_style.font_size * scale,
                        graphic_state=char.pdf_style.graphic_state,
                    ),
                    scale=scale,
                    vertical=char.vertical,
                    advance=char.advance * scale if char.advance else None,
                    xobj_id=char.xobj_id,
                )
                new_chars.append(new_char)

            # Calculate bounding box from new_chars
            min_x = min(char.visual_bbox.box.x for char in new_chars)
            min_y = min(char.visual_bbox.box.y for char in new_chars)
            max_x = max(char.visual_bbox.box.x2 for char in new_chars)
            max_y = max(char.visual_bbox.box.y2 for char in new_chars)

            new_formula = PdfFormula(
                box=Box(
                    x=min_x,
                    y=min_y,
                    x2=max_x,
                    y2=max_y,
                ),
                pdf_character=new_chars,
                x_offset=self.formular.x_offset * scale,
                y_offset=self.formular.y_offset * scale,
                x_advance=self.formular.x_advance * scale,
            )

            # Handle contained curves
            new_curves = []
            for curve in self.formular.pdf_curve:
                new_curve = self._transform_curve_for_relocation(
                    curve,
                    self.formular.box.x,
                    self.formular.box.y,
                    x,
                    y,
                    scale,
                )
                new_curves.append(new_curve)
            new_formula.pdf_curve = new_curves

            # Handle contained forms
            new_forms = []
            for form in self.formular.pdf_form:
                new_form = self._transform_form_for_relocation(
                    form, self.formular.box.x, self.formular.box.y, x, y, scale
                )
                new_forms.append(new_form)
            new_formula.pdf_form = new_forms

            update_formula_data(new_formula)

            new_tu = TypesettingUnit(formular=new_formula)
            new_tu.try_resue_cache(self)
            return new_tu

        elif self.unicode:
            # 对于 Unicode 字符，我们存储新的位置信息
            new_unit = TypesettingUnit(
                unicode=self.unicode,
                font=self.font,
                original_font=self.original_font,
                font_size=self.font_size * scale,
                style=self.style,
                xobj_id=self.xobj_id,
                debug_info=self.debug_info,
            )
            new_unit.x = x
            new_unit.y = y
            new_unit.scale = scale
            new_unit.try_resue_cache(self)
            return new_unit

    def _transform_curve_for_relocation(
        self,
        curve,
        original_formula_x: float,
        original_formula_y: float,
        new_x: float,
        new_y: float,
        scale: float,
    ):
        """Transform a curve for formula relocation."""
        import copy

        new_curve = copy.deepcopy(curve)

        if new_curve.box:
            # Calculate relative position to formula's original position (same as chars)
            rel_x = new_curve.box.x - original_formula_x
            rel_y = new_curve.box.y - original_formula_y

            # Apply same transformation as characters
            new_curve.box = Box(
                x=new_x + (rel_x + self.formular.x_offset) * scale,
                y=new_y + (rel_y + self.formular.y_offset) * scale,
                x2=new_x
                + (
                    rel_x
                    + (new_curve.box.x2 - new_curve.box.x)
                    + self.formular.x_offset
                )
                * scale,
                y2=new_y
                + (
                    rel_y
                    + (new_curve.box.y2 - new_curve.box.y)
                    + self.formular.y_offset
                )
                * scale,
            )

        # Set relocation transform instead of modifying original CTM
        translation_x = (
            new_x + self.formular.x_offset * scale - original_formula_x * scale
        )
        translation_y = (
            new_y + self.formular.y_offset * scale - original_formula_y * scale
        )

        # Create relocation transformation matrix
        from babeldoc.format.pdf.document_il.utils.matrix_helper import (
            create_translation_and_scale_matrix,
        )

        relocation_matrix = create_translation_and_scale_matrix(
            translation_x, translation_y, scale
        )
        new_curve.relocation_transform = list(relocation_matrix)

        return new_curve

    def _transform_form_for_relocation(
        self,
        form,
        original_formula_x: float,
        original_formula_y: float,
        new_x: float,
        new_y: float,
        scale: float,
    ):
        """Transform a form for formula relocation."""
        import copy

        new_form = copy.deepcopy(form)

        if new_form.box:
            # Calculate relative position to formula's original position (same as chars)
            rel_x = new_form.box.x - original_formula_x
            rel_y = new_form.box.y - original_formula_y

            # Apply same transformation as characters
            new_form.box = Box(
                x=new_x + (rel_x + self.formular.x_offset) * scale,
                y=new_y + (rel_y + self.formular.y_offset) * scale,
                x2=new_x
                + (rel_x + (new_form.box.x2 - new_form.box.x) + self.formular.x_offset)
                * scale,
                y2=new_y
                + (rel_y + (new_form.box.y2 - new_form.box.y) + self.formular.y_offset)
                * scale,
            )

        # Set relocation transform instead of modifying original matrices
        translation_x = (
            new_x + self.formular.x_offset * scale - original_formula_x * scale
        )
        translation_y = (
            new_y + self.formular.y_offset * scale - original_formula_y * scale
        )

        # Create relocation transformation matrix
        from babeldoc.format.pdf.document_il.utils.matrix_helper import (
            create_translation_and_scale_matrix,
        )

        relocation_matrix = create_translation_and_scale_matrix(
            translation_x, translation_y, scale
        )
        new_form.relocation_transform = list(relocation_matrix)

        return new_form

    def render(
        self,
    ) -> tuple[list[PdfCharacter], list[PdfCurve], list[PdfForm]]:
        """渲染排版单元为 PdfCharacter 列表

        Returns:
            PdfCharacter 列表
        """
        if self.can_passthrough:
            return self.passthrough()
        elif self.unicode:
            assert self.x is not None, (
                "x position must be set, should be set by `relocate`"
            )
            assert self.y is not None, (
                "y position must be set, should be set by `relocate`"
            )
            assert self.scale is not None, (
                "scale must be set, should be set by `relocate`"
            )
            x = self.x
            y = self.y
            # if self.original_font and self.font and hasattr(self.original_font, "descent") and hasattr(self.font, "descent_fontmap"):
            #     original_descent = self.original_font.descent
            #     new_descent = self.font.descent_fontmap
            #     y -= (original_descent - new_descent) * self.font_size / 1000

            # 计算字符宽度
            char_width = self.width

            new_char = PdfCharacter(
                pdf_character_id=self.font.has_glyph(ord(self.unicode)),
                char_unicode=self.unicode,
                box=Box(
                    x=x,  # 使用存储的位置
                    y=y,
                    x2=x + char_width,
                    y2=y + self.font_size,
                ),
                pdf_style=PdfStyle(
                    font_id=self.font_id,
                    font_size=self.font_size,
                    graphic_state=self.style.graphic_state,
                ),
                scale=self.scale,
                vertical=False,
                advance=char_width,
                xobj_id=self.xobj_id,
                debug_info=self.debug_info,
            )
            return [new_char], [], []
        else:
            logger.error(f"Unknown typesetting unit. TypesettingUnit: {self}. ")
            logger.error(f"Unknown typesetting unit. TypesettingUnit: {self}. ")
            return [], [], []


class Typesetting:
    stage_name = "Typesetting"

    def __init__(self, translation_config: TranslationConfig):
        self.font_mapper = FontMapper(translation_config)
        self.translation_config = translation_config
        self.lang_code = self.translation_config.lang_out.upper()
        self.is_cjk = (
            # Why zh-CN/zh-HK/zh-TW here but not zh-Hans and so on?
            # See https://funstory-ai.github.io/BabelDOC/supported_languages/
            ("ZH" in self.lang_code)  # C
            or ("JA" in self.lang_code)
            or ("JP" in self.lang_code)  # J
            or ("KR" in self.lang_code)  # K
            or ("CN" in self.lang_code)
            or ("HK" in self.lang_code)
            or ("TW" in self.lang_code)
        )

    def preprocess_document(self, document: il_version_1.Document, pbar):
        """预处理文档，获取每个段落的最优缩放因子，不执行实际排版"""
        all_scales: list[float] = []
        all_paragraphs: list[il_version_1.PdfParagraph] = []

        for page in document.page:
            pbar.advance()
            # 准备字体信息（复制自 render_page 的逻辑）
            fonts: dict[
                str | int,
                il_version_1.PdfFont | dict[str, il_version_1.PdfFont],
            ] = {f.font_id: f for f in page.pdf_font if f.font_id}
            page_fonts = {f.font_id: f for f in page.pdf_font if f.font_id}
            for k, v in self.font_mapper.fontid2font.items():
                fonts[k] = v
            for xobj in page.pdf_xobject:
                if xobj.xobj_id is not None:
                    fonts[xobj.xobj_id] = page_fonts.copy()
                    for font in xobj.pdf_font:
                        if (
                            xobj.xobj_id in fonts
                            and isinstance(fonts[xobj.xobj_id], dict)
                            and font.font_id
                        ):
                            fonts[xobj.xobj_id][font.font_id] = font

            # 处理每个段落
            for paragraph in page.pdf_paragraph:
                all_paragraphs.append(paragraph)
                unit_count = 0
                try:
                    typesetting_units = self.create_typesetting_units(paragraph, fonts)
                    unit_count = len(typesetting_units)
                    for unit in typesetting_units:
                        if unit.formular:
                            unit_count += len(unit.formular.pdf_character) - 1

                    # 如果所有单元都可以直接传递，则 scale = 1.0
                    if all(unit.can_passthrough for unit in typesetting_units):
                        paragraph.optimal_scale = 1.0
                    else:
                        # 获取最优缩放因子
                        optimal_scale = self._get_optimal_scale(
                            paragraph, page, typesetting_units
                        )
                        paragraph.optimal_scale = optimal_scale
                except Exception as e:
                    # 如果预处理出错，默认使用 1.0 缩放因子
                    logger.warning(f"预处理段落时出错：{e}")
                    paragraph.optimal_scale = 1.0

                if paragraph.optimal_scale is not None:
                    all_scales.extend([paragraph.optimal_scale] * unit_count)

        # 获取缩放因子的众数
        if all_scales:
            try:
                modes = statistics.multimode(all_scales)
                mode_scale = min(modes)
            except statistics.StatisticsError:
                logger.warning(
                    "Could not find a mode for paragraph scales. Falling back to median."
                )
                mode_scale = statistics.median(all_scales)
            # 将所有大于众数的值修改为众数
            for paragraph in all_paragraphs:
                if (
                    paragraph.optimal_scale is not None
                    and paragraph.optimal_scale > mode_scale
                ):
                    paragraph.optimal_scale = mode_scale
        else:
            logger.error(
                "document_scales is empty, there seems no paragraph in this PDF"
            )

    def _find_optimal_scale_and_layout(
        self,
        paragraph: il_version_1.PdfParagraph,
        page: il_version_1.Page,
        typesetting_units: list[TypesettingUnit],
        initial_scale: float = 1.0,
        use_english_line_break: bool = True,
        apply_layout: bool = False,
    ) -> tuple[float, list[TypesettingUnit] | None]:
        """查找最优缩放因子并可选择性地执行布局

        Args:
            paragraph: 段落对象
            page: 页面对象
            typesetting_units: 排版单元列表
            initial_scale: 初始缩放因子
            use_english_line_break: 是否使用英文换行规则
            apply_layout: 是否应用布局到 paragraph（True 时执行实际排版）

        Returns:
            tuple[float, list[TypesettingUnit] | None]: (最终缩放因子，排版后的单元列表或 None)
        """
        if not paragraph.box:
            return initial_scale, None

        box = paragraph.box
        scale = initial_scale
        line_skip = 1.50 if self.is_cjk else 1.3
        min_scale = 0.1
        expand_space_flag = 0
        final_typeset_units = None

        while scale >= min_scale:
            try:
                # 尝试布局排版单元
                typeset_units, all_units_fit = self._layout_typesetting_units(
                    typesetting_units,
                    box,
                    scale,
                    line_skip,
                    paragraph,
                    use_english_line_break,
                )

                # 如果所有单元都放得下
                if all_units_fit:
                    if apply_layout:
                        # 实际应用排版结果
                        paragraph.scale = scale
                        paragraph.pdf_paragraph_composition = []
                        for unit in typeset_units:
                            chars, curves, forms = unit.render()
                            for char in chars:
                                paragraph.pdf_paragraph_composition.append(
                                    PdfParagraphComposition(pdf_character=char),
                                )
                            for curve in curves:
                                page.pdf_curve.append(curve)
                            for form in forms:
                                page.pdf_form.append(form)
                        final_typeset_units = typeset_units
                    return scale, final_typeset_units
            except Exception:
                # 如果布局检查出错，继续尝试下一个缩放因子
                pass

            # 添加与原 retypeset 一致的逻辑检查
            if not hasattr(paragraph, "debug_id") or not paragraph.debug_id:
                return scale, final_typeset_units

            # 减小缩放因子
            if scale > 0.6:
                scale -= 0.05
            else:
                scale -= 0.1

            if scale < 0.7:
                space_expanded = False  # 标记是否成功扩展了空间

                if expand_space_flag == 0:
                    # 尝试向下扩展
                    try:
                        min_y = self.get_max_bottom_space(box, page) + 2
                        if min_y < box.y:
                            expanded_box = Box(x=box.x, y=min_y, x2=box.x2, y2=box.y2)
                            box = expanded_box
                            if apply_layout:
                                # 更新段落的边界框
                                paragraph.box = expanded_box
                            space_expanded = True
                    except Exception:
                        pass
                    expand_space_flag = 1

                    # 只有成功扩展空间时才 continue，否则继续减小 scale
                    if space_expanded:
                        continue

                elif expand_space_flag == 1:
                    # 尝试向右扩展
                    try:
                        max_x = self.get_max_right_space(box, page) - 5
                        if max_x > box.x2:
                            expanded_box = Box(x=box.x, y=box.y, x2=max_x, y2=box.y2)
                            box = expanded_box
                            if apply_layout:
                                # 更新段落的边界框
                                paragraph.box = expanded_box
                            space_expanded = True
                    except Exception:
                        pass
                    expand_space_flag = 2

                    # 只有成功扩展空间时才 continue，否则继续减小 scale
                    if space_expanded:
                        continue

                # 只有在扩展尝试阶段 (expand_space_flag < 2) 且扩展失败时才重置 scale
                # 当 expand_space_flag >= 2 时，说明已经尝试过所有扩展，应该继续正常的 scale 减小
                if expand_space_flag < 2:
                    # 如果无法扩展空间，重置 scale 并继续循环
                    scale = 1.0

        # 如果仍然放不下，尝试去除英文换行限制
        if use_english_line_break:
            return self._find_optimal_scale_and_layout(
                paragraph,
                page,
                typesetting_units,
                initial_scale,
                use_english_line_break=False,
                apply_layout=apply_layout,
            )

        # 最后返回最小缩放因子
        return min_scale, final_typeset_units

    def _get_optimal_scale(
        self,
        paragraph: il_version_1.PdfParagraph,
        page: il_version_1.Page,
        typesetting_units: list[TypesettingUnit],
        use_english_line_break: bool = True,
    ) -> float:
        """获取段落的最优缩放因子，不执行实际排版"""
        scale, _ = self._find_optimal_scale_and_layout(
            paragraph,
            page,
            typesetting_units,
            1.0,
            use_english_line_break,
            apply_layout=False,
        )
        return scale

    def retypeset_with_precomputed_scale(
        self,
        paragraph: il_version_1.PdfParagraph,
        page: il_version_1.Page,
        typesetting_units: list[TypesettingUnit],
        precomputed_scale: float,
        use_english_line_break: bool = True,
    ):
        """使用预计算的缩放因子进行排版"""
        if not paragraph.box:
            return

        # 使用通用方法进行排版，传入预计算的缩放因子作为初始值
        self._find_optimal_scale_and_layout(
            paragraph,
            page,
            typesetting_units,
            precomputed_scale,
            use_english_line_break,
            apply_layout=True,
        )

    def typesetting_document(self, document: il_version_1.Document):
        # 原有的排版逻辑
        if self.translation_config.progress_monitor:
            with self.translation_config.progress_monitor.stage_start(
                self.stage_name,
                len(document.page) * 2,
            ) as pbar:
                # 预处理：获取所有段落的最优缩放因子
                self.preprocess_document(document, pbar)

                for page in document.page:
                    self.translation_config.raise_if_cancelled()
                    self.render_page(page)
                    pbar.advance()
        else:
            for page in document.page:
                self.translation_config.raise_if_cancelled()
                self.render_page(page)

    def render_page(self, page: il_version_1.Page):
        fonts: dict[
            str | int,
            il_version_1.PdfFont | dict[str, il_version_1.PdfFont],
        ] = {f.font_id: f for f in page.pdf_font if f.font_id}
        page_fonts = {f.font_id: f for f in page.pdf_font if f.font_id}
        for k, v in self.font_mapper.fontid2font.items():
            fonts[k] = v
        for xobj in page.pdf_xobject:
            if xobj.xobj_id is not None:
                fonts[xobj.xobj_id] = page_fonts.copy()
                for font in xobj.pdf_font:
                    if font.font_id:
                        fonts[xobj.xobj_id][font.font_id] = font
        if (
            page.page_number == 0
            and self.translation_config.watermark_output_mode
            == WatermarkOutputMode.Watermarked
        ):
            self.add_watermark(page)
        try:
            para_index = index.Index()
            para_map = {}
            #
            valid_paras = [
                p
                for p in page.pdf_paragraph
                if p.box
                and all(c is not None for c in [p.box.x, p.box.y, p.box.x2, p.box.y2])
            ]

            for i, para in enumerate(valid_paras):
                para_map[i] = para
                para_index.insert(i, box_to_tuple(para.box))

            for i, p_upper in para_map.items():
                if not (p_upper.box and p_upper.box.y is not None):
                    continue

                # Calculate paragraph height and set required gap accordingly
                para_height = p_upper.box.y2 - p_upper.box.y
                required_gap = 0.5 if para_height < 36 else 3

                check_area = il_version_1.Box(
                    x=p_upper.box.x,
                    y=p_upper.box.y - required_gap,
                    x2=p_upper.box.x2,
                    y2=p_upper.box.y,
                )

                candidate_ids = list(para_index.intersection(box_to_tuple(check_area)))

                conflicting_paras = []
                for para_id in candidate_ids:
                    if para_id == i:
                        continue
                    p_lower = para_map[para_id]
                    if not (
                        p_lower.box
                        and p_upper.box
                        and p_lower.box.x2 < p_upper.box.x
                        or p_lower.box.x > p_upper.box.x2
                    ):
                        conflicting_paras.append(p_lower)

                if conflicting_paras:
                    max_y2 = max(
                        p.box.y2
                        for p in conflicting_paras
                        if p.box and p.box.y2 is not None
                    )

                    new_y = max_y2 + required_gap
                    if p_upper.box and new_y < p_upper.box.y2:
                        p_upper.box.y = new_y
        except Exception as e:
            logger.warning(
                f"Failed to adjust paragraph positions on page {page.page_number}: {e}"
            )
        # 开始实际的渲染过程
        for paragraph in page.pdf_paragraph:
            self.render_paragraph(paragraph, page, fonts)

    def add_watermark(self, page: il_version_1.Page):
        page_width = page.cropbox.box.x2 - page.cropbox.box.x
        page_height = page.cropbox.box.y2 - page.cropbox.box.y
        style = il_version_1.PdfStyle(
            font_id="base",
            font_size=6,
            graphic_state=il_version_1.GraphicState(),
        )
        text = f"本文档由 funstory.ai 的开源 PDF 翻译库 BabelDOC {WATERMARK_VERSION} (http://yadt.io) 翻译，本仓库正在积极的建设当中，欢迎 star 和关注。"
        if self.translation_config.debug:
            text += "\n 当前为 DEBUG 模式，将显示更多辅助信息。请注意，部分框的位置对应原文，但在译文中可能不正确。"
        page.pdf_paragraph.append(
            il_version_1.PdfParagraph(
                first_line_indent=False,
                box=il_version_1.Box(
                    x=page.cropbox.box.x + page_width * 0.05,
                    y=page.cropbox.box.y,
                    x2=page.cropbox.box.x2,
                    y2=page.cropbox.box.y2 - page_height * 0.05,
                ),
                vertical=False,
                pdf_style=style,
                pdf_paragraph_composition=[
                    il_version_1.PdfParagraphComposition(
                        pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
                            unicode=text,
                            pdf_style=style,
                        ),
                    ),
                ],
                xobj_id=-1,
            ),
        )

    def render_paragraph(
        self,
        paragraph: il_version_1.PdfParagraph,
        page: il_version_1.Page,
        fonts: dict[
            str | int,
            il_version_1.PdfFont | dict[str, il_version_1.PdfFont],
        ],
    ):
        typesetting_units = self.create_typesetting_units(paragraph, fonts)
        # 如果所有单元都可以直接传递，则直接传递
        if all(unit.can_passthrough for unit in typesetting_units):
            paragraph.scale = 1.0
            paragraph.pdf_paragraph_composition = self.create_passthrough_composition(
                typesetting_units,
            )
        else:
            # 使用预计算的缩放因子进行重排版
            precomputed_scale = (
                paragraph.optimal_scale if paragraph.optimal_scale is not None else 1.0
            )

            # 如果有单元无法直接传递，则进行重排版
            paragraph.pdf_paragraph_composition = []
            self.retypeset_with_precomputed_scale(
                paragraph, page, typesetting_units, precomputed_scale
            )

            # 重排版后，重新设置段落各字符的 render order
            self._update_paragraph_render_order(paragraph)

    def _get_width_before_next_break_point(
        self, typesetting_units: list[TypesettingUnit], scale: float
    ) -> float:
        if not typesetting_units:
            return 0
        if typesetting_units[0].can_break_line:
            return 0

        total_width = 0
        for unit in typesetting_units:
            if unit.can_break_line:
                return total_width * scale
            total_width += unit.width
        return total_width * scale

    def _layout_typesetting_units(
        self,
        typesetting_units: list[TypesettingUnit],
        box: Box,
        scale: float,
        line_skip: float,
        paragraph: il_version_1.PdfParagraph,
        use_english_line_break: bool = True,
    ) -> tuple[list[TypesettingUnit], bool]:
        """布局排版单元。

        Args:
            typesetting_units: 要布局的排版单元列表
            box: 布局边界框
            scale: 缩放因子

        Returns:
            tuple[list[TypesettingUnit], bool]: (已布局的排版单元列表，是否所有单元都放得下)
        """
        # 计算字号众数
        font_sizes = []
        for unit in typesetting_units:
            if unit.font_size:
                font_sizes.append(unit.font_size)
            if unit.char and unit.char.pdf_style and unit.char.pdf_style.font_size:
                font_sizes.append(unit.char.pdf_style.font_size)
        font_sizes.sort()
        font_size = statistics.mode(font_sizes)

        space_width = (
            self.font_mapper.base_font.char_lengths("你", font_size * scale)[0] * 0.5
        )

        # 计算行高（使用众数）
        unit_heights = (
            [unit.height for unit in typesetting_units] if typesetting_units else []
        )
        if not unit_heights:
            avg_height = 0
        elif len(unit_heights) == 1:
            avg_height = unit_heights[0] * scale
        else:
            try:
                avg_height = statistics.mode(unit_heights) * scale
            except statistics.StatisticsError:
                # 如果没有众数（所有值都出现相同次数），则使用平均值
                avg_height = sum(unit_heights) / len(unit_heights) * scale

        # 初始化位置为右上角，并减去一个平均行高
        current_x = box.x
        current_y = box.y2 - avg_height
        box = copy.deepcopy(box)
        # box.y -= avg_height * (line_spacing - 1.01) # line_spacing 已被替换为 line_skip
        line_height = 0
        current_line_heights = []  # 存储当前行所有元素的高度

        # 存储已排版的单元
        typeset_units = []
        all_units_fit = True
        last_unit: TypesettingUnit | None = None
        line_ys = [current_y]
        if paragraph.first_line_indent:
            current_x += space_width * 4
        # 遍历所有排版单元
        for i, unit in enumerate(typesetting_units):
            # 计算当前单元在当前缩放下的尺寸
            unit_width = unit.width * scale
            unit_height = unit.height * scale

            # 跳过行首的空格
            if current_x == box.x and unit.is_space:
                continue

            if (
                last_unit  # 有上一个单元
                and last_unit.is_cjk_char ^ unit.is_cjk_char  # 中英文交界处
                and (
                    last_unit.box
                    and last_unit.box.y
                    and current_y - 0.1
                    <= last_unit.box.y2
                    <= current_y + line_height + 0.1
                )  # 在同一行，且有垂直重叠
                and not last_unit.mixed_character_blacklist  # 不是混排空格黑名单字符
                and not unit.mixed_character_blacklist  # 同上
                and current_x > box.x  # 不是行首
                and unit.try_get_unicode() != " "  # 不是空格
                and last_unit.try_get_unicode() != " "  # 不是空格
                and last_unit.try_get_unicode()
                not in [
                    "。",
                    "！",
                    "？",
                    "；",
                    "：",
                    "，",
                ]
            ):
                current_x += space_width * 0.5
            if use_english_line_break:
                width_before_next_break_point = self._get_width_before_next_break_point(
                    typesetting_units[i:], scale
                )
            else:
                width_before_next_break_point = 0

            # 如果当前行放不下这个元素，换行
            if not unit.is_hung_punctuation and (
                (current_x + unit_width > box.x2)
                or (
                    use_english_line_break
                    and current_x + unit_width + width_before_next_break_point > box.x2
                )
                or (
                    unit.is_cannot_appear_in_line_end_punctuation
                    and current_x + unit_width * 2 > box.x2
                )
            ):
                # 换行
                current_x = box.x
                if not current_line_heights:
                    return [], False
                max_height = max(current_line_heights)
                mode_height = statistics.mode(current_line_heights)

                current_y -= max(mode_height * line_skip, max_height * 1.05)
                line_ys.append(current_y)
                line_height = 0.0
                current_line_heights = []  # 清空当前行高度列表

                # 检查是否超出底部边界
                # if current_y - unit_height < box.y:
                if current_y < box.y:
                    all_units_fit = False
                    # 这里不要 break，继续排版剩余内容

                if unit.is_space:
                    line_height = max(line_height, unit_height)
                    continue

            # 放置当前单元
            relocated_unit = unit.relocate(current_x, current_y, scale)
            typeset_units.append(relocated_unit)

            # 添加当前单元的高度到当前行高度列表
            if not unit.is_space:
                current_line_heights.append(unit_height)

            prev_x = current_x
            # 更新 x 坐标
            current_x = relocated_unit.box.x2
            if prev_x > current_x:
                logger.warning(f"坐标回绕！！！TypesettingUnit: {unit.box}, ")

            last_unit = relocated_unit

        return typeset_units, all_units_fit

    def create_typesetting_units(
        self,
        paragraph: il_version_1.PdfParagraph,
        fonts: dict[str, il_version_1.PdfFont],
    ) -> list[TypesettingUnit]:
        if not paragraph.pdf_paragraph_composition:
            return []
        result = []

        @cache
        def get_font(font_id: str, xobj_id: int | None):
            if xobj_id in fonts:
                font = fonts[xobj_id][font_id]
            else:
                font = fonts[font_id]
            return font

        for composition in paragraph.pdf_paragraph_composition:
            if composition is None:
                continue
            if composition.pdf_line:
                result.extend(
                    [
                        TypesettingUnit(char=char)
                        for char in composition.pdf_line.pdf_character
                    ],
                )
            elif composition.pdf_character:
                result.append(
                    TypesettingUnit(
                        char=composition.pdf_character,
                        debug_info=paragraph.debug_info,
                    ),
                )
            elif composition.pdf_same_style_characters:
                result.extend(
                    [
                        TypesettingUnit(char=char)
                        for char in composition.pdf_same_style_characters.pdf_character
                    ],
                )
            elif composition.pdf_same_style_unicode_characters:
                style = composition.pdf_same_style_unicode_characters.pdf_style
                if style is None:
                    logger.warning(
                        f"Style is None. "
                        f"Composition: {composition}. "
                        f"Paragraph: {paragraph}. ",
                    )
                    continue
                font_id = style.font_id
                if font_id is None:
                    logger.warning(
                        f"Font ID is None. "
                        f"Composition: {composition}. "
                        f"Paragraph: {paragraph}. ",
                    )
                    continue
                font = get_font(font_id, paragraph.xobj_id)
                if composition.pdf_same_style_unicode_characters.unicode:
                    result.extend(
                        [
                            TypesettingUnit(
                                unicode=char_unicode,
                                font=self.font_mapper.map(
                                    font,
                                    char_unicode,
                                ),
                                original_font=font,
                                font_size=style.font_size,
                                style=style,
                                xobj_id=paragraph.xobj_id,
                                debug_info=composition.pdf_same_style_unicode_characters.debug_info
                                or False,
                            )
                            for char_unicode in composition.pdf_same_style_unicode_characters.unicode
                            if char_unicode not in ("\n",)
                        ],
                    )
            elif composition.pdf_formula:
                result.extend([TypesettingUnit(formular=composition.pdf_formula)])
            else:
                logger.error(
                    f"Unknown composition type. "
                    f"Composition: {composition}. "
                    f"Paragraph: {paragraph}. ",
                )
                continue
        result = list(
            filter(
                lambda x: x.unicode is None or x.font is not None,
                result,
            ),
        )

        if any(x.width < 0 for x in result):
            logger.warning("有排版单元宽度小于 0，请检查字体映射是否正确。")
        return result

    def create_passthrough_composition(
        self,
        typesetting_units: list[TypesettingUnit],
    ) -> list[PdfParagraphComposition]:
        """从排版单元创建直接传递的段落组合。

        Args:
            typesetting_units: 排版单元列表

        Returns:
            段落组合列表
        """
        composition = []
        for unit in typesetting_units:
            if unit.formular:
                # 对于公式单元，直接创建包含完整公式的组合
                composition.append(PdfParagraphComposition(pdf_formula=unit.formular))
            else:
                # 对于字符单元，使用原有逻辑
                chars, curves, forms = unit.passthrough()
                composition.extend(
                    [PdfParagraphComposition(pdf_character=char) for char in chars],
                )
        return composition

    def get_max_right_space(self, current_box: Box, page) -> float:
        """获取段落右侧最大可用空间

        Args:
            current_box: 当前段落的边界框
            page: 当前页面

        Returns:
            可以扩展到的最大 x 坐标
        """
        # 获取页面的裁剪框作为初始最大限制
        max_x = page.cropbox.box.x2 * 0.9

        # 检查所有可能的阻挡元素
        for para in page.pdf_paragraph:
            if para.box == current_box or para.box is None:  # 跳过当前段落
                continue
            # 只考虑在当前段落右侧且有垂直重叠的元素
            if para.box.x > current_box.x and not (
                para.box.y >= current_box.y2 or para.box.y2 <= current_box.y
            ):
                max_x = min(max_x, para.box.x)
        for char in page.pdf_character:
            if char.box.x > current_box.x and not (
                char.box.y >= current_box.y2 or char.box.y2 <= current_box.y
            ):
                max_x = min(max_x, char.box.x)
        # 检查图形
        for figure in page.pdf_figure:
            if figure.box.x > current_box.x and not (
                figure.box.y >= current_box.y2 or figure.box.y2 <= current_box.y
            ):
                max_x = min(max_x, figure.box.x)

        return max_x

    def get_max_bottom_space(self, current_box: Box, page: il_version_1.Page) -> float:
        """获取段落下方最大可用空间

        Args:
            current_box: 当前段落的边界框
            page: 当前页面

        Returns:
            可以扩展到的最小 y 坐标
        """
        # 获取页面的裁剪框作为初始最小限制
        min_y = page.cropbox.box.y * 1.1

        # 检查所有可能的阻挡元素
        for para in page.pdf_paragraph:
            if para.box == current_box or para.box is None:  # 跳过当前段落
                continue
            # 只考虑在当前段落下方且有水平重叠的元素
            if para.box.y2 < current_box.y and not (
                para.box.x >= current_box.x2 or para.box.x2 <= current_box.x
            ):
                min_y = max(min_y, para.box.y2)
        for char in page.pdf_character:
            if char.box.y2 < current_box.y and not (
                char.box.x >= current_box.x2 or char.box.x2 <= current_box.x
            ):
                min_y = max(min_y, char.box.y2)
        # 检查图形
        for figure in page.pdf_figure:
            if figure.box.y2 < current_box.y and not (
                figure.box.x >= current_box.x2 or figure.box.x2 <= current_box.x
            ):
                min_y = max(min_y, figure.box.y2)

        return min_y

    def _update_paragraph_render_order(self, paragraph: il_version_1.PdfParagraph):
        """
        重新设置段落各字符的 render order
        主 render order 等于 paragraph 的 renderorder，sub render order 从 1 开始自增
        """
        if not hasattr(paragraph, "render_order") or paragraph.render_order is None:
            return

        main_render_order = paragraph.render_order
        sub_render_order = 1

        # 遍历段落的所有组成部分
        for composition in paragraph.pdf_paragraph_composition:
            # 检查单个字符
            if composition.pdf_character:
                char = composition.pdf_character
                char.render_order = main_render_order
                char.sub_render_order = sub_render_order
                sub_render_order += 1


================================================
FILE: babeldoc/format/pdf/document_il/utils/__init__.py
================================================


================================================
FILE: babeldoc/format/pdf/document_il/utils/extract_char.py
================================================
import logging
import shutil
from collections import defaultdict
from pathlib import Path

import cv2
import numpy as np
import pymupdf
from rich.logging import RichHandler
from sklearn.cluster import DBSCAN

import babeldoc.format.pdf.high_level
import babeldoc.format.pdf.translation_config
from babeldoc.const import get_process_pool
from babeldoc.format.pdf.document_il import il_version_1

logger = logging.getLogger(__name__)

# --- Algorithm Tuning Parameters ---

# --- Band Creation ---
# Minimum vertical overlap ratio for a character to be added to an existing band.
BAND_CREATION_OVERLAP_THRESHOLD = 0.5

# --- Line Clustering (within a band) ---
# Epsilon for DBSCAN, as a multiplier of the average character width/height.
LINE_CLUSTERING_EPS_MULTIPLIER = 3.5

# --- Line Splitting (for tall/wide lines) ---
# A line is considered for splitting if its height/width is > X times the max char size.
LINE_SPLIT_SIZE_RATIO_THRESHOLD = 1.5
# Epsilon for DBSCAN when splitting lines, as a multiplier of the max char size.
LINE_SPLIT_DBSCAN_EPS_MULTIPLIER = 0.5

# --- Space Insertion (in a finalized line) ---
# A space is inserted if the gap between chars is > X times the average char width.
SPACE_INSERTION_GAP_MULTIPLIER = 0.45

# --- Line Merging (across the page) ---
# --- Optimization ---
# Maximum vertical gap to search for potential merges, as a multiplier of avg char height.
MERGE_VERTICAL_GAP_MULTIPLIER = 1.5
# --- Containment Merge ---
# Intersection-over-area threshold to consider one line as contained within another.
MERGE_CONTAINMENT_IOU_THRESHOLD = 0.6
# --- Adjacency Merge ---
# Minimum vertical/horizontal overlap for adjacent lines to be considered for merging.
MERGE_ADJACENCY_OVERLAP_THRESHOLD = 0.7
# Maximum gap between adjacent lines to merge, as a multiplier of avg char size.
MERGE_ADJACENCY_GAP_MULTIPLIER = 1.5


# --- End of Parameters ---


def parse_pdf(pdf_path, page_ranges=None) -> il_version_1.Document:
    translation_config = babeldoc.format.pdf.translation_config.TranslationConfig(
        *[None for _ in range(4)], doc_layout_model=None
    )
    if page_ranges:
        translation_config.page_ranges = [page_ranges]
    translation_config.progress_monitor = (
        babeldoc.format.pdf.high_level.ProgressMonitor(
            babeldoc.format.pdf.high_level.TRANSLATE_STAGES
        )
    )
    try:
        shutil.copy(pdf_path, translation_config.get_working_file_path("input.pdf"))
        doc = pymupdf.open(pdf_path)
        il_creater = babeldoc.format.pdf.high_level.ILCreater(translation_config)
        il_creater.mupdf = doc
        with Path(translation_config.get_working_file_path("input.pdf")).open(
            "rb"
        ) as f:
            babeldoc.format.pdf.high_level.start_parse_il(
                f,
                doc_zh=doc,
                resfont="test_font",
                il_creater=il_creater,
                translation_config=translation_config,
            )
        il = il_creater.create_il()
        doc.close()
        return il
    finally:
        translation_config.cleanup_temp_files()
    return None


class Line:
    def __init__(self, chars: list[tuple[il_version_1.Box, str, bool]]):
        self.chars = chars
        self.text = "".join([c[1] for c in chars])


def _recalculate_line_text_with_spacing(line, orientation):
    if not line.chars:
        line.text = ""
        return

    if orientation == "horizontal":

        def get_main_start(c):
            return c[0].x

        def get_main_end(c):
            return c[0].x2

        def get_main_size(c):
            return c[0].x2 - c[0].x

    else:  # vertical

        def get_main_start(c):
            return c[0].y

        def get_main_end(c):
            return c[0].y2

        def get_main_size(c):
            return c[0].y2 - c[0].y

    line_text = ""
    avg_width = np.mean(
        [get_main_size(c) for c in line.chars if get_main_size(c) > 0] or [0]
    )

    if len(line.chars) > 1 and avg_width > 0:
        for i in range(len(line.chars) - 1):
            c1, c2 = line.chars[i], line.chars[i + 1]
            gap = get_main_start(c2) - get_main_end(c1)

            if gap > avg_width * SPACE_INSERTION_GAP_MULTIPLIER:
                line_text += c1[1] + " "
            else:
                line_text += c1[1]

    if line.chars:
        line_text += line.chars[-1][1]

    line.text = line_text


# [box, char_unicode, vertical]
# vertical: True if the char is vertical, False if the char is horizontal
def extract_paragraph_line(
    pdf_path,
) -> dict[int, list[tuple[il_version_1.Box, str, bool]]]:
    il = parse_pdf(pdf_path)
    if il is None:
        return None
    line_boxes = {}
    for page in il.page:
        line_boxes[page.page_number] = convert_page_to_char_boxes(page)
    return line_boxes


def convert_page_to_char_boxes(
    page: il_version_1.Page,
) -> list[tuple[il_version_1.Box, str, bool]]:
    return [
        (char.visual_bbox.box, char.char_unicode, char.vertical)
        for char in page.pdf_character
    ]


def _cluster_by_axis(chars: list[tuple[il_version_1.Box, str, bool]], orientation: str):
    """
    A generalized function to cluster characters into lines based on main and secondary axes.
    """
    if not chars:
        return []

    # Define main and secondary axes based on orientation
    if orientation == "horizontal":

        def get_secondary_start(c):
            return c[0].y

        def get_secondary_end(c):
            return c[0].y2

        def get_main_start(c):
            return c[0].x

        def get_main_end(c):
            return c[0].x2

        def get_main_size(c):
            return c[0].x2 - c[0].x

    else:  # vertical

        def get_secondary_start(c):
            return c[0].x

        def get_secondary_end(c):
            return c[0].x2

        def get_main_start(c):
            return c[0].y

        def get_main_end(c):
            return c[0].y2

        def get_main_size(c):
            return c[0].y2 - c[0].y

    # Step 1: Group chars into bands along the secondary axis based on overlap.
    # This is an optimized version of the band clustering algorithm.
    # It avoids the O(N^2) complexity of the naive approach by making
    # assumptions based on the sorted order of characters.
    chars.sort(key=get_secondary_start)

    # Each band is a tuple: (list_of_chars, min_secondary_coord, max_secondary_coord)
    bands_data: list[tuple[list, float, float]] = []

    for char in chars:
        char_secondary_start = get_secondary_start(char)
        char_secondary_end = get_secondary_end(char)
        char_secondary_size = char_secondary_end - char_secondary_start

        best_band_index = -1
        max_overlap_ratio = (
            BAND_CREATION_OVERLAP_THRESHOLD  # Minimum overlap ratio to be considered
        )

        # Iterate backwards over bands, as recent bands are more likely to overlap.
        for i in range(len(bands_data) - 1, -1, -1):
            band_chars, band_secondary_start, band_secondary_end = bands_data[i]

            # Optimization: If the band is already far above the current char,
            # and since chars are sorted by start, no further bands will match.
            if band_secondary_end < char_secondary_start:
                break

            overlap = max(
                0,
                min(char_secondary_end, band_secondary_end)
                - max(char_secondary_start, band_secondary_start),
            )

            if char_secondary_size > 0:
                overlap_ratio = overlap / char_secondary_size
                if overlap_ratio > max_overlap_ratio:
                    max_overlap_ratio = overlap_ratio
                    best_band_index = i

        if best_band_index != -1:
            # Add char to the best matching band and update its boundaries
            band_chars, band_start, band_end = bands_data[best_band_index]
            band_chars.append(char)
            updated_band = (
                band_chars,
                min(band_start, char_secondary_start),
                max(band_end, char_secondary_end),
            )
            bands_data[best_band_index] = updated_band
            # Move the updated band to the end to maintain rough locality
            bands_data.append(bands_data.pop(best_band_index))
        else:
            # No suitable band found, create a new one
            bands_data.append(([char], char_secondary_start, char_secondary_end))

    # Extract final bands from the data structure
    bands = [b[0] for b in bands_data]

    # Step 2: For each band, cluster along the main axis using DBSCAN
    final_lines = []
    for band in bands:
        if len(band) < 1:
            continue

        main_axis_sizes = [get_main_size(c) for c in band if get_main_size(c) > 0]
        avg_main_size = np.mean(main_axis_sizes) if main_axis_sizes else 10

        # Epsilon for main-axis clustering is twice the average character size in that dimension
        eps = avg_main_size * LINE_CLUSTERING_EPS_MULTIPLIER

        centroids = np.array(
            [((c[0].x + c[0].x2) / 2, (c[0].y + c[0].y2) / 2) for c in band]
        )

        if centroids.size > 0:
            db = DBSCAN(eps=eps, min_samples=1, metric="manhattan").fit(centroids)

            line_groups = defaultdict(list)
            for i, label in enumerate(db.labels_):
                if label != -1:
                    line_groups[label].append(band[i])

            for _, line in line_groups.items():
                line.sort(key=get_main_start)
                final_lines.append(Line(line))

    # Step 3: Split lines that are too tall/wide, which likely contain multiple distinct lines from different columns
    processed_lines = []
    for line in final_lines:
        if not line.chars:
            continue

        line_secondary_start = min(get_secondary_start(c) for c in line.chars)
        line_secondary_end = max(get_secondary_end(c) for c in line.chars)
        line_secondary_size = line_secondary_end - line_secondary_start

        char_secondary_sizes = [
            get_secondary_end(c) - get_secondary_start(c)
            for c in line.chars
            if get_secondary_end(c) - get_secondary_start(c) > 0
        ]
        if not char_secondary_sizes:
            processed_lines.append(line)
            continue

        max_char_secondary_size = np.max(char_secondary_sizes)

        if (
            line_secondary_size
            > max_char_secondary_size * LINE_SPLIT_SIZE_RATIO_THRESHOLD
            and len(line.chars) > 1
        ):
            # logger.debug(
            #     f"Splitting line '{line.text}' which seems to contain multiple lines."
            # )

            # Use DBSCAN on the secondary axis centers to split the line
            centers = np.array(
                [
                    [(get_secondary_start(c) + get_secondary_end(c)) / 2]
                    for c in line.chars
                ]
            )
            db = DBSCAN(
                eps=max_char_secondary_size * LINE_SPLIT_DBSCAN_EPS_MULTIPLIER,
                min_samples=1,
            ).fit(centers)

            sub_lines = defaultdict(list)
            for i, label in enumerate(db.labels_):
                sub_lines[label].append(line.chars[i])

            for _, sub_line_chars in sub_lines.items():
                sub_line_chars.sort(key=get_main_start)
                processed_lines.append(Line(sub_line_chars))
        else:
            processed_lines.append(line)
    final_lines = processed_lines

    for line in final_lines:
        _recalculate_line_text_with_spacing(line, orientation)

    return final_lines


def _merge_lines_on_page(page_lines: list[Line]) -> list[Line]:
    """
    Merge lines on a page that are either contained within or adjacent to each other.
    This function contains both containment and adjacency merge logic.
    """
    if not page_lines:
        return []

    merged_lines = []
    lines_to_skip = set()

    for i in range(len(page_lines)):
        if i in lines_to_skip:
            continue

        line1 = page_lines[i]
        if not line1.chars:
            merged_lines.append(line1)
            continue

        bbox1 = (
            min(c[0].x for c in line1.chars),
            min(c[0].y for c in line1.chars),
            max(c[0].x2 for c in line1.chars),
            max(c[0].y2 for c in line1.chars),
        )

        # Optimization: Calculate a vertical gap threshold to prune the search space.
        # Based on the vertical adjacency merge condition.
        line1_avg_char_height = np.mean(
            [c[0].y2 - c[0].y for c in line1.chars if c[0].y2 > c[0].y] or [0]
        )
        max_v_gap = line1_avg_char_height * MERGE_VERTICAL_GAP_MULTIPLIER

        merged = False
        for j in range(i + 1, len(page_lines)):
            if j in lines_to_skip:
                continue

            line2 = page_lines[j]
            if not line2.chars:
                continue

            bbox2 = (
                min(c[0].x for c in line2.chars),
                min(c[0].y for c in line2.chars),
                max(c[0].x2 for c in line2.chars),
                max(c[0].y2 for c in line2.chars),
            )

            # Optimization: if line2 is too far below line1, no more merges with line1 are possible.
            # The list is sorted top-to-bottom, so we can break early.
            v_gap = bbox1[1] - bbox2[3]  # y_min_1 - y_max_2
            if v_gap > max_v_gap:
                break

            # Check for "mostly contained" by checking intersection over area
            inter_x0 = max(bbox1[0], bbox2[0])
            inter_y0 = max(bbox1[1], bbox2[1])
            inter_x1 = min(bbox1[2], bbox2[2])
            inter_y1 = min(bbox1[3], bbox2[3])

            inter_area = max(0, inter_x1 - inter_x0) * max(0, inter_y1 - inter_y0)

            area1 = (
                (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
                if (bbox1[2] > bbox1[0] and bbox1[3] > bbox1[1])
                else 0
            )
            area2 = (
                (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
                if (bbox2[2] > bbox2[0] and bbox2[3] > bbox2[1])
                else 0
            )

            # Heuristic for merging:
            # 1. By containment: if one line is mostly inside another.
            # 2. By adjacency: if two lines are close and aligned.
            if (
                area2 > 0
                and area1 >= area2
                and (inter_area / area2) > MERGE_CONTAINMENT_IOU_THRESHOLD
            ):
                # Case 1: Merge line2 (smaller) into line1 (larger) by containment
                # logger.debug(
                #     f"Merging line '{line2.text}' into '{line1.text}' (mostly contained)"
                # )
                line1.chars.extend(line2.chars)
                lines_to_skip.add(j)
                merged = True
                bbox1 = (
                    min(bbox1[0], bbox2[0]),
                    min(bbox1[1], bbox2[1]),
                    max(bbox1[2], bbox2[2]),
                    max(bbox1[3], bbox2[3]),
                )

            elif (
                area1 > 0
                and area2 > area1
                and (inter_area / area1) > MERGE_CONTAINMENT_IOU_THRESHOLD
            ):
                # Case 2: Merge line1 (smaller) into line2 (larger) by containment
                # logger.debug(
                #     f"Merging line '{line1.text}' into '{line2.text}' (mostly contained)"
                # )
                line2.chars.extend(line1.chars)
                page_lines[i], page_lines[j] = page_lines[j], page_lines[i]
                line1 = page_lines[i]
                lines_to_skip.add(j)
                merged = True
                bbox1 = (
                    min(bbox1[0], bbox2[0]),
                    min(bbox1[1], bbox2[1]),
                    max(bbox1[2], bbox2[2]),
                    max(bbox1[3], bbox2[3]),
                )

            else:
                # Case 3: Merge by adjacency for lines that are close to each other
                orientation = "horizontal" if not line1.chars[0][2] else "vertical"
                if orientation == "horizontal":
                    height1 = bbox1[3] - bbox1[1]
                    height2 = bbox2[3] - bbox2[1]
                    if height1 > 0 and height2 > 0:
                        v_overlap = max(
                            0,
                            min(bbox1[3], bbox2[3]) - max(bbox1[1], bbox2[1]),
                        )
                        if (
                            v_overlap / height1
                        ) > MERGE_ADJACENCY_OVERLAP_THRESHOLD and (
                            v_overlap / height2
                        ) > MERGE_ADJACENCY_OVERLAP_THRESHOLD:
                            h_gap = max(bbox1[0], bbox2[0]) - min(bbox1[2], bbox2[2])
                            if h_gap >= 0:
                                avg_char_width = np.mean(
                                    [
                                        c[0].x2 - c[0].x
                                        for c in (line1.chars + line2.chars)
                                        if c[0].x2 > c[0].x
                                    ]
                                    or [0]
                                )
                                if (
                                    avg_char_width > 0
                                    and h_gap
                                    < avg_char_width * MERGE_ADJACENCY_GAP_MULTIPLIER
                                ):
                                    # logger.debug(
                                    #     f"Merging adjacent lines '{line1.text}' and '{line2.text}'"
                                    # )
                                    line1.chars.extend(line2.chars)
                                    lines_to_skip.add(j)
                                    merged = True
                                    bbox1 = (
                                        min(bbox1[0], bbox2[0]),
                                        min(bbox1[1], bbox2[1]),
                                        max(bbox1[2], bbox2[2]),
                                        max(bbox1[3], bbox2[3]),
                                    )
                else:  # Vertical
                    width1 = bbox1[2] - bbox1[0]
                    width2 = bbox2[2] - bbox2[0]
                    if width1 > 0 and width2 > 0:
                        h_overlap = max(
                            0,
                            min(bbox1[2], bbox2[2]) - max(bbox1[0], bbox2[0]),
                        )
                        if (
                            h_overlap / width1
                        ) > MERGE_ADJACENCY_OVERLAP_THRESHOLD and (
                            h_overlap / width2
                        ) > MERGE_ADJACENCY_OVERLAP_THRESHOLD:
                            v_gap = max(bbox1[1], bbox2[1]) - min(bbox1[3], bbox2[3])
                            if v_gap >= 0:
                                avg_char_height = np.mean(
                                    [
                                        c[0].y2 - c[0].y
                                        for c in (line1.chars + line2.chars)
                                        if c[0].y2 > c[0].y
                                    ]
                                    or [0]
                                )
                                if (
                                    avg_char_height > 0
                                    and v_gap
                                    < avg_char_height * MERGE_ADJACENCY_GAP_MULTIPLIER
                                ):
                                    # logger.debug(
                                    #     f"Merging adjacent vertical lines '{line1.text}' and '{line2.text}'"
                                    # )
                                    line1.chars.extend(line2.chars)
                                    lines_to_skip.add(j)
                                    merged = True
                                    bbox1 = (
                                        min(bbox1[0], bbox2[0]),
                                        min(bbox1[1], bbox2[1]),
                                        max(bbox1[2], bbox2[2]),
                                        max(bbox1[3], bbox2[3]),
                                    )

        if merged:
            # Re-sort and recalculate text for the merged line
            orientation = (
                "horizontal" if not line1.chars[0][2] else "vertical"
            )  # Guess orientation from first char
            if orientation == "horizontal":
                line1.chars.sort(key=lambda c: c[0].x)
            else:  # vertical
                line1.chars.sort(key=lambda c: c[0].y)
            _recalculate_line_text_with_spacing(line1, orientation)

        merged_lines.append(line1)

    return merged_lines


def process_page_chars_to_lines(
    chars: list[tuple[il_version_1.Box, str, bool]],
) -> list[Line]:
    pool = get_process_pool()
    if pool is None:
        return process_page_chars_to_lines_internal(chars)
    return pool.apply(process_page_chars_to_lines_internal, (chars,))


def process_page_chars_to_lines_internal(
    chars: list[tuple[il_version_1.Box, str, bool]],
) -> list[Line]:
    """
    Process characters on a single page to cluster them into lines.

    Args:
        chars: List of character tuples (box, char_unicode, is_vertical)

    Returns:
        List of Line objects representing clustered and merged lines
    """
    if not chars:
        return []

    horizontal_chars = [c for c in chars if not c[2]]
    vertical_chars = [c for c in chars if c[2]]

    horizontal_lines = _cluster_by_axis(horizontal_chars, "horizontal")
    vertical_lines = _cluster_by_axis(vertical_chars, "vertical")

    page_lines = horizontal_lines + vertical_lines

    # Sort all found lines by their position on the page (top-to-bottom, left-to-right)
    def get_line_position(line):
        if not line:
            return (0, 0)
        # PDF coordinate system: Y increases upwards. We negate it for top-to-bottom sort.
        avg_y = np.mean([(c[0].y + c[0].y2) / 2 for c in line])
        avg_x = np.mean([(c[0].x + c[0].x2) / 2 for c in line])
        return (-avg_y, avg_x)

    page_lines.sort(key=lambda line: get_line_position(line.chars))

    # Merge lines on the page
    merged_page_lines = _merge_lines_on_page(page_lines)
    return merged_page_lines


def cluster_chars_to_lines(
    char_boxes: dict[int, list[tuple[il_version_1.Box, str, bool]]],
) -> dict[int, list[Line]]:
    clustered_lines = {}
    if not char_boxes:
        return clustered_lines

    for page_num, chars in char_boxes.items():
        merged_page_lines = process_page_chars_to_lines(chars)
        clustered_lines[page_num] = merged_page_lines

    return clustered_lines


def draw_clustered_lines_to_image(pdf_path, clustered_lines: dict[int, list[Line]]):
    doc = pymupdf.open(pdf_path)
    debug_dir = Path("ocr-box-image-clustered") / Path(pdf_path).stem
    debug_dir.mkdir(parents=True, exist_ok=True)

    for page_number, lines in clustered_lines.items():
        if not lines:
            continue

        page = doc[page_number]
        pixmap = page.get_pixmap(dpi=300)
        image_height = pixmap.height
        image_width = pixmap.width

        samples = bytearray(pixmap.samples)
        image_array = np.frombuffer(samples, dtype=np.uint8).reshape(
            image_height, image_width, pixmap.n
        )

        if pixmap.n in [3, 4]:
            image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)

        # cv2.imwrite(str(debug_dir / f"{page_number}.png"), image_array)

        annotated_image = image_array.copy()

        page_rect = page.rect
        x_scale = image_width / page_rect.width
        y_scale = image_height / page_rect.height

        for i, line in enumerate(lines):
            if not line:
                continue

            # Draw the encompassing line box first (red)
            char_boxes_in_line = [item[0] for item in line.chars]
            min_x = min(b.x for b in char_boxes_in_line)
            min_y = min(b.y for b in char_boxes_in_line)
            max_x2 = max(b.x2 for b in char_boxes_in_line)
            max_y2 = max(b.y2 for b in char_boxes_in_line)

            img_x0_line = int(min_x * x_scale)
            img_y1_line = int(image_height - (max_y2 * y_scale))
            img_x1_line = int(max_x2 * x_scale)
            img_y0_line = int(image_height - (min_y * y_scale))

            cv2.rectangle(
                annotated_image,
                (img_x0_line, img_y1_line),
                (img_x1_line, img_y0_line),
                (0, 0, 255),  # Red for lines
                2,
            )

            cv2.putText(
                annotated_image,
                f"line {i}: {line.text}",
                (img_x0_line, img_y1_line - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.7,
                (0, 0, 255),
                2,
            )

            # Then, draw the individual character boxes on top (green)
            for char_box, _, _ in line.chars:
                pdf_x0, pdf_y0, pdf_x1, pdf_y1 = (
                    char_box.x,
                    char_box.y,
                    char_box.x2,
                    char_box.y2,
                )

                img_x0_char = int(pdf_x0 * x_scale)
                img_y0_char_pdf = int(pdf_y0 * y_scale)
                img_x1_char = int(pdf_x1 * x_scale)
                img_y1_char_pdf = int(pdf_y1 * y_scale)

                img_y0_char = image_height - img_y0_char_pdf
                img_y1_char = image_height - img_y1_char_pdf

                cv2.rectangle(
                    annotated_image,
                    (img_x0_char, img_y1_char),
                    (img_x1_char, img_y0_char),
                    (0, 255, 0),  # Green for characters
                    1,  # Thinner line
                )

        cv2.imwrite(str(debug_dir / f"{page_number}_annotated.png"), annotated_image)

    doc.close()


def main():
    logging.basicConfig(level=logging.INFO, handlers=[RichHandler()])
    for pdf_path in (
        "2404.16109v1.pdf",
        "2022 - Bortoli_Valentin De, Mathieu_Emile - Riemannian Score-Based Generative Modelling.pdf",
        "2024 - Regev_Oded - On Lattices, Learning with Errors, Random Linear Codes, and Cryptography.pdf",
        "2024 - Yang_Tian-Le, Lee_Kuang-Yao - Functional Linear Non-Gaussian Acyclic Model for Causal Discovery.pdf",
    ):
        logger.info(f"Processing {pdf_path}")
        char_boxes = extract_paragraph_line(pdf_path)
        if not char_boxes:
            logger.warning(f"No character boxes extracted from {pdf_path}")
            continue

        logger.info(
            f"Extracted {sum(len(c) for c in char_boxes.values())} characters. Clustering them into lines..."
        )
        lines = cluster_chars_to_lines(char_boxes)

        total_lines = sum(len(l) for l in lines.values())
        logger.info(f"Clustered into {total_lines} lines. Drawing boxes...")

        # logger.info("--- Clustered Lines Text ---")
        # for page_num, page_lines in lines.items():
        #     logger.info(f"Page {page_num}:")
        #     for i, line in enumerate(page_lines):
        #         logger.info(f"  Line {i}: {line.text}")
        # logger.info("----------------------------")

        draw_clustered_lines_to_image(pdf_path, lines)
        logger.info("Annotated images saved in 'ocr-box-image-clustered' directory.")


if __name__ == "__main__":
    main()


================================================
FILE: babeldoc/format/pdf/document_il/utils/fontmap.py
================================================
import enum
import functools
import logging
import re
from pathlib import Path

import pymupdf

from babeldoc.assets import assets
from babeldoc.format.pdf.document_il import PdfFont
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.translation_config import TranslationConfig

logger = logging.getLogger(__name__)


class PrimaryFontFamily(enum.IntEnum):
    SERIF = 1
    SANS_SERIF = 2
    SCRIPT = 3
    NONE = 4

    @classmethod
    def from_str(cls, value: str):
        if value == "serif":
            return cls.SERIF
        elif value == "sans-serif":
            return cls.SANS_SERIF
        elif value == "script":
            return cls.SCRIPT
        else:
            return cls.NONE


class FontMapper:
    stage_name = "Add Fonts"

    def __init__(self, translation_config: TranslationConfig):
        self.translation_config = translation_config
        assert translation_config.primary_font_family in [
            None,
            "serif",
            "sans-serif",
            "script",
        ]
        self.primary_font_family = PrimaryFontFamily.from_str(
            translation_config.primary_font_family,
        )

        font_family = assets.get_font_family(translation_config.lang_out)
        self.font_file_names = []
        for k in (
            "normal",
            "script",
            "fallback",
            "base",
        ):
            self.font_file_names.extend(font_family[k])

        self.fonts: dict[str, pymupdf.Font] = {}
        self.fontid2fontpath: dict[str, Path] = {}
        for font_file_name in self.font_file_names:
            if font_file_name in self.fontid2fontpath:
                continue
            font_path, font_metadata = assets.get_font_and_metadata(font_file_name)
            pymupdf_font = pymupdf.Font(fontfile=str(font_path))
            pymupdf_font.has_glyph = functools.lru_cache(maxsize=10240, typed=True)(
                pymupdf_font.has_glyph,
            )
            pymupdf_font.char_lengths = functools.lru_cache(maxsize=10240, typed=True)(
                pymupdf_font.char_lengths,
            )
            self.fonts[font_file_name] = pymupdf_font
            self.fontid2fontpath[font_file_name] = font_path
            self.fonts[font_file_name].font_id = font_file_name
            self.fonts[font_file_name].font_path = font_path
            self.fonts[font_file_name].ascent_fontmap = font_metadata["ascent"]
            self.fonts[font_file_name].descent_fontmap = font_metadata["descent"]
            self.fonts[font_file_name].encoding_length = font_metadata[
                "encoding_length"
            ]

        self.normal_font_ids: list[str] = font_family["normal"]
        self.script_font_ids: list[str] = font_family["script"]
        self.fallback_font_ids: list[str] = font_family["fallback"]
        self.base_font_ids: list[str] = font_family["base"]
        self.fontid2fontpath["base"] = self.fontid2fontpath[font_family["base"][0]]

        self.fontid2font: dict[str, pymupdf.Font] = {
            f.font_id: f for f in self.fonts.values()
        }

        self.fontid2font["base"] = self.fontid2font[self.base_font_ids[0]]

        self.normal_fonts: list[pymupdf.Font] = [
            self.fontid2font[font_id] for font_id in self.normal_font_ids
        ]
        self.script_fonts: list[pymupdf.Font] = [
            self.fontid2font[font_id] for font_id in self.script_font_ids
        ]
        self.fallback_fonts: list[pymupdf.Font] = [
            self.fontid2font[font_id] for font_id in self.fallback_font_ids
        ]

        self.base_font = self.fontid2font["base"]

        self.type2font: dict[str, list[pymupdf.Font]] = {
            "normal": self.normal_fonts,
            "script": self.script_fonts,
            "fallback": self.fallback_fonts,
            "base": [self.base_font],
        }

        self.has_char = functools.lru_cache(maxsize=10240, typed=True)(self.has_char)
        self.map_in_type = functools.lru_cache(maxsize=10240, typed=True)(
            self.map_in_type
        )

    def has_char(self, char_unicode: str):
        if len(char_unicode) != 1:
            return False
        current_char = ord(char_unicode)
        for font in self.fonts.values():
            if font.has_glyph(current_char):
                return True
        return False

    def map_in_type(
        self,
        bold: bool,
        italic: bool,
        monospaced: bool,
        serif: bool,
        char_unicode: str,
        font_type: str,
    ):
        if font_type == "script" and not italic:
            return None
        current_char = ord(char_unicode)
        for font in self.type2font[font_type]:
            if not font.has_glyph(current_char):
                continue
            if bool(bold) != bool(font.is_bold):
                continue
            # 不知道什么原因，思源黑体的 serif 属性为 1，先 workaround
            if bool(serif) and "serif" not in font.font_id.lower():
                continue
            if not bool(serif) and "serif" in font.font_id.lower():
                continue
            return font

        return None

    def map(self, original_font: PdfFont, char_unicode: str):
        current_char = ord(char_unicode)
        if isinstance(original_font, pymupdf.Font):
            bold = original_font.is_bold
            italic = original_font.is_italic
            monospaced = original_font.is_monospaced
            serif = original_font.is_serif
        elif isinstance(original_font, PdfFont):
            bold = original_font.bold
            italic = original_font.italic
            monospaced = original_font.monospace
            serif = original_font.serif
        else:
            logger.error(
                f"Unknown font type: {type(original_font)}. "
                f"Original font: {original_font}. "
                f"Char unicode: {char_unicode}. ",
            )
            return None

        if self.primary_font_family == PrimaryFontFamily.SERIF:
            serif = True
        elif self.primary_font_family == PrimaryFontFamily.SANS_SERIF:
            serif = False
        elif self.primary_font_family == PrimaryFontFamily.SCRIPT:
            serif = False
            italic = True

        script_font_map_result = self.map_in_type(
            bold, italic, monospaced, serif, char_unicode, "script"
        )
        if script_font_map_result:
            return script_font_map_result

        for script_font in self.script_fonts:
            if italic and script_font.has_glyph(current_char):
                return script_font

        normal_font_map_result = self.map_in_type(
            bold, italic, monospaced, serif, char_unicode, "normal"
        )
        if normal_font_map_result is not None:
            return normal_font_map_result

        fallback_font_map_result = self.map_in_type(
            bold, italic, monospaced, serif, char_unicode, "fallback"
        )
        if fallback_font_map_result is not None:
            return fallback_font_map_result

        for font in self.fallback_fonts:
            if font.has_glyph(current_char):
                return font

        logger.warning(
            f"Can't find font for {char_unicode}({current_char}). "
            f"Original font: {original_font.name}[{original_font.font_id}]. "
            f"Char unicode: {char_unicode}. ",
        )
        return None

    def get_used_font_ids(self, il: il_version_1.Document) -> set[str]:
        result = set()
        for page in il.page:
            for char in page.pdf_character:
                if char.pdf_style and char.pdf_style.font_id:
                    result.add(char.pdf_style.font_id)
            for para in page.pdf_paragraph:
                for comp in para.pdf_paragraph_composition:
                    if char := comp.pdf_character:
                        if char.pdf_style and char.pdf_style.font_id:
                            result.add(char.pdf_style.font_id)
        return result

    def add_font(self, doc_zh: pymupdf.Document, il: il_version_1.Document):
        used_font_ids = self.get_used_font_ids(il)
        font_list = [
            (k, v) for k, v in self.fontid2fontpath.items() if k in used_font_ids
        ]

        font_id = {}
        xreflen = doc_zh.xref_length()
        total = xreflen - 1 + len(font_list) + len(il.page) + len(font_list)
        with self.translation_config.progress_monitor.stage_start(
            self.stage_name,
            total,
        ) as pbar:
            if not il.page:
                pbar.advance(total)
                return
            for font in font_list:
                if font[0] in font_id:
                    continue
                font_id[font[0]] = doc_zh[0].insert_font(font[0], font[1])
                pbar.advance(1)
            for xref in range(1, xreflen):
                pbar.advance(1)
                # xref_type = doc_zh.xref_get_key(xref, "Type")
                # if xref_type[1] == "/Page":
                #     resources_xref = doc_zh.xref_get_key(xref, "Resources")
                #     if resources_xref[0] == 'null':
                #         doc_zh.xref_set_key(xref, "Resources", f"<</Font<<>>>>")
                for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
                    try:  # xref 读写可能出错
                        font_res = doc_zh.xref_get_key(xref, f"{label}Font")
                        if font_res is None:
                            continue
                        target_key_prefix = f"{label}Font/"
                        if font_res[0] == "xref":
                            resource_xref_id = re.search(
                                "(\\d+) 0 R",
                                font_res[1],
                            ).group(1)
                            xref = int(resource_xref_id)
                            font_res = ("dict", doc_zh.xref_object(xref))
                            target_key_prefix = ""
                        if font_res[0] == "dict":
                            for font in font_list:
                                target_key = f"{target_key_prefix}{font[0]}"
                                font_exist = doc_zh.xref_get_key(xref, target_key)
                                if font_exist[0] == "null":
                                    doc_zh.xref_set_key(
                                        xref,
                                        target_key,
                                        f"{font_id[font[0]]} 0 R",
                                    )
                    except Exception:
                        pass

            # Create PdfFont for each font
            # 预先创建所有字体对象
            pdf_fonts = []
            for font_name, _ in font_list:
                # Get descent_fontmap from fontid2font
                assert font_name in self.fontid2font, f"Font {font_name} not found"
                mupdf_font = self.fontid2font[font_name]
                descent_fontmap = mupdf_font.descent_fontmap
                ascent_fontmap = mupdf_font.ascent_fontmap
                encoding_length = mupdf_font.encoding_length

                pdf_fonts.append(
                    il_version_1.PdfFont(
                        name=font_name,
                        xref_id=font_id[font_name],
                        font_id=font_name,
                        encoding_length=encoding_length,
                        bold=mupdf_font.is_bold,
                        italic=mupdf_font.is_italic,
                        monospace=mupdf_font.is_monospaced,
                        serif=mupdf_font.is_serif,
                        descent=descent_fontmap,
                        ascent=ascent_fontmap,
                    ),
                )
                pbar.advance(1)

            # 批量添加字体到页面和 XObject
            for page in il.page:
                page.pdf_font.extend(pdf_fonts)
                for xobj in page.pdf_xobject:
                    xobj.pdf_font.extend(pdf_fonts)
                pbar.advance(1)


================================================
FILE: babeldoc/format/pdf/document_il/utils/formular_helper.py
================================================
import base64
import functools
import re
import unicodedata

from babeldoc.format.pdf.document_il.il_version_1 import Box
from babeldoc.format.pdf.document_il.il_version_1 import Page
from babeldoc.format.pdf.document_il.il_version_1 import PdfFormula
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.utils.layout_helper import (
    formular_height_ignore_char,
)
from babeldoc.format.pdf.translation_config import TranslationConfig


def is_formulas_start_char(
    char: str,
    font_mapper: FontMapper,
    translation_config: TranslationConfig,
) -> bool:
    if not char:
        return False
    if "(cid:" in char:
        return True
    if not font_mapper.has_char(char):
        if len(char) > 1 and all(font_mapper.has_char(x) for x in char):
            return False
        return True
    if translation_config.formular_char_pattern:
        pattern = translation_config.formular_char_pattern
        if re.match(pattern, char):
            return True
    if char != " " and (
        unicodedata.category(char[0])
        in [
            # "Lm",
            "Mn",
            "Sk",
            "Sm",
            "Zl",
            "Zp",
            "Zs",
            "Co",  # private use character
            # "So",  # symbol
        ]  # 文字修饰符、数学符号、分隔符号
        or ord(char[0]) in range(0x370, 0x400)  # 希腊字母
    ):
        return True
    if re.match("[0-9\\[\\]•]", char):
        return True
    return False


def is_formulas_middle_char(
    char: str,
    font_mapper: FontMapper,
    translation_config: TranslationConfig,
) -> bool:
    if is_formulas_start_char(char, font_mapper, translation_config):
        return True

    if re.match(",", char):
        return True

    return False


def collect_page_formula_font_ids(
    page: Page, formular_font_pattern: str | None
) -> tuple[set[int], dict[str, set[int]]]:
    """
    Collects formula font IDs from page fonts and XObject fonts.

    Args:
        page: The Page object to process.
        formular_font_pattern: The regex pattern to identify formula fonts by name.

    Returns:
        A tuple containing:
            - A set of font_ids considered formula fonts at the page level.
            - A dictionary mapping xobj_id to a set of font_ids considered
              formula fonts for that specific XObject.
    """
    # Page-level formula font IDs
    page_formula_font_ids = set()
    if page.pdf_font:
        for font in page.pdf_font:
            if is_formulas_font(font.name, formular_font_pattern):
                page_formula_font_ids.add(font.font_id)

    # XObject-level formula font IDs
    xobj_formula_font_ids_map = {}
    if page.pdf_xobject:
        for xobj in page.pdf_xobject:
            # Start with a copy of page-level formula fonts for this XObject
            current_xobj_fonts = page_formula_font_ids.copy()
            if xobj.pdf_font:
                for font in xobj.pdf_font:
                    if is_formulas_font(font.name, formular_font_pattern):
                        current_xobj_fonts.add(font.font_id)
                    else:
                        # If a font within an XObject is explicitly not a formula font,
                        # remove it from this XObject's set.
                        current_xobj_fonts.discard(font.font_id)
            xobj_formula_font_ids_map[xobj.xobj_id] = current_xobj_fonts

    return page_formula_font_ids, xobj_formula_font_ids_map


@functools.cache
def is_formulas_font(font_name: str, formular_font_pattern: str | None) -> bool:
    pattern_text = (
        r"^("
        r"|BLKFort.*"
        r"|Cambria.*"
        r"|EUAlbertina.*"
        r"|NimbusRomNo9L.*"
        r"|GlosaMath.*"
        r"|URWPalladioL.*"
        r"|CMSS.+"
        r"|Arial.*"
        r"|TimesNewRoman.*"
        r"|SegoeUI.*"
        r"|CMTT9.*"
        r"|CMSL10.*"
        r"|CMTI10.*"
        r"|CMTT10.*"
        r"|CMTI12.*"
        r"|CMR12.*"
        r"|MeridienLTStd.*"
        r"|Calibri.*"
        r"|STIXMathJax_Main.*"
        r"|.*NewBaskerville.*"
        r"|.*FranklinGothic.*"
        r"|.*AGaramondPro.*"
        r"|.*PalatinoItalCOR.*"
        r"|.*ITCSymbolStd.*"
        r"|.*PlantinStd.*"
        r"|.*DJ5EscrowCond.*"
        r"|.*ExchangeBook.*"
        r"|.*DJ5Exchange.*"
        r"|.*Times.*"
        r"|.*PalatinoLTStd.*"
        r"|.*Times New Roman,Italic.*"
        r"|.*EhrhardtMT.*"
        r"|.*GillSansMTStd.*"
        r"|.*MedicineSymbols3.*"
        r"|.*HardingText.*"
        r"|.*GraphikNaturel.*"
        r"|.*HelveticaNeue.*"
        r"|.*GoudyOldStyleT.*"
        r"|.*Symbol.*"
        r"|.*ScalaSansLF.*"
        r"|.*ScalaLF.*"
        r"|.*ScalaSansPro.*"
        r"|.*PetersburgC.*"
        r"|.*ColiseumC.*"
        r"|.*Gantari.*"
        r"|.*OptimaLTStd.*"
        r"|.*CronosPro.*"
        r"|.*ACaslon.*"
        r"|.*Frutiger.*"
        r"|.*BrandonGrotesque.*"
        r"|.*FairfieldLH.*"
        r"|.*CaeciliaLTStd.*"
        r"|.*Whitney.*"
        r"|.*Mercury.*"
        r"|.*SabonLTStd.*"
        r"|.*AnonymousPro.*"
        r"|.*SabonLTPro.*"
        r"|.*ArnoPro.*"
        r"|.*CharisSIL.*"
        r"|.*MSReference.*"
        r"|.*CMUSerif-Roman.*"
        r"|.*CourierNewPS.*"
        r"|.*XCharter.*"
        r"|.*GillSans.*"
        r"|.*Perpetua.*"
        r"|.*GEInspira.*"
        r"|.*AGaramond.*"
        r"|.*BMath.*"
        r"|.*MSTT.*"
        r"|.*Bookinsanity.*"
        r"|.*ScalySans.*"
        r"|.*Code2000.*"
        r"|.*Minion.*"
        r"|.*JansonTextLT.*"
        r"|.*MathPack.*"
        r"|.*Macmillan.*"
        r"|.*NimbusSan.*"
        r"|.*Mincho.*"
        r"|.*Amerigo.*"
        r"|.*MSGloriolaIIStd.*"
        r"|.*CMU.+"
        r"|.*LinLibertine.*"
        r"|.*txsys.*"
        r")$"
    )
    precise_formula_font_pattern = (
        r"^("
        # r"|.*CambriaMath.*"
        # r"|.*Cambria Math.*"
        r"|.*Asana.*"
        r"|.*MiriamMonoCLM-BookOblique.*"
        r"|.*Miriam Mono CLM.*"
        r"|.*Logix.*"
        r"|.*AeBonum.*"
        r"|.*AeMRoman.*"
        r"|.*AePagella.*"
        r"|.*AeSchola.*"
        r"|.*Concrete.*"
        r"|.*LatinModernMathCompanion.*"
        r"|.*Latin Modern Math Companion.*"
        r"|.*RalphSmithsFormalScriptCompanion.*"
        r"|.*Ralph Smiths Formal Script Companion.*"
        r"|.*TeXGyreBonumMathCompanion.*"
        r"|.*TeX Gyre Bonum Companion.*"
        r"|.*TeXGyrePagellaMathCompanion.*"
        r"|.*TeX Gyre Pagella Math Companion.*"
        r"|.*TeXGyreTermesMathCompanion.*"
        r"|.*TeX Gyre Termes Math Companion.*"
        r"|.*XITSMathCompanion.*"
        r"|.*XITS Math Companion.*"
        r"|.*Erewhon.*"
        r"|.*Euler-Math.*"
        r"|.*Euler Math.*"
        r"|.*FiraMath-Regular.*"
        r"|.*Fira Math.*"
        r"|.*Garamond-Math.*"
        r"|.*GFSNeohellenicMath.*"
        r"|.*KpMath.*"
        r"|.*Lete Sans Math.*"
        r"|.*LeteSansMath.*"
        # r"|.*LinLibertineO.*"
        r"|.*Linux Libertine O.*"
        r"|.*LibertinusMath-Regular.*"
        r"|.*Libertinus Math.*"
        r"|.*LatinModernMath-Regular.*"
        r"|.*Latin Modern Math.*"
        r"|.*Luciole.*"
        r"|.*NewCM.*"
        r"|.*NewComputerModern.*"
        r"|.*OldStandard-Math.*"
        r"|.*STIXMath-Regular.*"
        r"|.*STIX Math.*"
        r"|.*STIXTwoMath-Regular.*"
        r"|.*STIX Two Math.*"
        r"|.*TeXGyreBonumMath.*"
        r"|.*TeX Gyre Bonum Math.*"
        r"|.*TeXGyreDejaVuMath.*"
        r"|.*TeX Gyre DejaVu Math.*"
        r"|.*TeXGyrePagellaMath.*"
        r"|.*TeX Gyre Pagella Math.*"
        r"|.*TeXGyreScholaMath.*"
        r"|.*TeX Gyre Schola Math.*"
        r"|.*TeXGyreTermesMath.*"
        r"|.*TeX Gyre Termes Math.*"
        r"|.*XCharter-Math.*"
        r"|.*XCharter Math.*"
        r"|.*XITSMath-Bold.*"
        r"|.*XITS Math.*"
        r"|.*XITSMath.*"
        r"|.*IBMPlexMath.*"
        r"|.*IBM Plex Math.*"
        r")$"
    )
    if formular_font_pattern:
        broad_formula_font_pattern = formular_font_pattern
    else:
        broad_formula_font_pattern = (
            r"(CM[^RB]"
            r"|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]"
            r"|LINE"
            r"|LCIRCLE"
            r"|TeX-"
            r"|rsfs"
            r"|txsy"
            r"|wasy"
            r"|stmary"
            r"|.*Mono"
            r"|.*Code"
            # r"|.*Ital"
            r"|.*Sym"
            r"|.*Math"
            r"|AdvP4C4E74"
            r"|AdvPSSym"
            r"|AdvP4C4E59"
            r")"
        )

    if font_name.startswith("BASE64:"):
        font_name_bytes = base64.b64decode(font_name[7:])
        font = font_name_bytes.split(b"+")[-1]
        pattern_text = pattern_text.encode()
        broad_formula_font_pattern = broad_formula_font_pattern.encode()
    else:
        font = font_name.split("+")[-1]

    if not font:
        return False

    if re.match(precise_formula_font_pattern, font):
        return True
    elif re.match(pattern_text, font):
        return False
    elif re.match(broad_formula_font_pattern, font):
        return True

    return False


def update_formula_data(formula: PdfFormula):
    min_x = min(char.visual_bbox.box.x for char in formula.pdf_character)
    max_x = max(char.visual_bbox.box.x2 for char in formula.pdf_character)
    if not all(map(formular_height_ignore_char, formula.pdf_character)):
        min_y = min(
            char.visual_bbox.box.y
            for char in formula.pdf_character
            if not formular_height_ignore_char(char)
        )
        max_y = max(
            char.visual_bbox.box.y2
            for char in formula.pdf_character
            if not formular_height_ignore_char(char)
        )
    else:
        min_y = min(char.visual_bbox.box.y for char in formula.pdf_character)
        max_y = max(char.visual_bbox.box.y2 for char in formula.pdf_character)
    formula.box = Box(min_x, min_y, max_x, max_y)
    if not formula.y_offset:
        formula.y_offset = 0
    if not formula.x_offset:
        formula.x_offset = 0
    if not formula.x_advance:
        formula.x_advance = 0


================================================
FILE: babeldoc/format/pdf/document_il/utils/layout_helper.py
================================================
import logging
import math
import re
import unicodedata
from typing import Literal

import regex
from pymupdf import Font

from babeldoc.format.pdf.document_il import GraphicState
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.document_il.il_version_1 import Box
from babeldoc.format.pdf.document_il.il_version_1 import PdfCharacter
from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraph
from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraphComposition

logger = logging.getLogger(__name__)
# HEIGHT_NOT_USFUL_CHAR_IN_CHAR = (
#     "∑︁",
#     # 暂时假设 cid:17 和 cid 16 是特殊情况
#     # 来源于 arXiv:2310.18608v2 第九页公式大括号
#     "(cid:17)",
#     "(cid:16)",
#     # arXiv:2411.19509v2 第四页 []
#     "(cid:104)",
#     "(cid:105)",
#     # arXiv:2411.19509v2 第四页 公式的 | 竖线
#     "(cid:13)",
#     "∑︁",
#     # arXiv:2412.05265 27 页 累加号
#     "(cid:88)",
#     # arXiv:2412.05265 16 页 累乘号
#     "(cid:89)",
#     # arXiv:2412.05265 27 页 积分
#     "(cid:90)",
#     # arXiv:2412.05265 32 页 公式左右的中括号
#     "(cid:2)",
#     "(cid:3)",
#     "·",
#     "√",
# )

# 由于我们有一套 bbox 解析机制了，所以现在不需要这个东西了。
HEIGHT_NOT_USFUL_CHAR_IN_CHAR = (None,)


LEFT_BRACKET = ("(cid:8)", "(", "(cid:16)", "{", "[", "(cid:104)", "(cid:2)")
RIGHT_BRACKET = ("(cid:9)", ")", "(cid:17)", "}", "]", "(cid:105)", "(cid:3)")

BULLET_POINT_PATTERN = re.compile(
    r"[■•⚫⬤◆◇○●◦‣⁃▪▫∗†‡¹²³⁴⁵⁶⁷⁸⁹⁰₁₂₃₄₅₆₇₈₉₀ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻ¶※⁑⁂⁕⁎⁜❧☙⁋‖‽·]"
)


def is_bullet_point(char: PdfCharacter) -> bool:
    """Check if the character is a bullet point.

    Args:
        char: The character to check

    Returns:
        bool: True if the character is a bullet point
    """
    is_bullet = bool(BULLET_POINT_PATTERN.match(char.char_unicode))
    return is_bullet


def calculate_box_iou(box1: Box, box2: Box) -> float:
    """Calculate the Intersection over Union (IOU) between two boxes.

    Args:
        box1: First box
        box2: Second box

    Returns:
        float: IOU value between 0 and 1
    """
    if box1 is None or box2 is None:
        return 0.0

    # Calculate intersection
    x_left = max(box1.x, box2.x)
    y_top = max(box1.y, box2.y)
    x_right = min(box1.x2, box2.x2)
    y_bottom = min(box1.y2, box2.y2)

    # Check if there's no intersection
    if x_left >= x_right or y_top >= y_bottom:
        return 0.0

    # Calculate intersection area
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # Calculate areas of both boxes
    box1_area = (box1.x2 - box1.x) * (box1.y2 - box1.y)
    box2_area = (box2.x2 - box2.x) * (box2.y2 - box2.y)

    # Calculate union area
    union_area = box1_area + box2_area - intersection_area

    # Avoid division by zero
    if union_area <= 0:
        return 0.0

    return intersection_area / union_area


def formular_height_ignore_char(char: PdfCharacter):
    return (
        char.pdf_character_id is None
        or char.char_unicode in HEIGHT_NOT_USFUL_CHAR_IN_CHAR
    )


def box_to_tuple(box: Box) -> tuple[float, float, float, float]:
    """Converts a Box object to a tuple of its coordinates."""
    if box is None:
        return (0, 0, 0, 0)
    return (box.x, box.y, box.x2, box.y2)


class Layout:
    def __init__(self, layout_id, name):
        self.id = layout_id
        self.name = name

    @staticmethod
    def is_newline(prev_char: PdfCharacter, curr_char: PdfCharacter) -> bool:
        # 如果没有前一个字符，不是换行
        if prev_char is None:
            return False

        # 获取两个字符的中心 y 坐标
        # prev_y = (prev_char.box.y + prev_char.box.y2) / 2
        # curr_y = (curr_char.box.y + curr_char.box.y2) / 2

        # 如果当前字符的 y 坐标明显低于前一个字符，说明换行了
        # 这里使用字符高度的一半作为阈值
        char_height = max(
            curr_char.box.y2 - curr_char.box.y,
            prev_char.box.y2 - prev_char.box.y,
        )
        char_width = max(
            curr_char.box.x2 - curr_char.box.x,
            prev_char.box.x2 - prev_char.box.x,
        )
        should_new_line = (
            curr_char.box.y2 < prev_char.box.y
            or curr_char.box.x2 < prev_char.box.x - char_width * 10
        )
        if should_new_line and (
            formular_height_ignore_char(curr_char)
            or formular_height_ignore_char(prev_char)
        ):
            return False
        return should_new_line


def get_paragraph_length_except(
    paragraph: PdfParagraph,
    except_chars: str,
    font: Font,
) -> int:
    length = 0
    for composition in paragraph.pdf_paragraph_composition:
        if composition.pdf_character:
            length += (
                composition.pdf_character[0].box.x2 - composition.pdf_character[0].box.x
            )
        elif composition.pdf_same_style_characters:
            for pdf_char in composition.pdf_same_style_characters.pdf_character:
                if pdf_char.char_unicode in except_chars:
                    continue
                length += pdf_char.box.x2 - pdf_char.box.x
        elif composition.pdf_same_style_unicode_characters:
            for char_unicode in composition.pdf_same_style_unicode_characters.unicode:
                if char_unicode in except_chars:
                    continue
                length += font.char_lengths(
                    char_unicode,
                    composition.pdf_same_style_unicode_characters.pdf_style.font_size,
                )[0]
        elif composition.pdf_line:
            for pdf_char in composition.pdf_line.pdf_character:
                if pdf_char.char_unicode in except_chars:
                    continue
                length += pdf_char.box.x2 - pdf_char.box.x
        elif composition.pdf_formula:
            length += composition.pdf_formula.box.x2 - composition.pdf_formula.box.x
        else:
            logger.error(
                f"Unknown composition type. "
                f"Composition: {composition}. "
                f"Paragraph: {paragraph}. ",
            )
            continue
    return length


def get_paragraph_unicode(paragraph: PdfParagraph) -> str:
    chars = []
    for composition in paragraph.pdf_paragraph_composition:
        if composition.pdf_line:
            chars.extend(composition.pdf_line.pdf_character)
        elif composition.pdf_same_style_characters:
            chars.extend(composition.pdf_same_style_characters.pdf_character)
        elif composition.pdf_same_style_unicode_characters:
            chars.extend(composition.pdf_same_style_unicode_characters.unicode)
        elif composition.pdf_formula:
            chars.extend(composition.pdf_formula.pdf_character)
        elif composition.pdf_character:
            chars.append(composition.pdf_character)
        else:
            logger.error(
                f"Unknown composition type. "
                f"Composition: {composition}. "
                f"Paragraph: {paragraph}. ",
            )
            continue
    return get_char_unicode_string(chars)


SPACE_REGEX = regex.compile(r"\s+", regex.UNICODE)


def get_char_unicode_string(chars: list[PdfCharacter | str]) -> str:
    """
    将字符列表转换为 Unicode 字符串，根据字符间距自动插入空格。
    有些 PDF 不会显式编码空格，这时需要根据间距自动插入空格。

    Args:
        chars: 字符列表，可以是 PdfCharacter 对象或字符串

    Returns:
        str: 处理后的 Unicode 字符串
    """
    # 计算字符间距的中位数
    distances = []
    for i in range(len(chars) - 1):
        if not (
            isinstance(chars[i], PdfCharacter)
            and isinstance(chars[i + 1], PdfCharacter)
        ):
            continue
        distance = chars[i + 1].box.x - chars[i].box.x2
        if distance > 1:  # 只考虑正向距离
            distances.append(distance)

    # 去重后的距离
    distinct_distances = sorted(set(distances))

    if not distinct_distances:
        median_distance = 1
    elif len(distinct_distances) == 1:
        median_distance = distinct_distances[0]
    else:
        median_distance = distinct_distances[1]

    # 构建 unicode 字符串，根据间距插入空格
    unicode_chars = []
    for i in range(len(chars)):
        # 如果不是字符对象，直接添加，一般来说这个时候 chars[i] 是字符串
        if not isinstance(chars[i], PdfCharacter):
            unicode_chars.append(chars[i])
            continue

        # use unicode regex to replace all space with " "
        unicode_chars.append(
            regex.sub(
                r"\s+",
                " ",
                unicodedata.normalize("NFKC", chars[i].char_unicode),
            )
        )

        # 如果是空格，跳过
        if chars[i].char_unicode == " ":
            continue

        # 如果两个字符都是 PdfCharacter，检查间距
        if i < len(chars) - 1 and isinstance(chars[i + 1], PdfCharacter):
            distance = chars[i + 1].box.x - chars[i].box.x2
            if distance >= median_distance or Layout.is_newline(  # 间距大于中位数
                chars[i],
                chars[i + 1],
            ):  # 换行
                unicode_chars.append(" ")  # 添加空格

    result = "".join(unicode_chars)
    # use unicode regex to replace all space with " "
    normalize = unicodedata.normalize("NFKC", result)
    result = SPACE_REGEX.sub(" ", normalize).strip()
    return result


def get_paragraph_max_height(paragraph: PdfParagraph) -> float:
    """
    获取段落中最高的排版单元高度。

    Args:
        paragraph: PDF 段落对象

    Returns:
        float: 最大高度值
    """
    max_height = 0.0
    for composition in paragraph.pdf_paragraph_composition:
        if composition is None:
            continue
        if composition.pdf_character:
            char_height = (
                composition.pdf_character[0].box.y2 - composition.pdf_character[0].box.y
            )
            max_height = max(max_height, char_height)
        elif composition.pdf_same_style_characters:
            for pdf_char in composition.pdf_same_style_characters.pdf_character:
                char_height = pdf_char.box.y2 - pdf_char.box.y
                max_height = max(max_height, char_height)
        elif composition.pdf_same_style_unicode_characters:
            # 对于纯 Unicode 字符，我们使用其样式中的字体大小作为高度估计
            font_size = (
                composition.pdf_same_style_unicode_characters.pdf_style.font_size
            )
            max_height = max(max_height, font_size)
        elif composition.pdf_line:
            for pdf_char in composition.pdf_line.pdf_character:
                char_height = pdf_char.box.y2 - pdf_char.box.y
                max_height = max(max_height, char_height)
        elif composition.pdf_formula:
            formula_height = (
                composition.pdf_formula.box.y2 - composition.pdf_formula.box.y
            )
            max_height = max(max_height, formula_height)
        else:
            logger.error(
                f"Unknown composition type. "
                f"Composition: {composition}. "
                f"Paragraph: {paragraph}. ",
            )
            continue
    return max_height


def is_same_style(style1, style2) -> bool:
    """判断两个样式是否相同"""
    if style1 is None or style2 is None:
        return style1 is style2

    return (
        style1.font_id == style2.font_id
        and math.fabs(style1.font_size - style2.font_size) < 0.02
        and is_same_graphic_state(style1.graphic_state, style2.graphic_state)
    )


def is_same_style_except_size(style1, style2) -> bool:
    """判断两个样式是否相同"""
    if style1 is None or style2 is None:
        return style1 is style2

    return (
        style1.font_id == style2.font_id
        and 0.7 < math.fabs(style1.font_size / style2.font_size) < 1.3
        and is_same_graphic_state(style1.graphic_state, style2.graphic_state)
    )


def is_same_style_except_font(style1, style2) -> bool:
    """判断两个样式是否相同"""
    if style1 is None or style2 is None:
        return style1 is style2

    return math.fabs(
        style1.font_size - style2.font_size,
    ) < 0.02 and is_same_graphic_state(style1.graphic_state, style2.graphic_state)


def is_same_graphic_state(state1: GraphicState, state2: GraphicState) -> bool:
    """判断两个 GraphicState 是否相同"""
    if state1 is None or state2 is None:
        return state1 is state2

    return (
        state1.passthrough_per_char_instruction
        == state2.passthrough_per_char_instruction
    )


def add_space_dummy_chars(paragraph: PdfParagraph) -> None:
    """
    在 PDF 段落中添加表示空格的 dummy 字符。
    这个函数会直接修改传入的 paragraph 对象，在需要空格的地方添加 dummy 字符。
    同时也会处理不同组成部分之间的空格。

    Args:
        paragraph: 需要处理的 PDF 段落对象
    """
    # 首先处理每个组成部分内部的空格
    for composition in paragraph.pdf_paragraph_composition:
        if composition.pdf_line:
            chars = composition.pdf_line.pdf_character
            _add_space_dummy_chars_to_list(chars)
        elif composition.pdf_same_style_characters:
            chars = composition.pdf_same_style_characters.pdf_character
            _add_space_dummy_chars_to_list(chars)
        elif composition.pdf_same_style_unicode_characters:
            # 对于 unicode 字符，不需要处理。
            # 这种类型只会出现在翻译好的结果中
            continue
        elif composition.pdf_formula:
            chars = composition.pdf_formula.pdf_character
            _add_space_dummy_chars_to_list(chars)

    # 然后处理组成部分之间的空格
    for i in range(len(paragraph.pdf_paragraph_composition) - 1):
        curr_comp = paragraph.pdf_paragraph_composition[i]
        next_comp = paragraph.pdf_paragraph_composition[i + 1]

        # 获取当前组成部分的最后一个字符
        curr_last_char = _get_last_char_from_composition(curr_comp)
        if not curr_last_char:
            continue

        # 获取下一个组成部分的第一个字符
        next_first_char = _get_first_char_from_composition(next_comp)
        if not next_first_char:
            continue

        # 检查两个组成部分之间是否需要添加空格
        distance = next_first_char.box.x - curr_last_char.box.x2
        if distance > 1:  # 只考虑正向距离
            # 创建一个 dummy 字符作为空格
            space_box = Box(
                x=curr_last_char.box.x2,
                y=curr_last_char.box.y,
                x2=curr_last_char.box.x2 + distance,
                y2=curr_last_char.box.y2,
            )

            space_char = PdfCharacter(
                pdf_style=curr_last_char.pdf_style,
                box=space_box,
                char_unicode=" ",
                scale=curr_last_char.scale,
                advance=space_box.x2 - space_box.x,
                visual_bbox=il_version_1.VisualBbox(box=space_box),
            )

            # 将空格添加到当前组成部分的末尾
            if curr_comp.pdf_line:
                curr_comp.pdf_line.pdf_character.append(space_char)
            elif curr_comp.pdf_same_style_characters:
                curr_comp.pdf_same_style_characters.pdf_character.append(space_char)
            elif curr_comp.pdf_formula:
                curr_comp.pdf_formula.pdf_character.append(space_char)


def _get_first_char_from_composition(
    comp: PdfParagraphComposition,
) -> PdfCharacter | None:
    """获取组成部分的第一个字符"""
    if comp.pdf_line and comp.pdf_line.pdf_character:
        return comp.pdf_line.pdf_character[0]
    elif (
        comp.pdf_same_style_characters and comp.pdf_same_style_characters.pdf_character
    ):
        return comp.pdf_same_style_characters.pdf_character[0]
    elif comp.pdf_formula and comp.pdf_formula.pdf_character:
        return comp.pdf_formula.pdf_character[0]
    elif comp.pdf_character:
        return comp.pdf_character
    return None


def _get_last_char_from_composition(
    comp: PdfParagraphComposition,
) -> PdfCharacter | None:
    """获取组成部分的最后一个字符"""
    if comp.pdf_line and comp.pdf_line.pdf_character:
        return comp.pdf_line.pdf_character[-1]
    elif (
        comp.pdf_same_style_characters and comp.pdf_same_style_characters.pdf_character
    ):
        return comp.pdf_same_style_characters.pdf_character[-1]
    elif comp.pdf_formula and comp.pdf_formula.pdf_character:
        return comp.pdf_formula.pdf_character[-1]
    elif comp.pdf_character:
        return comp.pdf_character
    return None


def _add_space_dummy_chars_to_list(chars: list[PdfCharacter]) -> None:
    """
    在字符列表中的适当位置添加表示空格的 dummy 字符。

    Args:
        chars: PdfCharacter 对象列表
    """
    if not chars:
        return

    # 计算字符间距的中位数
    distances = []
    for i in range(len(chars) - 1):
        distance = chars[i + 1].box.x - chars[i].box.x2
        if distance > 1:  # 只考虑正向距离
            distances.append(distance)

    # 去重后的距离
    distinct_distances = sorted(set(distances))

    if not distinct_distances:
        median_distance = 1
    elif len(distinct_distances) == 1:
        median_distance = distinct_distances[0]
    else:
        median_distance = distinct_distances[1]

    # 在需要的地方插入空格字符
    i = 0
    while i < len(chars) - 1:
        curr_char = chars[i]
        next_char = chars[i + 1]

        distance = next_char.box.x - curr_char.box.x2
        if distance >= median_distance or Layout.is_newline(curr_char, next_char):
            if distance < 0:
                distance = -distance
            # 创建一个 dummy 字符作为空格
            space_box = Box(
                x=curr_char.box.x2,
                y=curr_char.box.y,
                x2=curr_char.box.x2 + min(distance, median_distance),
                y2=curr_char.box.y2,
            )

            space_char = PdfCharacter(
                pdf_style=curr_char.pdf_style,
                box=space_box,
                char_unicode=" ",
                scale=curr_char.scale,
                advance=space_box.x2 - space_box.x,
                visual_bbox=il_version_1.VisualBbox(box=space_box),
            )

            # 在当前位置后插入空格字符
            chars.insert(i + 1, space_char)
            i += 2  # 跳过刚插入的空格
        else:
            i += 1


def build_layout_index(page):
    """Builds an R-tree index for all layouts on the page."""
    from rtree import index

    layout_index = index.Index()
    layout_map = {}
    for i, layout in enumerate(page.page_layout):
        layout_map[i] = layout
        if layout.box:
            layout_index.insert(i, box_to_tuple(layout.box))
    return layout_index, layout_map


def calculate_iou_for_boxes(box1: Box, box2: Box) -> float:
    """Calculate the intersection area divided by the first box area."""
    x_left = max(box1.x, box2.x)
    y_bottom = max(box1.y, box2.y)
    x_right = min(box1.x2, box2.x2)
    y_top = min(box1.y2, box2.y2)

    if x_right <= x_left or y_top <= y_bottom:
        return 0.0

    # Calculate intersection area
    intersection_area = (x_right - x_left) * (y_top - y_bottom)

    # Calculate area of first box
    first_box_area = (box1.x2 - box1.x) * (box1.y2 - box1.y)

    # Return intersection divided by first box area, handle division by zero
    if first_box_area <= 0:
        return 0.0

    return intersection_area / first_box_area


def calculate_y_iou_for_boxes(box1: Box, box2: Box) -> float:
    """Calculate the intersection ratio in y-axis direction divided by the first box height.

    Args:
        box1: First box
        box2: Second box

    Returns:
        float: Intersection ratio in y-axis direction between 0 and 1
    """
    y_bottom = max(box1.y, box2.y)
    y_top = min(box1.y2, box2.y2)

    if y_top <= y_bottom:
        return 0.0

    # Calculate intersection height
    intersection_height = y_top - y_bottom

    # Calculate height of first box
    first_box_height = box1.y2 - box1.y

    # Return intersection divided by first box height, handle division by zero
    if first_box_height <= 0:
        return 0.0

    return intersection_height / first_box_height


def calculate_y_true_iou_for_boxes(box1: Box, box2: Box) -> float:
    """Calculate the intersection ratio in y-axis direction divided by the first box height.

    Args:
        box1: First box
        box2: Second box

    Returns:
        float: Intersection ratio in y-axis direction between 0 and 1
    """
    y_bottom = max(box1.y, box2.y)
    y_top = min(box1.y2, box2.y2)

    if y_top <= y_bottom:
        return 0.0

    # Calculate intersection height
    intersection_height = y_top - y_bottom

    # Calculate height of first box
    first_box_height = box1.y2 - box1.y
    second_box_height = box2.y2 - box2.y

    min_height = min(first_box_height, second_box_height)

    # Return intersection divided by first box height, handle division by zero
    if first_box_height <= 0:
        return 0.0

    return intersection_height / min_height


def get_character_layout(
    char,
    layout_index,
    layout_map,
    layout_priority=None,
    _bbox_mode: Literal["auto", "visual", "box"] = "auto",
):
    """Get the layout for a character based on priority and IoU."""
    if layout_priority is None:
        layout_priority = [
            "number",
            "reference",
            "reference_content",
            "algorithm",
            "formula_caption",
            "isolate_formula",
            "table_footnote",
            "table_caption",
            "figure_caption",
            "figure_title",
            "chart_title",
            "table_title",
            "table_cell_hybrid",
            "table_text",
            "wireless_table_cell",
            "wired_table_cell",
            "abandon",
            "title",
            "abstract",
            "paragraph_title",
            "content",
            "doc_title",
            "footnote",
            "header",
            "footer",
            "seal",
            "plain text",
            "tiny text",
            "author_info_hybrid",
            "list_item_hybrid",
            "text",
            "paragraph_hybrid",
            "paragraph",
            "table_cell",
            "figure_text",
            "list_item",
            "title",
            "caption",
            "footnote_hybrid",
            "footnote",
            "formula",
            "formula_hybrid",
            "page_header",
            "page_footer",
            # --- hybrid labels ---
            "reference_hybrid",
            "document_hybrid",
            "academic_paper_hybrid",
            "form_or_table_hybrid",
            "presentation_slide_hybrid",
            "webpage_screenshot_hybrid",
            "manga_or_comic_hybrid",
            "advertisement_hybrid",
            "magazine_or_newspaper_hybrid",
            "other_hybrid",
            "table_cell_hybrid",
            "figure_text_hybrid",
            "title_hybrid",
            "caption_hybrid",
            "code_algo_hybrid",
            "line_number_hybrid",
            "page_header_hybrid",
            "page_footer_hybrid",
            "page_number_hybrid",
            "unknown_hybrid",
            "fallback_line",
            "table",
            "figure",
            "image",
        ]

    char_box = char.visual_bbox.box
    # char_box2 = char.box
    # if bbox_mode == "auto":
    #     # Calculate IOU to decide which box to use
    #     intersection_area = max(
    #         0, min(char_box.x2, char_box2.x2) - max(char_box.x, char_box2.x)
    #     ) * max(0, min(char_box.y2, char_box2.y2) - max(char_box.y, char_box2.y))
    #     char_box_area = (char_box.x2 - char_box.x) * (char_box.y2 - char_box.y)
    #
    #     if char_box_area > 0:
    #         iou = intersection_area / char_box_area
    #         if iou < 0.2:
    #             char_box = char_box2
    # elif bbox_mode == "box":
    #     char_box = char_box2

    # Collect all intersecting layouts and their IoU values
    matching_layouts = []
    candidate_ids = list(layout_index.intersection(box_to_tuple(char_box)))
    candidate_layouts = [layout_map[i] for i in candidate_ids]

    for layout in candidate_layouts:
        # Calculate IoU
        intersection_area = max(
            0, min(char_box.x2, layout.box.x2) - max(char_box.x, layout.box.x)
        ) * max(0, min(char_box.y2, layout.box.y2) - max(char_box.y, layout.box.y))
        char_area = (char_box.x2 - char_box.x) * (char_box.y2 - char_box.y)

        if char_area > 0:
            iou = intersection_area / char_area
            if iou > 0:
                matching_layouts.append(
                    {
                        "layout": Layout(layout.id, layout.class_name),
                        "priority": (
                            layout_priority.index(layout.class_name)
                            if layout.class_name in layout_priority
                            else len(layout_priority)
                        ),
                        "iou": iou,
                    }
                )

    if not matching_layouts:
        return None

    # Sort by priority (ascending) and IoU value (descending)
    matching_layouts.sort(key=lambda x: (x["priority"], -x["iou"]))

    # non_hybrid_table_label = None
    # for layout in matching_layouts:
    #     layout = layout["layout"]
    #     label = layout.name
    #     if is_text_layout(layout) and label not in (
    #         "table_cell_hybrid",
    #         "table_text",
    #         "wireless_table_cell",
    #         "wired_table_cell",
    #         "fallback_line",
    #         "unknown_hybrid",
    #     ):
    #         non_hybrid_table_label = layout
    #         break
    #
    # if non_hybrid_table_label:
    #     return non_hybrid_table_label

    return matching_layouts[0]["layout"]


def is_text_layout(layout: Layout):
    """Check if a layout is a text layout."""
    return layout is not None and layout.name in [
        "plain text",
        "tiny text",
        "title",
        "abandon",
        "figure_caption",
        "table_caption",
        "table_text",
        "table_footnote",
        # "reference",
        "title",
        "paragraph_title",
        "abstract",
        "content",
        "figure_title",
        "table_title",
        "doc_title",
        "footnote",
        "header",
        "footer",
        "seal",
        "text",
        "chart_title",
        "paragraph",
        "table_cell",
        "figure_text",
        "list_item",
        "title",
        "caption",
        "footnote",
        "page_header",
        "page_footer",
        "wired_table_cell",
        "wireless_table_cell",
        "paragraph_hybrid",
        "table_cell_hybrid",
        "caption_hybrid",
        "unknown_hybrid",
        "figure_text_hybrid",
        "list_item_hybrid",
        "title_hybrid",
        "fallback_line",
        "author_info_hybrid",
        "page_header_hybrid",
        "page_footer_hybrid",
        "footnote_hybrid",
    ]


def is_character_in_formula_layout(
    char: il_version_1.PdfCharacter,
    _page: il_version_1.Page,
    layout_index,
    layout_map,
) -> int | None:
    """Check if character is contained within any formula-related layout."""
    formula_layout_types = {"formula"}

    char_box = char.visual_bbox.box
    char_box2 = char.box

    if calculate_iou_for_boxes(char_box, char_box2) < 0.2:
        char_box = char_box2

    # Get all candidate layouts that intersect with the character
    candidate_ids = list(layout_index.intersection(box_to_tuple(char_box)))
    candidate_layouts: list[il_version_1.PageLayout] = [
        layout_map[i] for i in candidate_ids
    ]

    # Check if any intersecting layout is a formula type
    for layout in candidate_layouts:
        if layout.class_name in formula_layout_types:
            iou = calculate_iou_for_boxes(char_box, layout.box)
            if iou > 0.4:  # Character has overlap with formula layout
                return layout.id

    return None


def is_curve_in_figure_table_layout(
    curve, layout_index, layout_map, protection_threshold: float = 0.3
) -> bool:
    """Check if curve is within figure/table layout areas.

    Args:
        curve: The curve object to check
        layout_index: Spatial index for layouts
        layout_map: Mapping from layout IDs to layout objects
        protection_threshold: IoU threshold for figure/table protection

    Returns:
        True if curve is within figure/table layout areas
    """
    if not curve.box:
        return False

    # Figure/table related layout types
    figure_table_layouts = {
        "figure",
        "table",
        "figure_text",
        "table_text",
        "figure_caption",
        "table_caption",
        "figure_title",
        "table_title",
        "chart_title",
        "table_cell",
        "table_cell_hybrid",
        "wired_table_cell",
        "wireless_table_cell",
        "table_footnote",
    }

    # Get candidate layouts that intersect with curve
    candidate_ids = list(layout_index.intersection(box_to_tuple(curve.box)))
    candidate_layouts = [layout_map[i] for i in candidate_ids]

    for layout in candidate_layouts:
        if layout.class_name in figure_table_layouts:
            # Check if curve has significant overlap with figure/table layout
            iou = calculate_iou_for_boxes(curve.box, layout.box)
            if iou > protection_threshold:
                return True

    return False


def is_curve_overlapping_with_paragraphs(
    curve, paragraphs: list, overlap_threshold: float = 0.2
) -> bool:
    """Check if curve overlaps with text paragraph areas.

    Args:
        curve: The curve object to check
        paragraphs: List of paragraph objects
        overlap_threshold: IoU threshold for paragraph overlap detection

    Returns:
        True if curve overlaps with any paragraph area
    """
    if not curve.box:
        return False

    for paragraph in paragraphs:
        para_box = get_paragraph_bounding_box(paragraph)
        if para_box:
            iou = calculate_iou_for_boxes(curve.box, para_box)
            if iou > overlap_threshold:
                return True

    return False


def get_paragraph_bounding_box(paragraph) -> Box | None:
    """Calculate the bounding box of a paragraph from its compositions.

    Args:
        paragraph: The paragraph object

    Returns:
        Box object representing the paragraph bounds, or None if no valid bounds
    """
    if not paragraph.pdf_paragraph_composition:
        return None

    min_x = float("inf")
    min_y = float("inf")
    max_x = float("-inf")
    max_y = float("-inf")

    has_valid_box = False

    for composition in paragraph.pdf_paragraph_composition:
        comp_box = None

        if composition.pdf_line and composition.pdf_line.box:
            comp_box = composition.pdf_line.box
        elif composition.pdf_formula and composition.pdf_formula.box:
            comp_box = composition.pdf_formula.box
        elif (
            composition.pdf_same_style_characters
            and composition.pdf_same_style_characters.box
        ):
            comp_box = composition.pdf_same_style_characters.box
        elif composition.pdf_character and len(composition.pdf_character) > 0:
            # Calculate box from character list
            char_boxes = [
                char.visual_bbox.box
                for char in composition.pdf_character
                if char.visual_bbox and char.visual_bbox.box
            ]
            if char_boxes:
                comp_min_x = min(box.x for box in char_boxes)
                comp_min_y = min(box.y for box in char_boxes)
                comp_max_x = max(box.x2 for box in char_boxes)
                comp_max_y = max(box.y2 for box in char_boxes)
                comp_box = Box(comp_min_x, comp_min_y, comp_max_x, comp_max_y)

        if comp_box:
            min_x = min(min_x, comp_box.x)
            min_y = min(min_y, comp_box.y)
            max_x = max(max_x, comp_box.x2)
            max_y = max(max_y, comp_box.y2)
            has_valid_box = True

    if not has_valid_box:
        return None

    return Box(min_x, min_y, max_x, max_y)


================================================
FILE: babeldoc/format/pdf/document_il/utils/matrix_helper.py
================================================
"""Matrix helper utilities for CTM decomposition and composition.

This module provides functions to:
- Decompose a PDF CTM into translation, rotation, scale, and shear
- Compose a CTM back from translation, rotation, scale, and shear

All comments and docstrings are in English per project guidelines.
"""

from __future__ import annotations

import math

from babeldoc.format.pdf.document_il.il_version_1 import PdfAffineTransform
from babeldoc.format.pdf.document_il.il_version_1 import PdfMatrix

# Local type aliases to avoid importing from pdfminer
Point = tuple[float, float]
Matrix = tuple[float, float, float, float, float, float]


def decompose_ctm(m: Matrix | PdfMatrix) -> PdfAffineTransform:
    """Decompose a PDF CTM into a PdfAffineTransform.

    The PDF current transformation matrix (CTM) is represented as
    ``(a, b, c, d, e, f)`` corresponding to the affine matrix:
    ``[[a, c, e], [b, d, f], [0, 0, 1]]``.

    This function decomposes it into:
    - translation: (tx, ty)
    - rotation: angle in radians (counter-clockwise)
    - scale: (sx, sy)
    - shear: x-shear factor (dimensionless, equals tan(shear_angle))

    The decomposition is based on a QR-like approach commonly used for 2D
    affine matrices. If the linear part is degenerate, sensible fallbacks are
    applied.

    Args:
        m: CTM as ``(a, b, c, d, e, f)``.

    Returns:
        A ``PdfAffineTransform`` instance with fields populated.
    """
    if isinstance(m, PdfMatrix):
        a = m.a
        b = m.b
        c = m.c
        d = m.d
        e = m.e
        f = m.f
        assert a is not None
        assert b is not None
        assert c is not None
        assert d is not None
        assert e is not None
        assert f is not None
    else:
        (a, b, c, d, e, f) = m

    tx, ty = e, f

    # Linear part
    m00, m01 = a, c
    m10, m11 = b, d

    # Scale X is the length of the first column
    sx = math.hypot(m00, m10)

    eps = 1e-12
    if sx < eps:
        # Degenerate first column. Choose rotation = 0, shear = 0, sx = 0.
        rotation = 0.0
        shear = 0.0
        # Then sy is the length of the second column
        sy = math.hypot(m01, m11)
        # Handle reflection
        det = m00 * m11 - m01 * m10
        if det < 0:
            sy = -sy if sy != 0 else -0.0
        return PdfAffineTransform(
            translation_x=tx,
            translation_y=ty,
            rotation=rotation,
            scale_x=sx,
            scale_y=sy,
            shear=shear,
        )

    # Normalize first column to get rotation axis
    r0x = m00 / sx
    r0y = m10 / sx

    # Shear is the projection of the second column onto the first column
    shear = r0x * m01 + r0y * m11

    # Remove the shear component from the second column
    m01_ortho = m01 - shear * r0x
    m11_ortho = m11 - shear * r0y

    # Scale Y is the length of the orthogonalized second column
    sy = math.hypot(m01_ortho, m11_ortho)

    # Determine reflection by determinant sign
    det = m00 * m11 - m01 * m10
    if det < 0:
        sy = -sy if sy != 0 else -0.0
        shear = -shear
        m01_ortho = -m01_ortho
        m11_ortho = -m11_ortho

    # Rotation is the angle of the first column
    rotation = math.atan2(m10, m00)

    return PdfAffineTransform(
        translation_x=tx,
        translation_y=ty,
        rotation=rotation,
        scale_x=sx,
        scale_y=sy,
        shear=shear,
    )


def compose_ctm(transform: PdfAffineTransform) -> Matrix:
    """Compose a PDF CTM from a PdfAffineTransform.

    This composes the 2x2 linear part using the following model:
    - First column: ``sx * r0`` where ``r0 = (cos(theta), sin(theta))``
    - Second column: ``shear * r0 + sy * r1`` where ``r1`` is the unit vector
      orthogonal to ``r0``: ``r1 = (-sin(theta), cos(theta))``
    - Translation is appended as (e, f) = (tx, ty)

    Args:
        transform: A ``PdfAffineTransform`` with translation, rotation,
            scale, and shear populated.

    Returns:
        The CTM matrix ``(a, b, c, d, e, f)``.
    """
    # Extract and validate required values from the dataclass
    tx = float(transform.translation_x if transform.translation_x is not None else 0.0)
    ty = float(transform.translation_y if transform.translation_y is not None else 0.0)
    theta = float(transform.rotation if transform.rotation is not None else 0.0)
    sx = float(transform.scale_x if transform.scale_x is not None else 1.0)
    sy = float(transform.scale_y if transform.scale_y is not None else 1.0)
    shear = float(transform.shear if transform.shear is not None else 0.0)

    cos_t = math.cos(theta)
    sin_t = math.sin(theta)

    # Unit basis aligned with rotation
    r0x, r0y = cos_t, sin_t
    r1x, r1y = -sin_t, cos_t

    # Columns of the linear matrix
    col0x = sx * r0x
    col0y = sx * r0y
    col1x = shear * r0x + sy * r1x
    col1y = shear * r0y + sy * r1y

    a = col0x
    b = col0y
    c = col1x
    d = col1y
    e = tx
    f = ty

    return a, b, c, d, e, f


def scale_and_set_translation(
    m: Matrix | PdfMatrix, scale_factor: float, tx: float, ty: float
) -> Matrix | PdfMatrix:
    """Uniformly scale CTM by percentage and set translation to a position.

    This function performs an isotropic scale in X and Y by ``percent`` and
    then sets the translation components to ``(tx, ty)``. It preserves the
    input type: if a ``PdfMatrix`` is provided, a ``PdfMatrix`` is returned;
    if a tuple is provided, a tuple is returned.

    Args:
        m: Input CTM as ``(a, b, c, d, e, f)`` or ``PdfMatrix``.
        scale_factor: Scale factor. ``1.0`` keeps size unchanged, ``0.5``
            halves it, ``2.0`` doubles it.
        tx: New translation X.
        ty: New translation Y.

    Returns:
        A CTM of the same type as the input, scaled and with translation set.
    """

    if isinstance(m, PdfMatrix):
        a = m.a
        b = m.b
        c = m.c
        d = m.d
        # e, f will be overridden by tx, ty
        assert a is not None
        assert b is not None
        assert c is not None
        assert d is not None

        return PdfMatrix(
            a=a * scale_factor,
            b=b * scale_factor,
            c=c * scale_factor,
            d=d * scale_factor,
            e=float(tx),
            f=float(ty),
        )

    a, b, c, d, _, _ = m
    return (
        a * scale_factor,
        b * scale_factor,
        c * scale_factor,
        d * scale_factor,
        float(tx),
        float(ty),
    )


def create_translation_and_scale_matrix(
    translation_x: float, translation_y: float, scale_factor: float
) -> Matrix:
    """Create a transformation matrix for translation and uniform scaling.

    This creates a CTM that first scales uniformly by scale_factor, then translates
    by (translation_x, translation_y).

    Args:
        translation_x: Translation in X direction
        translation_y: Translation in Y direction
        scale_factor: Uniform scale factor for both X and Y

    Returns:
        The CTM matrix (a, b, c, d, e, f)
    """
    # Matrix for uniform scaling and translation:
    # [scale  0      tx]
    # [0      scale  ty]
    # [0      0      1 ]
    # Which maps to CTM (scale, 0, 0, scale, tx, ty)
    return (scale_factor, 0.0, 0.0, scale_factor, translation_x, translation_y)


def multiply_matrices(m1: Matrix | PdfMatrix, m2: Matrix | PdfMatrix) -> Matrix:
    """Multiply two transformation matrices (m1 * m2).

    Args:
        m1: Left matrix in multiplication
        m2: Right matrix in multiplication

    Returns:
        Result matrix as tuple (a, b, c, d, e, f)
    """
    # Extract components from first matrix
    if isinstance(m1, PdfMatrix):
        a1, b1, c1, d1, e1, f1 = m1.a, m1.b, m1.c, m1.d, m1.e, m1.f
        assert all(x is not None for x in [a1, b1, c1, d1, e1, f1])
    else:
        a1, b1, c1, d1, e1, f1 = m1

    # Extract components from second matrix
    if isinstance(m2, PdfMatrix):
        a2, b2, c2, d2, e2, f2 = m2.a, m2.b, m2.c, m2.d, m2.e, m2.f
        assert all(x is not None for x in [a2, b2, c2, d2, e2, f2])
    else:
        a2, b2, c2, d2, e2, f2 = m2

    # Matrix multiplication for 2D affine transformations:
    # [a1 c1 e1]   [a2 c2 e2]   [a1*a2+c1*b2  a1*c2+c1*d2  a1*e2+c1*f2+e1]
    # [b1 d1 f1] * [b2 d2 f2] = [b1*a2+d1*b2  b1*c2+d1*d2  b1*e2+d1*f2+f1]
    # [0  0  1 ]   [0  0  1 ]   [0            0            1              ]

    a = a1 * a2 + c1 * b2
    b = b1 * a2 + d1 * b2
    c = a1 * c2 + c1 * d2
    d = b1 * c2 + d1 * d2
    e = a1 * e2 + c1 * f2 + e1
    f = b1 * e2 + d1 * f2 + f1

    return (a, b, c, d, e, f)


def apply_transform_to_ctm(
    existing_ctm: list[object],
    translation_x: float,
    translation_y: float,
    scale_factor: float,
) -> list[object]:
    """Apply translation and scale transformation to an existing CTM.

    Args:
        existing_ctm: Existing CTM as list of 6 floats
        translation_x: Translation in X direction
        translation_y: Translation in Y direction
        scale_factor: Uniform scale factor

    Returns:
        New CTM as list of objects
    """
    if len(existing_ctm) != 6:
        # If CTM is invalid, create a new identity matrix with the transform
        transform_matrix = create_translation_and_scale_matrix(
            translation_x, translation_y, scale_factor
        )
        return list(transform_matrix)

    # Convert existing CTM to Matrix format
    try:
        existing_matrix = tuple(float(x) for x in existing_ctm)
    except (ValueError, TypeError):
        # If conversion fails, use identity matrix
        existing_matrix = (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)

    # Create the transform matrix
    transform_matrix = create_translation_and_scale_matrix(
        translation_x, translation_y, scale_factor
    )

    # Left-multiply: new_ctm = transform_matrix * existing_matrix
    result_matrix = multiply_matrices(transform_matrix, existing_matrix)

    return list(result_matrix)


def matrix_to_bytes(m: Matrix | PdfMatrix) -> bytes:
    if isinstance(m, PdfMatrix):
        return (
            f" {m.a:.6f} {m.b:.6f} {m.c:.6f} {m.d:.6f} {m.e:.6f} {m.f:.6f} cm ".encode()
        )
    else:
        return f" {m[0]:.6f} {m[1]:.6f} {m[2]:.6f} {m[3]:.6f} {m[4]:.6f} {m[5]:.6f} cm ".encode()


================================================
FILE: babeldoc/format/pdf/document_il/utils/mupdf_helper.py
================================================
import numpy as np
import pymupdf

from babeldoc.const import get_process_pool


def get_no_rotation_img(page: pymupdf.Page, dpi: int = 72) -> pymupdf.Pixmap:
    # return page.get_pixmap(dpi=72)
    original_rotation = page.rotation
    page.set_rotation(0)
    pix = page.get_pixmap(dpi=dpi)
    page.set_rotation(original_rotation)
    return pix


def get_no_rotation_img_multiprocess_internal(
    pdf_bytes: str, pagenum: int, dpi: int = 72
) -> np.ndarray:
    # return page.get_pixmap(dpi=72)
    doc = pymupdf.open(pdf_bytes)
    try:
        page = doc[pagenum]
        original_rotation = page.rotation
        page.set_rotation(0)
        pix = page.get_pixmap(dpi=dpi)
        page.set_rotation(original_rotation)
        return np.frombuffer(pix.samples, np.uint8).reshape(
            pix.height,
            pix.width,
            3,
        )[:, :, ::-1]
    finally:
        doc.close()


def get_no_rotation_img_multiprocess(pdf_bytes: str, pagenum: int, dpi: int = 72):
    pool = get_process_pool()
    if pool is None:
        return get_no_rotation_img_multiprocess_internal(pdf_bytes, pagenum, dpi)
    return pool.apply(
        get_no_rotation_img_multiprocess_internal, (pdf_bytes, pagenum, dpi)
    )


================================================
FILE: babeldoc/format/pdf/document_il/utils/paragraph_helper.py
================================================
import logging
import re

from babeldoc.format.pdf.document_il import il_version_1

logger = logging.getLogger(__name__)


def is_cid_paragraph(paragraph: il_version_1.PdfParagraph):
    chars: list[il_version_1.PdfCharacter] = []
    for composition in paragraph.pdf_paragraph_composition:
        if composition.pdf_line:
            chars.extend(composition.pdf_line.pdf_character)
        elif composition.pdf_same_style_characters:
            chars.extend(composition.pdf_same_style_characters.pdf_character)
        elif composition.pdf_same_style_unicode_characters:
            continue
        #     chars.extend(composition.pdf_same_style_unicode_characters.unicode)
        elif composition.pdf_formula:
            chars.extend(composition.pdf_formula.pdf_character)
        elif composition.pdf_character:
            chars.append(composition.pdf_character)
        else:
            logger.error(
                f"Unknown composition type. "
                f"Composition: {composition}. "
                f"Paragraph: {paragraph}. ",
            )
            continue

    cid_count = 0
    for char in chars:
        if re.match(r"^\(cid:\d+\)$", char.char_unicode):
            cid_count += 1

    return cid_count > len(chars) * 0.8


NUMERIC_PATTERN = re.compile(r"^-?\d+(\.\d+)?$")


def is_pure_numeric_paragraph(paragraph) -> bool:
    """只检查段落是否为纯数字（支持整数、小数、负数）"""

    if not paragraph or not getattr(paragraph, "unicode", None):
        return False

    text = paragraph.unicode.strip()
    if not text:
        return False

    return bool(NUMERIC_PATTERN.match(text))


def is_placeholder_only_paragraph(paragraph: il_version_1.PdfParagraph) -> bool:
    """Check if a paragraph contains only placeholders and whitespace.

    Args:
        paragraph: PDF paragraph to check

    Returns:
        True if the paragraph contains only placeholders (formula or style tags)
        and whitespace, False otherwise
    """
    if not paragraph or not paragraph.unicode:
        return False

    for composition in paragraph.pdf_paragraph_composition:
        if composition.pdf_formula:
            # Formula composition is allowed
            continue
        elif composition.pdf_character:
            # Check if single character is whitespace
            if not composition.pdf_character.char_unicode.isspace():
                return False
        elif composition.pdf_line:
            # Check if all characters in the line are whitespace
            for char in composition.pdf_line.pdf_character:
                if not char.char_unicode.isspace():
                    return False
        elif composition.pdf_same_style_characters:
            # Check if all characters in the group are whitespace
            for char in composition.pdf_same_style_characters.pdf_character:
                if not char.char_unicode.isspace():
                    return False
        elif composition.pdf_same_style_unicode_characters:
            # Check if the unicode content is only whitespace
            if not composition.pdf_same_style_unicode_characters.unicode.isspace():
                return False
        else:
            # Unknown composition type, conservatively return False
            return False

    return True


================================================
FILE: babeldoc/format/pdf/document_il/utils/spatial_analyzer.py
================================================
"""Spatial relationship analyzer for PDF elements.

This module provides functions to analyze spatial relationships between PDF elements,
particularly for detecting containment relationships between formulas and other elements
like curves and forms.

All comments and docstrings are in English per project guidelines.
"""

from __future__ import annotations

from babeldoc.format.pdf.document_il.il_version_1 import Box
from babeldoc.format.pdf.document_il.il_version_1 import Page
from babeldoc.format.pdf.document_il.il_version_1 import PdfCurve
from babeldoc.format.pdf.document_il.il_version_1 import PdfForm
from babeldoc.format.pdf.document_il.il_version_1 import PdfFormula
from babeldoc.format.pdf.document_il.utils.layout_helper import calculate_iou_for_boxes


def is_element_contained_in_formula(
    element_box: Box,
    formula_box: Box,
    containment_threshold: float = 0.95,
    tolerance: float = 2.0,
) -> bool:
    """Check if an element is completely contained within a formula with tolerance.

    Args:
        element_box: The bounding box of the element to check
        formula_box: The bounding box of the formula
        containment_threshold: Minimum IoU ratio to consider as contained (default: 0.95)
        tolerance: Tolerance in units to expand formula box for containment check (default: 2.0)

    Returns:
        True if the element is considered contained within the formula
    """
    if element_box is None or formula_box is None:
        return False

    # Expand formula box by tolerance for more lenient containment check
    expanded_formula_box = Box(
        x=formula_box.x - tolerance,
        y=formula_box.y - tolerance,
        x2=formula_box.x2 + tolerance,
        y2=formula_box.y2 + tolerance,
    )

    # Calculate IoU of element box with respect to expanded formula box
    iou = calculate_iou_for_boxes(element_box, expanded_formula_box)
    return iou >= containment_threshold


def find_contained_curves(
    formula: PdfFormula, page: Page, paragraph_xobj_id: int | None = None
) -> list[PdfCurve]:
    """Find all curves that are contained within the given formula.

    Args:
        formula: The formula to check for contained curves
        page: The page containing the curves
        paragraph_xobj_id: The xobj_id of the paragraph containing the formula.
                          If provided, only curves with matching xobj_id will be returned.

    Returns:
        List of curves that are contained within the formula
    """
    if not formula.box or not page.pdf_curve:
        return []

    contained_curves = []
    for curve in page.pdf_curve:
        if curve.box and is_element_contained_in_formula(curve.box, formula.box):
            # If paragraph_xobj_id is specified, only include curves with matching xobj_id
            if paragraph_xobj_id is not None and curve.xobj_id != paragraph_xobj_id:
                continue
            contained_curves.append(curve)

    return contained_curves


def find_contained_forms(
    formula: PdfFormula, page: Page, paragraph_xobj_id: int | None = None
) -> list[PdfForm]:
    """Find all forms that are contained within the given formula.

    Args:
        formula: The formula to check for contained forms
        page: The page containing the forms
        paragraph_xobj_id: The xobj_id of the paragraph containing the formula.
                          If provided, only forms with matching xobj_id will be returned.

    Returns:
        List of forms that are contained within the formula
    """
    if not formula.box or not page.pdf_form:
        return []

    contained_forms = []
    for form in page.pdf_form:
        if form.box and is_element_contained_in_formula(form.box, formula.box):
            # If paragraph_xobj_id is specified, only include forms with matching xobj_id
            if paragraph_xobj_id is not None and form.xobj_id != paragraph_xobj_id:
                continue
            contained_forms.append(form)

    return contained_forms


def find_all_contained_elements(
    formula: PdfFormula, page: Page, paragraph_xobj_id: int | None = None
) -> tuple[list[PdfCurve], list[PdfForm]]:
    """Find all curves and forms that are contained within the given formula.

    Args:
        formula: The formula to check for contained elements
        page: The page containing the elements
        paragraph_xobj_id: The xobj_id of the paragraph containing the formula.
                          If provided, only elements with matching xobj_id will be returned.

    Returns:
        Tuple of (contained_curves, contained_forms)
    """
    contained_curves = find_contained_curves(formula, page, paragraph_xobj_id)
    contained_forms = find_contained_forms(formula, page, paragraph_xobj_id)
    return contained_curves, contained_forms


def calculate_translation_and_scale(
    old_box: Box, new_box: Box
) -> tuple[float, float, float]:
    """Calculate translation and scale factors between two boxes.

    Args:
        old_box: The original bounding box
        new_box: The new bounding box

    Returns:
        Tuple of (translation_x, translation_y, scale_factor)
    """
    if old_box is None or new_box is None:
        return 0.0, 0.0, 1.0

    # Calculate translation (difference in top-left corners)
    translation_x = new_box.x - old_box.x
    translation_y = new_box.y - old_box.y

    # Calculate scale factor (using width ratio, fallback to height if needed)
    old_width = old_box.x2 - old_box.x
    new_width = new_box.x2 - new_box.x

    if old_width > 0:
        scale_factor = new_width / old_width
    else:
        old_height = old_box.y2 - old_box.y
        new_height = new_box.y2 - new_box.y
        scale_factor = new_height / old_height if old_height > 0 else 1.0

    return translation_x, translation_y, scale_factor


================================================
FILE: babeldoc/format/pdf/document_il/utils/style_helper.py
================================================
from babeldoc.format.pdf.document_il import il_version_1


def create_pdf_style(r, g, b, font_id="base", font_size=6):
    """
    Create a PdfStyle object from RGB values.

    Args:
        r: Red component in range 0-255
        g: Green component in range 0-255
        b: Blue component in range 0-255
        font_id: Font identifier
        font_size: Font size

    Returns:
        PdfStyle object with the specified color
    """
    r, g, b = [x / 255.0 for x in (r, g, b)]
    return il_version_1.PdfStyle(
        font_id=font_id,
        font_size=font_size,
        graphic_state=il_version_1.GraphicState(
            passthrough_per_char_instruction=f"{r:.10f} {g:.10f} {b:.10f} rg",
        ),
    )


BLACK = il_version_1.GraphicState(passthrough_per_char_instruction="0 g 0 G")

WHITE = il_version_1.GraphicState(passthrough_per_char_instruction="1 g 1 G")

GRAY80 = il_version_1.GraphicState(passthrough_per_char_instruction="0.80 g 0.80 G")
GRAY67 = il_version_1.GraphicState(passthrough_per_char_instruction="0.67 g 0.67 G")
GRAY33 = il_version_1.GraphicState(passthrough_per_char_instruction="0.33 g 0.33 G")

# Generate all color styles
RED = il_version_1.GraphicState(
    passthrough_per_char_instruction="1.0000000000 0.2313725490 0.1882352941 rg "
    "1.0000000000 0.2313725490 0.1882352941 RG",
)

ORANGE = il_version_1.GraphicState(
    passthrough_per_char_instruction="1.0000000000 0.5843137255 0.0000000000 rg "
    "1.0000000000 0.5843137255 0.0000000000 RG",
)
YELLOW = il_version_1.GraphicState(
    passthrough_per_char_instruction="1.0000000000 0.8000000000 0.0000000000 rg "
    "1.0000000000 0.8000000000 0.0000000000 RG",
)

GREEN = il_version_1.GraphicState(
    passthrough_per_char_instruction="0.2039215686 0.7803921569 0.3490196078 rg "
    "0.2039215686 0.7803921569 0.3490196078 RG",
)

MINT = il_version_1.GraphicState(
    passthrough_per_char_instruction="0.0000000000 0.7803921569 0.7450980392 rg "
    "0.0000000000 0.7803921569 0.7450980392 RG",
)

TEAL = il_version_1.GraphicState(
    passthrough_per_char_instruction="0.1882352941 0.6901960784 0.7803921569 rg "
    "0.1882352941 0.6901960784 0.7803921569 RG",
)

CYAN = il_version_1.GraphicState(
    passthrough_per_char_instruction="0.1960784314 0.6784313725 0.9019607843 rg "
    "0.1960784314 0.6784313725 0.9019607843 RG",
)

BLUE = il_version_1.GraphicState(
    passthrough_per_char_instruction="0.0000000000 0.4784313725 1.0000000000 rg "
    "0.0000000000 0.4784313725 1.0000000000 RG",
)

INDIGO = il_version_1.GraphicState(
    passthrough_per_char_instruction="0.3450980392 0.3372549020 0.8392156863 rg "
    "0.3450980392 0.3372549020 0.8392156863 RG",
)

PURPLE = il_version_1.GraphicState(
    passthrough_per_char_instruction="0.6862745098 0.3215686275 0.8705882353 rg "
    "0.6862745098 0.3215686275 0.8705882353 RG",
)

PINK = il_version_1.GraphicState(
    passthrough_per_char_instruction="1.0000000000 0.1764705882 0.3333333333 rg "
    "1.0000000000 0.1764705882 0.3333333333 RG",
)

BROWN = il_version_1.GraphicState(
    passthrough_per_char_instruction="0.6352941176 0.5176470588 0.3686274510 rg "
    "0.6352941176 0.5176470588 0.3686274510 RG",
)


================================================
FILE: babeldoc/format/pdf/document_il/utils/zstd_helper.py
================================================
import base64

import pyzstd


def zstd_compress(data) -> str:
    if isinstance(data, str):
        data = data.encode()
    if not isinstance(data, bytes):
        raise TypeError(f"data must be str or bytes, not {type(data)}")

    return base64.b85encode(pyzstd.compress(data)).decode()


def zstd_decompress(data) -> str:
    if isinstance(data, str):
        data = data.encode()
    if not isinstance(data, bytes):
        raise TypeError(f"data must be str or bytes, not {type(data)}")

    return pyzstd.decompress(base64.b85decode(data)).decode()


================================================
FILE: babeldoc/format/pdf/document_il/xml_converter.py
================================================
import copy
from pathlib import Path

import orjson
from xsdata.formats.dataclass.context import XmlContext
from xsdata.formats.dataclass.parsers import XmlParser
from xsdata.formats.dataclass.serializers import XmlSerializer
from xsdata.formats.dataclass.serializers.config import SerializerConfig

from babeldoc.format.pdf.document_il import il_version_1


class XMLConverter:
    def __init__(self):
        self.parser = XmlParser()
        config = SerializerConfig(indent="  ")
        context = XmlContext()
        self.serializer = XmlSerializer(context=context, config=config)

    def write_xml(self, document: il_version_1.Document, path: str):
        with Path(path).open("w", encoding="utf-8") as f:
            f.write(self.to_xml(document))

    def read_xml(self, path: str) -> il_version_1.Document:
        with Path(path).open(encoding="utf-8") as f:
            return self.from_xml(f.read())

    def to_xml(self, document: il_version_1.Document) -> str:
        return self.serializer.render(document)

    def from_xml(self, xml: str) -> il_version_1.Document:
        return self.parser.from_string(
            xml,
            il_version_1.Document,
        )

    def deepcopy(self, document: il_version_1.Document) -> il_version_1.Document:
        return copy.deepcopy(document)
        # return self.from_xml(self.to_xml(document))

    def to_json(self, document: il_version_1.Document) -> str:
        return orjson.dumps(
            document,
            option=orjson.OPT_APPEND_NEWLINE
            | orjson.OPT_INDENT_2
            | orjson.OPT_SORT_KEYS,
        ).decode()

    def write_json(self, document: il_version_1.Document, path: str):
        with Path(path).open("w", encoding="utf-8") as f:
            f.write(self.to_json(document))


================================================
FILE: babeldoc/format/pdf/high_level.py
================================================
import asyncio
import copy
import hashlib
import io
import logging
import pathlib
import re
import shutil
import threading
import time
from asyncio import CancelledError
from pathlib import Path
from typing import Any
from typing import BinaryIO

import pymupdf
from pymupdf import Document
from pymupdf import Font

from babeldoc import asynchronize
from babeldoc.assets.assets import warmup
from babeldoc.babeldoc_exception.BabelDOCException import ExtractTextError
from babeldoc.babeldoc_exception.BabelDOCException import (
    InputFileGeneratedByBabelDOCError,
)
from babeldoc.const import CACHE_FOLDER
from babeldoc.const import WATERMARK_VERSION
from babeldoc.const import close_process_pool
from babeldoc.format.pdf.converter import TranslateConverter
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.document_il.backend.pdf_creater import SAVE_PDF_STAGE_NAME
from babeldoc.format.pdf.document_il.backend.pdf_creater import SUBSET_FONT_STAGE_NAME
from babeldoc.format.pdf.document_il.backend.pdf_creater import PDFCreater
from babeldoc.format.pdf.document_il.backend.pdf_creater import reproduce_cmap
from babeldoc.format.pdf.document_il.frontend.il_creater import ILCreater
from babeldoc.format.pdf.document_il.midend.add_debug_information import (
    AddDebugInformation,
)
from babeldoc.format.pdf.document_il.midend.automatic_term_extractor import (
    AutomaticTermExtractor,
)
from babeldoc.format.pdf.document_il.midend.detect_scanned_file import DetectScannedFile
from babeldoc.format.pdf.document_il.midend.il_translator import ILTranslator
from babeldoc.format.pdf.document_il.midend.il_translator_llm_only import (
    ILTranslatorLLMOnly,
)
from babeldoc.format.pdf.document_il.midend.layout_parser import LayoutParser
from babeldoc.format.pdf.document_il.midend.paragraph_finder import ParagraphFinder
from babeldoc.format.pdf.document_il.midend.styles_and_formulas import StylesAndFormulas
from babeldoc.format.pdf.document_il.midend.table_parser import TableParser
from babeldoc.format.pdf.document_il.midend.typesetting import Typesetting
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.xml_converter import XMLConverter
from babeldoc.format.pdf.pdfinterp import PDFPageInterpreterEx
from babeldoc.format.pdf.result_merger import ResultMerger
from babeldoc.format.pdf.split_manager import SplitManager
from babeldoc.format.pdf.translation_config import TranslateResult
from babeldoc.format.pdf.translation_config import TranslationConfig
from babeldoc.format.pdf.translation_config import WatermarkOutputMode
from babeldoc.pdfminer.pdfdocument import PDFDocument
from babeldoc.pdfminer.pdfinterp import PDFResourceManager
from babeldoc.pdfminer.pdfpage import PDFPage
from babeldoc.pdfminer.pdfparser import PDFParser
from babeldoc.progress_monitor import ProgressMonitor
from babeldoc.utils import memory

logger = logging.getLogger(__name__)

TRANSLATE_STAGES = [
    (ILCreater.stage_name, 14.12),  # Parse PDF and Create IR
    (DetectScannedFile.stage_name, 2.45),  # DetectScannedFile
    (LayoutParser.stage_name, 14.03),  # Parse Page Layout
    (TableParser.stage_name, 1.0),  # Parse Table
    (ParagraphFinder.stage_name, 6.26),  # Parse Paragraphs
    (StylesAndFormulas.stage_name, 1.66),  # Parse Formulas and Styles
    # (RemoveDescent.stage_name, 0.15),  # Remove Char Descent
    (AutomaticTermExtractor.stage_name, 30.0),  # Extract Terms
    (ILTranslator.stage_name, 46.96),  # Translate Paragraphs
    (Typesetting.stage_name, 4.71),  # Typesetting
    (FontMapper.stage_name, 0.61),  # Add Fonts
    (PDFCreater.stage_name, 1.96),  # Generate drawing instructions
    (SUBSET_FONT_STAGE_NAME, 0.92),  # Subset font
    (SAVE_PDF_STAGE_NAME, 6.34),  # Save PDF
]

resfont_map = {
    "zh-cn": "china-ss",
    "zh-tw": "china-ts",
    "zh-hans": "china-ss",
    "zh-hant": "china-ts",
    "zh": "china-ss",
    "ja": "japan-s",
    "ko": "korea-s",
}


def safe_save(doc, *args, **kwargs):
    try:
        # first try, saving without options
        doc.save(*args, **kwargs)
    except Exception:
        # second try, saving with 'garbage=3' for object missing
        doc.ez_save(*args, **kwargs)


def check_metadata(pdf: Document):
    meta = pdf.metadata
    if not meta:
        return
    producer = meta.get("producer", None)
    if (
        producer
        and "BabelDOC" in producer
        and "Translation_generated_by_AI,please_carefully_discern" in producer
    ):
        raise InputFileGeneratedByBabelDOCError(
            "Input file is generated by BabelDOC, Cannot translate files that have already been translated."
        )


def add_metadata(
    translate_result: TranslateResult, translate_config: TranslationConfig
):
    processed = []
    for attr in (
        "mono_pdf_path",
        "dual_pdf_path",
        "no_watermark_mono_pdf_path",
        "no_watermark_dual_pdf_path",
    ):
        path = getattr(translate_result, attr)
        if not path or path in processed:
            continue
        processed.append(path)

        temp_path = translate_config.get_working_file_path(f"{path.stem}.cmap.pdf")
        pdf = pymupdf.open(path)
        meta = pdf.metadata
        if not meta:
            meta = {}
        creator = meta.get("creator", None)
        producer = meta.get("producer", None)
        if producer:
            if not creator:
                creator = producer
            else:
                creator += f", {producer}"

        translated_by = f"BabelDOC{WATERMARK_VERSION}_{time.time()}_Translation_generated_by_AI,please_carefully_discern"
        if translate_config.metadata_extra_data:
            translated_by += f"_{translate_config.metadata_extra_data}"
        meta["producer"] = translated_by
        meta["creator"] = creator

        for k, v in meta.items():
            if v:
                # 使用正则替换掉 surrogate 范围内的字符
                meta[k] = re.sub(r"[\uD800-\uDFFF]", "", v)

        pdf.set_metadata(meta)
        safe_save(pdf, temp_path)
        shutil.move(temp_path, path)


def fix_cmap(translate_result: TranslateResult, translate_config: TranslationConfig):
    processed = []
    for attr in (
        "mono_pdf_path",
        "dual_pdf_path",
        "no_watermark_mono_pdf_path",
        "no_watermark_dual_pdf_path",
    ):
        path = getattr(translate_result, attr)
        if not path or path in processed:
            continue
        processed.append(path)

        temp_path = translate_config.get_working_file_path(f"{path.stem}.cmap.pdf")
        pdf = pymupdf.open(path)
        reproduce_cmap(pdf)
        safe_save(pdf, temp_path)
        shutil.move(temp_path, path)


def verify_file_hash(file_path: str, expected_hash: str) -> bool:
    """Verify the SHA256 hash of a file."""
    sha256_hash = hashlib.sha256()
    with Path(file_path).open("rb") as f:
        # Read the file in chunks to handle large files efficiently
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest() == expected_hash


def translator_supports_llm(translator) -> bool:
    if not translator or not hasattr(translator, "do_llm_translate"):
        return False
    try:
        translator.do_llm_translate(None)
        return True
    except NotImplementedError:
        return False
    except Exception as exc:  # pragma: no cover - defensive logging
        logger.debug("translator %s failed llm detection: %s", translator, exc)
        return False


def start_parse_il(
    inf: BinaryIO,
    pages: list[int] | None = None,
    vfont: str = "",
    vchar: str = "",
    thread: int = 0,
    doc_zh: Document = None,
    lang_in: str = "",
    lang_out: str = "",
    service: str = "",
    resfont: str = "",
    noto: Font = None,
    cancellation_event: asyncio.Event = None,
    il_creater: ILCreater = None,
    translation_config: TranslationConfig = None,
    **kwarg: Any,
) -> None:
    rsrcmgr = PDFResourceManager()
    layout = {}
    device = TranslateConverter(
        rsrcmgr,
        vfont,
        vchar,
        thread,
        layout,
        lang_in,
        lang_out,
        service,
        resfont,
        noto,
        kwarg.get("envs", {}),
        kwarg.get("prompt", []),
        il_creater=il_creater,
    )
    # model = DocLayoutModel.load_available()

    assert device is not None
    assert il_creater is not None
    assert translation_config is not None
    obj_patch = {}
    interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch, il_creater)
    if pages:
        total_pages = len(pages)
    else:
        total_pages = doc_zh.page_count

    il_creater.on_total_pages(total_pages)

    parser = PDFParser(inf)
    doc = PDFDocument(parser)

    for pageno, page in enumerate(PDFPage.create_pages(doc)):
        if cancellation_event and cancellation_event.is_set():
            raise CancelledError("task cancelled")
        if pages and (pageno not in pages):
            continue
        page.pageno = pageno

        if not translation_config.should_translate_page(pageno + 1):
            continue

        height, width = (
            page.cropbox[3] - page.cropbox[1],
            page.cropbox[2] - page.cropbox[0],
        )
        if height > 1200 or width > 2000:
            logger.warning(f"page {pageno + 1} is too large, maybe unable to translate")
            # continue

        translation_config.raise_if_cancelled()
        # The current program no longer relies on
        # the following layout recognition results,
        # but in order to facilitate the migration of pdf2zh,
        # the relevant code is temporarily retained.
        # pix = doc_zh[page.pageno].get_pixmap()
        # image = np.frombuffer(pix.samples, np.uint8).reshape(
        #     pix.height, pix.width, 3
        # )[:, :, ::-1]
        # page_layout = model.predict(
        #     image, imgsz=int(pix.height / 32) * 32)[0]
        # # kdtree 是不可能 kdtree 的，不如直接渲染成图片，用空间换时间
        # box = np.ones((pix.height, pix.width))
        # h, w = box.shape
        # vcls = ["abandon", "figure", "table",
        #         "isolate_formula", "formula_caption"]
        # for i, d in enumerate(page_layout.boxes):
        #     if page_layout.names[int(d.cls)] not in vcls:
        #         x0, y0, x1, y1 = d.xyxy.squeeze()
        #         x0, y0, x1, y1 = (
        #             np.clip(int(x0 - 1), 0, w - 1),
        #             np.clip(int(h - y1 - 1), 0, h - 1),
        #             np.clip(int(x1 + 1), 0, w - 1),
        #             np.clip(int(h - y0 + 1), 0, h - 1),
        #         )
        #         box[y0:y1, x0:x1] = i + 2
        # for i, d in enumerate(page_layout.boxes):
        #     if page_layout.names[int(d.cls)] in vcls:
        #         x0, y0, x1, y1 = d.xyxy.squeeze()
        #         x0, y0, x1, y1 = (
        #             np.clip(int(x0 - 1), 0, w - 1),
        #             np.clip(int(h - y1 - 1), 0, h - 1),
        #             np.clip(int(x1 + 1), 0, w - 1),
        #             np.clip(int(h - y0 + 1), 0, h - 1),
        #         )
        #         box[y0:y1, x0:x1] = 0
        # layout[page.pageno] = box
        # 新建一个 xref 存放新指令流
        # page.page_xref = doc_zh.get_new_xref()  # hack 插入页面的新 xref
        # doc_zh.update_object(page.page_xref, "<<>>")
        # doc_zh.update_stream(page.page_xref, b"")
        # doc_zh[page.pageno].set_contents(page.page_xref)
        ops_base = interpreter.process_page(page)
        il_creater.on_page_base_operation(ops_base)
        il_creater.on_page_end()
    il_creater.on_finish()
    device.close()


def translate(translation_config: TranslationConfig) -> TranslateResult:
    with ProgressMonitor(get_translation_stage(translation_config)) as pm:
        return do_translate(pm, translation_config)


def get_translation_stage(
    translation_config: TranslationConfig,
) -> list[tuple[str, float]]:
    result = copy.deepcopy(TRANSLATE_STAGES)
    should_remove = []

    # If only parsing and generating PDF, skip all translation-related stages
    if translation_config.only_parse_generate_pdf:
        should_remove.extend(
            [
                DetectScannedFile.stage_name,
                LayoutParser.stage_name,
                TableParser.stage_name,
                ParagraphFinder.stage_name,
                StylesAndFormulas.stage_name,
                AutomaticTermExtractor.stage_name,
                ILTranslator.stage_name,
                Typesetting.stage_name,
            ]
        )
    else:
        # Original logic for selective removal
        if not translation_config.table_model:
            should_remove.append(TableParser.stage_name)
        if translation_config.skip_scanned_detection:
            should_remove.append(DetectScannedFile.stage_name)
        if not translation_config.auto_extract_glossary:
            should_remove.append(AutomaticTermExtractor.stage_name)
        if translation_config.skip_translation:
            should_remove.append(ILTranslator.stage_name)

    result = [x for x in result if x[0] not in should_remove]
    return result


async def async_translate(translation_config: TranslationConfig):
    """Asynchronously translate a PDF file with real-time progress reporting.

    This function yields progress events that can be used to update progress bars
    or other UI elements. The events are dictionaries with the following structure:

    - progress_start: {
        "type": "progress_start",
        "stage": str,              # Stage name
        "stage_progress": float,   # Always 0.0
        "stage_current": int,      # Current count (0)
        "stage_total": int         # Total items in stage
    }
    - progress_update: {
        "type": "progress_update",
        "stage": str,              # Stage name
        "stage_progress": float,   # Stage progress (0-100)
        "stage_current": int,      # Current items processed
        "stage_total": int,        # Total items in stage
        "overall_progress": float  # Overall progress (0-100)
    }
    - progress_end: {
        "type": "progress_end",
        "stage": str,              # Stage name
        "stage_progress": float,   # Always 100.0
        "stage_current": int,      # Equal to stage_total
        "stage_total": int,        # Total items processed
        "overall_progress": float  # Overall progress (0-100)
    }
    - finish: {
        "type": "finish",
        "translate_result": TranslateResult
    }
    - error: {
        "type": "error",
        "error": str
    }

    Args:
        translation_config: Configuration for the translation process

    Yields:
        dict: Progress events during translation

    Raises:
        CancelledError: If the translation is cancelled
        Exception: Any other errors during translation
    """
    loop = asyncio.get_running_loop()
    callback = asynchronize.AsyncCallback()

    finish_event = asyncio.Event()
    cancel_event = threading.Event()
    with ProgressMonitor(
        get_translation_stage(translation_config),
        progress_change_callback=callback.step_callback,
        finish_callback=callback.finished_callback,
        finish_event=finish_event,
        cancel_event=cancel_event,
        loop=loop,
        report_interval=translation_config.report_interval,
    ) as pm:
        future = loop.run_in_executor(None, do_translate, pm, translation_config)
        try:
            async for event in callback:
                event = event.kwargs
                yield event
                if event["type"] == "error":
                    break
        except CancelledError:
            cancel_event.set()
        except KeyboardInterrupt:
            logger.info("Translation cancelled by user through keyboard interrupt")
            cancel_event.set()
    if cancel_event.is_set():
        future.cancel()
    logger.info("Waiting for translation to finish...")
    await finish_event.wait()


class MemoryMonitor:
    """Monitor memory usage of current process and all child processes."""

    def __init__(self, interval=0.1):
        """Initialize memory monitor.

        Args:
            interval: Monitoring interval in seconds, defaults to 0.1s (100ms)
        """
        self.interval = interval
        self.peak_memory_usage = 0
        self.monitor_thread = None
        self.stop_event = None
        self.last_pss_check_time = None

    def __enter__(self):
        """Start memory monitoring."""
        self.stop_event = threading.Event()
        self.monitor_thread = threading.Thread(
            target=self._monitor_memory_usage, daemon=True
        )
        self.monitor_thread.start()
        logger.debug("Memory monitoring started")
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Stop monitoring and log peak memory usage."""
        if not self.monitor_thread:
            return

        self.stop_event.set()
        self.monitor_thread.join(timeout=2.0)
        logger.info(f"Peak memory usage: {self.peak_memory_usage:.2f} MB")

    def _monitor_memory_usage(self):
        """Background thread that periodically checks memory usage."""
        while not self.stop_event.is_set():
            try:
                # Use throttled memory check with 2-second PSS throttle
                total_memory, self.last_pss_check_time = (
                    memory.get_memory_usage_with_throttle(
                        include_children=True,
                        prefer_pss=True,
                        last_pss_check_time=self.last_pss_check_time,
                        pss_throttle_seconds=2.0,
                    )
                )

                # Convert to MB for better readability
                total_memory_mb = total_memory / (1024 * 1024)
                if total_memory_mb > self.peak_memory_usage:
                    self.peak_memory_usage = total_memory_mb
            except Exception as e:
                logger.warning(f"Error monitoring memory: {e}")

            time.sleep(self.interval)

    def get_peek_memory_psutil(self):
        """Get peak memory usage using psutil (for backwards compatibility)."""
        return memory.get_memory_usage_bytes(include_children=True, prefer_pss=True)


def fix_null_page_content(doc: Document) -> list[int]:
    invalid_page = []
    for x in range(len(doc)):
        xref = doc[x].xref
        if doc.xref_object(xref) == "null":
            invalid_page.append(x)
    for x in invalid_page:
        doc.delete_page(x)
        doc.insert_page(x)
    return invalid_page


def fix_null_xref(doc: Document) -> None:
    """Fix null xref in PDF file by replacing them with empty arrays.

    Args:
        doc: PyMuPDF Document object to fix
    """
    for i in range(1, doc.xref_length()):
        try:
            obj = doc.xref_object(i)
            if obj == "null":
                doc.update_object(i, "[]")
            elif obj and "/ASCII85Decode" in obj:  # make pdfminer happy
                data = doc.xref_stream(i)
                doc.update_stream(i, data)
            elif obj and "/LZWDecode" in obj:
                data = doc.xref_stream(i)
                doc.update_stream(i, data)
            elif obj and "/Annots" in obj:
                doc.xref_set_key(i, "Annots", "null")
        except Exception:
            doc.update_object(i, "[]")


def fix_filter(doc):
    page_contents = []
    for page in doc:
        page_contents.extend(page.get_contents())
    for page_piece in page_contents:
        f = doc.xref_get_key(page_piece, "Filter")
        if f[0] == "xref":
            data = doc.xref_stream(page_piece)
            doc.update_stream(page_piece, data)
    for page in doc:
        contents = page.get_contents()
        if len(contents) > 1:
            page_streams = [doc.xref_stream(i) for i in contents]
            r = doc.get_new_xref()
            doc.update_object(r, "<<>>")
            doc.update_stream(r, b" ".join(page_streams))
            doc.xref_set_key(page.xref, "Contents", f"{r} 0 R")
    return
    # skip rotate for now
    for page in doc:
        contents = page.get_contents()
        t, v = doc.xref_get_key(page.xref, "Rotate")
        rotate = -int(v) if t == "int" else 0
        if len(contents) > 1 or rotate:
            page_streams = [doc.xref_stream(i) for i in contents]
            r = doc.get_new_xref()
            page_prefix = b""
            page_suffix = b""
            if rotate:
                m0 = pymupdf.Matrix(rotate)
                b0 = page.mediabox * m0
                m1 = m0 * pymupdf.Matrix(1, 0, 0, 1, b0.x0, -b0.y0)
                page_prefix = (
                    f" {m1.a} {m1.b} {m1.c} {m1.d} {m1.e} {m1.f} cm q ".encode()
                )
                page_suffix = b" Q "
                update_page_bbox(doc, page, page.cropbox * m1, "CropBox")
                update_page_bbox(doc, page, page.artbox * m1, "ArtBox")
                update_page_bbox(doc, page, page.bleedbox * m1, "BleedBox")
                update_page_bbox(doc, page, page.mediabox * m1, "MediaBox")
                doc.xref_set_key(page.xref, "Rotate", "0")
            doc.update_object(r, "<<>>")
            doc.update_stream(r, page_prefix + b" ".join(page_streams) + page_suffix)
            doc.xref_set_key(page.xref, "Contents", f"{r} 0 R")


def update_page_bbox(doc, page, box, key):
    if doc.xref_get_key(page.xref, key)[0] == "array":
        doc.xref_set_key(page.xref, key, f"[{box.x0} {box.y0} {box.x1} {box.y1}]")


def do_translate(
    pm: ProgressMonitor, translation_config: TranslationConfig
) -> TranslateResult:
    try:
        translation_config.progress_monitor = pm
        original_pdf_path = translation_config.input_file
        logger.info(f"start to translate: {original_pdf_path}")
        try:
            check_metadata(Document(original_pdf_path))
        except InputFileGeneratedByBabelDOCError as e:
            logger.error(
                f"input file {original_pdf_path} is generated by BabelDOC, Cannot translate files that have already been translated."
            )
            raise e
        except Exception as e:
            logger.warning(f"Error in check metadata, continue: {e}")
        start_time = time.time()
        peak_memory_usage = 0
        with MemoryMonitor() as memory_monitor:
            # Check if split translation is enabled
            if not translation_config.split_strategy:
                result = _do_translate_single(pm, translation_config)
            else:
                # Initialize split manager and determine split points
                split_manager = SplitManager(translation_config)
                split_points = split_manager.determine_split_points(translation_config)

                if not split_points:
                    logger.warning(
                        "No split points determined, falling back to single translation"
                    )
                    result = _do_translate_single(pm, translation_config)
                else:
                    logger.info(f"Split points determined: {len(split_points)} parts")

                    if len(split_points) == 1:
                        logger.info("Only one part, use single translation")
                        result = _do_translate_single(pm, translation_config)
                    else:
                        pm.total_parts = len(split_points)

                        # Process parts serially
                        results: dict[int, TranslateResult | None] = {}
                        original_watermark_mode = (
                            translation_config.watermark_output_mode
                        )
                        original_doc = Document(original_pdf_path)
                        for i, split_point in enumerate(split_points):
                            try:
                                # Create a copy of config for this part
                                part_config = copy.copy(translation_config)
                                part_config.skip_clean = True
                                should_translate_pages = []
                                for page in range(
                                    split_point.start_page, split_point.end_page + 1
                                ):
                                    if translation_config.should_translate_page(
                                        page + 1
                                    ):
                                        should_translate_pages.append(
                                            page - split_point.start_page + 1
                                        )
                                part_config.pages = None
                                part_config.page_ranges = [
                                    (x, x) for x in should_translate_pages
                                ]
                                if (
                                    translation_config.only_include_translated_page
                                    and not should_translate_pages
                                ):
                                    results[i] = None
                                    continue

                                # Only first part should do scanned detection if enabled
                                if i > 0:
                                    part_config.skip_scanned_detection = True

                                part_config.working_dir = (
                                    translation_config.get_part_working_dir(i)
                                )
                                part_config.output_dir = (
                                    translation_config.get_part_output_dir(i)
                                )

                                assert id(
                                    part_config.shared_context_cross_split_part
                                ) == id(
                                    translation_config.shared_context_cross_split_part
                                ), "shared_context_cross_split_part must be the same"

                                part_temp_input_path = (
                                    part_config.get_working_file_path(
                                        f"input.part{i}.pdf"
                                    )
                                )
                                part_config.input_file = part_temp_input_path

                                temp_doc = Document()
                                for x in range(
                                    split_point.start_page, split_point.end_page + 1
                                ):
                                    xref = original_doc[x].xref
                                    if (
                                        original_doc.xref_get_key(xref, "Annots")[0]
                                        != "null"
                                    ):
                                        original_doc.xref_set_key(
                                            xref, "Annots", "null"
                                        )
                                temp_doc.insert_pdf(
                                    original_doc,
                                    from_page=split_point.start_page,
                                    to_page=split_point.end_page,
                                )
                                safe_save(temp_doc, part_temp_input_path)
                                assert (
                                    temp_doc.page_count
                                    == split_point.end_page - split_point.start_page + 1
                                )

                                # Only first part should have watermark
                                if i > 0:
                                    part_config.watermark_output_mode = (
                                        WatermarkOutputMode.NoWatermark
                                    )

                                # Create progress monitor for this part
                                part_monitor = pm.create_part_monitor(
                                    i, len(split_points)
                                )

                                # Process this part
                                result = _do_translate_single(
                                    part_monitor,
                                    part_config,
                                )
                                results[i] = result

                            except Exception as e:
                                logger.error(f"Error in part {i}: {e}")
                                pm.translate_error(e)
                                raise
                            finally:
                                # Clean up part working directory
                                translation_config.cleanup_part_working_dir(i)

                        # Restore original watermark mode
                        translation_config.watermark_output_mode = (
                            original_watermark_mode
                        )

                        # Merge results
                        merger = ResultMerger(translation_config)
                        logger.info("start merge results")
                        result = merger.merge_results(results)
                        logger.info("finish merge results")
            peak_memory_usage = memory_monitor.peak_memory_usage

        finish_time = time.time()
        result.total_seconds = finish_time - start_time

        logger.info(
            f"finish translate: {original_pdf_path}, cost: {finish_time - start_time} s",
        )
        # Populate aggregate valid text statistics into result
        try:
            sc = translation_config.shared_context_cross_split_part
            result.total_valid_character_count = getattr(
                sc, "valid_char_count_total", 0
            )
            token_total = getattr(sc, "total_valid_text_token_count", None)
            result.total_valid_text_token_count = (
                token_total if isinstance(token_total, int) else 0
            )
        except Exception as e:
            logger.warning("Failed to populate valid text statistics: %s", e)
            try:
                result.total_valid_character_count = 0
                result.total_valid_text_token_count = 0
            except Exception:
                pass
        result.original_pdf_path = translation_config.input_file
        result.peak_memory_usage = peak_memory_usage

        fix_cmap(result, translation_config)
        add_metadata(result, translation_config)
        try:
            migrate_toc(translation_config, result)
        except Exception as e:
            logger.error(
                f"Failed to migrate TOC from {translation_config.input_file}: {e}"
            )
        pm.translate_done(result)
        return result

    except Exception as e:
        if translation_config.debug:
            logger.exception("translate error:")
        else:
            logger.error(f"translate error: {e}")
        pm.disable = False
        pm.translate_error(e)
        raise
    finally:
        logger.debug("do_translate finally")
        pm.on_finish()
        translation_config.cleanup_temp_files()


def migrate_toc(
    translation_config: TranslationConfig, translate_result: TranslateResult
):
    if translation_config.use_alternating_pages_dual:
        logger.info('skipping TOC migration for "use_alternating_pages_dual" mode')
        return
    old_doc = Document(translation_config.input_file)
    if not old_doc:
        return
    try:
        fix_filter(old_doc)
        fix_null_xref(old_doc)
    except Exception:
        logger.exception("auto fix failed, please check the pdf file")

    toc_data = old_doc.get_toc()

    if not toc_data:
        logger.info("No TOC found in the original PDF, skipping migration.")
        return

    if translation_config.only_include_translated_page:
        total_page = set(range(0, len(old_doc)))

        pages_to_translate = {
            i for i in len(old_doc) if translation_config.should_translate_page(i + 1)
        }

        should_removed_page = list(total_page - pages_to_translate)

    files = {
        translate_result.dual_pdf_path,
        # translate_result.mono_pdf_path,
        translate_result.no_watermark_dual_pdf_path,
        # translate_result.no_watermark_mono_pdf_path
    }

    for f in files:
        if not f:
            continue
        mig_toc_temp_input = translation_config.get_working_file_path(
            "mig_toc_temp.pdf"
        )
        shutil.copy(f, mig_toc_temp_input)
        new_doc = Document(mig_toc_temp_input.as_posix())
        if not new_doc:
            continue

        new_doc.set_toc(toc_data)
        PDFCreater.save_pdf_with_timeout(
            new_doc,
            f.as_posix(),
            translation_config=translation_config,
            clean=not translation_config.skip_clean,
            tag="mig_toc",
        )


# mediabox -> '[0 nul 792]'
def fix_media_box(doc: Document) -> None:
    mediabox_data = {}
    for x in range(1, doc.xref_length()):
        t = doc.xref_get_key(x, "Type")
        box_set = {}
        if t[1] in ["/Pages", "/Page"]:
            mediabox = doc.xref_get_key(x, "MediaBox")
            if mediabox[0] == "array":
                try:
                    _, _, x1, y1 = (
                        mediabox[1].replace("[", "").replace("]", "").split(" ")
                    )
                    doc.xref_set_key(x, "MediaBox", f"[0 0 {x1} {y1}]")
                    box_set["MediaBox"] = mediabox[1]
                except Exception:
                    logger.warning(
                        "Attempt to fix media box failed; some pages may not have been processed correctly."
                    )
            for k in ["CropBox", "BleedBox", "TrimBox", "ArtBox"]:
                box = doc.xref_get_key(x, k)
                if box[0] != "null":
                    box_set[k] = box[1]
                    doc.xref_set_key(x, k, "null")
        if box_set:
            mediabox_data[x] = box_set
    return mediabox_data


def check_cid_char(il: il_version_1.Document):
    chars = []
    for page in il.page:
        chars.extend(page.pdf_character)

    cid_count = 0
    for char in chars:
        if re.match(r"^\(cid:\d+\)$", char.char_unicode):
            cid_count += 1

    return cid_count > len(chars) * 0.8


def _do_translate_single(
    pm: ProgressMonitor,
    translation_config: TranslationConfig,
) -> TranslateResult:
    """Original translation logic for a single document or part"""
    translation_config.progress_monitor = pm

    if translation_config.shared_context_cross_split_part.auto_enabled_ocr_workaround:
        translation_config.ocr_workaround = True
        translation_config.skip_scanned_detection = True

    original_pdf_path = translation_config.input_file
    if translation_config.debug:
        doc_input = Document(original_pdf_path)
        logger.debug("debug mode, save decompressed input pdf")
        output_path = translation_config.get_working_file_path(
            "input.decompressed.pdf",
        )
        # Fix null xref in PDF file
        try:
            _ = fix_null_page_content(doc_input)
            fix_filter(doc_input)
            fix_null_xref(doc_input)
        except Exception:
            logger.exception("auto fix failed, please check the pdf file")
        safe_save(doc_input, output_path, expand=True, pretty=True)
        del doc_input

    # Continue with original processing
    temp_pdf_path = translation_config.get_working_file_path("input.pdf")
    doc_pdf2zh = Document(original_pdf_path)
    safe_save(doc_pdf2zh, temp_pdf_path)

    # Fix null xref in PDF file
    invalid_pages = []
    try:
        invalid_pages = fix_null_page_content(doc_pdf2zh)
        fix_filter(doc_pdf2zh)
        fix_null_xref(doc_pdf2zh)
    except Exception:
        logger.exception("auto fix failed, please check the pdf file")

    mediabox_data = fix_media_box(doc_pdf2zh)

    # for page in doc_pdf2zh:
    #     page.insert_font(resfont, None)

    resfont = None
    safe_save(doc_pdf2zh, temp_pdf_path)

    # if not translation_config.skip_scanned_detection and DetectScannedFile(
    #     translation_config
    # ).fast_check(doc_pdf2zh):
    #     if translation_config.auto_enable_ocr_workaround:
    #         logger.warning(
    #             "Fast scanned check hit, Turning on OCR workaround.",
    #         )
    #         translation_config.shared_context_cross_split_part.auto_enabled_ocr_workaround = True
    #         translation_config.ocr_workaround = True
    #         translation_config.skip_scanned_detection = True
    #     else:
    #         logger.warning(
    #             "Fast scanned check hit, Please check the input PDF file.",
    #         )
    #         raise ScannedPDFError("Scanned PDF detected.")

    il_creater = ILCreater(translation_config)
    il_creater.mupdf = doc_pdf2zh
    xml_converter = XMLConverter()
    logger.debug(f"start parse il from {temp_pdf_path}")
    with Path(temp_pdf_path).open("rb") as f:
        start_parse_il(
            f,
            doc_zh=doc_pdf2zh,
            resfont=resfont,
            il_creater=il_creater,
            translation_config=translation_config,
        )
    logger.debug(f"finish parse il from {temp_pdf_path}")
    docs = il_creater.create_il()
    logger.debug(f"finish create il from {temp_pdf_path}")
    del il_creater
    if translation_config.only_include_translated_page and not docs.page:
        return None

    if translation_config.debug:
        xml_converter.write_json(
            docs,
            translation_config.get_working_file_path("create_il.debug.json"),
        )

    if check_cid_char(docs):
        raise ExtractTextError("The document contains too many CID chars.")

    # Skip all translation processing if only_parse_generate_pdf is enabled
    if translation_config.only_parse_generate_pdf:
        logger.debug("only_parse_generate_pdf enabled, skipping translation processing")
        # Skip directly to PDF generation
        pdf_creater = PDFCreater(temp_pdf_path, docs, translation_config, mediabox_data)
        result = pdf_creater.write(translation_config)
        result.original_pdf_path = translation_config.input_file
        return result

    # Rest of the original translation logic...
    # [Previous implementation of do_translate continues here]

    # 检测是否为扫描文件
    if translation_config.skip_scanned_detection:
        logger.debug("skipping scanned file detection")
    else:
        logger.debug("start detect scanned file")
        DetectScannedFile(translation_config).process(
            docs, temp_pdf_path, mediabox_data
        )
        logger.debug("finish detect scanned file")
        if translation_config.debug:
            xml_converter.write_json(
                docs,
                translation_config.get_working_file_path("detect_scanned_file.json"),
            )

    # Generate layouts for all pages
    logger.debug("start generating layouts")
    docs = LayoutParser(translation_config).process(docs, doc_pdf2zh)
    logger.debug("finish generating layouts")
    close_process_pool()
    if translation_config.debug:
        xml_converter.write_json(
            docs,
            translation_config.get_working_file_path("layout_generator.json"),
        )

    if translation_config.table_model:
        docs = TableParser(translation_config).process(docs, doc_pdf2zh)
        logger.debug("finish table parser")
        if translation_config.debug:
            xml_converter.write_json(
                docs,
                translation_config.get_working_file_path("table_parser.json"),
            )
    ParagraphFinder(translation_config).process(docs)
    logger.debug(f"finish paragraph finder from {temp_pdf_path}")
    if translation_config.debug:
        xml_converter.write_json(
            docs,
            translation_config.get_working_file_path("paragraph_finder.json"),
        )
    StylesAndFormulas(translation_config).process(docs)
    logger.debug(f"finish styles and formulas from {temp_pdf_path}")
    if translation_config.debug:
        xml_converter.write_json(
            docs,
            translation_config.get_working_file_path("styles_and_formulas.json"),
        )

    translate_engine = translation_config.translator
    term_extraction_engine = translation_config.get_term_extraction_translator()

    support_llm_translate = translator_supports_llm(translate_engine)
    support_llm_term_extraction = translator_supports_llm(term_extraction_engine)

    if support_llm_term_extraction and translation_config.auto_extract_glossary:
        AutomaticTermExtractor(term_extraction_engine, translation_config).procress(
            docs
        )

    if not translation_config.skip_translation:
        if support_llm_translate:
            il_translator = ILTranslatorLLMOnly(translate_engine, translation_config)
        else:
            il_translator = ILTranslator(translate_engine, translation_config)

        il_translator.translate(docs)
        del il_translator
        logger.debug(f"finish ILTranslator from {temp_pdf_path}")
    else:
        logger.info("skip ILTranslator")

    if translation_config.debug:
        xml_converter.write_json(
            docs,
            translation_config.get_working_file_path("il_translated.json"),
        )

    if translation_config.debug:
        AddDebugInformation(translation_config).process(docs)
        xml_converter.write_json(
            docs,
            translation_config.get_working_file_path("add_debug_information.json"),
        )
    mono_watermark_first_page_doc_bytes = None
    dual_watermark_first_page_doc_bytes = None
    try:
        if translation_config.watermark_output_mode == WatermarkOutputMode.Both:
            mono_watermark_first_page_doc_bytes, dual_watermark_first_page_doc_bytes = (
                generate_first_page_with_watermark(
                    doc_pdf2zh, translation_config, docs, mediabox_data
                )
            )
    except Exception:
        logger.warning(
            "Failed to generate watermark for first page, using no watermark"
        )
        translation_config.watermark_output_mode = WatermarkOutputMode.NoWatermark
        mono_watermark_first_page_doc_bytes = None
        dual_watermark_first_page_doc_bytes = None

    Typesetting(translation_config).typesetting_document(docs)
    logger.debug(f"finish typsetting from {temp_pdf_path}")
    if translation_config.debug:
        xml_converter.write_json(
            docs,
            translation_config.get_working_file_path("typsetting.json"),
        )

    pdf_creater = PDFCreater(temp_pdf_path, docs, translation_config, mediabox_data)
    result = pdf_creater.write(translation_config)
    try:
        if mono_watermark_first_page_doc_bytes:
            mono_watermark_pdf = merge_watermark_doc(
                result.mono_pdf_path,
                mono_watermark_first_page_doc_bytes,
                translation_config,
            )
            result.mono_pdf_path = mono_watermark_pdf
    except Exception:
        result.mono_pdf_path = result.no_watermark_mono_pdf_path
    try:
        if dual_watermark_first_page_doc_bytes:
            dual_watermark_pdf = merge_watermark_doc(
                result.dual_pdf_path,
                dual_watermark_first_page_doc_bytes,
                translation_config,
            )
            result.dual_pdf_path = dual_watermark_pdf
    except Exception:
        result.dual_pdf_path = result.no_watermark_dual_pdf_path

    result.original_pdf_path = translation_config.input_file

    return result


def generate_first_page_with_watermark(
    mupdf: Document,
    translation_config: TranslationConfig,
    doc_il: il_version_1.Document,
    mediabox_data: dict[int, Any] | None = None,
) -> (io.BytesIO, io.BytesIO):
    first_page_doc = Document()
    first_page_doc.insert_pdf(mupdf, from_page=0, to_page=0)

    il_only_first_page_doc = il_version_1.Document()
    il_only_first_page_doc.total_pages = 1
    il_only_first_page_doc.page = [copy.deepcopy(doc_il.page[0])]

    watermarked_config = copy.copy(translation_config)
    watermarked_config.watermark_output_mode = WatermarkOutputMode.Watermarked
    try:
        watermarked_config.progress_monitor.disable = True
        watermarked_temp_pdf_path = watermarked_config.get_working_file_path(
            "watermarked_temp_input.pdf"
        )
        safe_save(first_page_doc, watermarked_temp_pdf_path)

        Typesetting(watermarked_config).typsetting_document(il_only_first_page_doc)
        pdf_creater = PDFCreater(
            watermarked_temp_pdf_path.as_posix(),
            il_only_first_page_doc,
            watermarked_config,
            mediabox_data,
        )
        result = pdf_creater.write(watermarked_config)
        mono_pdf_bytes = None
        dual_pdf_bytes = None
        if result.mono_pdf_path:
            mono_pdf_bytes = io.BytesIO()
            with Path(result.mono_pdf_path).open("rb") as f:
                mono_pdf_bytes.write(f.read())
            result.mono_pdf_path.unlink()
            mono_pdf_bytes.seek(0)

        if result.dual_pdf_path:
            dual_pdf_bytes = io.BytesIO()
            with Path(result.dual_pdf_path).open("rb") as f:
                dual_pdf_bytes.write(f.read())
            result.dual_pdf_path.unlink()
            dual_pdf_bytes.seek(0)

        return mono_pdf_bytes, dual_pdf_bytes
    finally:
        watermarked_config.progress_monitor.disable = False


def merge_watermark_doc(
    no_watermark_pdf_path: pathlib.PosixPath,
    watermark_first_page_pdf_bytes: io.BytesIO,
    translation_config: TranslationConfig,
) -> pathlib.PosixPath:
    if not no_watermark_pdf_path.exists():
        raise FileNotFoundError(
            f"no_watermark_pdf_path not found: {no_watermark_pdf_path}"
        )
    if not watermark_first_page_pdf_bytes:
        raise FileNotFoundError(
            f"watermark_first_page_pdf_bytes not found: {watermark_first_page_pdf_bytes}"
        )

    no_watermark_pdf = Document(no_watermark_pdf_path.as_posix())
    no_watermark_pdf.delete_page(0)

    watermark_first_page_pdf = Document("pdf", watermark_first_page_pdf_bytes)
    no_watermark_pdf.insert_pdf(
        watermark_first_page_pdf, from_page=0, to_page=0, start_at=0
    )

    new_save_path = no_watermark_pdf_path.with_name(
        no_watermark_pdf_path.name.replace(".no_watermark", "")
    )

    PDFCreater.save_pdf_with_timeout(
        no_watermark_pdf,
        new_save_path.as_posix(),
        translation_config=translation_config,
        clean=not translation_config.skip_clean,
    )
    return new_save_path


def download_font_assets():
    warmup()


def create_cache_folder():
    try:
        logger.debug(f"create cache folder at {CACHE_FOLDER}")
        Path(CACHE_FOLDER).mkdir(parents=True, exist_ok=True)
    except OSError:
        logger.critical(
            f"Failed to create cache folder at {CACHE_FOLDER}",
            exc_info=True,
        )
        exit(1)


def init():
    create_cache_folder()


================================================
FILE: babeldoc/format/pdf/pdfinterp.py
================================================
import logging
from collections.abc import Sequence
from typing import Any
from typing import cast

import numpy as np

from babeldoc.format.pdf.babelpdf.utils import guarded_bbox
from babeldoc.format.pdf.document_il.frontend.il_creater import ILCreater
from babeldoc.pdfminer import settings
from babeldoc.pdfminer.pdfcolor import PREDEFINED_COLORSPACE
from babeldoc.pdfminer.pdfcolor import PDFColorSpace
from babeldoc.pdfminer.pdfdevice import PDFDevice
from babeldoc.pdfminer.pdfdevice import PDFTextSeq
from babeldoc.pdfminer.pdffont import PDFFont
from babeldoc.pdfminer.pdfinterp import LITERAL_FORM
from babeldoc.pdfminer.pdfinterp import LITERAL_IMAGE
from babeldoc.pdfminer.pdfinterp import Color
from babeldoc.pdfminer.pdfinterp import PDFContentParser
from babeldoc.pdfminer.pdfinterp import PDFInterpreterError
from babeldoc.pdfminer.pdfinterp import PDFPageInterpreter
from babeldoc.pdfminer.pdfinterp import PDFResourceManager
from babeldoc.pdfminer.pdfinterp import PDFStackT
from babeldoc.pdfminer.pdfpage import PDFPage
from babeldoc.pdfminer.pdftypes import LITERALS_ASCII85_DECODE
from babeldoc.pdfminer.pdftypes import PDFObjRef
from babeldoc.pdfminer.pdftypes import PDFStream
from babeldoc.pdfminer.pdftypes import dict_value
from babeldoc.pdfminer.pdftypes import list_value
from babeldoc.pdfminer.pdftypes import resolve1
from babeldoc.pdfminer.pdftypes import stream_value
from babeldoc.pdfminer.psexceptions import PSEOF
from babeldoc.pdfminer.psexceptions import PSTypeError
from babeldoc.pdfminer.psparser import PSKeyword
from babeldoc.pdfminer.psparser import PSLiteral
from babeldoc.pdfminer.psparser import keyword_name
from babeldoc.pdfminer.psparser import literal_name
from babeldoc.pdfminer.utils import MATRIX_IDENTITY
from babeldoc.pdfminer.utils import Matrix
from babeldoc.pdfminer.utils import Rect
from babeldoc.pdfminer.utils import apply_matrix_pt
from babeldoc.pdfminer.utils import choplist
from babeldoc.pdfminer.utils import mult_matrix

log = logging.getLogger(__name__)


def safe_float(o: Any) -> float | None:
    try:
        return float(o)
    except (TypeError, ValueError):
        return None


class PDFContentParserEx(PDFContentParser):
    def __init__(self, streams: Sequence[object]) -> None:
        super().__init__(streams)

    def do_keyword(self, pos: int, token: PSKeyword) -> None:
        if token is self.KEYWORD_BI:
            # inline image within a content stream
            self.start_type(pos, "inline")
        elif token is self.KEYWORD_ID:
            try:
                (_, objs) = self.end_type("inline")
                if len(objs) % 2 != 0:
                    error_msg = f"Invalid dictionary construct: {objs!r}"
                    raise PSTypeError(error_msg)
                d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
                eos = b"EI"
                filter_ = d.get("F", None)
                if filter_:
                    if isinstance(filter_, PSLiteral):
                        filter_ = [filter_]
                    if filter_[0] in LITERALS_ASCII85_DECODE:
                        eos = b"~>"
                (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
                if eos != b"EI":  # it may be necessary for decoding
                    data += eos
                obj = PDFStream(d, data)
                self.push((pos, obj))
                if eos == b"EI":  # otherwise it is still in the stream
                    self.push((pos, self.KEYWORD_EI))
            except PSTypeError:
                if settings.STRICT:
                    raise
        else:
            self.push((pos, token))


class PDFPageInterpreterEx(PDFPageInterpreter):
    """Processor for the content of a PDF page

    Reference: PDF Reference, Appendix A, Operator Summary
    """

    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        device: PDFDevice,
        obj_patch,
        il_creater: ILCreater,
    ) -> None:
        self.rsrcmgr = rsrcmgr
        self.device = device
        self.obj_patch = obj_patch
        self.il_creater = il_creater

    def dup(self) -> "PDFPageInterpreterEx":
        return self.__class__(
            self.rsrcmgr,
            self.device,
            self.obj_patch,
            self.il_creater,
        )

    def init_resources(self, resources: dict[object, object]) -> None:
        # 重载设置 fontid 和 descent
        """Prepare the fonts and XObjects listed in the Resource attribute."""
        self.resources = resources
        self.fontmap: dict[object, PDFFont] = {}
        self.fontid: dict[PDFFont, object] = {}
        self.xobjmap = {}
        self.csmap: dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
        if not resources:
            return

        def get_colorspace(spec: object) -> PDFColorSpace | None:
            if isinstance(spec, list):
                name = literal_name(spec[0])
            else:
                name = literal_name(spec)
            if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
                val = stream_value(spec[1])
                if "N" in val:
                    return PDFColorSpace(name, val["N"])
                elif "Alternate" in val:
                    return PREDEFINED_COLORSPACE[val["Alternate"].name]
            elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
                return PDFColorSpace(name, len(list_value(spec[1])))
            else:
                return PREDEFINED_COLORSPACE.get(name)

        for k, v in dict_value(resources).items():
            # log.debug("Resource: %r: %r", k, v)
            if k == "Font":
                for fontid, spec in dict_value(v).items():
                    objid = None
                    if isinstance(spec, PDFObjRef):
                        objid = spec.objid
                    spec = dict_value(spec)
                    font = self.rsrcmgr.get_font(objid, spec)
                    font.xobj_id = objid
                    self.il_creater.on_page_resource_font(font, objid, fontid)
                    self.fontmap[fontid] = font
                    self.fontmap[fontid].descent = 0  # hack fix descent
                    self.fontid[self.fontmap[fontid]] = fontid
            elif k == "ColorSpace":
                for csid, spec in dict_value(v).items():
                    colorspace = get_colorspace(resolve1(spec))
                    if colorspace is not None:
                        self.csmap[csid] = colorspace
            elif k == "ProcSet":
                self.rsrcmgr.get_procset(list_value(v))
            elif k == "XObject":
                for xobjid, xobjstrm in dict_value(v).items():
                    self.xobjmap[xobjid] = xobjstrm
        pass

    def do_CS(self, name: PDFStackT) -> None:
        """Set color space for stroking operations

        Introduced in PDF 1.1
        """
        try:
            self.il_creater.on_stroking_color_space(literal_name(name))
            self.scs = self.csmap[literal_name(name)]
        except KeyError:
            if settings.STRICT:
                raise PDFInterpreterError(f"Undefined ColorSpace: {name!r}") from None
        return

    def do_cs(self, name: PDFStackT) -> None:
        """Set color space for nonstroking operations"""
        try:
            self.il_creater.on_non_stroking_color_space(literal_name(name))
            self.ncs = self.csmap[literal_name(name)]
        except KeyError:
            if settings.STRICT:
                raise PDFInterpreterError(f"Undefined ColorSpace: {name!r}") from None
        return

    ############################################################
    # 重载返回调用参数（SCN）
    def do_SCN(self) -> None:
        """Set color for stroking operations."""
        if self.scs:
            n = self.scs.ncomponents
        else:
            if settings.STRICT:
                raise PDFInterpreterError("No colorspace specified!")
            n = 1
        n = len(self.argstack)
        args = self.pop(n)
        self.il_creater.on_passthrough_per_char("SCN", args)
        self.graphicstate.scolor = cast(Color, args)
        return args

    def do_scn(self) -> None:
        """Set color for nonstroking operations"""
        if self.ncs:
            n = self.ncs.ncomponents
        else:
            if settings.STRICT:
                raise PDFInterpreterError("No colorspace specified!")
            n = 1
        n = len(self.argstack)
        args = self.pop(n)
        self.il_creater.on_passthrough_per_char("scn", args)
        self.graphicstate.ncolor = cast(Color, args)
        return args

    def do_SC(self) -> None:
        """Set color for stroking operations"""
        args = self.do_SCN()
        self.il_creater.remove_latest_passthrough_per_char_instruction()
        self.il_creater.on_passthrough_per_char("SC", args)
        return args

    def do_sc(self) -> None:
        """Set color for nonstroking operations"""
        args = self.do_scn()
        self.il_creater.remove_latest_passthrough_per_char_instruction()
        self.il_creater.on_passthrough_per_char("sc", args)
        return args

    # Ensure bbox has four numbers, otherwise determine it as an illegal image
    # For example, some Form's bbox is '[ null -.00487 1.00412 .99393 ]'
    def do_Do(self, xobjid_arg: PDFStackT) -> None:
        # 重载设置 xobj 的 obj_patch
        """Invoke named XObject"""
        xobjid = literal_name(xobjid_arg)
        try:
            xobj = stream_value(self.xobjmap[xobjid])
        except KeyError:
            if settings.STRICT:
                raise PDFInterpreterError(f"Undefined xobject id: {xobjid!r}") from None
            return
        # log.debug("Processing xobj: %r", xobj)
        subtype = xobj.get("Subtype")
        if subtype is LITERAL_FORM and "BBox" in xobj:
            interpreter = self.dup()

            # In extremely rare cases, a none might be mixed in the bbox, for example
            # /BBox [ 0 3.052 null 274.9 157.3 ]
            bbox = list(
                filter(lambda x: x is not None, cast(Rect, list_value(xobj["BBox"])))
            )
            if len(bbox) < 4:
                return

            matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
            # According to PDF reference 1.7 section 4.9.1, XObjects in
            # earlier PDFs (prior to v1.2) use the page's Resources entry
            # instead of having their own Resources entry.
            xobjres = xobj.get("Resources")
            if xobjres:
                resources = dict_value(xobjres)
            else:
                resources = self.resources.copy()

            self.il_creater.on_xobj_form(
                self.ctm,
                self.il_creater.xobj_id,
                xobj.objid,
                "form",
                xobjid,
                bbox,
                matrix,
            )

            self.device.begin_figure(xobjid, bbox, matrix)
            ctm = mult_matrix(matrix, self.ctm)
            (x, y, x2, y2) = guarded_bbox(bbox)
            (x, y) = apply_matrix_pt(ctm, (x, y))
            (x2, y2) = apply_matrix_pt(ctm, (x2, y2))
            x_id = self.il_creater.on_xobj_begin((x, y, x2, y2), xobj.objid)
            try:
                ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
            except Exception:
                self.il_creater.on_xobj_end(x_id, " ")
                return
            np_version = np.__version__
            if np_version.split(".")[0] >= "2":
                pos_inv = -np.asmatrix(ctm[4:]) * ctm_inv
            else:
                pos_inv = -np.mat(ctm[4:]) * ctm_inv
            a, b, c, d = ctm_inv.reshape(4).tolist()
            e, f = pos_inv.tolist()[0]
            ops_base = interpreter.render_contents(
                resources,
                [xobj],
                ctm=ctm,
            )
            self.ncs = interpreter.ncs
            self.scs = interpreter.scs
            self.il_creater.on_xobj_end(
                x_id,
                # f"q {ops_base} Q {a} {b} {c} {d} {e} {f} cm ",
                f"{a:.6f} {b:.6f} {c:.6f} {d:.6f} {e:.6f} {f:.6f} cm ",
            )
            try:  # 有的时候 form 字体加不上这里会烂掉
                self.device.fontid = interpreter.fontid
                self.device.fontmap = interpreter.fontmap
                ops_new = self.device.end_figure(xobjid)
                ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
                np_version = np.__version__
                if np_version.split(".")[0] >= "2":
                    pos_inv = -np.asmatrix(ctm[4:]) * ctm_inv
                else:
                    pos_inv = -np.mat(ctm[4:]) * ctm_inv
                a, b, c, d = ctm_inv.reshape(4).tolist()
                e, f = pos_inv.tolist()[0]
                self.obj_patch[self.xobjmap[xobjid].objid] = (
                    f"q {ops_base}Q {a:.6f} {b:.6f} {c:.6f} {d:.6f} {e:.6f} {f:.6f} cm {ops_new}"
                )
            except Exception:
                pass
        elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
            self.il_creater.on_xobj_form(
                self.ctm,
                self.il_creater.xobj_id,
                xobj.objid,
                "image",
                xobjid,
                (0, 0, 1, 1),
                MATRIX_IDENTITY,
            )
            self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
            self.device.render_image(xobjid, xobj)
            self.device.end_figure(xobjid)
        else:
            # unsupported xobject type.
            pass

    def do_W(self) -> None:
        """Set clipping path using nonzero winding number rule"""
        self.handle_w(False)

    def do_W_a(self) -> None:
        """Set clipping path using even-odd rule"""
        self.handle_w(True)

    def handle_w(self, evenodd: bool):
        path = self.curpath
        self.il_creater.on_pdf_clip_path(path, evenodd, self.ctm)

    def process_page(self, page: PDFPage) -> None:
        # 重载设置 page 的 obj_patch
        # log.debug("Processing page: %r", page)
        # print(page.mediabox,page.cropbox)
        # (x0, y0, x1, y1) = page.mediabox
        (x0, y0, x1, y1) = page.cropbox
        if page.rotate == 90:
            ctm = (0, -1, 1, 0, -y0, x1)
        elif page.rotate == 180:
            ctm = (-1, 0, 0, -1, x1, y1)
        elif page.rotate == 270:
            ctm = (0, 1, -1, 0, y1, -x0)
        else:
            ctm = (1, 0, 0, 1, -x0, -y0)
        # ctm_for_ops = copy.copy(ctm)
        ctm_for_ops = (1, 0, 0, 1, -x0, -y0)
        ctm = (1, 0, 0, 1, -x0, -y0)
        if page.rotate == 90 or page.rotate == 270:
            (x0, y0, x1, y1) = (y0, x1, y1, x0)
        self.il_creater.on_page_start()
        self.il_creater.on_page_crop_box(x0, y0, x1, y1)
        self.device.begin_page(page, ctm)
        ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
        self.device.fontid = self.fontid
        self.device.fontmap = self.fontmap
        _ops_new = self.device.end_page(page)
        # 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标，这里输出的时候需要用 cm 把页面偏移加回来
        # self.obj_patch[page.page_xref] = (
        #     # f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}"  # ops_base 里可能有图，需要让 ops_new 里的文字覆盖在上面，使用 q/Q 重置位置矩阵
        #     ""
        # )
        # for obj in page.contents:
        #     self.obj_patch[obj.objid] = ""
        return f"q {ops_base} Q {' '.join(f'{x:f}' for x in ctm_for_ops)} cm"
        # return f"q {ops_base} Q 1 0 0 1 {x0} {y0} cm"

    def render_contents(
        self,
        resources: dict[object, object],
        streams: Sequence[object],
        ctm: Matrix = MATRIX_IDENTITY,
    ) -> None:
        # 重载返回指令流
        """Render the content streams.

        This method may be called recursively.
        """
        # log.debug(
        #     "render_contents: resources=%r, streams=%r, ctm=%r",
        #     resources,
        #     streams,
        #     ctm,
        # )
        self.init_resources(resources)
        self.init_state(ctm)
        return self.execute(list_value(streams))

    def do_q(self) -> None:
        """Save graphics state"""
        self.gstack.append(self.get_current_state())
        self.il_creater.push_passthrough_per_char_instruction()
        return

    def do_Q(self) -> None:
        """Restore graphics state"""
        if self.gstack:
            self.set_current_state(self.gstack.pop())
        self.il_creater.pop_passthrough_per_char_instruction()
        return

    def do_TJ(self, seq: PDFStackT) -> None:
        """Show text, allowing individual glyph positioning"""
        if self.textstate.font is None:
            if settings.STRICT:
                raise PDFInterpreterError("No font specified!")
            return
        if isinstance(seq, PSLiteral):
            return
        assert self.ncs is not None
        gs = self.graphicstate.copy()
        gs.passthrough_instruction = (
            self.il_creater.passthrough_per_char_instruction.copy()
        )
        if isinstance(seq, int) or isinstance(seq, float):
            seq = [seq]
        self.device.render_string(self.textstate, cast(PDFTextSeq, seq), self.ncs, gs)
        return

    def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
        """Set line dash pattern"""
        self.graphicstate.dash = (dash, phase)
        self.il_creater.on_line_dash(dash, phase)

    def do_BI(self) -> None:
        """Begin inline image object"""
        self.il_creater.on_inline_image_begin()

    def do_ID(self) -> None:
        """Begin inline image data"""
        pass  # Handled by PDFContentParserEx

    def do_EI(self, obj: PDFStackT) -> None:
        """End inline image object"""
        if isinstance(obj, PDFStream):
            self.il_creater.on_inline_image_end(obj, self.ctm)

    # Run PostScript commands
    # The Do_xxx method is the method for executing corresponding postscript instructions
    def execute(self, streams: Sequence[object]) -> None:
        ops = ""
        for stream in streams:
            self.il_creater.on_new_stream()
            # 重载返回指令流
            try:
                parser = PDFContentParserEx([stream])
            except PSEOF:
                # empty page
                return
            while True:
                try:
                    (_, obj) = parser.nextobject()
                except PSEOF:
                    break
                if isinstance(obj, PSKeyword):
                    name = keyword_name(obj)
                    act_name = (
                        name.replace("*", "_a").replace('"', "_w").replace("'", "_q")
                    )
                    method = f"do_{act_name}"
                    if hasattr(self, method):
                        func = getattr(self, method)
                        nargs = func.__code__.co_argcount - 1
                        if nargs:
                            args = self.pop(nargs)
                            # log.debug("exec: %s %r", name, args)
                            if len(args) == nargs:
                                func(*args)
                                if self.il_creater.is_passthrough_per_char_operation(
                                    name,
                                ):
                                    self.il_creater.on_passthrough_per_char(name, args)
                                if self.il_creater.is_graphic_operation(name):
                                    continue
                                elif name == "d":
                                    arg0 = f"[{' '.join(f'{arg}' for arg in args[0])}]"
                                    arg1 = args[1]
                                    ops += f"{arg0} {arg1} {name} "
                                elif not (
                                    name[0] == "T"
                                    or name
                                    in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"]
                                ):  # 过滤 T 系列文字指令，因为 EI 的参数是 obj 所以也需要过滤（只在少数文档中画横线时使用），过滤 marked 系列指令
                                    p = " ".join(
                                        [
                                            (
                                                f"{x:f}"
                                                if isinstance(x, float)
                                                else str(x).replace("'", "")
                                            )
                                            for x in args
                                        ],
                                    )
                                    ops += f"{p} {name} "
                        else:
                            # log.debug("exec: %s", name)
                            targs = func()
                            if targs is None:
                                targs = []
                            if self.il_creater.is_graphic_operation(name):
                                continue
                            elif not (name[0] == "T" or name in ["BI", "ID", "EMC"]):
                                p = " ".join(
                                    [
                                        (
                                            f"{x:f}"
                                            if isinstance(x, float)
                                            else str(x).replace("'", "")
                                        )
                                        for x in targs
                                    ],
                                )
                                ops += f"{p} {name} "
                    elif settings.STRICT:
                        error_msg = f"Unknown operator: {name!r}"
                        raise PDFInterpreterError(error_msg)
                else:
                    self.push(obj)
            # print('REV DATA',ops)
        return ops


================================================
FILE: babeldoc/format/pdf/result_merger.py
================================================
import logging
from pathlib import Path

from pymupdf import Document

from babeldoc.format.pdf.document_il.backend.pdf_creater import PDFCreater
from babeldoc.format.pdf.translation_config import TranslateResult
from babeldoc.format.pdf.translation_config import TranslationConfig

logger = logging.getLogger(__name__)


class ResultMerger:
    """Handles merging of split translation results"""

    def __init__(self, translation_config: TranslationConfig):
        self.config = translation_config

    def merge_results(
        self, results: dict[int, TranslateResult | None]
    ) -> TranslateResult:
        """Merge multiple translation results into one"""
        if not results:
            raise ValueError("No results to merge")

        basename = Path(self.config.input_file).stem
        debug_suffix = ".debug" if self.config.debug else ""

        mono_file_name = f"{basename}{debug_suffix}.{self.config.lang_out}.mono.pdf"
        dual_file_name = f"{basename}{debug_suffix}.{self.config.lang_out}.dual.pdf"

        debug_suffix += ".no_watermark"

        mono_file_name_no_watermark = (
            f"{basename}{debug_suffix}.{self.config.lang_out}.mono.pdf"
        )
        dual_file_name_no_watermark = (
            f"{basename}{debug_suffix}.{self.config.lang_out}.dual.pdf"
        )
        results = {k: v for k, v in results.items() if v is not None}
        # Sort results by part index
        sorted_results = dict(sorted(results.items()))
        first_result = next(iter(sorted_results.values()))

        # Initialize paths for merged files
        merged_mono_path = None
        merged_dual_path = None
        merged_no_watermark_mono_path = None
        merged_no_watermark_dual_path = None
        try:
            # Merge monolingual PDFs if they exist
            if (
                any(r.mono_pdf_path for r in results.values())
                and not self.config.no_mono
            ):
                merged_mono_path = self._merge_pdfs(
                    [
                        r.mono_pdf_path
                        for r in sorted_results.values()
                        if r.mono_pdf_path
                    ],
                    mono_file_name,
                    tag="merged_mono",
                )
        except Exception as e:
            logger.error(f"Error merging monolingual PDFs: {e}")
            merged_mono_path = None

        try:
            # Merge dual-language PDFs if they exist
            if (
                any(r.dual_pdf_path for r in results.values())
                and not self.config.no_dual
            ):
                merged_dual_path = self._merge_pdfs(
                    [
                        r.dual_pdf_path
                        for r in sorted_results.values()
                        if r.dual_pdf_path
                    ],
                    dual_file_name,
                    tag="merged_dual",
                )
        except Exception as e:
            logger.error(f"Error merging dual-language PDFs: {e}")
            merged_dual_path = None

        if any(
            r.dual_pdf_path != r.no_watermark_dual_pdf_path
            or r.mono_pdf_path != r.no_watermark_mono_pdf_path
            for r in results.values()
        ):
            try:
                # Merge no-watermark PDFs if they exist
                if (
                    any(r.no_watermark_mono_pdf_path for r in results.values())
                    and not self.config.no_mono
                ):
                    merged_no_watermark_mono_path = self._merge_pdfs(
                        [
                            r.no_watermark_mono_pdf_path
                            for r in sorted_results.values()
                            if r.no_watermark_mono_pdf_path
                        ],
                        mono_file_name_no_watermark,
                        tag="merged_no_watermark_mono",
                    )
            except Exception as e:
                logger.error(f"Error merging no-watermark PDFs: {e}")
                merged_no_watermark_mono_path = None

            try:
                if (
                    any(r.no_watermark_dual_pdf_path for r in results.values())
                    and not self.config.no_dual
                ):
                    merged_no_watermark_dual_path = self._merge_pdfs(
                        [
                            r.no_watermark_dual_pdf_path
                            for r in sorted_results.values()
                            if r.no_watermark_dual_pdf_path
                        ],
                        "merged_no_watermark_dual.pdf",
                        tag="merged_no_watermark_dual",
                    )
            except Exception as e:
                logger.error(f"Error merging no-watermark PDFs: {e}")
                merged_no_watermark_dual_path = None

        auto_extracted_glossary_path = None
        if (
            self.config.save_auto_extracted_glossary
            and self.config.shared_context_cross_split_part.auto_extracted_glossary
        ):
            auto_extracted_glossary_path = self.config.get_output_file_path(
                f"{basename}{debug_suffix}.{self.config.lang_out}.glossary.csv"
            )
            with auto_extracted_glossary_path.open("w", encoding="utf-8") as f:
                logger.info(
                    f"save auto extracted glossary to {auto_extracted_glossary_path}"
                )
                f.write(
                    self.config.shared_context_cross_split_part.auto_extracted_glossary.to_csv()
                )

        # Create merged result
        merged_result = TranslateResult(
            mono_pdf_path=merged_mono_path,
            dual_pdf_path=merged_dual_path,
            auto_extracted_glossary_path=auto_extracted_glossary_path,
        )
        merged_result.no_watermark_mono_pdf_path = merged_no_watermark_mono_path
        merged_result.no_watermark_dual_pdf_path = merged_no_watermark_dual_path

        if merged_result.no_watermark_mono_pdf_path is None:
            merged_result.no_watermark_mono_pdf_path = merged_mono_path
        elif merged_result.mono_pdf_path is None:
            merged_result.mono_pdf_path = merged_no_watermark_mono_path

        if merged_result.no_watermark_dual_pdf_path is None:
            merged_result.no_watermark_dual_pdf_path = merged_dual_path
        elif merged_result.dual_pdf_path is None:
            merged_result.dual_pdf_path = merged_no_watermark_dual_path

        # Calculate total time
        total_time = sum(
            r.total_seconds for r in results.values() if hasattr(r, "total_seconds")
        )
        merged_result.total_seconds = total_time

        return merged_result

    def _merge_pdfs(
        self, pdf_paths: list[str | Path], output_name: str, tag: str
    ) -> Path:
        """Merge multiple PDFs into one"""
        if not pdf_paths:
            return None

        output_path = self.config.get_output_file_path(output_name)
        merged_doc = Document()

        for pdf_path in pdf_paths:
            doc = Document(str(pdf_path))
            merged_doc.insert_pdf(doc)

        merged_doc = PDFCreater.subset_fonts_in_subprocess(
            merged_doc, self.config, tag=tag
        )
        PDFCreater.save_pdf_with_timeout(
            merged_doc, str(output_path), translation_config=self.config
        )

        return output_path


================================================
FILE: babeldoc/format/pdf/split_manager.py
================================================
import logging
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class SplitPoint:
    """Represents a point where the document should be split"""

    start_page: int
    end_page: int
    estimated_complexity: float = 1.0
    chapter_title: str | None = None


class BaseSplitStrategy:
    """Base class for split strategies"""

    def determine_split_points(self, config) -> list[SplitPoint]:
        raise NotImplementedError


class PageCountStrategy(BaseSplitStrategy):
    """Split document based on page count"""

    def __init__(self, max_pages_per_part: int = 20):
        self.max_pages_per_part = max_pages_per_part

    def determine_split_points(self, config) -> list[SplitPoint]:
        from pymupdf import Document

        doc = Document(str(config.input_file))
        total_pages = doc.page_count

        split_points = []
        current_page = 0

        while current_page < total_pages:
            end_page = min(current_page + self.max_pages_per_part, total_pages)
            split_points.append(
                SplitPoint(
                    start_page=current_page,
                    end_page=end_page - 1,  # end_page is inclusive
                )
            )
            current_page = end_page

        return split_points


class SplitManager:
    """Manages document splitting process"""

    def __init__(self, config=None):
        self.strategy = config.split_strategy

    def determine_split_points(self, config) -> list[SplitPoint]:
        """Determine where to split the document"""
        return self.strategy.determine_split_points(config)

    def estimate_part_complexity(self, split_point: SplitPoint) -> float:
        """Estimate the complexity of a document part"""
        # Simple estimation based on page count for now
        return (
            split_point.end_page - split_point.start_page + 1
        ) * split_point.estimated_complexity


================================================
FILE: babeldoc/format/pdf/translation_config.py
================================================
import enum
import logging
import shutil
import tempfile
import threading
from collections import Counter
from pathlib import Path

from babeldoc.const import CACHE_FOLDER
from babeldoc.format.pdf.split_manager import BaseSplitStrategy
from babeldoc.format.pdf.split_manager import PageCountStrategy
from babeldoc.glossary import Glossary
from babeldoc.glossary import GlossaryEntry
from babeldoc.progress_monitor import ProgressMonitor
from babeldoc.translator.translator import BaseTranslator

logger = logging.getLogger(__name__)


class WatermarkOutputMode(enum.Enum):
    Watermarked = "watermarked"
    NoWatermark = "no_watermark"
    Both = "both"


class SharedContextCrossSplitPart:
    def __init__(self):
        self.first_paragraph = None
        self.recent_title_paragraph = None
        self._lock = threading.Lock()
        self.user_glossaries: list[Glossary] = []
        self.auto_extracted_glossary: Glossary | None = None
        self.raw_extracted_terms: list[tuple[str, str]] = []
        self.auto_enabled_ocr_workaround = False
        # Statistics for valid characters/text across the whole file
        self.valid_char_count_total: int = 0
        self.total_valid_text_token_count: int = 0

    def initialize_glossaries(self, initial_glossaries: list[Glossary] | None):
        with self._lock:
            self.user_glossaries = (
                list(initial_glossaries) if initial_glossaries else []
            )
            self.auto_extracted_glossary = None
            self.raw_extracted_terms = []
            self.unique_name = self._generate_unique_auto_glossary_name()
            self.norm_terms = set()
            for g in self.user_glossaries:
                for entity in g.normalized_lookup:
                    self.norm_terms.add(entity)
            # reset statistics buffer when initializing
            self.valid_char_count_total = 0
            self.total_valid_text_token_count = 0

    def add_raw_extracted_term_pair(self, src: str, tgt: str):
        with self._lock:
            self.raw_extracted_terms.append((src, tgt))

    def _generate_unique_auto_glossary_name(self) -> str:
        base_name = "auto_extracted_glossary"
        current_name = base_name
        suffix = 0
        existing_names = {g.name for g in self.user_glossaries}
        if (
            self.auto_extracted_glossary
            and self.auto_extracted_glossary.name == current_name
        ):
            pass

        while current_name in existing_names:
            suffix += 1
            current_name = f"{base_name}#{suffix}"
        return current_name

    def contains_term(self, term: str) -> bool:
        with self._lock:
            try:
                return term in self.norm_terms
            except Exception:
                return False

    def finalize_auto_extracted_glossary(self):
        with self._lock:
            self.auto_extracted_glossary = None

            if not self.raw_extracted_terms:
                self.raw_extracted_terms = []
                return

            term_translations: dict[str, list[str]] = {}
            for src, tgt in self.raw_extracted_terms:
                term_translations.setdefault(src, []).append(tgt)

            final_entries: list[GlossaryEntry] = []
            for src, tgts in term_translations.items():
                if not tgts:
                    continue
                most_common_tgt = Counter(tgts).most_common(1)[0][0]
                final_entries.append(GlossaryEntry(src, most_common_tgt))

            if final_entries:
                self.auto_extracted_glossary = Glossary(
                    name=self.unique_name, entries=final_entries
                )

    def get_glossaries(self) -> list[Glossary]:
        with self._lock:
            all_glossaries = list(self.user_glossaries)
            if self.auto_extracted_glossary:
                all_glossaries.append(self.auto_extracted_glossary)
            return all_glossaries

    def get_glossaries_for_translation(
        self, auto_extract_enabled: bool
    ) -> list[Glossary]:
        with self._lock:
            if auto_extract_enabled and self.auto_extracted_glossary:
                return [self.auto_extracted_glossary]
            else:
                all_glossaries = list(self.user_glossaries)
                if self.auto_extracted_glossary:
                    all_glossaries.append(self.auto_extracted_glossary)
                return all_glossaries

    def add_valid_counts(self, char_count: int, token_count: int):
        """Accumulate valid character and token counts in a threadsafe way."""
        if char_count <= 0 and token_count <= 0:
            return
        with self._lock:
            if char_count > 0:
                self.valid_char_count_total += char_count
            if token_count > 0:
                self.total_valid_text_token_count += token_count


class TranslationConfig:
    @staticmethod
    def create_max_pages_per_part_split_strategy(max_pages_per_part: int):
        return PageCountStrategy(max_pages_per_part)

    # for backward compatibility,
    # new parameters should be added at the end of the function.
    def __init__(
        self,
        translator: BaseTranslator,
        input_file: str | Path,
        lang_in: str,
        lang_out: str,
        doc_layout_model,  # DocLayoutModel
        # for backward compatibility
        font: str | Path | None = None,
        pages: str | None = None,
        output_dir: str | Path | None = None,
        debug: bool = False,
        working_dir: str | Path | None = None,
        no_dual: bool = False,
        no_mono: bool = False,
        formular_font_pattern: str | None = None,
        formular_char_pattern: str | None = None,
        qps: int = 1,
        split_short_lines: bool = False,
        short_line_split_factor: float = 0.8,
        use_rich_pbar: bool = True,
        progress_monitor: ProgressMonitor | None = None,
        skip_clean: bool = False,
        dual_translate_first: bool = False,
        disable_rich_text_translate: bool = False,
        enhance_compatibility: bool = False,
        report_interval: float = 0.1,
        min_text_length: int = 5,
        use_side_by_side_dual: bool = True,  # Deprecated: 是否使用拼版式双语 PDF（并排显示原文和译文）向下兼容选项，已停用。
        use_alternating_pages_dual: bool = False,
        watermark_output_mode: WatermarkOutputMode = WatermarkOutputMode.Watermarked,
        # Add split-related parameters
        split_strategy: BaseSplitStrategy | None = None,
        table_model=None,
        show_char_box: bool = False,
        skip_scanned_detection: bool = False,
        ocr_workaround: bool = False,
        custom_system_prompt: str | None = None,
        add_formula_placehold_hint: bool = False,
        glossaries: list[Glossary] | None = None,
        pool_max_workers: int | None = None,
        auto_extract_glossary: bool = True,
        auto_enable_ocr_workaround: bool = False,
        primary_font_family: str | None = None,
        only_include_translated_page: bool | None = False,
        save_auto_extracted_glossary: bool = True,
        enable_graphic_element_process: bool = True,
        merge_alternating_line_numbers: bool = True,
        skip_translation: bool = False,
        skip_form_render: bool = False,
        skip_curve_render: bool = False,
        only_parse_generate_pdf: bool = False,
        remove_non_formula_lines: bool = False,
        non_formula_line_iou_threshold: float = 0.9,
        figure_table_protection_threshold: float = 0.9,
        skip_formula_offset_calculation: bool = False,
        term_extraction_translator: BaseTranslator | None = None,
        metadata_extra_data: str | None = None,
        term_pool_max_workers: int | None = None,
        disable_same_text_fallback: bool = False,
    ):
        self.translator = translator
        self.term_extraction_translator = term_extraction_translator or translator
        initial_user_glossaries = list(glossaries) if glossaries else []

        self.input_file = input_file
        self.lang_in = lang_in
        self.lang_out = lang_out
        # just ignore font
        self.font = None

        self.pages = pages
        self.page_ranges = self.parse_pages(pages) if pages else None
        self.debug = debug
        self.watermark_output_mode = watermark_output_mode

        self.output_dir = output_dir
        self.working_dir = working_dir
        self.no_dual = no_dual
        self.no_mono = no_mono

        self.formular_font_pattern = formular_font_pattern
        self.formular_char_pattern = formular_char_pattern
        self.qps = qps
        # Set pool_max_workers with default value from qps
        self.pool_max_workers = (
            pool_max_workers if pool_max_workers is not None else qps
        )
        # Set term_pool_max_workers for automatic term extraction.
        # If not provided, default to pool_max_workers.
        self.term_pool_max_workers = (
            term_pool_max_workers
            if term_pool_max_workers is not None
            else self.pool_max_workers
        )
        self.split_short_lines = split_short_lines

        self.short_line_split_factor = short_line_split_factor
        self.use_rich_pbar = use_rich_pbar
        self.progress_monitor = progress_monitor
        self.doc_layout_model = doc_layout_model

        self.skip_clean = skip_clean or enhance_compatibility
        self.skip_scanned_detection = skip_scanned_detection

        self.dual_translate_first = dual_translate_first or enhance_compatibility
        self.disable_rich_text_translate = (
            disable_rich_text_translate or enhance_compatibility
        )

        self.report_interval = report_interval
        self.min_text_length = min_text_length
        self.use_alternating_pages_dual = use_alternating_pages_dual
        self.ocr_workaround = ocr_workaround
        self.merge_alternating_line_numbers = merge_alternating_line_numbers

        if self.ocr_workaround:
            self.skip_scanned_detection = True
            self.disable_rich_text_translate = True

        # for backward compatibility
        if use_side_by_side_dual is False and use_alternating_pages_dual is False:
            self.use_alternating_pages_dual = True

        if progress_monitor and progress_monitor.cancel_event is None:
            progress_monitor.cancel_event = threading.Event()

        if working_dir is None:
            if debug:
                working_dir = Path(CACHE_FOLDER) / "working" / Path(input_file).stem
                self._is_temp_dir = False
            else:
                working_dir = tempfile.mkdtemp()
                self._is_temp_dir = True
        else:
            working_dir = Path(working_dir) / Path(input_file).stem
            self._is_temp_dir = False

        self.working_dir = working_dir

        Path(working_dir).mkdir(parents=True, exist_ok=True)

        if output_dir is None:
            output_dir = Path.cwd()
        self.output_dir = output_dir

        Path(output_dir).mkdir(parents=True, exist_ok=True)

        if not doc_layout_model:
            from babeldoc.docvision.doclayout import DocLayoutModel

            doc_layout_model = DocLayoutModel.load_available()
        self.doc_layout_model = doc_layout_model

        self.shared_context_cross_split_part = SharedContextCrossSplitPart()
        self.shared_context_cross_split_part.initialize_glossaries(
            initial_user_glossaries
        )

        # Initialize split-related attributes
        self.split_strategy = split_strategy

        # Create a unique working directory for each part
        self._part_working_dirs: dict[int, Path] = {}
        self._part_output_dirs: dict[int, Path] = {}

        self.table_model = table_model
        self.show_char_box = show_char_box
        self.custom_system_prompt = custom_system_prompt
        self.add_formula_placehold_hint = add_formula_placehold_hint
        self.auto_extract_glossary = auto_extract_glossary
        self.auto_enable_ocr_workaround = auto_enable_ocr_workaround
        self.skip_translation = skip_translation
        self.only_parse_generate_pdf = only_parse_generate_pdf

        if self.skip_translation or self.only_parse_generate_pdf:
            self.auto_extract_glossary = False

        if auto_enable_ocr_workaround:
            self.ocr_workaround = False
            self.skip_scanned_detection = False

        assert primary_font_family in [
            None,
            "serif",
            "sans-serif",
            "script",
        ]
        self.primary_font_family = primary_font_family

        if only_include_translated_page is None:
            only_include_translated_page = False

        self.only_include_translated_page = only_include_translated_page

        self.save_auto_extracted_glossary = save_auto_extracted_glossary

        # force disable table translate until the new model is ready
        self.table_model = None
        self.enable_graphic_element_process = enable_graphic_element_process
        self.skip_form_render = skip_form_render
        self.skip_curve_render = skip_curve_render
        self.remove_non_formula_lines = remove_non_formula_lines
        self.non_formula_line_iou_threshold = non_formula_line_iou_threshold
        self.figure_table_protection_threshold = figure_table_protection_threshold
        self.skip_formula_offset_calculation = skip_formula_offset_calculation

        self.metadata_extra_data = metadata_extra_data

        self.term_extraction_token_usage: dict[str, int] = {
            "total_tokens": 0,
            "prompt_tokens": 0,
            "completion_tokens": 0,
            "cache_hit_prompt_tokens": 0,
        }
        self.disable_same_text_fallback = disable_same_text_fallback

        if self.ocr_workaround:
            self.remove_non_formula_lines = False

    def parse_pages(self, pages_str: str | None) -> list[tuple[int, int]] | None:
        """解析页码字符串，返回页码范围列表

        Args:
            pages_str: 形如 "1-,2,-3,4" 的页码字符串

        Returns:
            包含 (start, end) 元组的列表，其中 -1 表示无限制
        """
        if not pages_str:
            return None

        ranges: list[tuple[int, int]] = []
        for part in pages_str.split(","):
            part = part.strip()
            if "-" in part:
                start, end = part.split("-")
                start_as_int = int(start) if start else 1
                end_as_int = int(end) if end else -1
                ranges.append((start_as_int, end_as_int))
            else:
                page = int(part)
                ranges.append((page, page))
        return ranges

    def should_translate_page(self, page_number: int) -> bool:
        """判断指定页码是否需要翻译
        Args:
            page_number: 页码
        Returns:
            是否需要翻译该页
        """
        if isinstance(self.page_ranges, list) and len(self.page_ranges) == 0:
            return False
        if not self.page_ranges:
            return True

        for start, end in self.page_ranges:
            if start <= page_number and (end == -1 or page_number <= end):
                return True
        return False

    def get_output_file_path(self, filename: str) -> Path:
        return Path(self.output_dir) / filename

    def get_working_file_path(self, filename: str) -> Path:
        return Path(self.working_dir) / filename

    def get_part_working_dir(self, part_index: int) -> Path:
        """Get working directory for a specific part"""
        if part_index not in self._part_working_dirs:
            if self.working_dir:
                part_dir = Path(self.working_dir) / f"part_{part_index}"
            else:
                part_dir = Path(tempfile.mkdtemp()) / f"part_{part_index}"
            part_dir.mkdir(parents=True, exist_ok=True)
            self._part_working_dirs[part_index] = part_dir
        return self._part_working_dirs[part_index]

    def get_part_output_dir(self, part_index: int) -> Path:
        """Get output directory for a specific part"""
        if part_index not in self._part_output_dirs:
            part_dir = Path(self.working_dir) / f"part_{part_index}_output"
            part_dir.mkdir(parents=True, exist_ok=True)
            self._part_output_dirs[part_index] = part_dir
        return self._part_output_dirs[part_index]

    def cleanup_part_output_dir(self, part_index: int):
        """Clean up output directory for a specific part"""
        if part_index in self._part_output_dirs:
            part_dir = self._part_output_dirs[part_index]
            if part_dir.exists():
                shutil.rmtree(part_dir)
            del self._part_output_dirs[part_index]

    def cleanup_part_working_dir(self, part_index: int):
        """Clean up working directory for a specific part"""
        if part_index in self._part_working_dirs:
            part_dir = self._part_working_dirs[part_index]
            if part_dir.exists():
                shutil.rmtree(part_dir, ignore_errors=True)
            del self._part_working_dirs[part_index]

    def cleanup_temp_files(self):
        """Clean up all temporary files including part working directories"""
        try:
            for part_index in list(self._part_working_dirs.keys()):
                self.cleanup_part_working_dir(part_index)
            if self._is_temp_dir:
                logger.info(f"cleanup temp files: {self.working_dir}")
                shutil.rmtree(self.working_dir, ignore_errors=True)
        except Exception:
            logger.exception("Error cleaning up temporary files")

    def raise_if_cancelled(self):
        if self.progress_monitor is not None:
            self.progress_monitor.raise_if_cancelled()

    def cancel_translation(self):
        if self.progress_monitor is not None:
            self.progress_monitor.cancel()

    def get_term_extraction_translator(self) -> BaseTranslator:
        """Return the translator to use for automatic term extraction."""
        return self.term_extraction_translator

    def record_term_extraction_usage(
        self,
        total_tokens: int,
        prompt_tokens: int,
        completion_tokens: int,
        cache_hit_prompt_tokens: int,
    ) -> None:
        """Accumulate token usage for automatic term extraction."""
        if total_tokens > 0:
            self.term_extraction_token_usage["total_tokens"] += total_tokens
        if prompt_tokens > 0:
            self.term_extraction_token_usage["prompt_tokens"] += prompt_tokens
        if completion_tokens > 0:
            self.term_extraction_token_usage["completion_tokens"] += completion_tokens
        if cache_hit_prompt_tokens > 0:
            self.term_extraction_token_usage["cache_hit_prompt_tokens"] += (
                cache_hit_prompt_tokens
            )


class TranslateResult:
    original_pdf_path: str
    total_seconds: float
    mono_pdf_path: Path | None
    dual_pdf_path: Path | None
    no_watermark_mono_pdf_path: Path | None
    no_watermark_dual_pdf_path: Path | None
    peak_memory_usage: int | None
    auto_extracted_glossary_path: Path | None
    total_valid_character_count: int | None
    total_valid_text_token_count: int | None

    def __init__(
        self,
        mono_pdf_path: Path | None,
        dual_pdf_path: Path | None,
        auto_extracted_glossary_path: Path | None = None,
    ):
        self.mono_pdf_path = mono_pdf_path
        self.dual_pdf_path = dual_pdf_path

        # For compatibility considerations, if only a non-watermarked PDF is generated,
        # the values of mono_pdf_path and no_watermark_mono_pdf_path are the same.
        self.no_watermark_mono_pdf_path = mono_pdf_path
        self.no_watermark_dual_pdf_path = dual_pdf_path

        self.auto_extracted_glossary_path = auto_extracted_glossary_path
        self.total_valid_character_count = None
        self.total_valid_text_token_count = None

    def __str__(self):
        """Return a human-readable string representation of the translation result."""
        result = []
        if hasattr(self, "original_pdf_path") and self.original_pdf_path:
            result.append(f"\tOriginal PDF: {self.original_pdf_path}")

        if hasattr(self, "total_seconds") and self.total_seconds:
            result.append(f"\tTotal time: {self.total_seconds:.2f} seconds")

        if self.mono_pdf_path:
            result.append(f"\tMonolingual PDF: {self.mono_pdf_path}")

        if self.dual_pdf_path:
            result.append(f"\tDual-language PDF: {self.dual_pdf_path}")

        if (
            hasattr(self, "no_watermark_mono_pdf_path")
            and self.no_watermark_mono_pdf_path
            and self.no_watermark_mono_pdf_path != self.mono_pdf_path
        ):
            result.append(
                f"\tNo-watermark Monolingual PDF: {self.no_watermark_mono_pdf_path}"
            )

        if (
            hasattr(self, "no_watermark_dual_pdf_path")
            and self.no_watermark_dual_pdf_path
            and self.no_watermark_dual_pdf_path != self.dual_pdf_path
        ):
            result.append(
                f"\tNo-watermark Dual-language PDF: {self.no_watermark_dual_pdf_path}"
            )

        if (
            hasattr(self, "auto_extracted_glossary_path")
            and self.auto_extracted_glossary_path
        ):
            result.append(
                f"\tAuto-extracted glossary: {self.auto_extracted_glossary_path}"
            )

        if hasattr(self, "peak_memory_usage") and self.peak_memory_usage:
            result.append(f"\tPeak memory usage: {self.peak_memory_usage} MB")

        if hasattr(self, "total_valid_character_count") and isinstance(
            self.total_valid_character_count, int
        ):
            result.append(
                f"\tTotal valid character count: {self.total_valid_character_count}"
            )

        if hasattr(self, "total_valid_text_token_count") and isinstance(
            self.total_valid_text_token_count, int
        ):
            result.append(
                f"\tTotal valid text token count (gpt-4o): {self.total_valid_text_token_count}"
            )

        if result:
            result.insert(0, "Translation results:")

        return "\n".join(result) if result else "No translation results available"


================================================
FILE: babeldoc/glossary.py
================================================
import csv
import io
import itertools
import logging
import re
import time
from pathlib import Path

import chardet
import hyperscan
import regex

logger = logging.getLogger(__name__)


class GlossaryEntry:
    def __init__(self, source: str, target: str, target_language: str | None = None):
        self.source = source
        self.target = target
        self.target_language = target_language

    def __repr__(self):
        return f"GlossaryEntry(source='{self.source}', target='{self.target}', target_language='{self.target_language}')"


def batched(iterable, n, *, strict=False):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    iterator = iter(iterable)
    while batch := tuple(itertools.islice(iterator, n)):
        if strict and len(batch) != n:
            raise ValueError("batched(): incomplete batch")
        yield batch


TERM_NORM_PATTERN = re.compile(r"\s+", regex.UNICODE)


class Glossary:
    def __init__(self, name: str, entries: list[GlossaryEntry]):
        self.name = name

        # Deduplicate entries based on normalized source
        unique_entries = []
        seen_normalized_sources = set()
        for entry in entries:
            normalized_source = self.normalize_source(entry.source)
            if normalized_source not in seen_normalized_sources:
                unique_entries.append(entry)
                seen_normalized_sources.add(normalized_source)
        self.entries = unique_entries

        self.normalized_lookup: dict[str, tuple[str, str]] = {}
        self.id_lookup: list[tuple[str, str]] = []
        self.hs_dbs: list[hyperscan.Database] | None = None
        self._build_regex_and_lookup()

    @staticmethod
    def normalize_source(source_term: str) -> str:
        """Normalizes a source term by lowercasing and standardizing whitespace."""
        term = source_term.lower()
        term = TERM_NORM_PATTERN.sub(
            " ", term
        )  # Replace multiple whitespace with single space
        return term.strip()

    def _build_regex_and_lookup(self):
        logger.debug(
            f"start build regex for glossary {self.name} with {len(self.entries)} entries"
        )
        """
        Builds a combined regex for all source terms and a lookup dictionary
        from normalized source terms to (original_source, original_target).
        Regex patterns are sorted by length in descending order to prioritize longer matches.
        """
        self.normalized_lookup = {}

        if not self.entries:
            self.source_terms_regex = None
            return

        self.hs_dbs = []
        hs_pattern = []
        start = time.time()
        for idx, entry in enumerate(self.entries):
            normalized_key = self.normalize_source(entry.source)
            self.normalized_lookup[normalized_key] = (entry.source, entry.target)
            self.id_lookup.append((entry.source, entry.target))

            hs_pattern.append((re.escape(entry.source).encode("utf-8"), idx))

        chunk_size = 20000
        for i, pattern_chunk in enumerate(
            batched(hs_pattern, chunk_size, strict=False)
        ):
            logger.debug(
                f"building hs_db chunk {i + 1} / {len(self.entries) // chunk_size + 1}"
            )
            expressions, ids = zip(*pattern_chunk, strict=False)

            hs_db = hyperscan.Database()
            hs_db.compile(
                expressions=expressions,
                ids=ids,
                elements=len(pattern_chunk),
                flags=hyperscan.HS_FLAG_CASELESS | hyperscan.HS_FLAG_SINGLEMATCH,
                # | hyperscan.HS_FLAG_UTF8
                # | hyperscan.HS_FLAG_UCP,
            )
            self.hs_dbs.append(hs_db)

        end = time.time()
        logger.debug(
            f"finished building regex for glossary {self.name} in {end - start:.2f} seconds"
        )
        logger.debug(
            f"build hs database for glossary {self.name} with {len(self.entries)} entries, hs_info: {self.hs_dbs[0].info()}"
        )
        if not self.hs_dbs:
            self.hs_dbs = None

    @classmethod
    def from_csv(cls, file_path: Path, target_lang_out: str) -> "Glossary":
        """
        Loads glossary entries from a CSV file.
        CSV format: source,target,tgt_lng (tgt_lng is optional)
        Filters entries based on tgt_lng matching target_lang_out.
        The glossary name is derived from the CSV filename.
        """
        glossary_name = file_path.stem
        loaded_entries: list[GlossaryEntry] = []

        # Normalize target_lang_out once for comparison
        normalized_target_lang_out = target_lang_out.lower().replace("-", "_")

        try:
            with file_path.open("rb") as f:
                content = f.read()
                encoding = chardet.detect(content)["encoding"]
                buffer = io.StringIO(content.decode(encoding))
                reader = csv.DictReader(buffer, doublequote=True)
                if not all(col in reader.fieldnames for col in ["source", "target"]):
                    raise ValueError(
                        f"CSV file {file_path} must contain 'source' and 'target' columns."
                    )

                for row in reader:
                    source = row["source"]
                    target = row["target"]
                    tgt_lng = row.get("tgt_lng", None)  # Handle optional tgt_lng

                    if tgt_lng and tgt_lng.strip():
                        normalized_entry_tgt_lng = (
                            tgt_lng.strip().lower().replace("-", "_")
                        )
                        if normalized_entry_tgt_lng != normalized_target_lang_out:
                            continue  # Skip if language doesn't match

                    loaded_entries.append(GlossaryEntry(source, target, tgt_lng))
        except FileNotFoundError:
            # Or handle as per your project's error strategy, e.g., log and return empty Glossary
            raise
        except Exception as e:
            # Or handle as per your project's error strategy
            raise ValueError(
                f"Error reading or parsing CSV file {file_path}: {e}"
            ) from e

        return cls(name=glossary_name, entries=loaded_entries)

    def to_csv(self) -> str:
        """Exports the glossary entries to a CSV formatted string."""
        dict_data = [
            {
                "source": x.source,
                "target": x.target,
                "tgt_lng": x.target_language if x.target_language else "",
            }
            for x in self.entries
        ]
        buffer = io.StringIO()
        dict_writer = csv.DictWriter(
            buffer, fieldnames=["source", "target", "tgt_lng"], doublequote=True
        )
        dict_writer.writeheader()
        dict_writer.writerows(dict_data)
        return buffer.getvalue()

    def __repr__(self):
        return f"Glossary(name='{self.name}', num_entries={len(self.entries)})"

    def get_active_entries_for_text(self, text: str) -> list[tuple[str, str]]:
        """Returns a list of (original_source, target_text) tuples for terms found in the given text."""
        if not self.hs_dbs or not text:
            return []

        text = TERM_NORM_PATTERN.sub(" ", text)  # Normalize whitespace in the text
        if not text:
            return []

        active_entries = []

        def on_match(
            idx: int, _from: int, _to: int, _flags: int, _context=None
        ) -> bool | None:
            active_entries.append(self.id_lookup[idx])
            return False

        for hs_db in self.hs_dbs:
            # Scan the text with the hyperscan database
            scratch = hyperscan.Scratch(hs_db)
            hs_db.scan(text.encode("utf-8"), on_match, scratch=scratch)
        return active_entries


================================================
FILE: babeldoc/main.py
================================================
import asyncio
import logging
import multiprocessing as mp
import queue
import random
import sys
from pathlib import Path
from typing import Any

import configargparse
import tqdm
from rich.progress import BarColumn
from rich.progress import MofNCompleteColumn
from rich.progress import Progress
from rich.progress import TextColumn
from rich.progress import TimeElapsedColumn
from rich.progress import TimeRemainingColumn

import babeldoc.assets.assets
import babeldoc.format.pdf.high_level
from babeldoc.const import enable_process_pool
from babeldoc.format.pdf.translation_config import TranslationConfig
from babeldoc.format.pdf.translation_config import WatermarkOutputMode
from babeldoc.glossary import Glossary
from babeldoc.translator.translator import OpenAITranslator
from babeldoc.translator.translator import set_translate_rate_limiter

logger = logging.getLogger(__name__)
__version__ = "0.5.23"


def create_parser():
    parser = configargparse.ArgParser(
        config_file_parser_class=configargparse.TomlConfigParser(["babeldoc"]),
    )
    parser.add_argument(
        "-c",
        "--config",
        is_config_file=True,
        help="config file path",
    )
    parser.add_argument(
        "--version",
        action="version",
        version=f"%(prog)s {__version__}",
    )
    parser.add_argument(
        "--files",
        action="append",
        help="One or more paths to PDF files.",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Use debug logging level.",
    )
    parser.add_argument(
        "--warmup",
        action="store_true",
        help="Only download and verify required assets then exit.",
    )
    parser.add_argument(
        "--rpc-doclayout",
        help="RPC service host address for document layout analysis",
    )
    parser.add_argument(
        "--rpc-doclayout2",
        help="RPC service host address for document layout analysis",
    )
    parser.add_argument(
        "--rpc-doclayout3",
        help="RPC service host address for document layout analysis",
    )
    parser.add_argument(
        "--rpc-doclayout4",
        help="RPC service host address for document layout analysis",
    )
    parser.add_argument(
        "--rpc-doclayout5",
        help="RPC service host address for document layout analysis",
    )
    parser.add_argument(
        "--rpc-doclayout6",
        help="RPC service host address for document layout analysis",
    )
    parser.add_argument(
        "--rpc-doclayout7",
        help="RPC service host address for document layout analysis",
    )
    parser.add_argument(
        "--generate-offline-assets",
        default=None,
        help="Generate offline assets package in the specified directory",
    )
    parser.add_argument(
        "--restore-offline-assets",
        default=None,
        help="Restore offline assets package from the specified file",
    )
    parser.add_argument(
        "--working-dir",
        default=None,
        help="Working directory for translation. If not set, use temp directory.",
    )
    parser.add_argument(
        "--metadata-extra-data",
        default=None,
        help="Extra data for metadata",
    )
    parser.add_argument(
        "--enable-process-pool",
        action="store_true",
        help="DEBUG ONLY",
    )
    # translation option argument group
    translation_group = parser.add_argument_group(
        "Translation",
        description="Used during translation",
    )
    translation_group.add_argument(
        "--pages",
        "-p",
        help="Pages to translate. If not set, translate all pages. like: 1,2,1-,-3,3-5",
    )
    translation_group.add_argument(
        "--min-text-length",
        type=int,
        default=5,
        help="Minimum text length to translate (default: 5)",
    )
    translation_group.add_argument(
        "--lang-in",
        "-li",
        default="en",
        help="The code of source language.",
    )
    translation_group.add_argument(
        "--lang-out",
        "-lo",
        default="zh",
        help="The code of target language.",
    )
    translation_group.add_argument(
        "--output",
        "-o",
        help="Output directory for files. if not set, use same as input.",
    )
    translation_group.add_argument(
        "--qps",
        "-q",
        type=int,
        default=4,
        help="QPS limit of translation service",
    )
    translation_group.add_argument(
        "--ignore-cache",
        action="store_true",
        help="Ignore translation cache.",
    )
    translation_group.add_argument(
        "--no-dual",
        action="store_true",
        help="Do not output bilingual PDF files",
    )
    translation_group.add_argument(
        "--no-mono",
        action="store_true",
        help="Do not output monolingual PDF files",
    )
    translation_group.add_argument(
        "--formular-font-pattern",
        help="Font pattern to identify formula text",
    )
    translation_group.add_argument(
        "--formular-char-pattern",
        help="Character pattern to identify formula text",
    )
    translation_group.add_argument(
        "--split-short-lines",
        action="store_true",
        help="Force split short lines into different paragraphs (may cause poor typesetting & bugs)",
    )
    translation_group.add_argument(
        "--short-line-split-factor",
        type=float,
        default=0.8,
        help="Split threshold factor. The actual threshold is the median length of all lines on the current page * this factor",
    )
    translation_group.add_argument(
        "--skip-clean",
        action="store_true",
        help="Skip PDF cleaning step",
    )
    translation_group.add_argument(
        "--dual-translate-first",
        action="store_true",
        help="Put translated pages first in dual PDF mode",
    )
    translation_group.add_argument(
        "--disable-rich-text-translate",
        action="store_true",
        help="Disable rich text translation (may help improve compatibility with some PDFs)",
    )
    translation_group.add_argument(
        "--enhance-compatibility",
        action="store_true",
        help="Enable all compatibility enhancement options (equivalent to --skip-clean --dual-translate-first --disable-rich-text-translate)",
    )
    translation_group.add_argument(
        "--use-alternating-pages-dual",
        action="store_true",
        help="Use alternating pages mode for dual PDF. When enabled, original and translated pages are arranged in alternate order.",
    )
    translation_group.add_argument(
        "--watermark-output-mode",
        type=str,
        choices=["watermarked", "no_watermark", "both"],
        default="watermarked",
        help="Control watermark output mode: 'watermarked' (default) adds watermark to translated PDF, 'no_watermark' doesn't add watermark, 'both' outputs both versions.",
    )
    translation_group.add_argument(
        "--max-pages-per-part",
        type=int,
        help="Maximum number of pages per part for split translation. If not set, no splitting will be performed.",
    )
    translation_group.add_argument(
        "--no-watermark",
        action="store_true",
        help="[DEPRECATED] Use --watermark-output-mode=no_watermark instead. Do not add watermark to the translated PDF.",
    )
    translation_group.add_argument(
        "--report-interval",
        type=float,
        default=0.1,
        help="Progress report interval in seconds (default: 0.1)",
    )
    translation_group.add_argument(
        "--translate-table-text",
        action="store_true",
        default=False,
        help="Translate table text (experimental)",
    )
    translation_group.add_argument(
        "--show-char-box",
        action="store_true",
        default=False,
        help="Show character box (debug only)",
    )
    translation_group.add_argument(
        "--skip-scanned-detection",
        action="store_true",
        default=False,
        help="Skip scanned document detection (speeds up processing for non-scanned documents)",
    )
    translation_group.add_argument(
        "--ocr-workaround",
        action="store_true",
        default=False,
        help="Add text fill background (experimental)",
    )
    translation_group.add_argument(
        "--custom-system-prompt",
        help="Custom system prompt for translation.",
        default=None,
    )
    translation_group.add_argument(
        "--add-formula-placehold-hint",
        action="store_true",
        default=False,
        help="Add formula placeholder hint for translation. (Currently not recommended, it may affect translation quality, default: False)",
    )
    translation_group.add_argument(
        "--disable-same-text-fallback",
        action="store_true",
        default=False,
        help="Disable fallback translation when LLM output matches input text. (default: False)",
    )
    translation_group.add_argument(
        "--glossary-files",
        type=str,
        default=None,
        help="Comma-separated paths to glossary CSV files.",
    )
    translation_group.add_argument(
        "--pool-max-workers",
        type=int,
        help="Maximum number of worker threads for internal task processing pools. If not specified, defaults to QPS value. This parameter directly sets the worker count, replacing previous QPS-based dynamic calculations.",
    )
    translation_group.add_argument(
        "--term-pool-max-workers",
        type=int,
        help="Maximum number of worker threads dedicated to automatic term extraction. If not specified, defaults to --pool-max-workers (or QPS value when unset).",
    )
    translation_group.add_argument(
        "--no-auto-extract-glossary",
        action="store_false",
        dest="auto_extract_glossary",
        default=True,
        help="Disable automatic term extraction. (Config file: set auto_extract_glossary = false)",
    )
    translation_group.add_argument(
        "--auto-enable-ocr-workaround",
        action="store_true",
        default=False,
        help="Enable automatic OCR workaround. If a document is detected as heavily scanned, this will attempt to enable OCR processing and skip further scan detection. Note: This option interacts with `--ocr-workaround` and `--skip-scanned-detection`. See documentation for details. (default: False)",
    )
    translation_group.add_argument(
        "--primary-font-family",
        type=str,
        choices=["serif", "sans-serif", "script"],
        default=None,
        help="Override primary font family for translated text. Choices: 'serif' for serif fonts, 'sans-serif' for sans-serif fonts, 'script' for script/italic fonts. If not specified, uses automatic font selection based on original text properties.",
    )
    translation_group.add_argument(
        "--only-include-translated-page",
        action="store_true",
        default=False,
        help="Only include translated pages in the output PDF. Effective only when --pages is used.",
    )
    translation_group.add_argument(
        "--save-auto-extracted-glossary",
        action="store_true",
        default=False,
        help="Save automatically extracted glossary terms to a CSV file in the output directory.",
    )
    translation_group.add_argument(
        "--disable-graphic-element-process",
        action="store_true",
        default=False,
        help="Disable graphic element process. (default: False)",
    )
    translation_group.add_argument(
        "--no-merge-alternating-line-numbers",
        action="store_false",
        dest="merge_alternating_line_numbers",
        default=True,
        help="Disable post-processing that merges alternating line-number layouts (by default this feature is enabled).",
    )
    translation_group.add_argument(
        "--skip-translation",
        action="store_true",
        default=False,
        help="Skip translation step. (default: False)",
    )
    translation_group.add_argument(
        "--skip-form-render",
        action="store_true",
        default=False,
        help="Skip form rendering. (default: False)",
    )
    translation_group.add_argument(
        "--skip-curve-render",
        action="store_true",
        default=False,
        help="Skip curve rendering. (default: False)",
    )
    translation_group.add_argument(
        "--only-parse-generate-pdf",
        action="store_true",
        default=False,
        help="Only parse PDF and generate output PDF without translation (default: False). This skips all translation-related processing including layout analysis, paragraph finding, style processing, and translation itself.",
    )
    translation_group.add_argument(
        "--remove-non-formula-lines",
        action="store_true",
        default=False,
        help="Remove non-formula lines from paragraph areas. This removes decorative lines that are not part of formulas, while protecting lines in figure/table areas. (default: False)",
    )
    translation_group.add_argument(
        "--non-formula-line-iou-threshold",
        type=float,
        default=0.9,
        help="IoU threshold for detecting paragraph overlap when removing non-formula lines. Higher values are more conservative. (default: 0.9)",
    )
    translation_group.add_argument(
        "--figure-table-protection-threshold",
        type=float,
        default=0.9,
        help="IoU threshold for protecting lines in figure/table areas when removing non-formula lines. Higher values provide more protection. (default: 0.9)",
    )
    translation_group.add_argument(
        "--skip-formula-offset-calculation",
        action="store_true",
        default=False,
        help="Skip formula offset calculation (default: False)",
    )
    # service option argument group
    service_group = translation_group.add_mutually_exclusive_group()
    service_group.add_argument(
        "--openai",
        action="store_true",
        help="Use OpenAI translator.",
    )
    service_group = parser.add_argument_group(
        "Translation - OpenAI Options",
        description="OpenAI specific options",
    )
    service_group.add_argument(
        "--openai-model",
        default="gpt-4o-mini",
        help="The OpenAI model to use for translation.",
    )
    service_group.add_argument(
        "--openai-base-url",
        help="The base URL for the OpenAI API.",
    )
    service_group.add_argument(
        "--openai-api-key",
        "-k",
        help="The API key for the OpenAI API.",
    )
    service_group.add_argument(
        "--openai-term-extraction-model",
        default=None,
        help="OpenAI model to use for automatic term extraction. Defaults to --openai-model when unset.",
    )
    service_group.add_argument(
        "--openai-term-extraction-base-url",
        default=None,
        help="Base URL for the OpenAI API used during automatic term extraction. Falls back to --openai-base-url when unset.",
    )
    service_group.add_argument(
        "--openai-term-extraction-api-key",
        default=None,
        help="API key for the OpenAI API used during automatic term extraction. Falls back to --openai-api-key when unset.",
    )
    service_group.add_argument(
        "--enable-json-mode-if-requested",
        action="store_true",
        default=False,
        help="Enable JSON mode for OpenAI requests.",
    )
    service_group.add_argument(
        "--send-dashscope-header",
        action="store_true",
        default=False,
        help="Send DashScope data inspection header to disable input/output inspection.",
    )
    service_group.add_argument(
        "--no-send-temperature",
        action="store_true",
        default=False,
        help="Do not send temperature parameter to OpenAI API (default: send temperature).",
    )
    service_group.add_argument(
        "--openai-reasoning",
        type=str,
        default=None,
        help="Reasoning string to send in the OpenAI request body 'reasoning' field. If not set, the field is not sent.",
    )
    service_group.add_argument(
        "--openai-term-extraction-reasoning",
        type=str,
        default=None,
        help="Reasoning string for the OpenAI term extraction translator. If not set, no reasoning field is sent for term extraction requests.",
    )

    return parser


async def main():
    parser = create_parser()
    args: Any = parser.parse_args()

    if args.debug:
        logging.getLogger().setLevel(logging.DEBUG)

    if args.generate_offline_assets:
        babeldoc.assets.assets.generate_offline_assets_package(
            Path(args.generate_offline_assets)
        )
        logger.info("Offline assets package generated, exiting...")
        return

    if args.restore_offline_assets:
        babeldoc.assets.assets.restore_offline_assets_package(
            Path(args.restore_offline_assets)
        )
        logger.info("Offline assets package restored, exiting...")
        return

    if args.warmup:
        babeldoc.assets.assets.warmup()
        logger.info("Warmup completed, exiting...")
        return

    # 验证翻译服务选择
    if not args.openai:
        parser.error("必须选择一个翻译服务：--openai")

    # 验证 OpenAI 参数
    if args.openai and not args.openai_api_key:
        parser.error("使用 OpenAI 服务时必须提供 API key")

    if args.enable_process_pool:
        enable_process_pool()

    # 实例化翻译器
    if args.openai:
        translator_kwargs: dict[str, Any] = {}
        if args.openai_reasoning is not None:
            translator_kwargs["reasoning"] = args.openai_reasoning
        translator = OpenAITranslator(
            lang_in=args.lang_in,
            lang_out=args.lang_out,
            model=args.openai_model,
            base_url=args.openai_base_url,
            api_key=args.openai_api_key,
            ignore_cache=args.ignore_cache,
            enable_json_mode_if_requested=args.enable_json_mode_if_requested,
            send_dashscope_header=args.send_dashscope_header,
            send_temperature=not args.no_send_temperature,
            **translator_kwargs,
        )
        term_extraction_translator = translator
        if (
            args.openai_term_extraction_model
            or args.openai_term_extraction_base_url
            or args.openai_term_extraction_api_key
        ):
            term_translator_kwargs: dict[str, Any] = {}
            if args.openai_term_extraction_reasoning is not None:
                term_translator_kwargs["reasoning"] = (
                    args.openai_term_extraction_reasoning
                )
            term_extraction_translator = OpenAITranslator(
                lang_in=args.lang_in,
                lang_out=args.lang_out,
                model=args.openai_term_extraction_model or args.openai_model,
                base_url=(args.openai_term_extraction_base_url or args.openai_base_url),
                api_key=args.openai_term_extraction_api_key or args.openai_api_key,
                ignore_cache=args.ignore_cache,
                enable_json_mode_if_requested=args.enable_json_mode_if_requested,
                send_dashscope_header=args.send_dashscope_header,
                send_temperature=not args.no_send_temperature,
                **term_translator_kwargs,
            )
    else:
        raise ValueError("Invalid translator type")

    # 设置翻译速率限制
    set_translate_rate_limiter(args.qps)
    # 初始化文档布局模型
    if args.rpc_doclayout:
        from babeldoc.docvision.rpc_doclayout import RpcDocLayoutModel

        doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout)
    elif args.rpc_doclayout2:
        from babeldoc.docvision.rpc_doclayout2 import RpcDocLayoutModel

        doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout2)
    elif args.rpc_doclayout3:
        from babeldoc.docvision.rpc_doclayout3 import RpcDocLayoutModel

        doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout3)
    elif args.rpc_doclayout4:
        from babeldoc.docvision.rpc_doclayout4 import RpcDocLayoutModel

        doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout4)
    elif args.rpc_doclayout5:
        from babeldoc.docvision.rpc_doclayout5 import RpcDocLayoutModel

        doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout5)
    elif args.rpc_doclayout6:
        from babeldoc.docvision.rpc_doclayout6 import RpcDocLayoutModel

        doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout6)
    elif args.rpc_doclayout7:
        from babeldoc.docvision.rpc_doclayout7 import RpcDocLayoutModel

        doc_layout_model = RpcDocLayoutModel(host=args.rpc_doclayout7)
    else:
        from babeldoc.docvision.doclayout import DocLayoutModel

        doc_layout_model = DocLayoutModel.load_onnx()

    if args.translate_table_text:
        from babeldoc.docvision.table_detection.rapidocr import RapidOCRModel

        table_model = RapidOCRModel()
    else:
        table_model = None

    # Load glossaries
    loaded_glossaries: list[Glossary] = []
    if args.glossary_files:
        paths_str = args.glossary_files.split(",")
        for p_str in paths_str:
            file_path = Path(p_str.strip())
            if not file_path.exists():
                logger.error(f"Glossary file not found: {file_path}")
                continue
            if not file_path.is_file():
                logger.error(f"Glossary path is not a file: {file_path}")
                continue
            try:
                glossary_obj = Glossary.from_csv(file_path, args.lang_out)
                if glossary_obj.entries:
                    loaded_glossaries.append(glossary_obj)
                    logger.info(
                        f"Loaded glossary '{glossary_obj.name}' with {len(glossary_obj.entries)} entries."
                    )
                else:
                    logger.info(
                        f"Glossary '{file_path.stem}' loaded with no applicable entries for lang_out '{args.lang_out}'."
                    )
            except Exception as e:
                logger.error(f"Failed to load glossary from {file_path}: {e}")

    pending_files = []
    for file in args.files:
        # 清理文件路径，去除两端的引号
        if file.startswith("--files="):
            file = file[len("--files=") :]
        file = file.lstrip("-").strip("\"'")
        if not Path(file).exists():
            logger.error(f"文件不存在：{file}")
            exit(1)
        if not file.lower().endswith(".pdf"):
            logger.error(f"文件不是 PDF 文件：{file}")
            exit(1)
        pending_files.append(file)

    if args.output:
        if not Path(args.output).exists():
            logger.info(f"输出目录不存在，创建：{args.output}")
            try:
                Path(args.output).mkdir(parents=True, exist_ok=True)
            except OSError:
                logger.critical(
                    f"Failed to create output folder at {args.output}",
                    exc_info=True,
                )
                exit(1)
    else:
        args.output = None

    if args.working_dir:
        working_dir = Path(args.working_dir)
        if not working_dir.exists():
            logger.info(f"工作目录不存在，创建：{working_dir}")
            try:
                working_dir.mkdir(parents=True, exist_ok=True)
            except OSError:
                logger.critical(
                    f"Failed to create working directory at {working_dir}",
                    exc_info=True,
                )
                exit(1)
    else:
        working_dir = None

    watermark_output_mode = WatermarkOutputMode.Watermarked
    if args.no_watermark:
        watermark_output_mode = WatermarkOutputMode.NoWatermark
    elif args.watermark_output_mode == "both":
        watermark_output_mode = WatermarkOutputMode.Both
    elif args.watermark_output_mode == "watermarked":
        watermark_output_mode = WatermarkOutputMode.Watermarked
    elif args.watermark_output_mode == "no_watermark":
        watermark_output_mode = WatermarkOutputMode.NoWatermark

    split_strategy = None
    if args.max_pages_per_part:
        split_strategy = TranslationConfig.create_max_pages_per_part_split_strategy(
            args.max_pages_per_part
        )

    total_term_extraction_total_tokens = 0
    total_term_extraction_prompt_tokens = 0
    total_term_extraction_completion_tokens = 0
    total_term_extraction_cache_hit_prompt_tokens = 0

    for file in pending_files:
        # 清理文件路径，去除两端的引号
        file = file.strip("\"'")
        # 创建配置对象
        config = TranslationConfig(
            input_file=file,
            font=None,
            pages=args.pages,
            output_dir=args.output,
            translator=translator,
            term_extraction_translator=term_extraction_translator,
            debug=args.debug,
            lang_in=args.lang_in,
            lang_out=args.lang_out,
            no_dual=args.no_dual,
            no_mono=args.no_mono,
            qps=args.qps,
            formular_font_pattern=args.formular_font_pattern,
            formular_char_pattern=args.formular_char_pattern,
            split_short_lines=args.split_short_lines,
            short_line_split_factor=args.short_line_split_factor,
            doc_layout_model=doc_layout_model,
            skip_clean=args.skip_clean,
            dual_translate_first=args.dual_translate_first,
            disable_rich_text_translate=args.disable_rich_text_translate,
            enhance_compatibility=args.enhance_compatibility,
            use_alternating_pages_dual=args.use_alternating_pages_dual,
            report_interval=args.report_interval,
            min_text_length=args.min_text_length,
            watermark_output_mode=watermark_output_mode,
            split_strategy=split_strategy,
            table_model=table_model,
            show_char_box=args.show_char_box,
            skip_scanned_detection=args.skip_scanned_detection,
            ocr_workaround=args.ocr_workaround,
            custom_system_prompt=args.custom_system_prompt,
            working_dir=working_dir,
            add_formula_placehold_hint=args.add_formula_placehold_hint,
            disable_same_text_fallback=args.disable_same_text_fallback,
            glossaries=loaded_glossaries,
            pool_max_workers=args.pool_max_workers,
            auto_extract_glossary=args.auto_extract_glossary,
            auto_enable_ocr_workaround=args.auto_enable_ocr_workaround,
            primary_font_family=args.primary_font_family,
            only_include_translated_page=args.only_include_translated_page,
            save_auto_extracted_glossary=args.save_auto_extracted_glossary,
            enable_graphic_element_process=not args.disable_graphic_element_process,
            merge_alternating_line_numbers=args.merge_alternating_line_numbers,
            skip_translation=args.skip_translation,
            skip_form_render=args.skip_form_render,
            skip_curve_render=args.skip_curve_render,
            only_parse_generate_pdf=args.only_parse_generate_pdf,
            remove_non_formula_lines=args.remove_non_formula_lines,
            non_formula_line_iou_threshold=args.non_formula_line_iou_threshold,
            figure_table_protection_threshold=args.figure_table_protection_threshold,
            skip_formula_offset_calculation=args.skip_formula_offset_calculation,
            metadata_extra_data=args.metadata_extra_data,
            term_pool_max_workers=args.term_pool_max_workers,
        )

        def nop(_x):
            pass

        getattr(doc_layout_model, "init_font_mapper", nop)(config)
        # Create progress handler
        progress_context, progress_handler = create_progress_handler(
            config, show_log=False
        )

        # 开始翻译
        with progress_context:
            async for event in babeldoc.format.pdf.high_level.async_translate(config):
                progress_handler(event)
                if config.debug:
                    logger.debug(event)
                if event["type"] == "error":
                    logger.error(f"Error: {event['error']}")
                    break
                if event["type"] == "finish":
                    result = event["translate_result"]
                    logger.info(str(result))
                    break
        usage = config.term_extraction_token_usage
        total_term_extraction_total_tokens += usage["total_tokens"]
        total_term_extraction_prompt_tokens += usage["prompt_tokens"]
        total_term_extraction_completion_tokens += usage["completion_tokens"]
        total_term_extraction_cache_hit_prompt_tokens += usage[
            "cache_hit_prompt_tokens"
        ]
    logger.info(f"Total tokens: {translator.token_count.value}")
    logger.info(f"Prompt tokens: {translator.prompt_token_count.value}")
    logger.info(f"Completion tokens: {translator.completion_token_count.value}")
    logger.info(
        f"Cache hit prompt tokens: {translator.cache_hit_prompt_token_count.value}"
    )
    logger.info(
        "Term extraction tokens: total=%s prompt=%s completion=%s cache_hit_prompt=%s",
        total_term_extraction_total_tokens,
        total_term_extraction_prompt_tokens,
        total_term_extraction_completion_tokens,
        total_term_extraction_cache_hit_prompt_tokens,
    )
    if term_extraction_translator is not translator:
        logger.info(
            "Term extraction translator raw tokens: total=%s prompt=%s completion=%s cache_hit_prompt=%s",
            term_extraction_translator.token_count.value,
            term_extraction_translator.prompt_token_count.value,
            term_extraction_translator.completion_token_count.value,
            term_extraction_translator.cache_hit_prompt_token_count.value,
        )


def create_progress_handler(
    translation_config: TranslationConfig, show_log: bool = False
):
    """Create a progress handler function based on the configuration.

    Args:
        translation_config: The translation configuration.

    Returns:
        A tuple of (progress_context, progress_handler), where progress_context is a context
        manager that should be used to wrap the translation process, and progress_handler
        is a function that will be called with progress events.
    """
    if translation_config.use_rich_pbar:
        progress = Progress(
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            MofNCompleteColumn(),
            TimeElapsedColumn(),
            TimeRemainingColumn(),
        )
        translate_task_id = progress.add_task("translate", total=100)
        stage_tasks = {}

        def progress_handler(event):
            if show_log and random.random() <= 0.1:  # noqa: S311
                logger.info(event)
            if event["type"] == "progress_start":
                if event["stage"] not in stage_tasks:
                    stage_tasks[event["stage"]] = progress.add_task(
                        f"{event['stage']} ({event['part_index']}/{event['total_parts']})",
                        total=event.get("stage_total", 100),
                    )
            elif event["type"] == "progress_update":
                stage = event["stage"]
                if stage in stage_tasks:
                    progress.update(
                        stage_tasks[stage],
                        completed=event["stage_current"],
                        total=event["stage_total"],
                        description=f"{event['stage']} ({event['part_index']}/{event['total_parts']})",
                        refresh=True,
                    )
                progress.update(
                    translate_task_id,
                    completed=event["overall_progress"],
                    refresh=True,
                )
            elif event["type"] == "progress_end":
                stage = event["stage"]
                if stage in stage_tasks:
                    progress.update(
                        stage_tasks[stage],
                        completed=event["stage_total"],
                        total=event["stage_total"],
                        description=f"{event['stage']} ({event['part_index']}/{event['total_parts']})",
                        refresh=True,
                    )
                    progress.update(
                        translate_task_id,
                        completed=event["overall_progress"],
                        refresh=True,
                    )
                progress.refresh()

        return progress, progress_handler
    else:
        pbar = tqdm.tqdm(total=100, desc="translate")

        def progress_handler(event):
            if event["type"] == "progress_update":
                pbar.update(event["overall_progress"] - pbar.n)
                pbar.set_description(
                    f"{event['stage']} ({event['stage_current']}/{event['stage_total']})",
                )
            elif event["type"] == "progress_end":
                pbar.set_description(f"{event['stage']} (Complete)")
                pbar.refresh()

        return pbar, progress_handler


# for backward compatibility
def create_cache_folder():
    return babeldoc.format.pdf.high_level.create_cache_folder()


# for backward compatibility
def download_font_assets():
    return babeldoc.format.pdf.high_level.download_font_assets()


class EvictQueue(queue.Queue):
    def __init__(self, maxsize):
        self.discarded = 0
        super().__init__(maxsize)

    def put(self, item, block=False, timeout=None):
        while True:
            try:
                super().put(item, block=False)
                break
            except queue.Full:
                try:
                    self.get_nowait()
                    self.discarded += 1
                except queue.Empty:
                    pass


def speed_up_logs():
    import logging.handlers

    root_logger = logging.getLogger()
    log_que = EvictQueue(1000)
    queue_handler = logging.handlers.QueueHandler(log_que)
    queue_listener = logging.handlers.QueueListener(log_que, *root_logger.handlers)
    queue_listener.start()
    root_logger.handlers = [queue_handler]


def cli():
    """Command line interface entry point."""
    from rich.logging import RichHandler

    logging.basicConfig(level=logging.INFO, handlers=[RichHandler()])

    logging.getLogger("httpx").setLevel("CRITICAL")
    logging.getLogger("httpx").propagate = False
    logging.getLogger("openai").setLevel("CRITICAL")
    logging.getLogger("openai").propagate = False
    logging.getLogger("httpcore").setLevel("CRITICAL")
    logging.getLogger("httpcore").propagate = False
    logging.getLogger("http11").setLevel("CRITICAL")
    logging.getLogger("http11").propagate = False
    for v in logging.Logger.manager.loggerDict.values():
        if getattr(v, "name", None) is None:
            continue
        if (
            v.name.startswith("pdfminer")
            or v.name.startswith("peewee")
            or v.name.startswith("httpx")
            or "http11" in v.name
            or "openai" in v.name
            or "pdfminer" in v.name
        ):
            v.disabled = True
            v.propagate = False

    speed_up_logs()
    babeldoc.format.pdf.high_level.init()
    asyncio.run(main())


if __name__ == "__main__":
    if sys.platform == "darwin" or sys.platform == "win32":
        mp.set_start_method("spawn")
    else:
        mp.set_start_method("forkserver")
    cli()


================================================
FILE: babeldoc/pdfminer/LICENSE
================================================
Copyright (c) 2004-2016  Yusuke Shinyama <yusuke at shinyama dot jp>

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

================================================
FILE: babeldoc/pdfminer/__init__.py
================================================
from importlib.metadata import PackageNotFoundError
from importlib.metadata import version

try:
    __version__ = version("pdfminer.six")
except PackageNotFoundError:
    # package is not installed, return default
    __version__ = "0.0"

if __name__ == "__main__":
    print(__version__)


================================================
FILE: babeldoc/pdfminer/_saslprep.py
================================================
# Copyright 2016-present MongoDB, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Some changes copyright 2021-present Matthias Valvekens,
# licensed under the license of the pyHanko project (see LICENSE file).


"""An implementation of RFC4013 SASLprep."""

__all__ = ["saslprep"]

import stringprep
import unicodedata
from collections.abc import Callable

from babeldoc.pdfminer.pdfexceptions import PDFValueError

# RFC4013 section 2.3 prohibited output.
_PROHIBITED: tuple[Callable[[str], bool], ...] = (
    # A strict reading of RFC 4013 requires table c12 here, but
    # characters from it are mapped to SPACE in the Map step. Can
    # normalization reintroduce them somehow?
    stringprep.in_table_c12,
    stringprep.in_table_c21_c22,
    stringprep.in_table_c3,
    stringprep.in_table_c4,
    stringprep.in_table_c5,
    stringprep.in_table_c6,
    stringprep.in_table_c7,
    stringprep.in_table_c8,
    stringprep.in_table_c9,
)


def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
    """An implementation of RFC4013 SASLprep.
    :param data:
        The string to SASLprep.
    :param prohibit_unassigned_code_points:
        RFC 3454 and RFCs for various SASL mechanisms distinguish between
        `queries` (unassigned code points allowed) and
        `stored strings` (unassigned code points prohibited). Defaults
        to ``True`` (unassigned code points are prohibited).
    :return: The SASLprep'ed version of `data`.
    """
    if prohibit_unassigned_code_points:
        prohibited = _PROHIBITED + (stringprep.in_table_a1,)
    else:
        prohibited = _PROHIBITED

    # RFC3454 section 2, step 1 - Map
    # RFC4013 section 2.1 mappings
    # Map Non-ASCII space characters to SPACE (U+0020). Map
    # commonly mapped to nothing characters to, well, nothing.
    in_table_c12 = stringprep.in_table_c12
    in_table_b1 = stringprep.in_table_b1
    data = "".join(
        [
            "\u0020" if in_table_c12(elt) else elt
            for elt in data
            if not in_table_b1(elt)
        ],
    )

    # RFC3454 section 2, step 2 - Normalize
    # RFC4013 section 2.2 normalization
    data = unicodedata.ucd_3_2_0.normalize("NFKC", data)

    in_table_d1 = stringprep.in_table_d1
    if in_table_d1(data[0]):
        if not in_table_d1(data[-1]):
            # RFC3454, Section 6, #3. If a string contains any
            # RandALCat character, the first and last characters
            # MUST be RandALCat characters.
            raise PDFValueError("SASLprep: failed bidirectional check")
        # RFC3454, Section 6, #2. If a string contains any RandALCat
        # character, it MUST NOT contain any LCat character.
        prohibited = prohibited + (stringprep.in_table_d2,)
    else:
        # RFC3454, Section 6, #3. Following the logic of #3, if
        # the first character is not a RandALCat, no other character
        # can be either.
        prohibited = prohibited + (in_table_d1,)

    # RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
    for char in data:
        if any(in_table(char) for in_table in prohibited):
            raise PDFValueError("SASLprep: failed prohibited character check")

    return data


================================================
FILE: babeldoc/pdfminer/arcfour.py
================================================
"""Python implementation of Arcfour encryption algorithm.
See https://en.wikipedia.org/wiki/RC4
This code is in the public domain.

"""

from collections.abc import Sequence


class Arcfour:
    def __init__(self, key: Sequence[int]) -> None:
        # because Py3 range is not indexable
        s = [i for i in range(256)]
        j = 0
        klen = len(key)
        for i in range(256):
            j = (j + s[i] + key[i % klen]) % 256
            (s[i], s[j]) = (s[j], s[i])
        self.s = s
        (self.i, self.j) = (0, 0)

    def process(self, data: bytes) -> bytes:
        (i, j) = (self.i, self.j)
        s = self.s
        r = b""
        for c in iter(data):
            i = (i + 1) % 256
            j = (j + s[i]) % 256
            (s[i], s[j]) = (s[j], s[i])
            k = s[(s[i] + s[j]) % 256]
            r += bytes((c ^ k,))
        (self.i, self.j) = (i, j)
        return r

    encrypt = decrypt = process


================================================
FILE: babeldoc/pdfminer/ascii85.py
================================================
"""Python implementation of ASCII85/ASCIIHex decoder (Adobe version)."""

import re
from base64 import a85decode
from binascii import unhexlify

start_re = re.compile(rb"^\s*<?\s*~\s*")
end_re = re.compile(rb"\s*~\s*>?\s*$")


def ascii85decode(data: bytes) -> bytes:
    """In ASCII85 encoding, every four bytes are encoded with five ASCII
    letters, using 85 different types of characters (as 256**4 < 85**5).
    When the length of the original bytes is not a multiple of 4, a special
    rule is used for round up.

    Adobe's ASCII85 implementation expects the input to be terminated
    by `b"~>"`, and (though this is absent from the PDF spec) it can
    also begin with `b"<~"`.  We can't reliably expect this to be the
    case, and there can be off-by-one errors in stream lengths which
    mean we only see `~` at the end.  Worse yet, `<` and `>` are
    ASCII85 digits, so we can't strip them.  We settle on a compromise
    where we strip leading `<~` or `~` and trailing `~` or `~>`.
    """
    data = start_re.sub(b"", data)
    data = end_re.sub(b"", data)
    return a85decode(data)


bws_re = re.compile(rb"\s")


def asciihexdecode(data: bytes) -> bytes:
    """ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
    For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
    ASCIIHexDecode filter produces one byte of binary data. All white-space
    characters are ignored. A right angle bracket character (>) indicates
    EOD. Any other characters will cause an error. If the filter encounters
    the EOD marker after reading an odd number of hexadecimal digits, it
    will behave as if a 0 followed the last digit.
    """
    data = bws_re.sub(b"", data)
    idx = data.find(b">")
    if idx != -1:
        data = data[:idx]
        if idx % 2 == 1:
            data += b"0"
    return unhexlify(data)


================================================
FILE: babeldoc/pdfminer/casting.py
================================================
import itertools
from typing import Any

from babeldoc.pdfminer.utils import Matrix
from babeldoc.pdfminer.utils import Rect

_FloatTriple = tuple[float, float, float]
_FloatQuadruple = tuple[float, float, float, float]


def safe_int(o: Any) -> int | None:
    try:
        return int(o)
    except (TypeError, ValueError):
        return None


def safe_float(o: Any) -> float | None:
    try:
        return float(o)
    except (TypeError, ValueError):
        return None


def safe_matrix(a: Any, b: Any, c: Any, d: Any, e: Any, f: Any) -> Matrix | None:
    a_f = safe_float(a)
    b_f = safe_float(b)
    c_f = safe_float(c)
    d_f = safe_float(d)
    e_f = safe_float(e)
    f_f = safe_float(f)

    if (
        a_f is None
        or b_f is None
        or c_f is None
        or d_f is None
        or e_f is None
        or f_f is None
    ):
        return None

    return a_f, b_f, c_f, d_f, e_f, f_f


def safe_rgb(r: Any, g: Any, b: Any) -> tuple[float, float, float] | None:
    return _safe_float_triple(r, g, b)


def safe_cmyk(
    c: Any, m: Any, y: Any, k: Any
) -> tuple[float, float, float, float] | None:
    return _safe_float_quadruple(c, m, y, k)


def safe_rect_list(value: Any) -> Rect | None:
    try:
        values = list(itertools.islice(value, 4))
    except TypeError:
        return None

    if len(values) != 4:
        return None

    return safe_rect(*values)


def safe_rect(a: Any, b: Any, c: Any, d: Any) -> Rect | None:
    return _safe_float_quadruple(a, b, c, d)


def _safe_float_triple(a: Any, b: Any, c: Any) -> _FloatTriple | None:
    a_f = safe_float(a)
    b_f = safe_float(b)
    c_f = safe_float(c)

    if a_f is None or b_f is None or c_f is None:
        return None

    return a_f, b_f, c_f


def _safe_float_quadruple(a: Any, b: Any, c: Any, d: Any) -> _FloatQuadruple | None:
    a_f = safe_float(a)
    b_f = safe_float(b)
    c_f = safe_float(c)
    d_f = safe_float(d)

    if a_f is None or b_f is None or c_f is None or d_f is None:
        return None

    return a_f, b_f, c_f, d_f


================================================
FILE: babeldoc/pdfminer/ccitt.py
================================================
# CCITT Fax decoder
#
# Bugs: uncompressed mode untested.
#
# cf.
#  ITU-T Recommendation T.4
#    "Standardization of Group 3 facsimile terminals
#    for document transmission"
#  ITU-T Recommendation T.6
#    "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS
#    FOR GROUP 4 FACSIMILE APPARATUS"


import array
from collections.abc import Callable
from collections.abc import Iterator
from collections.abc import MutableSequence
from collections.abc import Sequence
from typing import Any
from typing import cast

from babeldoc.pdfminer.pdfexceptions import PDFException
from babeldoc.pdfminer.pdfexceptions import PDFValueError


def get_bytes(data: bytes) -> Iterator[int]:
    yield from data


# Workaround https://github.com/python/mypy/issues/731
BitParserState = MutableSequence[Any]
# A better definition (not supported by mypy) would be:
# BitParserState = MutableSequence[Union["BitParserState", int, str, None]]


class BitParser:
    _state: BitParserState

    # _accept is declared Optional solely as a workaround for
    # https://github.com/python/mypy/issues/708
    _accept: Callable[[Any], BitParserState] | None

    def __init__(self) -> None:
        self._pos = 0

    @classmethod
    def add(cls, root: BitParserState, v: int | str, bits: str) -> None:
        p: BitParserState = root
        b = None
        for i in range(len(bits)):
            if i > 0:
                assert b is not None
                if p[b] is None:
                    p[b] = [None, None]
                p = p[b]
            if bits[i] == "1":
                b = 1
            else:
                b = 0
        assert b is not None
        p[b] = v

    def feedbytes(self, data: bytes) -> None:
        for byte in get_bytes(data):
            for m in (128, 64, 32, 16, 8, 4, 2, 1):
                self._parse_bit(byte & m)

    def _parse_bit(self, x: object) -> None:
        if x:
            v = self._state[1]
        else:
            v = self._state[0]
        self._pos += 1
        if isinstance(v, list):
            self._state = v
        else:
            assert self._accept is not None
            self._state = self._accept(v)


class CCITTG4Parser(BitParser):
    MODE = [None, None]
    BitParser.add(MODE, 0, "1")
    BitParser.add(MODE, +1, "011")
    BitParser.add(MODE, -1, "010")
    BitParser.add(MODE, "h", "001")
    BitParser.add(MODE, "p", "0001")
    BitParser.add(MODE, +2, "000011")
    BitParser.add(MODE, -2, "000010")
    BitParser.add(MODE, +3, "0000011")
    BitParser.add(MODE, -3, "0000010")
    BitParser.add(MODE, "u", "0000001111")
    BitParser.add(MODE, "x1", "0000001000")
    BitParser.add(MODE, "x2", "0000001001")
    BitParser.add(MODE, "x3", "0000001010")
    BitParser.add(MODE, "x4", "0000001011")
    BitParser.add(MODE, "x5", "0000001100")
    BitParser.add(MODE, "x6", "0000001101")
    BitParser.add(MODE, "x7", "0000001110")
    BitParser.add(MODE, "e", "000000000001000000000001")

    WHITE = [None, None]
    BitParser.add(WHITE, 0, "00110101")
    BitParser.add(WHITE, 1, "000111")
    BitParser.add(WHITE, 2, "0111")
    BitParser.add(WHITE, 3, "1000")
    BitParser.add(WHITE, 4, "1011")
    BitParser.add(WHITE, 5, "1100")
    BitParser.add(WHITE, 6, "1110")
    BitParser.add(WHITE, 7, "1111")
    BitParser.add(WHITE, 8, "10011")
    BitParser.add(WHITE, 9, "10100")
    BitParser.add(WHITE, 10, "00111")
    BitParser.add(WHITE, 11, "01000")
    BitParser.add(WHITE, 12, "001000")
    BitParser.add(WHITE, 13, "000011")
    BitParser.add(WHITE, 14, "110100")
    BitParser.add(WHITE, 15, "110101")
    BitParser.add(WHITE, 16, "101010")
    BitParser.add(WHITE, 17, "101011")
    BitParser.add(WHITE, 18, "0100111")
    BitParser.add(WHITE, 19, "0001100")
    BitParser.add(WHITE, 20, "0001000")
    BitParser.add(WHITE, 21, "0010111")
    BitParser.add(WHITE, 22, "0000011")
    BitParser.add(WHITE, 23, "0000100")
    BitParser.add(WHITE, 24, "0101000")
    BitParser.add(WHITE, 25, "0101011")
    BitParser.add(WHITE, 26, "0010011")
    BitParser.add(WHITE, 27, "0100100")
    BitParser.add(WHITE, 28, "0011000")
    BitParser.add(WHITE, 29, "00000010")
    BitParser.add(WHITE, 30, "00000011")
    BitParser.add(WHITE, 31, "00011010")
    BitParser.add(WHITE, 32, "00011011")
    BitParser.add(WHITE, 33, "00010010")
    BitParser.add(WHITE, 34, "00010011")
    BitParser.add(WHITE, 35, "00010100")
    BitParser.add(WHITE, 36, "00010101")
    BitParser.add(WHITE, 37, "00010110")
    BitParser.add(WHITE, 38, "00010111")
    BitParser.add(WHITE, 39, "00101000")
    BitParser.add(WHITE, 40, "00101001")
    BitParser.add(WHITE, 41, "00101010")
    BitParser.add(WHITE, 42, "00101011")
    BitParser.add(WHITE, 43, "00101100")
    BitParser.add(WHITE, 44, "00101101")
    BitParser.add(WHITE, 45, "00000100")
    BitParser.add(WHITE, 46, "00000101")
    BitParser.add(WHITE, 47, "00001010")
    BitParser.add(WHITE, 48, "00001011")
    BitParser.add(WHITE, 49, "01010010")
    BitParser.add(WHITE, 50, "01010011")
    BitParser.add(WHITE, 51, "01010100")
    BitParser.add(WHITE, 52, "01010101")
    BitParser.add(WHITE, 53, "00100100")
    BitParser.add(WHITE, 54, "00100101")
    BitParser.add(WHITE, 55, "01011000")
    BitParser.add(WHITE, 56, "01011001")
    BitParser.add(WHITE, 57, "01011010")
    BitParser.add(WHITE, 58, "01011011")
    BitParser.add(WHITE, 59, "01001010")
    BitParser.add(WHITE, 60, "01001011")
    BitParser.add(WHITE, 61, "00110010")
    BitParser.add(WHITE, 62, "00110011")
    BitParser.add(WHITE, 63, "00110100")
    BitParser.add(WHITE, 64, "11011")
    BitParser.add(WHITE, 128, "10010")
    BitParser.add(WHITE, 192, "010111")
    BitParser.add(WHITE, 256, "0110111")
    BitParser.add(WHITE, 320, "00110110")
    BitParser.add(WHITE, 384, "00110111")
    BitParser.add(WHITE, 448, "01100100")
    BitParser.add(WHITE, 512, "01100101")
    BitParser.add(WHITE, 576, "01101000")
    BitParser.add(WHITE, 640, "01100111")
    BitParser.add(WHITE, 704, "011001100")
    BitParser.add(WHITE, 768, "011001101")
    BitParser.add(WHITE, 832, "011010010")
    BitParser.add(WHITE, 896, "011010011")
    BitParser.add(WHITE, 960, "011010100")
    BitParser.add(WHITE, 1024, "011010101")
    BitParser.add(WHITE, 1088, "011010110")
    BitParser.add(WHITE, 1152, "011010111")
    BitParser.add(WHITE, 1216, "011011000")
    BitParser.add(WHITE, 1280, "011011001")
    BitParser.add(WHITE, 1344, "011011010")
    BitParser.add(WHITE, 1408, "011011011")
    BitParser.add(WHITE, 1472, "010011000")
    BitParser.add(WHITE, 1536, "010011001")
    BitParser.add(WHITE, 1600, "010011010")
    BitParser.add(WHITE, 1664, "011000")
    BitParser.add(WHITE, 1728, "010011011")
    BitParser.add(WHITE, 1792, "00000001000")
    BitParser.add(WHITE, 1856, "00000001100")
    BitParser.add(WHITE, 1920, "00000001101")
    BitParser.add(WHITE, 1984, "000000010010")
    BitParser.add(WHITE, 2048, "000000010011")
    BitParser.add(WHITE, 2112, "000000010100")
    BitParser.add(WHITE, 2176, "000000010101")
    BitParser.add(WHITE, 2240, "000000010110")
    BitParser.add(WHITE, 2304, "000000010111")
    BitParser.add(WHITE, 2368, "000000011100")
    BitParser.add(WHITE, 2432, "000000011101")
    BitParser.add(WHITE, 2496, "000000011110")
    BitParser.add(WHITE, 2560, "000000011111")

    BLACK = [None, None]
    BitParser.add(BLACK, 0, "0000110111")
    BitParser.add(BLACK, 1, "010")
    BitParser.add(BLACK, 2, "11")
    BitParser.add(BLACK, 3, "10")
    BitParser.add(BLACK, 4, "011")
    BitParser.add(BLACK, 5, "0011")
    BitParser.add(BLACK, 6, "0010")
    BitParser.add(BLACK, 7, "00011")
    BitParser.add(BLACK, 8, "000101")
    BitParser.add(BLACK, 9, "000100")
    BitParser.add(BLACK, 10, "0000100")
    BitParser.add(BLACK, 11, "0000101")
    BitParser.add(BLACK, 12, "0000111")
    BitParser.add(BLACK, 13, "00000100")
    BitParser.add(BLACK, 14, "00000111")
    BitParser.add(BLACK, 15, "000011000")
    BitParser.add(BLACK, 16, "0000010111")
    BitParser.add(BLACK, 17, "0000011000")
    BitParser.add(BLACK, 18, "0000001000")
    BitParser.add(BLACK, 19, "00001100111")
    BitParser.add(BLACK, 20, "00001101000")
    BitParser.add(BLACK, 21, "00001101100")
    BitParser.add(BLACK, 22, "00000110111")
    BitParser.add(BLACK, 23, "00000101000")
    BitParser.add(BLACK, 24, "00000010111")
    BitParser.add(BLACK, 25, "00000011000")
    BitParser.add(BLACK, 26, "000011001010")
    BitParser.add(BLACK, 27, "000011001011")
    BitParser.add(BLACK, 28, "000011001100")
    BitParser.add(BLACK, 29, "000011001101")
    BitParser.add(BLACK, 30, "000001101000")
    BitParser.add(BLACK, 31, "000001101001")
    BitParser.add(BLACK, 32, "000001101010")
    BitParser.add(BLACK, 33, "000001101011")
    BitParser.add(BLACK, 34, "000011010010")
    BitParser.add(BLACK, 35, "000011010011")
    BitParser.add(BLACK, 36, "000011010100")
    BitParser.add(BLACK, 37, "000011010101")
    BitParser.add(BLACK, 38, "000011010110")
    BitParser.add(BLACK, 39, "000011010111")
    BitParser.add(BLACK, 40, "000001101100")
    BitParser.add(BLACK, 41, "000001101101")
    BitParser.add(BLACK, 42, "000011011010")
    BitParser.add(BLACK, 43, "000011011011")
    BitParser.add(BLACK, 44, "000001010100")
    BitParser.add(BLACK, 45, "000001010101")
    BitParser.add(BLACK, 46, "000001010110")
    BitParser.add(BLACK, 47, "000001010111")
    BitParser.add(BLACK, 48, "000001100100")
    BitParser.add(BLACK, 49, "000001100101")
    BitParser.add(BLACK, 50, "000001010010")
    BitParser.add(BLACK, 51, "000001010011")
    BitParser.add(BLACK, 52, "000000100100")
    BitParser.add(BLACK, 53, "000000110111")
    BitParser.add(BLACK, 54, "000000111000")
    BitParser.add(BLACK, 55, "000000100111")
    BitParser.add(BLACK, 56, "000000101000")
    BitParser.add(BLACK, 57, "000001011000")
    BitParser.add(BLACK, 58, "000001011001")
    BitParser.add(BLACK, 59, "000000101011")
    BitParser.add(BLACK, 60, "000000101100")
    BitParser.add(BLACK, 61, "000001011010")
    BitParser.add(BLACK, 62, "000001100110")
    BitParser.add(BLACK, 63, "000001100111")
    BitParser.add(BLACK, 64, "0000001111")
    BitParser.add(BLACK, 128, "000011001000")
    BitParser.add(BLACK, 192, "000011001001")
    BitParser.add(BLACK, 256, "000001011011")
    BitParser.add(BLACK, 320, "000000110011")
    BitParser.add(BLACK, 384, "000000110100")
    BitParser.add(BLACK, 448, "000000110101")
    BitParser.add(BLACK, 512, "0000001101100")
    BitParser.add(BLACK, 576, "0000001101101")
    BitParser.add(BLACK, 640, "0000001001010")
    BitParser.add(BLACK, 704, "0000001001011")
    BitParser.add(BLACK, 768, "0000001001100")
    BitParser.add(BLACK, 832, "0000001001101")
    BitParser.add(BLACK, 896, "0000001110010")
    BitParser.add(BLACK, 960, "0000001110011")
    BitParser.add(BLACK, 1024, "0000001110100")
    BitParser.add(BLACK, 1088, "0000001110101")
    BitParser.add(BLACK, 1152, "0000001110110")
    BitParser.add(BLACK, 1216, "0000001110111")
    BitParser.add(BLACK, 1280, "0000001010010")
    BitParser.add(BLACK, 1344, "0000001010011")
    BitParser.add(BLACK, 1408, "0000001010100")
    BitParser.add(BLACK, 1472, "0000001010101")
    BitParser.add(BLACK, 1536, "0000001011010")
    BitParser.add(BLACK, 1600, "0000001011011")
    BitParser.add(BLACK, 1664, "0000001100100")
    BitParser.add(BLACK, 1728, "0000001100101")
    BitParser.add(BLACK, 1792, "00000001000")
    BitParser.add(BLACK, 1856, "00000001100")
    BitParser.add(BLACK, 1920, "00000001101")
    BitParser.add(BLACK, 1984, "000000010010")
    BitParser.add(BLACK, 2048, "000000010011")
    BitParser.add(BLACK, 2112, "000000010100")
    BitParser.add(BLACK, 2176, "000000010101")
    BitParser.add(BLACK, 2240, "000000010110")
    BitParser.add(BLACK, 2304, "000000010111")
    BitParser.add(BLACK, 2368, "000000011100")
    BitParser.add(BLACK, 2432, "000000011101")
    BitParser.add(BLACK, 2496, "000000011110")
    BitParser.add(BLACK, 2560, "000000011111")

    UNCOMPRESSED = [None, None]
    BitParser.add(UNCOMPRESSED, "1", "1")
    BitParser.add(UNCOMPRESSED, "01", "01")
    BitParser.add(UNCOMPRESSED, "001", "001")
    BitParser.add(UNCOMPRESSED, "0001", "0001")
    BitParser.add(UNCOMPRESSED, "00001", "00001")
    BitParser.add(UNCOMPRESSED, "00000", "000001")
    BitParser.add(UNCOMPRESSED, "T00", "00000011")
    BitParser.add(UNCOMPRESSED, "T10", "00000010")
    BitParser.add(UNCOMPRESSED, "T000", "000000011")
    BitParser.add(UNCOMPRESSED, "T100", "000000010")
    BitParser.add(UNCOMPRESSED, "T0000", "0000000011")
    BitParser.add(UNCOMPRESSED, "T1000", "0000000010")
    BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
    BitParser.add(UNCOMPRESSED, "T10000", "00000000010")

    class CCITTException(PDFException):
        pass

    class EOFB(CCITTException):
        pass

    class InvalidData(CCITTException):
        pass

    class ByteSkip(CCITTException):
        pass

    _color: int

    def __init__(self, width: int, bytealign: bool = False) -> None:
        BitParser.__init__(self)
        self.width = width
        self.bytealign = bytealign
        self.reset()

    def feedbytes(self, data: bytes) -> None:
        for byte in get_bytes(data):
            try:
                for m in (128, 64, 32, 16, 8, 4, 2, 1):
                    self._parse_bit(byte & m)
            except self.ByteSkip:
                self._accept = self._parse_mode
                self._state = self.MODE
            except self.EOFB:
                break

    def _parse_mode(self, mode: object) -> BitParserState:
        if mode == "p":
            self._do_pass()
            self._flush_line()
            return self.MODE
        elif mode == "h":
            self._n1 = 0
            self._accept = self._parse_horiz1
            if self._color:
                return self.WHITE
            else:
                return self.BLACK
        elif mode == "u":
            self._accept = self._parse_uncompressed
            return self.UNCOMPRESSED
        elif mode == "e":
            raise self.EOFB
        elif isinstance(mode, int):
            self._do_vertical(mode)
            self._flush_line()
            return self.MODE
        else:
            raise self.InvalidData(mode)

    def _parse_horiz1(self, n: Any) -> BitParserState:
        if n is None:
            raise self.InvalidData
        self._n1 += n
        if n < 64:
            self._n2 = 0
            self._color = 1 - self._color
            self._accept = self._parse_horiz2
        if self._color:
            return self.WHITE
        else:
            return self.BLACK

    def _parse_horiz2(self, n: Any) -> BitParserState:
        if n is None:
            raise self.InvalidData
        self._n2 += n
        if n < 64:
            self._color = 1 - self._color
            self._accept = self._parse_mode
            self._do_horizontal(self._n1, self._n2)
            self._flush_line()
            return self.MODE
        elif self._color:
            return self.WHITE
        else:
            return self.BLACK

    def _parse_uncompressed(self, bits: str | None) -> BitParserState:
        if not bits:
            raise self.InvalidData
        if bits.startswith("T"):
            self._accept = self._parse_mode
            self._color = int(bits[1])
            self._do_uncompressed(bits[2:])
            return self.MODE
        else:
            self._do_uncompressed(bits)
            return self.UNCOMPRESSED

    def _get_bits(self) -> str:
        return "".join(str(b) for b in self._curline[: self._curpos])

    def _get_refline(self, i: int) -> str:
        if i < 0:
            return "[]" + "".join(str(b) for b in self._refline)
        elif len(self._refline) <= i:
            return "".join(str(b) for b in self._refline) + "[]"
        else:
            return (
                "".join(str(b) for b in self._refline[:i])
                + "["
                + str(self._refline[i])
                + "]"
                + "".join(str(b) for b in self._refline[i + 1 :])
            )

    def reset(self) -> None:
        self._y = 0
        self._curline = array.array("b", [1] * self.width)
        self._reset_line()
        self._accept = self._parse_mode
        self._state = self.MODE

    def output_line(self, y: int, bits: Sequence[int]) -> None:
        print(y, "".join(str(b) for b in bits))

    def _reset_line(self) -> None:
        self._refline = self._curline
        self._curline = array.array("b", [1] * self.width)
        self._curpos = -1
        self._color = 1

    def _flush_line(self) -> None:
        if self.width <= self._curpos:
            self.output_line(self._y, self._curline)
            self._y += 1
            self._reset_line()
            if self.bytealign:
                raise self.ByteSkip

    def _do_vertical(self, dx: int) -> None:
        x1 = self._curpos + 1
        while 1:
            if x1 == 0:
                if self._color == 1 and self._refline[x1] != self._color:
                    break
            elif x1 == len(self._refline) or (
                self._refline[x1 - 1] == self._color
                and self._refline[x1] != self._color
            ):
                break
            x1 += 1
        x1 += dx
        x0 = max(0, self._curpos)
        x1 = max(0, min(self.width, x1))
        if x1 < x0:
            for x in range(x1, x0):
                self._curline[x] = self._color
        elif x0 < x1:
            for x in range(x0, x1):
                self._curline[x] = self._color
        self._curpos = x1
        self._color = 1 - self._color

    def _do_pass(self) -> None:
        x1 = self._curpos + 1
        while 1:
            if x1 == 0:
                if self._color == 1 and self._refline[x1] != self._color:
                    break
            elif x1 == len(self._refline) or (
                self._refline[x1 - 1] == self._color
                and self._refline[x1] != self._color
            ):
                break
            x1 += 1
        while 1:
            if x1 == 0:
                if self._color == 0 and self._refline[x1] == self._color:
                    break
            elif x1 == len(self._refline) or (
                self._refline[x1 - 1] != self._color
                and self._refline[x1] == self._color
            ):
                break
            x1 += 1
        for x in range(self._curpos, x1):
            self._curline[x] = self._color
        self._curpos = x1

    def _do_horizontal(self, n1: int, n2: int) -> None:
        if self._curpos < 0:
            self._curpos = 0
        x = self._curpos
        for _ in range(n1):
            if len(self._curline) <= x:
                break
            self._curline[x] = self._color
            x += 1
        for _ in range(n2):
            if len(self._curline) <= x:
                break
            self._curline[x] = 1 - self._color
            x += 1
        self._curpos = x

    def _do_uncompressed(self, bits: str) -> None:
        for c in bits:
            self._curline[self._curpos] = int(c)
            self._curpos += 1
            self._flush_line()


class CCITTFaxDecoder(CCITTG4Parser):
    def __init__(
        self,
        width: int,
        bytealign: bool = False,
        reversed: bool = False,
    ) -> None:
        CCITTG4Parser.__init__(self, width, bytealign=bytealign)
        self.reversed = reversed
        self._buf = b""

    def close(self) -> bytes:
        return self._buf

    def output_line(self, y: int, bits: Sequence[int]) -> None:
        arr = array.array("B", [0] * ((len(bits) + 7) // 8))
        if self.reversed:
            bits = [1 - b for b in bits]
        for i, b in enumerate(bits):
            if b:
                arr[i // 8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
        self._buf += arr.tobytes()


def ccittfaxdecode(data: bytes, params: dict[str, object]) -> bytes:
    K = params.get("K")
    if K == -1:
        cols = cast(int, params.get("Columns"))
        bytealign = cast(bool, params.get("EncodedByteAlign"))
        reversed = cast(bool, params.get("BlackIs1"))
        parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
    else:
        raise PDFValueError(K)
    parser.feedbytes(data)
    return parser.close()


# test
def main(argv: list[str]) -> None:
    if not argv[1:]:
        import unittest

        unittest.main()
        return

    class Parser(CCITTG4Parser):
        def __init__(self, width: int, bytealign: bool = False) -> None:
            import pygame  # type: ignore[import]

            CCITTG4Parser.__init__(self, width, bytealign=bytealign)
            self.img = pygame.Surface((self.width, 1000))

        def output_line(self, y: int, bits: Sequence[int]) -> None:
            for x, b in enumerate(bits):
                if b:
                    self.img.set_at((x, y), (255, 255, 255))
                else:
                    self.img.set_at((x, y), (0, 0, 0))

        def close(self) -> None:
            import pygame

            pygame.image.save(self.img, "out.bmp")

    for path in argv[1:]:
        fp = open(path, "rb")
        (_, _, k, w, h, _) = path.split(".")
        parser = Parser(int(w))
        parser.feedbytes(fp.read())
        parser.close()
        fp.close()


================================================
FILE: babeldoc/pdfminer/cmap/README.txt
================================================
README.txt for cmap

This directory contains *.pickle.gz files converted from Adobe CMap resources.
CMaps are required to decode text data written in CJK (Chinese, Japanese,
Korean) language.  CMap resources are now available freely from Adobe web site:
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources

The follwing files were extracted from the downloadable tarballs:

cid2code_Adobe_CNS1.txt:
	http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z

cid2code_Adobe_GB1.txt:
	http://download.macromedia.com/pub/opensource/cmap/cmapresources_gb1-5.tar.z

cid2code_Adobe_Japan1.txt:
	http://download.macromedia.com/pub/opensource/cmap/cmapresources_japan1-6.tar.z

cid2code_Adobe_Korea1.txt:
	http://download.macromedia.com/pub/opensource/cmap/cmapresources_korean1-2.tar.z


These *.pickle.gz files can be generated by running following commands in the
top directory:

    $ make cmap
    python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt
    reading 'cmaprsrc/cid2code_Adobe_CNS1.txt'...
    writing 'CNS1_H.py'...
    ...

On Windows machines which don't have `make` command,
paste the following commands on a command line prompt:

    mkdir pdfminer\cmap
    python tools\conv_cmap.py -c B5=cp950 -c UniCNS-UTF8=utf-8 pdfminer\cmap Adobe-CNS1 cmaprsrc\cid2code_Adobe_CNS1.txt
    python tools\conv_cmap.py -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 pdfminer\cmap Adobe-GB1 cmaprsrc\cid2code_Adobe_GB1.txt
    python tools\conv_cmap.py -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 pdfminer\cmap Adobe-Japan1 cmaprsrc\cid2code_Adobe_Japan1.txt
    python tools\conv_cmap.py -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 pdfminer\cmap Adobe-Korea1 cmaprsrc\cid2code_Adobe_Korea1.txt


Here is the license information in the original files:

%%Copyright: -----------------------------------------------------------
%%Copyright: Copyright 1990-20xx Adobe Systems Incorporated.
%%Copyright: All rights reserved.
%%Copyright:
%%Copyright: Redistribution and use in source and binary forms, with or
%%Copyright: without modification, are permitted provided that the
%%Copyright: following conditions are met:
%%Copyright:
%%Copyright: Redistributions of source code must retain the above
%%Copyright: copyright notice, this list of conditions and the following
%%Copyright: disclaimer.
%%Copyright:
%%Copyright: Redistributions in binary form must reproduce the above
%%Copyright: copyright notice, this list of conditions and the following
%%Copyright: disclaimer in the documentation and/or other materials
%%Copyright: provided with the distribution.
%%Copyright:
%%Copyright: Neither the name of Adobe Systems Incorporated nor the names
%%Copyright: of its contributors may be used to endorse or promote
%%Copyright: products derived from this software without specific prior
%%Copyright: written permission.
%%Copyright:
%%Copyright: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
%%Copyright: CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
%%Copyright: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
%%Copyright: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
%%Copyright: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
%%Copyright: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
%%Copyright: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
%%Copyright: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
%%Copyright: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
%%Copyright: HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
%%Copyright: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
%%Copyright: OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
%%Copyright: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%%Copyright: -----------------------------------------------------------


================================================
FILE: babeldoc/pdfminer/cmapdb.py
================================================
"""Adobe character mapping (CMap) support.

CMaps provide the mapping between character codes and Unicode
code-points to character ids (CIDs).

More information is available on:

  https://github.com/adobe-type-tools/cmap-resources

"""

import gzip
import logging
import os
import os.path
import pickle as pickle
import struct
import sys
from collections.abc import Iterable
from collections.abc import Iterator
from collections.abc import MutableMapping
from typing import Any
from typing import BinaryIO
from typing import TextIO
from typing import cast

from babeldoc.pdfminer.encodingdb import name2unicode
from babeldoc.pdfminer.pdfexceptions import PDFException
from babeldoc.pdfminer.pdfexceptions import PDFTypeError
from babeldoc.pdfminer.psexceptions import PSEOF
from babeldoc.pdfminer.psexceptions import PSSyntaxError
from babeldoc.pdfminer.psparser import KWD
from babeldoc.pdfminer.psparser import PSKeyword
from babeldoc.pdfminer.psparser import PSLiteral
from babeldoc.pdfminer.psparser import PSStackParser
from babeldoc.pdfminer.psparser import literal_name
from babeldoc.pdfminer.utils import choplist
from babeldoc.pdfminer.utils import nunpack

log = logging.getLogger(__name__)


class CMapError(PDFException):
    pass


class CMapBase:
    debug = 0

    def __init__(self, **kwargs: object) -> None:
        self.attrs: MutableMapping[str, object] = kwargs.copy()

    def is_vertical(self) -> bool:
        return self.attrs.get("WMode", 0) != 0

    def set_attr(self, k: str, v: object) -> None:
        self.attrs[k] = v

    def add_code2cid(self, code: str, cid: int) -> None:
        pass

    def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None:
        pass

    def use_cmap(self, cmap: "CMapBase") -> None:
        pass

    def decode(self, code: bytes) -> Iterable[int]:
        raise NotImplementedError


class CMap(CMapBase):
    def __init__(self, **kwargs: str | int) -> None:
        CMapBase.__init__(self, **kwargs)
        self.code2cid: dict[int, object] = {}

    def __repr__(self) -> str:
        return "<CMap: %s>" % self.attrs.get("CMapName")

    def use_cmap(self, cmap: CMapBase) -> None:
        assert isinstance(cmap, CMap), str(type(cmap))

        def copy(dst: dict[int, object], src: dict[int, object]) -> None:
            for k, v in src.items():
                if isinstance(v, dict):
                    d: dict[int, object] = {}
                    dst[k] = d
                    copy(d, v)
                else:
                    dst[k] = v

        copy(self.code2cid, cmap.code2cid)

    def decode(self, code: bytes) -> Iterator[int]:
        log.debug("decode: %r, %r", self, code)
        d = self.code2cid
        for i in iter(code):
            if i in d:
                x = d[i]
                if isinstance(x, int):
                    yield x
                    d = self.code2cid
                else:
                    d = cast(dict[int, object], x)
            else:
                d = self.code2cid

    def dump(
        self,
        out: TextIO = sys.stdout,
        code2cid: dict[int, object] | None = None,
        code: tuple[int, ...] = (),
    ) -> None:
        if code2cid is None:
            code2cid = self.code2cid
            code = ()
        for k, v in sorted(code2cid.items()):
            c = code + (k,)
            if isinstance(v, int):
                out.write("code %r = cid %d\n" % (c, v))
            else:
                self.dump(out=out, code2cid=cast(dict[int, object], v), code=c)


class IdentityCMap(CMapBase):
    def decode(self, code: bytes) -> tuple[int, ...]:
        n = len(code) // 2
        if n:
            return struct.unpack_from(f">{n}H", code)
        else:
            return ()


class IdentityCMapByte(IdentityCMap):
    def decode(self, code: bytes) -> tuple[int, ...]:
        n = len(code)
        if n:
            return struct.unpack(">%dB" % n, code)
        else:
            return ()


class UnicodeMap(CMapBase):
    def __init__(self, **kwargs: str | int) -> None:
        CMapBase.__init__(self, **kwargs)
        self.cid2unichr: dict[int, str] = {}

    def __repr__(self) -> str:
        return "<UnicodeMap: %s>" % self.attrs.get("CMapName")

    def get_unichr(self, cid: int) -> str:
        log.debug("get_unichr: %r, %r", self, cid)
        return self.cid2unichr[cid]

    def dump(self, out: TextIO = sys.stdout) -> None:
        for k, v in sorted(self.cid2unichr.items()):
            out.write("cid %d = unicode %r\n" % (k, v))


class IdentityUnicodeMap(UnicodeMap):
    def get_unichr(self, cid: int) -> str:
        """Interpret character id as unicode codepoint"""
        log.debug("get_unichr: %r, %r", self, cid)
        return chr(cid)


class FileCMap(CMap):
    def add_code2cid(self, code: str, cid: int) -> None:
        assert isinstance(code, str) and isinstance(cid, int), str(
            (type(code), type(cid)),
        )
        d = self.code2cid
        for c in code[:-1]:
            ci = ord(c)
            if ci in d:
                d = cast(dict[int, object], d[ci])
            else:
                t: dict[int, object] = {}
                d[ci] = t
                d = t
        ci = ord(code[-1])
        d[ci] = cid


class FileUnicodeMap(UnicodeMap):
    def add_cid2unichr(self, cid: int, code: PSLiteral | bytes | int) -> None:
        assert isinstance(cid, int), str(type(cid))
        if isinstance(code, PSLiteral):
            # Interpret as an Adobe glyph name.
            assert isinstance(code.name, str)
            unichr = name2unicode(code.name)
        elif isinstance(code, bytes):
            # Interpret as UTF-16BE.
            unichr = code.decode("UTF-16BE", "ignore")
        elif isinstance(code, int):
            unichr = chr(code)
        else:
            raise PDFTypeError(code)

        # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
        if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
            return
        self.cid2unichr[cid] = unichr


class PyCMap(CMap):
    def __init__(self, name: str, module: Any) -> None:
        super().__init__(CMapName=name)
        self.code2cid = module.CODE2CID
        if module.IS_VERTICAL:
            self.attrs["WMode"] = 1


class PyUnicodeMap(UnicodeMap):
    def __init__(self, name: str, module: Any, vertical: bool) -> None:
        super().__init__(CMapName=name)
        if vertical:
            self.cid2unichr = module.CID2UNICHR_V
            self.attrs["WMode"] = 1
        else:
            self.cid2unichr = module.CID2UNICHR_H


class CMapDB:
    _cmap_cache: dict[str, PyCMap] = {}
    _umap_cache: dict[str, list[PyUnicodeMap]] = {}

    class CMapNotFound(CMapError):
        pass

    @classmethod
    def _load_data(cls, name: str) -> Any:
        name = name.replace("\0", "")
        filename = "%s.pickle.gz" % name
        log.debug("loading: %r", name)
        cmap_paths = (
            os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
            os.path.join(os.path.dirname(__file__), "cmap"),
        )
        for directory in cmap_paths:
            path = os.path.join(directory, filename)
            if os.path.exists(path):
                gzfile = gzip.open(path)
                try:
                    return type(str(name), (), pickle.loads(gzfile.read()))
                finally:
                    gzfile.close()
        raise CMapDB.CMapNotFound(name)

    @classmethod
    def get_cmap(cls, name: str) -> CMapBase:
        if name == "Identity-H":
            return IdentityCMap(WMode=0)
        elif name == "Identity-V":
            return IdentityCMap(WMode=1)
        elif name == "OneByteIdentityH":
            return IdentityCMapByte(WMode=0)
        elif name == "OneByteIdentityV":
            return IdentityCMapByte(WMode=1)
        try:
            return cls._cmap_cache[name]
        except KeyError:
            pass
        data = cls._load_data(name)
        cls._cmap_cache[name] = cmap = PyCMap(name, data)
        return cmap

    @classmethod
    def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
        try:
            return cls._umap_cache[name][vertical]
        except KeyError:
            pass
        data = cls._load_data("to-unicode-%s" % name)
        cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
        return cls._umap_cache[name][vertical]


class CMapParser(PSStackParser[PSKeyword]):
    def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
        PSStackParser.__init__(self, fp)
        self.cmap = cmap
        # some ToUnicode maps don't have "begincmap" keyword.
        self._in_cmap = True
        self._warnings: set[str] = set()

    def run(self) -> None:
        try:
            self.nextobject()
        except PSEOF:
            pass

    KEYWORD_BEGINCMAP = KWD(b"begincmap")
    KEYWORD_ENDCMAP = KWD(b"endcmap")
    KEYWORD_USECMAP = KWD(b"usecmap")
    KEYWORD_DEF = KWD(b"def")
    KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
    KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
    KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
    KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
    KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
    KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
    KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
    KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
    KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
    KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
    KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
    KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")

    def do_keyword(self, pos: int, token: PSKeyword) -> None:
        """ToUnicode CMaps

        See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
        """
        if token is self.KEYWORD_BEGINCMAP:
            self._in_cmap = True
            self.popall()
            return

        elif token is self.KEYWORD_ENDCMAP:
            self._in_cmap = False
            return

        if not self._in_cmap:
            return

        if token is self.KEYWORD_DEF:
            try:
                ((_, k), (_, v)) = self.pop(2)
                self.cmap.set_attr(literal_name(k), v)
            except PSSyntaxError:
                pass
            return

        if token is self.KEYWORD_USECMAP:
            try:
                ((_, cmapname),) = self.pop(1)
                self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
            except PSSyntaxError:
                pass
            except CMapDB.CMapNotFound:
                pass
            return

        if token is self.KEYWORD_BEGINCODESPACERANGE:
            self.popall()
            return
        if token is self.KEYWORD_ENDCODESPACERANGE:
            self.popall()
            return

        if token is self.KEYWORD_BEGINCIDRANGE:
            self.popall()
            return

        if token is self.KEYWORD_ENDCIDRANGE:
            objs = [obj for (__, obj) in self.popall()]
            for start_byte, end_byte, cid in choplist(3, objs):
                if not isinstance(start_byte, bytes):
                    self._warn_once("The start object of begincidrange is not a byte.")
                    continue
                if not isinstance(end_byte, bytes):
                    self._warn_once("The end object of begincidrange is not a byte.")
                    continue
                if not isinstance(cid, int):
                    self._warn_once("The cid object of begincidrange is not a byte.")
                    continue
                if len(start_byte) != len(end_byte):
                    self._warn_once(
                        "The start and end byte of begincidrange have "
                        "different lengths.",
                    )
                    continue
                start_prefix = start_byte[:-4]
                end_prefix = end_byte[:-4]
                if start_prefix != end_prefix:
                    self._warn_once(
                        "The prefix of the start and end byte of "
                        "begincidrange are not the same.",
                    )
                    continue
                svar = start_byte[-4:]
                evar = end_byte[-4:]
                start = nunpack(svar)
                end = nunpack(evar)
                vlen = len(svar)
                for i in range(end - start + 1):
                    x = start_prefix + struct.pack(">L", start + i)[-vlen:]
                    self.cmap.add_cid2unichr(cid + i, x)
            return

        if token is self.KEYWORD_BEGINCIDCHAR:
            self.popall()
            return

        if token is self.KEYWORD_ENDCIDCHAR:
            objs = [obj for (__, obj) in self.popall()]
            for cid, code in choplist(2, objs):
                if isinstance(code, bytes) and isinstance(cid, int):
                    self.cmap.add_cid2unichr(cid, code)
            return

        if token is self.KEYWORD_BEGINBFRANGE:
            self.popall()
            return

        if token is self.KEYWORD_ENDBFRANGE:
            objs = [obj for (__, obj) in self.popall()]
            for start_byte, end_byte, code in choplist(3, objs):
                if not isinstance(start_byte, bytes):
                    self._warn_once("The start object is not a byte.")
                    continue
                if not isinstance(end_byte, bytes):
                    self._warn_once("The end object is not a byte.")
                    continue
                if len(start_byte) != len(end_byte):
                    self._warn_once("The start and end byte have different lengths.")
                    continue
                start = nunpack(start_byte)
                end = nunpack(end_byte)
                if isinstance(code, list):
                    if len(code) != end - start + 1:
                        self._warn_once(
                            "The difference between the start and end "
                            "offsets does not match the code length.",
                        )
                    for cid, unicode_value in zip(
                        range(start, end + 1), code, strict=False
                    ):
                        self.cmap.add_cid2unichr(cid, unicode_value)
                else:
                    assert isinstance(code, bytes)
                    var = code[-4:]
                    base = nunpack(var)
                    prefix = code[:-4]
                    vlen = len(var)
                    for i in range(end - start + 1):
                        x = prefix + struct.pack(">L", base + i)[-vlen:]
                        self.cmap.add_cid2unichr(start + i, x)
            return

        if token is self.KEYWORD_BEGINBFCHAR:
            self.popall()
            return

        if token is self.KEYWORD_ENDBFCHAR:
            objs = [obj for (__, obj) in self.popall()]
            for cid, code in choplist(2, objs):
                if isinstance(cid, bytes) and isinstance(code, bytes):
                    self.cmap.add_cid2unichr(nunpack(cid), code)
            return

        if token is self.KEYWORD_BEGINNOTDEFRANGE:
            self.popall()
            return

        if token is self.KEYWORD_ENDNOTDEFRANGE:
            self.popall()
            return

        self.push((pos, token))

    def _warn_once(self, msg: str) -> None:
        """Warn once for each unique message"""
        if msg not in self._warnings:
            self._warnings.add(msg)
            base_msg = (
                "Ignoring (part of) ToUnicode map because the PDF data "
                "does not conform to the format. This could result in "
                "(cid) values in the output. "
            )
            log.warning(base_msg + msg)


================================================
FILE: babeldoc/pdfminer/converter.py
================================================
import io
import logging
import re
from collections.abc import Sequence
from typing import BinaryIO
from typing import Generic
from typing import TextIO
from typing import TypeVar
from typing import cast

from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.pdfminer.image import ImageWriter
from babeldoc.pdfminer.layout import LAParams
from babeldoc.pdfminer.layout import LTAnno
from babeldoc.pdfminer.layout import LTChar
from babeldoc.pdfminer.layout import LTComponent
from babeldoc.pdfminer.layout import LTContainer
from babeldoc.pdfminer.layout import LTCurve
from babeldoc.pdfminer.layout import LTFigure
from babeldoc.pdfminer.layout import LTImage
from babeldoc.pdfminer.layout import LTItem
from babeldoc.pdfminer.layout import LTLayoutContainer
from babeldoc.pdfminer.layout import LTLine
from babeldoc.pdfminer.layout import LTPage
from babeldoc.pdfminer.layout import LTRect
from babeldoc.pdfminer.layout import LTText
from babeldoc.pdfminer.layout import LTTextBox
from babeldoc.pdfminer.layout import LTTextBoxVertical
from babeldoc.pdfminer.layout import LTTextGroup
from babeldoc.pdfminer.layout import LTTextLine
from babeldoc.pdfminer.layout import TextGroupElement
from babeldoc.pdfminer.pdfcolor import PDFColorSpace
from babeldoc.pdfminer.pdfdevice import PDFTextDevice
from babeldoc.pdfminer.pdfexceptions import PDFValueError
from babeldoc.pdfminer.pdffont import PDFFont
from babeldoc.pdfminer.pdffont import PDFUnicodeNotDefined
from babeldoc.pdfminer.pdfinterp import PDFGraphicState
from babeldoc.pdfminer.pdfinterp import PDFResourceManager
from babeldoc.pdfminer.pdfpage import PDFPage
from babeldoc.pdfminer.pdftypes import PDFStream
from babeldoc.pdfminer.utils import AnyIO
from babeldoc.pdfminer.utils import Matrix
from babeldoc.pdfminer.utils import PathSegment
from babeldoc.pdfminer.utils import Point
from babeldoc.pdfminer.utils import Rect
from babeldoc.pdfminer.utils import apply_matrix_pt
from babeldoc.pdfminer.utils import bbox2str
from babeldoc.pdfminer.utils import enc
from babeldoc.pdfminer.utils import make_compat_str
from babeldoc.pdfminer.utils import mult_matrix
from babeldoc.pdfminer import utils

log = logging.getLogger(__name__)


class PDFLayoutAnalyzer(PDFTextDevice):
    cur_item: LTLayoutContainer
    ctm: Matrix

    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        pageno: int = 1,
        laparams: LAParams | None = None,
    ) -> None:
        PDFTextDevice.__init__(self, rsrcmgr)
        self.pageno = pageno
        self.laparams = laparams
        self._stack: list[LTLayoutContainer] = []

    def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
        (x0, y0, x1, y1) = page.mediabox
        (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
        (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
        mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
        self.cur_item = LTPage(self.pageno, mediabox)

    def end_page(self, page: PDFPage) -> None:
        assert not self._stack, str(len(self._stack))
        assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
        if self.laparams is not None:
            self.cur_item.analyze(self.laparams)
        self.pageno += 1
        self.receive_layout(self.cur_item)

    def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
        self._stack.append(self.cur_item)
        self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))

    def end_figure(self, _: str) -> None:
        fig = self.cur_item
        assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
        self.cur_item = self._stack.pop()
        self.cur_item.add(fig)

    def render_image(self, name: str, stream: PDFStream) -> None:
        assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
        item = LTImage(
            name,
            stream,
            (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
        )
        self.cur_item.add(item)

    def paint_path(
        self,
        gstate: PDFGraphicState,
        stroke: bool,
        fill: bool,
        evenodd: bool,
        path: Sequence[PathSegment],
    ) -> None:
        """Paint paths described in section 4.4 of the PDF reference manual"""
        shape = "".join(x[0] for x in path)
        current_clip_paths = self.il_creater.current_clip_paths.copy()
        if shape[:1] != "m":
            # Per PDF Reference Section 4.4.1, "path construction operators may
            # be invoked in any sequence, but the first one invoked must be m
            # or re to begin a new subpath." Since pdfminer.six already
            # converts all `re` (rectangle) operators to their equivelent
            # `mlllh` representation, paths ingested by `.paint_path(...)` that
            # do not begin with the `m` operator are invalid.
            pass

        # elif shape.count("m") > 1:
        #     # recurse if there are multiple m's in this shape
        #     for m in re.finditer(r"m[^m]+", shape):
        #         subpath = path[m.start(0) : m.end(0)]
        #         self.paint_path(gstate, stroke, fill, evenodd, subpath)

        else:
            # Although the 'h' command does not not literally provide a
            # point-position, its position is (by definition) equal to the
            # subpath's starting point.
            #
            # And, per Section 4.4's Table 4.9, all other path commands place
            # their point-position in their final two arguments. (Any preceding
            # arguments represent control points on Bézier curves.)
            raw_pts = [
                cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
            ]
            pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]

            operators = [str(operation[0]) for operation in path]
            transformed_points = [
                [
                    apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
                    for operand1, operand2 in zip(
                        operation[1::2], operation[2::2], strict=False
                    )
                ]
                for operation in path
            ]
            transformed_path = [
                cast(PathSegment, (o, *p))
                for o, p in zip(operators, transformed_points, strict=False)
            ]

            # Drop a redundant "l" on a path closed with "h"
            if len(shape) > 3 and shape[-2:] == "lh" and pts[-2] == pts[0]:
                shape = shape[:-2] + "h"
                pts.pop()

            passthrough_instruction = (
                self.il_creater.passthrough_per_char_instruction.copy()
            )
            xobj_id = self.il_creater.xobj_id
            if shape in {"mlh", "ml"}:
                # single line segment
                #
                # Note: 'ml', in conditional above, is a frequent anomaly
                # that we want to support.
                line = LTLine(
                    gstate.linewidth,
                    pts[0],
                    pts[1],
                    stroke,
                    fill,
                    evenodd,
                    gstate.scolor,
                    gstate.ncolor,
                    original_path=transformed_path,
                    dashing_style=gstate.dash,
                )
                line.passthrough_instruction = passthrough_instruction
                line.xobj_id = xobj_id
                line.render_order = self.il_creater.get_render_order_and_increase()
                line.ctm = self.ctm
                line.raw_path = path.copy()
                line.clip_paths = current_clip_paths
                self.cur_item.add(line)

            elif shape in {"mlllh", "mllll"}:
                (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts

                is_closed_loop = pts[0] == pts[4]
                has_square_coordinates = (
                    x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
                ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
                if is_closed_loop and has_square_coordinates:
                    rect = LTRect(
                        gstate.linewidth,
                        (*pts[0], *pts[2]),
                        stroke,
                        fill,
                        evenodd,
                        gstate.scolor,
                        gstate.ncolor,
                        transformed_path,
                        gstate.dash,
                    )
                    rect.passthrough_instruction = passthrough_instruction
                    rect.xobj_id = xobj_id
                    rect.render_order = self.il_creater.get_render_order_and_increase()
                    rect.ctm = self.ctm
                    rect.raw_path = path.copy()
                    rect.clip_paths = current_clip_paths
                    self.cur_item.add(rect)
                else:
                    curve = LTCurve(
                        gstate.linewidth,
                        pts,
                        stroke,
                        fill,
                        evenodd,
                        gstate.scolor,
                        gstate.ncolor,
                        transformed_path,
                        gstate.dash,
                    )
                    curve.passthrough_instruction = passthrough_instruction
                    curve.xobj_id = xobj_id
                    curve.render_order = self.il_creater.get_render_order_and_increase()
                    curve.ctm = self.ctm
                    curve.raw_path = path.copy()
                    curve.clip_paths = current_clip_paths
                    self.cur_item.add(curve)
            else:
                curve = LTCurve(
                    gstate.linewidth,
                    pts,
                    stroke,
                    fill,
                    evenodd,
                    gstate.scolor,
                    gstate.ncolor,
                    transformed_path,
                    gstate.dash,
                )
                curve.passthrough_instruction = passthrough_instruction
                curve.xobj_id = xobj_id
                curve.render_order = self.il_creater.get_render_order_and_increase()
                curve.ctm = self.ctm
                curve.raw_path = path.copy()
                curve.clip_paths = current_clip_paths
                self.cur_item.add(curve)

    def render_char(
        self,
        matrix: Matrix,
        font: PDFFont,
        fontsize: float,
        scaling: float,
        rise: float,
        cid: int,
        ncs: PDFColorSpace,
        graphicstate: PDFGraphicState,
    ) -> float:
        try:
            text = font.to_unichr(cid)
            assert isinstance(text, str), str(type(text))
        except PDFUnicodeNotDefined:
            text = self.handle_undefined_char(font, cid)
        textwidth = font.char_width(cid)
        textdisp = font.char_disp(cid)
        item = LTChar(
            matrix,
            font,
            fontsize,
            scaling,
            rise,
            text,
            textwidth,
            textdisp,
            ncs,
            graphicstate,
        )
        self.cur_item.add(item)
        return item.adv

    def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
        log.debug("undefined: %r, %r", font, cid)
        return "(cid:%d)" % cid

    def receive_layout(self, ltpage: LTPage) -> None:
        pass


class PDFPageAggregator(PDFLayoutAnalyzer):
    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        pageno: int = 1,
        laparams: LAParams | None = None,
    ) -> None:
        PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
        self.result: LTPage | None = None

    def receive_layout(self, ltpage: LTPage) -> None:
        self.result = ltpage

    def get_result(self) -> LTPage:
        assert self.result is not None
        return self.result


# Some PDFConverter children support only binary I/O
IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)


class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        outfp: IOType,
        codec: str = "utf-8",
        pageno: int = 1,
        laparams: LAParams | None = None,
    ) -> None:
        PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
        self.outfp: IOType = outfp
        self.codec = codec
        self.outfp_binary = self._is_binary_stream(self.outfp)

    @staticmethod
    def _is_binary_stream(outfp: AnyIO) -> bool:
        """Test if an stream is binary or not"""
        if "b" in getattr(outfp, "mode", ""):
            return True
        elif hasattr(outfp, "mode"):
            # output stream has a mode, but it does not contain 'b'
            return False
        elif isinstance(outfp, io.BytesIO):
            return True
        elif isinstance(outfp, io.StringIO) or isinstance(outfp, io.TextIOBase):
            return False

        return True


class TextConverter(PDFConverter[AnyIO]):
    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        outfp: AnyIO,
        codec: str = "utf-8",
        pageno: int = 1,
        laparams: LAParams | None = None,
        showpageno: bool = False,
        imagewriter: ImageWriter | None = None,
    ) -> None:
        super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
        self.showpageno = showpageno
        self.imagewriter = imagewriter

    def write_text(self, text: str) -> None:
        text = utils.compatible_encode_method(text, self.codec, "ignore")
        if self.outfp_binary:
            cast(BinaryIO, self.outfp).write(text.encode())
        else:
            cast(TextIO, self.outfp).write(text)

    def receive_layout(self, ltpage: LTPage) -> None:
        def render(item: LTItem) -> None:
            if isinstance(item, LTContainer):
                for child in item:
                    render(child)
            elif isinstance(item, LTText):
                self.write_text(item.get_text())
            if isinstance(item, LTTextBox):
                self.write_text("\n")
            elif isinstance(item, LTImage):
                if self.imagewriter is not None:
                    self.imagewriter.export_image(item)

        if self.showpageno:
            self.write_text("Page %s\n" % ltpage.pageid)
        render(ltpage)
        self.write_text("\f")

    # Some dummy functions to save memory/CPU when all that is wanted
    # is text.  This stops all the image and drawing output from being
    # recorded and taking up RAM.
    def render_image(self, name: str, stream: PDFStream) -> None:
        if self.imagewriter is not None:
            PDFConverter.render_image(self, name, stream)

    def paint_path(
        self,
        gstate: PDFGraphicState,
        stroke: bool,
        fill: bool,
        evenodd: bool,
        path: Sequence[PathSegment],
    ) -> None:
        pass


class HTMLConverter(PDFConverter[AnyIO]):
    RECT_COLORS = {
        "figure": "yellow",
        "textline": "magenta",
        "textbox": "cyan",
        "textgroup": "red",
        "curve": "black",
        "page": "gray",
    }

    TEXT_COLORS = {
        "textbox": "blue",
        "char": "black",
    }

    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        outfp: AnyIO,
        codec: str = "utf-8",
        pageno: int = 1,
        laparams: LAParams | None = None,
        scale: float = 1,
        fontscale: float = 1.0,
        layoutmode: str = "normal",
        showpageno: bool = True,
        pagemargin: int = 50,
        imagewriter: ImageWriter | None = None,
        debug: int = 0,
        rect_colors: dict[str, str] | None = None,
        text_colors: dict[str, str] | None = None,
    ) -> None:
        PDFConverter.__init__(
            self,
            rsrcmgr,
            outfp,
            codec=codec,
            pageno=pageno,
            laparams=laparams,
        )

        # write() assumes a codec for binary I/O, or no codec for text I/O.
        if self.outfp_binary and not self.codec:
            raise PDFValueError("Codec is required for a binary I/O output")
        if not self.outfp_binary and self.codec:
            raise PDFValueError("Codec must not be specified for a text I/O output")

        if text_colors is None:
            text_colors = {"char": "black"}
        if rect_colors is None:
            rect_colors = {"curve": "black", "page": "gray"}

        self.scale = scale
        self.fontscale = fontscale
        self.layoutmode = layoutmode
        self.showpageno = showpageno
        self.pagemargin = pagemargin
        self.imagewriter = imagewriter
        self.rect_colors = rect_colors
        self.text_colors = text_colors
        if debug:
            self.rect_colors.update(self.RECT_COLORS)
            self.text_colors.update(self.TEXT_COLORS)
        self._yoffset: float = self.pagemargin
        self._font: tuple[str, float] | None = None
        self._fontstack: list[tuple[str, float] | None] = []
        self.write_header()

    def write(self, text: str) -> None:
        if self.codec:
            cast(BinaryIO, self.outfp).write(text.encode(self.codec))
        else:
            cast(TextIO, self.outfp).write(text)

    def write_header(self) -> None:
        self.write("<html><head>\n")
        if self.codec:
            s = (
                '<meta http-equiv="Content-Type" content="text/html; '
                'charset=%s">\n' % self.codec
            )
        else:
            s = '<meta http-equiv="Content-Type" content="text/html">\n'
        self.write(s)
        self.write("</head><body>\n")

    def write_footer(self) -> None:
        page_links = [f'<a href="#{i}">{i}</a>' for i in range(1, self.pageno)]
        s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
            page_links,
        )
        self.write(s)
        self.write("</body></html>\n")

    def write_text(self, text: str) -> None:
        self.write(enc(text))

    def place_rect(
        self,
        color: str,
        borderwidth: int,
        x: float,
        y: float,
        w: float,
        h: float,
    ) -> None:
        color2 = self.rect_colors.get(color)
        if color2 is not None:
            s = (
                '<span style="position:absolute; border: %s %dpx solid; '
                'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
                % (
                    color2,
                    borderwidth,
                    x * self.scale,
                    (self._yoffset - y) * self.scale,
                    w * self.scale,
                    h * self.scale,
                )
            )
            self.write(s)

    def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
        self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)

    def place_image(
        self,
        item: LTImage,
        borderwidth: int,
        x: float,
        y: float,
        w: float,
        h: float,
    ) -> None:
        if self.imagewriter is not None:
            name = self.imagewriter.export_image(item)
            s = (
                '<img src="%s" border="%d" style="position:absolute; '
                'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
                % (
                    enc(name),
                    borderwidth,
                    x * self.scale,
                    (self._yoffset - y) * self.scale,
                    w * self.scale,
                    h * self.scale,
                )
            )
            self.write(s)

    def place_text(
        self,
        color: str,
        text: str,
        x: float,
        y: float,
        size: float,
    ) -> None:
        color2 = self.text_colors.get(color)
        if color2 is not None:
            s = (
                '<span style="position:absolute; color:%s; left:%dpx; '
                'top:%dpx; font-size:%dpx;">'
                % (
                    color2,
                    x * self.scale,
                    (self._yoffset - y) * self.scale,
                    size * self.scale * self.fontscale,
                )
            )
            self.write(s)
            self.write_text(text)
            self.write("</span>\n")

    def begin_div(
        self,
        color: str,
        borderwidth: int,
        x: float,
        y: float,
        w: float,
        h: float,
        writing_mode: str = "False",
    ) -> None:
        self._fontstack.append(self._font)
        self._font = None
        s = (
            '<div style="position:absolute; border: %s %dpx solid; '
            "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
            'height:%dpx;">'
            % (
                color,
                borderwidth,
                writing_mode,
                x * self.scale,
                (self._yoffset - y) * self.scale,
                w * self.scale,
                h * self.scale,
            )
        )
        self.write(s)

    def end_div(self, color: str) -> None:
        if self._font is not None:
            self.write("</span>")
        self._font = self._fontstack.pop()
        self.write("</div>")

    def put_text(self, text: str, fontname: str, fontsize: float) -> None:
        font = (fontname, fontsize)
        if font != self._font:
            if self._font is not None:
                self.write("</span>")
            # Remove subset tag from fontname, see PDF Reference 5.5.3
            fontname_without_subset_tag = fontname.split("+")[-1]
            self.write(
                '<span style="font-family: %s; font-size:%dpx">'
                % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale),
            )
            self._font = font
        self.write_text(text)

    def put_newline(self) -> None:
        self.write("<br>")

    def receive_layout(self, ltpage: LTPage) -> None:
        def show_group(item: LTTextGroup | TextGroupElement) -> None:
            if isinstance(item, LTTextGroup):
                self.place_border("textgroup", 1, item)
                for child in item:
                    show_group(child)

        def render(item: LTItem) -> None:
            child: LTItem
            if isinstance(item, LTPage):
                self._yoffset += item.y1
                self.place_border("page", 1, item)
                if self.showpageno:
                    self.write(
                        '<div style="position:absolute; top:%dpx;">'
                        % ((self._yoffset - item.y1) * self.scale),
                    )
                    self.write(
                        f'<a name="{item.pageid}">Page {item.pageid}</a></div>\n',
                    )
                for child in item:
                    render(child)
                if item.groups is not None:
                    for group in item.groups:
                        show_group(group)
            elif isinstance(item, LTCurve):
                self.place_border("curve", 1, item)
            elif isinstance(item, LTFigure):
                self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
                for child in item:
                    render(child)
                self.end_div("figure")
            elif isinstance(item, LTImage):
                self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
            elif self.layoutmode == "exact":
                if isinstance(item, LTTextLine):
                    self.place_border("textline", 1, item)
                    for child in item:
                        render(child)
                elif isinstance(item, LTTextBox):
                    self.place_border("textbox", 1, item)
                    self.place_text(
                        "textbox",
                        str(item.index + 1),
                        item.x0,
                        item.y1,
                        20,
                    )
                    for child in item:
                        render(child)
                elif isinstance(item, LTChar):
                    self.place_border("char", 1, item)
                    self.place_text(
                        "char",
                        item.get_text(),
                        item.x0,
                        item.y1,
                        item.size,
                    )
            elif isinstance(item, LTTextLine):
                for child in item:
                    render(child)
                if self.layoutmode != "loose":
                    self.put_newline()
            elif isinstance(item, LTTextBox):
                self.begin_div(
                    "textbox",
                    1,
                    item.x0,
                    item.y1,
                    item.width,
                    item.height,
                    item.get_writing_mode(),
                )
                for child in item:
                    render(child)
                self.end_div("textbox")
            elif isinstance(item, LTChar):
                fontname = make_compat_str(item.fontname)
                self.put_text(item.get_text(), fontname, item.size)
            elif isinstance(item, LTText):
                self.write_text(item.get_text())

        render(ltpage)
        self._yoffset += self.pagemargin

    def close(self) -> None:
        self.write_footer()


class XMLConverter(PDFConverter[AnyIO]):
    CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")

    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        outfp: AnyIO,
        codec: str = "utf-8",
        pageno: int = 1,
        laparams: LAParams | None = None,
        imagewriter: ImageWriter | None = None,
        stripcontrol: bool = False,
    ) -> None:
        PDFConverter.__init__(
            self,
            rsrcmgr,
            outfp,
            codec=codec,
            pageno=pageno,
            laparams=laparams,
        )

        # write() assumes a codec for binary I/O, or no codec for text I/O.
        if self.outfp_binary == (not self.codec):
            raise PDFValueError("Codec is required for a binary I/O output")

        self.imagewriter = imagewriter
        self.stripcontrol = stripcontrol
        self.write_header()

    def write(self, text: str) -> None:
        if self.codec:
            cast(BinaryIO, self.outfp).write(text.encode(self.codec))
        else:
            cast(TextIO, self.outfp).write(text)

    def write_header(self) -> None:
        if self.codec:
            self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
        else:
            self.write('<?xml version="1.0" ?>\n')
        self.write("<pages>\n")

    def write_footer(self) -> None:
        self.write("</pages>\n")

    def write_text(self, text: str) -> None:
        if self.stripcontrol:
            text = self.CONTROL.sub("", text)
        self.write(enc(text))

    def receive_layout(self, ltpage: LTPage) -> None:
        def show_group(item: LTItem) -> None:
            if isinstance(item, LTTextBox):
                self.write(
                    '<textbox id="%d" bbox="%s" />\n'
                    % (item.index, bbox2str(item.bbox)),
                )
            elif isinstance(item, LTTextGroup):
                self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
                for child in item:
                    show_group(child)
                self.write("</textgroup>\n")

        def render(item: LTItem) -> None:
            child: LTItem
            if isinstance(item, LTPage):
                s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
                    item.pageid,
                    bbox2str(item.bbox),
                    item.rotate,
                )
                self.write(s)
                for child in item:
                    render(child)
                if item.groups is not None:
                    self.write("<layout>\n")
                    for group in item.groups:
                        show_group(group)
                    self.write("</layout>\n")
                self.write("</page>\n")
            elif isinstance(item, LTLine):
                s = '<line linewidth="%d" bbox="%s" />\n' % (
                    item.linewidth,
                    bbox2str(item.bbox),
                )
                self.write(s)
            elif isinstance(item, LTRect):
                s = '<rect linewidth="%d" bbox="%s" />\n' % (
                    item.linewidth,
                    bbox2str(item.bbox),
                )
                self.write(s)
            elif isinstance(item, LTCurve):
                s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
                    item.linewidth,
                    bbox2str(item.bbox),
                    item.get_pts(),
                )
                self.write(s)
            elif isinstance(item, LTFigure):
                s = f'<figure name="{item.name}" bbox="{bbox2str(item.bbox)}">\n'
                self.write(s)
                for child in item:
                    render(child)
                self.write("</figure>\n")
            elif isinstance(item, LTTextLine):
                self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
                for child in item:
                    render(child)
                self.write("</textline>\n")
            elif isinstance(item, LTTextBox):
                wmode = ""
                if isinstance(item, LTTextBoxVertical):
                    wmode = ' wmode="vertical"'
                s = '<textbox id="%d" bbox="%s"%s>\n' % (
                    item.index,
                    bbox2str(item.bbox),
                    wmode,
                )
                self.write(s)
                for child in item:
                    render(child)
                self.write("</textbox>\n")
            elif isinstance(item, LTChar):
                s = (
                    '<text font="%s" bbox="%s" colourspace="%s" '
                    'ncolour="%s" size="%.3f">'
                    % (
                        enc(item.fontname),
                        bbox2str(item.bbox),
                        item.ncs.name,
                        item.graphicstate.ncolor,
                        item.size,
                    )
                )
                self.write(s)
                self.write_text(item.get_text())
                self.write("</text>\n")
            elif isinstance(item, LTText):
                self.write("<text>%s</text>\n" % item.get_text())
            elif isinstance(item, LTImage):
                if self.imagewriter is not None:
                    name = self.imagewriter.export_image(item)
                    self.write(
                        '<image src="%s" width="%d" height="%d" />\n'
                        % (enc(name), item.width, item.height),
                    )
                else:
                    self.write(
                        '<image width="%d" height="%d" />\n'
                        % (item.width, item.height),
                    )
            else:
                assert False, str(("Unhandled", item))

        render(ltpage)

    def close(self) -> None:
        self.write_footer()


class HOCRConverter(PDFConverter[AnyIO]):
    """Extract an hOCR representation from explicit text information within a PDF."""

    #   Where text is being extracted from a variety of types of PDF within a
    #   business process, those PDFs where the text is only present in image
    #   form will need to be analysed using an OCR tool which will typically
    #   output hOCR. This converter extracts the explicit text information from
    #   those PDFs that do have it and uses it to genxerate a basic hOCR
    #   representation that is designed to be used in conjunction with the image
    #   of the PDF in the same way as genuine OCR output would be, but without the
    #   inevitable OCR errors.

    #   The converter does not handle images, diagrams or text colors.

    #   In the examples processed by the contributor it was necessary to set
    #   LAParams.all_texts to True.

    CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")

    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        outfp: AnyIO,
        codec: str = "utf8",
        pageno: int = 1,
        laparams: LAParams | None = None,
        stripcontrol: bool = False,
    ):
        PDFConverter.__init__(
            self,
            rsrcmgr,
            outfp,
            codec=codec,
            pageno=pageno,
            laparams=laparams,
        )
        self.stripcontrol = stripcontrol
        self.within_chars = False
        self.write_header()

    def bbox_repr(self, bbox: Rect) -> str:
        (in_x0, in_y0, in_x1, in_y1) = bbox
        # PDF y-coordinates are the other way round from hOCR coordinates
        out_x0 = int(in_x0)
        out_y0 = int(self.page_bbox[3] - in_y1)
        out_x1 = int(in_x1)
        out_y1 = int(self.page_bbox[3] - in_y0)
        return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"

    def write(self, text: str) -> None:
        if self.codec:
            encoded_text = text.encode(self.codec)
            cast(BinaryIO, self.outfp).write(encoded_text)
        else:
            cast(TextIO, self.outfp).write(text)

    def write_header(self) -> None:
        if self.codec:
            self.write(
                "<html xmlns='http://www.w3.org/1999/xhtml' "
                "xml:lang='en' lang='en' charset='%s'>\n" % self.codec,
            )
        else:
            self.write(
                "<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en' lang='en'>\n",
            )
        self.write("<head>\n")
        self.write("<title></title>\n")
        self.write(
            "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />\n",
        )
        self.write(
            "<meta name='ocr-system' content='pdfminer.six HOCR Converter' />\n",
        )
        self.write(
            "  <meta name='ocr-capabilities'"
            " content='ocr_page ocr_block ocr_line ocrx_word'/>\n",
        )
        self.write("</head>\n")
        self.write("<body>\n")

    def write_footer(self) -> None:
        self.write("<!-- comment in the following line to debug -->\n")
        self.write(
            "<!--script src='https://unpkg.com/hocrjs'></script--></body></html>\n",
        )

    def write_text(self, text: str) -> None:
        if self.stripcontrol:
            text = self.CONTROL.sub("", text)
        self.write(text)

    def write_word(self) -> None:
        if len(self.working_text) > 0:
            bold_and_italic_styles = ""
            if "Italic" in self.working_font:
                bold_and_italic_styles = "font-style: italic; "
            if "Bold" in self.working_font:
                bold_and_italic_styles += "font-weight: bold; "
            self.write(
                "<span style='font:\"%s\"; font-size:%d; %s' "
                "class='ocrx_word' title='%s; x_font %s; "
                "x_fsize %d'>%s</span>"
                % (
                    (
                        self.working_font,
                        self.working_size,
                        bold_and_italic_styles,
                        self.bbox_repr(self.working_bbox),
                        self.working_font,
                        self.working_size,
                        self.working_text.strip(),
                    )
                ),
            )
        self.within_chars = False

    def receive_layout(self, ltpage: LTPage) -> None:
        def render(item: LTItem) -> None:
            if self.within_chars and isinstance(item, LTAnno):
                self.write_word()
            if isinstance(item, LTPage):
                self.page_bbox = item.bbox
                self.write(
                    "<div class='ocr_page' id='%s' title='%s'>\n"
                    % (item.pageid, self.bbox_repr(item.bbox)),
                )
                for child in item:
                    render(child)
                self.write("</div>\n")
            elif isinstance(item, LTTextLine):
                self.write(
                    "<span class='ocr_line' title='%s'>" % (self.bbox_repr(item.bbox)),
                )
                for child_line in item:
                    render(child_line)
                self.write("</span>\n")
            elif isinstance(item, LTTextBox):
                self.write(
                    "<div class='ocr_block' id='%d' title='%s'>\n"
                    % (item.index, self.bbox_repr(item.bbox)),
                )
                for child in item:
                    render(child)
                self.write("</div>\n")
            elif isinstance(item, LTChar):
                if not self.within_chars:
                    self.within_chars = True
                    self.working_text = item.get_text()
                    self.working_bbox = item.bbox
                    self.working_font = item.fontname
                    self.working_size = item.size
                elif len(item.get_text().strip()) == 0:
                    self.write_word()
                    self.write(item.get_text())
                else:
                    if (
                        self.working_bbox[1] != item.bbox[1]
                        or self.working_font != item.fontname
                        or self.working_size != item.size
                    ):
                        self.write_word()
                        self.working_bbox = item.bbox
                        self.working_font = item.fontname
                        self.working_size = item.size
                    self.working_text += item.get_text()
                    self.working_bbox = (
                        self.working_bbox[0],
                        self.working_bbox[1],
                        item.bbox[2],
                        self.working_bbox[3],
                    )

        render(ltpage)

    def close(self) -> None:
        self.write_footer()


================================================
FILE: babeldoc/pdfminer/data_structures.py
================================================
from collections.abc import Iterable
from typing import Any

from babeldoc.pdfminer.pdfparser import PDFSyntaxError
from babeldoc.pdfminer.pdftypes import dict_value
from babeldoc.pdfminer.pdftypes import int_value
from babeldoc.pdfminer.pdftypes import list_value
from babeldoc.pdfminer.utils import choplist
from babeldoc.pdfminer import settings


class NumberTree:
    """A PDF number tree.

    See Section 3.8.6 of the PDF Reference.
    """

    def __init__(self, obj: Any):
        self._obj = dict_value(obj)
        self.nums: Iterable[Any] | None = None
        self.kids: Iterable[Any] | None = None
        self.limits: Iterable[Any] | None = None

        if "Nums" in self._obj:
            self.nums = list_value(self._obj["Nums"])
        if "Kids" in self._obj:
            self.kids = list_value(self._obj["Kids"])
        if "Limits" in self._obj:
            self.limits = list_value(self._obj["Limits"])

    def _parse(self) -> list[tuple[int, Any]]:
        items = []
        if self.nums:  # Leaf node
            for k, v in choplist(2, self.nums):
                items.append((int_value(k), v))

        if self.kids:  # Root or intermediate node
            for child_ref in self.kids:
                items += NumberTree(child_ref)._parse()

        return items

    values: list[tuple[int, Any]]  # workaround decorators unsupported by mypy

    @property  # type: ignore[no-redef,misc]
    def values(self) -> list[tuple[int, Any]]:
        values = self._parse()

        if settings.STRICT:
            if not all(a[0] <= b[0] for a, b in zip(values, values[1:], strict=False)):
                raise PDFSyntaxError("Number tree elements are out of order")
        else:
            values.sort(key=lambda t: t[0])

        return values


================================================
FILE: babeldoc/pdfminer/encodingdb.py
================================================
import logging
import re
from collections.abc import Iterable
from typing import cast

from babeldoc.pdfminer.glyphlist import glyphname2unicode
from babeldoc.pdfminer.latin_enc import ENCODING
from babeldoc.pdfminer.pdfexceptions import PDFKeyError
from babeldoc.pdfminer.psparser import PSLiteral

HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")

log = logging.getLogger(__name__)


def name2unicode(name: str) -> str:
    """Converts Adobe glyph names to Unicode numbers.

    In contrast to the specification, this raises a KeyError instead of return
    an empty string when the key is unknown.
    This way the caller must explicitly define what to do
    when there is not a match.

    Reference:
    https://github.com/adobe-type-tools/agl-specification#2-the-mapping

    :returns unicode character if name resembles something,
    otherwise a KeyError
    """
    if not isinstance(name, str):
        raise PDFKeyError(
            'Could not convert unicode name "%s" to character because '
            "it should be of type str but is of type %s" % (name, type(name)),
        )

    name = name.split(".")[0]
    components = name.split("_")

    if len(components) > 1:
        return "".join(map(name2unicode, components))

    elif name in glyphname2unicode:
        return glyphname2unicode[name]

    elif name.startswith("uni"):
        name_without_uni = name.strip("uni")

        if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
            unicode_digits = [
                int(name_without_uni[i : i + 4], base=16)
                for i in range(0, len(name_without_uni), 4)
            ]
            for digit in unicode_digits:
                raise_key_error_for_invalid_unicode(digit)
            characters = map(chr, unicode_digits)
            return "".join(characters)

    elif name.startswith("u"):
        name_without_u = name.strip("u")

        if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
            unicode_digit = int(name_without_u, base=16)
            raise_key_error_for_invalid_unicode(unicode_digit)
            return chr(unicode_digit)

    raise PDFKeyError(
        'Could not convert unicode name "%s" to character because '
        "it does not match specification" % name,
    )


def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
    """Unicode values should not be in the range D800 through DFFF because
    that is used for surrogate pairs in UTF-16

    :raises KeyError if unicode digit is invalid
    """
    if 55295 < unicode_digit < 57344:
        raise PDFKeyError(
            "Unicode digit %d is invalid because "
            "it is in the range D800 through DFFF" % unicode_digit,
        )


class EncodingDB:
    std2unicode: dict[int, str] = {}
    mac2unicode: dict[int, str] = {}
    win2unicode: dict[int, str] = {}
    pdf2unicode: dict[int, str] = {}
    for name, std, mac, win, pdf in ENCODING:
        c = name2unicode(name)
        if std:
            std2unicode[std] = c
        if mac:
            mac2unicode[mac] = c
        if win:
            win2unicode[win] = c
        if pdf:
            pdf2unicode[pdf] = c

    encodings = {
        "StandardEncoding": std2unicode,
        "MacRomanEncoding": mac2unicode,
        "WinAnsiEncoding": win2unicode,
        "PDFDocEncoding": pdf2unicode,
    }

    @classmethod
    def get_encoding(
        cls,
        name: str,
        diff: Iterable[object] | None = None,
    ) -> dict[int, str]:
        cid2unicode = cls.encodings.get(name, cls.std2unicode)
        if diff:
            cid2unicode = cid2unicode.copy()
            cid = 0
            for x in diff:
                if isinstance(x, int):
                    cid = x
                elif isinstance(x, PSLiteral):
                    try:
                        cid2unicode[cid] = name2unicode(cast(str, x.name))
                    except (KeyError, ValueError) as e:
                        log.debug(str(e))
                    cid += 1
        return cid2unicode


================================================
FILE: babeldoc/pdfminer/fontmetrics.py
================================================
"""Font metrics for the Adobe core 14 fonts.

Font metrics are used to compute the boundary of each character
written with a proportional font.

The following data were extracted from the AFM files:

  http://www.ctan.org/tex-archive/fonts/adobe/afm/

"""

###  BEGIN Verbatim copy of the license part

#
# Adobe Core 35 AFM Files with 314 Glyph Entries - ReadMe
#
# This file and the 35 PostScript(R) AFM files it accompanies may be
# used, copied, and distributed for any purpose and without charge,
# with or without modification, provided that all copyright notices
# are retained; that the AFM files are not distributed without this
# file; that all modifications to this file or any of the AFM files
# are prominently noted in the modified file(s); and that this
# paragraph is not modified. Adobe Systems has no responsibility or
# obligation to support the use of the AFM files.
#

###  END Verbatim copy of the license part

# flake8: noqa
from typing import Dict


def convert_font_metrics(path: str) -> None:
    """Convert an AFM file to a mapping of font metrics.

    See below for the output.
    """
    fonts = {}
    with open(path) as fileinput:
        for line in fileinput.readlines():
            f = line.strip().split(" ")
            if not f:
                continue
            k = f[0]
            if k == "FontName":
                fontname = f[1]
                props = {"FontName": fontname, "Flags": 0}
                chars: Dict[int, int] = {}
                fonts[fontname] = (props, chars)
            elif k == "C":
                cid = int(f[1])
                if 0 <= cid and cid <= 255:
                    width = int(f[4])
                    chars[cid] = width
            elif k in ("CapHeight", "XHeight", "ItalicAngle", "Ascender", "Descender"):
                k = {"Ascender": "Ascent", "Descender": "Descent"}.get(k, k)
                props[k] = float(f[1])
            elif k in ("FontName", "FamilyName", "Weight"):
                k = {"FamilyName": "FontFamily", "Weight": "FontWeight"}.get(k, k)
                props[k] = f[1]
            elif k == "IsFixedPitch":
                if f[1].lower() == "true":
                    props["Flags"] = 64
            elif k == "FontBBox":
                props[k] = tuple(map(float, f[1:5]))
        print("# -*- python -*-")
        print("FONT_METRICS = {")
        for fontname, (props, chars) in fonts.items():
            print(f" {fontname!r}: {(props, chars)!r},")
        print("}")


FONT_METRICS = {
    "Courier": (
        {
            "FontName": "Courier",
            "Descent": -194.0,
            "FontBBox": (-6.0, -249.0, 639.0, 803.0),
            "FontWeight": "Medium",
            "CapHeight": 572.0,
            "FontFamily": "Courier",
            "Flags": 64,
            "XHeight": 434.0,
            "ItalicAngle": 0.0,
            "Ascent": 627.0,
        },
        {
            " ": 600,
            "!": 600,
            '"': 600,
            "#": 600,
            "$": 600,
            "%": 600,
            "&": 600,
            "'": 600,
            "(": 600,
            ")": 600,
            "*": 600,
            "+": 600,
            ",": 600,
            "-": 600,
            ".": 600,
            "/": 600,
            "0": 600,
            "1": 600,
            "2": 600,
            "3": 600,
            "4": 600,
            "5": 600,
            "6": 600,
            "7": 600,
            "8": 600,
            "9": 600,
            ":": 600,
            ";": 600,
            "<": 600,
            "=": 600,
            ">": 600,
            "?": 600,
            "@": 600,
            "A": 600,
            "B": 600,
            "C": 600,
            "D": 600,
            "E": 600,
            "F": 600,
            "G": 600,
            "H": 600,
            "I": 600,
            "J": 600,
            "K": 600,
            "L": 600,
            "M": 600,
            "N": 600,
            "O": 600,
            "P": 600,
            "Q": 600,
            "R": 600,
            "S": 600,
            "T": 600,
            "U": 600,
            "V": 600,
            "W": 600,
            "X": 600,
            "Y": 600,
            "Z": 600,
            "[": 600,
            "\\": 600,
            "]": 600,
            "^": 600,
            "_": 600,
            "`": 600,
            "a": 600,
            "b": 600,
            "c": 600,
            "d": 600,
            "e": 600,
            "f": 600,
            "g": 600,
            "h": 600,
            "i": 600,
            "j": 600,
            "k": 600,
            "l": 600,
            "m": 600,
            "n": 600,
            "o": 600,
            "p": 600,
            "q": 600,
            "r": 600,
            "s": 600,
            "t": 600,
            "u": 600,
            "v": 600,
            "w": 600,
            "x": 600,
            "y": 600,
            "z": 600,
            "{": 600,
            "|": 600,
            "}": 600,
            "~": 600,
            "\xa1": 600,
            "\xa2": 600,
            "\xa3": 600,
            "\xa4": 600,
            "\xa5": 600,
            "\xa6": 600,
            "\xa7": 600,
            "\xa8": 600,
            "\xa9": 600,
            "\xaa": 600,
            "\xab": 600,
            "\xac": 600,
            "\xae": 600,
            "\xaf": 600,
            "\xb0": 600,
            "\xb1": 600,
            "\xb2": 600,
            "\xb3": 600,
            "\xb4": 600,
            "\xb5": 600,
            "\xb6": 600,
            "\xb7": 600,
            "\xb8": 600,
            "\xb9": 600,
            "\xba": 600,
            "\xbb": 600,
            "\xbc": 600,
            "\xbd": 600,
            "\xbe": 600,
            "\xbf": 600,
            "\xc0": 600,
            "\xc1": 600,
            "\xc2": 600,
            "\xc3": 600,
            "\xc4": 600,
            "\xc5": 600,
            "\xc6": 600,
            "\xc7": 600,
            "\xc8": 600,
            "\xc9": 600,
            "\xca": 600,
            "\xcb": 600,
            "\xcc": 600,
            "\xcd": 600,
            "\xce": 600,
            "\xcf": 600,
            "\xd0": 600,
            "\xd1": 600,
            "\xd2": 600,
            "\xd3": 600,
            "\xd4": 600,
            "\xd5": 600,
            "\xd6": 600,
            "\xd7": 600,
            "\xd8": 600,
            "\xd9": 600,
            "\xda": 600,
            "\xdb": 600,
            "\xdc": 600,
            "\xdd": 600,
            "\xde": 600,
            "\xdf": 600,
            "\xe0": 600,
            "\xe1": 600,
            "\xe2": 600,
            "\xe3": 600,
            "\xe4": 600,
            "\xe5": 600,
            "\xe6": 600,
            "\xe7": 600,
            "\xe8": 600,
            "\xe9": 600,
            "\xea": 600,
            "\xeb": 600,
            "\xec": 600,
            "\xed": 600,
            "\xee": 600,
            "\xef": 600,
            "\xf0": 600,
            "\xf1": 600,
            "\xf2": 600,
            "\xf3": 600,
            "\xf4": 600,
            "\xf5": 600,
            "\xf6": 600,
            "\xf7": 600,
            "\xf8": 600,
            "\xf9": 600,
            "\xfa": 600,
            "\xfb": 600,
            "\xfc": 600,
            "\xfd": 600,
            "\xfe": 600,
            "\xff": 600,
            "\u0100": 600,
            "\u0101": 600,
            "\u0102": 600,
            "\u0103": 600,
            "\u0104": 600,
            "\u0105": 600,
            "\u0106": 600,
            "\u0107": 600,
            "\u010c": 600,
            "\u010d": 600,
            "\u010e": 600,
            "\u010f": 600,
            "\u0110": 600,
            "\u0111": 600,
            "\u0112": 600,
            "\u0113": 600,
            "\u0116": 600,
            "\u0117": 600,
            "\u0118": 600,
            "\u0119": 600,
            "\u011a": 600,
            "\u011b": 600,
            "\u011e": 600,
            "\u011f": 600,
            "\u0122": 600,
            "\u0123": 600,
            "\u012a": 600,
            "\u012b": 600,
            "\u012e": 600,
            "\u012f": 600,
            "\u0130": 600,
            "\u0131": 600,
            "\u0136": 600,
            "\u0137": 600,
            "\u0139": 600,
            "\u013a": 600,
            "\u013b": 600,
            "\u013c": 600,
            "\u013d": 600,
            "\u013e": 600,
            "\u0141": 600,
            "\u0142": 600,
            "\u0143": 600,
            "\u0144": 600,
            "\u0145": 600,
            "\u0146": 600,
            "\u0147": 600,
            "\u0148": 600,
            "\u014c": 600,
            "\u014d": 600,
            "\u0150": 600,
            "\u0151": 600,
            "\u0152": 600,
            "\u0153": 600,
            "\u0154": 600,
            "\u0155": 600,
            "\u0156": 600,
            "\u0157": 600,
            "\u0158": 600,
            "\u0159": 600,
            "\u015a": 600,
            "\u015b": 600,
            "\u015e": 600,
            "\u015f": 600,
            "\u0160": 600,
            "\u0161": 600,
            "\u0162": 600,
            "\u0163": 600,
            "\u0164": 600,
            "\u0165": 600,
            "\u016a": 600,
            "\u016b": 600,
            "\u016e": 600,
            "\u016f": 600,
            "\u0170": 600,
            "\u0171": 600,
            "\u0172": 600,
            "\u0173": 600,
            "\u0178": 600,
            "\u0179": 600,
            "\u017a": 600,
            "\u017b": 600,
            "\u017c": 600,
            "\u017d": 600,
            "\u017e": 600,
            "\u0192": 600,
            "\u0218": 600,
            "\u0219": 600,
            "\u02c6": 600,
            "\u02c7": 600,
            "\u02d8": 600,
            "\u02d9": 600,
            "\u02da": 600,
            "\u02db": 600,
            "\u02dc": 600,
            "\u02dd": 600,
            "\u2013": 600,
            "\u2014": 600,
            "\u2018": 600,
            "\u2019": 600,
            "\u201a": 600,
            "\u201c": 600,
            "\u201d": 600,
            "\u201e": 600,
            "\u2020": 600,
            "\u2021": 600,
            "\u2022": 600,
            "\u2026": 600,
            "\u2030": 600,
            "\u2039": 600,
            "\u203a": 600,
            "\u2044": 600,
            "\u2122": 600,
            "\u2202": 600,
            "\u2206": 600,
            "\u2211": 600,
            "\u2212": 600,
            "\u221a": 600,
            "\u2260": 600,
            "\u2264": 600,
            "\u2265": 600,
            "\u25ca": 600,
            "\uf6c3": 600,
            "\ufb01": 600,
            "\ufb02": 600,
        },
    ),
    "Courier-Bold": (
        {
            "FontName": "Courier-Bold",
            "Descent": -194.0,
            "FontBBox": (-88.0, -249.0, 697.0, 811.0),
            "FontWeight": "Bold",
            "CapHeight": 572.0,
            "FontFamily": "Courier",
            "Flags": 64,
            "XHeight": 434.0,
            "ItalicAngle": 0.0,
            "Ascent": 627.0,
        },
        {
            " ": 600,
            "!": 600,
            '"': 600,
            "#": 600,
            "$": 600,
            "%": 600,
            "&": 600,
            "'": 600,
            "(": 600,
            ")": 600,
            "*": 600,
            "+": 600,
            ",": 600,
            "-": 600,
            ".": 600,
            "/": 600,
            "0": 600,
            "1": 600,
            "2": 600,
            "3": 600,
            "4": 600,
            "5": 600,
            "6": 600,
            "7": 600,
            "8": 600,
            "9": 600,
            ":": 600,
            ";": 600,
            "<": 600,
            "=": 600,
            ">": 600,
            "?": 600,
            "@": 600,
            "A": 600,
            "B": 600,
            "C": 600,
            "D": 600,
            "E": 600,
            "F": 600,
            "G": 600,
            "H": 600,
            "I": 600,
            "J": 600,
            "K": 600,
            "L": 600,
            "M": 600,
            "N": 600,
            "O": 600,
            "P": 600,
            "Q": 600,
            "R": 600,
            "S": 600,
            "T": 600,
            "U": 600,
            "V": 600,
            "W": 600,
            "X": 600,
            "Y": 600,
            "Z": 600,
            "[": 600,
            "\\": 600,
            "]": 600,
            "^": 600,
            "_": 600,
            "`": 600,
            "a": 600,
            "b": 600,
            "c": 600,
            "d": 600,
            "e": 600,
            "f": 600,
            "g": 600,
            "h": 600,
            "i": 600,
            "j": 600,
            "k": 600,
            "l": 600,
            "m": 600,
            "n": 600,
            "o": 600,
            "p": 600,
            "q": 600,
            "r": 600,
            "s": 600,
            "t": 600,
            "u": 600,
            "v": 600,
            "w": 600,
            "x": 600,
            "y": 600,
            "z": 600,
            "{": 600,
            "|": 600,
            "}": 600,
            "~": 600,
            "\xa1": 600,
            "\xa2": 600,
            "\xa3": 600,
            "\xa4": 600,
            "\xa5": 600,
            "\xa6": 600,
            "\xa7": 600,
            "\xa8": 600,
            "\xa9": 600,
            "\xaa": 600,
            "\xab": 600,
            "\xac": 600,
            "\xae": 600,
            "\xaf": 600,
            "\xb0": 600,
            "\xb1": 600,
            "\xb2": 600,
            "\xb3": 600,
            "\xb4": 600,
            "\xb5": 600,
            "\xb6": 600,
            "\xb7": 600,
            "\xb8": 600,
            "\xb9": 600,
            "\xba": 600,
            "\xbb": 600,
            "\xbc": 600,
            "\xbd": 600,
            "\xbe": 600,
            "\xbf": 600,
            "\xc0": 600,
            "\xc1": 600,
            "\xc2": 600,
            "\xc3": 600,
            "\xc4": 600,
            "\xc5": 600,
            "\xc6": 600,
            "\xc7": 600,
            "\xc8": 600,
            "\xc9": 600,
            "\xca": 600,
            "\xcb": 600,
            "\xcc": 600,
            "\xcd": 600,
            "\xce": 600,
            "\xcf": 600,
            "\xd0": 600,
            "\xd1": 600,
            "\xd2": 600,
            "\xd3": 600,
            "\xd4": 600,
            "\xd5": 600,
            "\xd6": 600,
            "\xd7": 600,
            "\xd8": 600,
            "\xd9": 600,
            "\xda": 600,
            "\xdb": 600,
            "\xdc": 600,
            "\xdd": 600,
            "\xde": 600,
            "\xdf": 600,
            "\xe0": 600,
            "\xe1": 600,
            "\xe2": 600,
            "\xe3": 600,
            "\xe4": 600,
            "\xe5": 600,
            "\xe6": 600,
            "\xe7": 600,
            "\xe8": 600,
            "\xe9": 600,
            "\xea": 600,
            "\xeb": 600,
            "\xec": 600,
            "\xed": 600,
            "\xee": 600,
            "\xef": 600,
            "\xf0": 600,
            "\xf1": 600,
            "\xf2": 600,
            "\xf3": 600,
            "\xf4": 600,
            "\xf5": 600,
            "\xf6": 600,
            "\xf7": 600,
            "\xf8": 600,
            "\xf9": 600,
            "\xfa": 600,
            "\xfb": 600,
            "\xfc": 600,
            "\xfd": 600,
            "\xfe": 600,
            "\xff": 600,
            "\u0100": 600,
            "\u0101": 600,
            "\u0102": 600,
            "\u0103": 600,
            "\u0104": 600,
            "\u0105": 600,
            "\u0106": 600,
            "\u0107": 600,
            "\u010c": 600,
            "\u010d": 600,
            "\u010e": 600,
            "\u010f": 600,
            "\u0110": 600,
            "\u0111": 600,
            "\u0112": 600,
            "\u0113": 600,
            "\u0116": 600,
            "\u0117": 600,
            "\u0118": 600,
            "\u0119": 600,
            "\u011a": 600,
            "\u011b": 600,
            "\u011e": 600,
            "\u011f": 600,
            "\u0122": 600,
            "\u0123": 600,
            "\u012a": 600,
            "\u012b": 600,
            "\u012e": 600,
            "\u012f": 600,
            "\u0130": 600,
            "\u0131": 600,
            "\u0136": 600,
            "\u0137": 600,
            "\u0139": 600,
            "\u013a": 600,
            "\u013b": 600,
            "\u013c": 600,
            "\u013d": 600,
            "\u013e": 600,
            "\u0141": 600,
            "\u0142": 600,
            "\u0143": 600,
            "\u0144": 600,
            "\u0145": 600,
            "\u0146": 600,
            "\u0147": 600,
            "\u0148": 600,
            "\u014c": 600,
            "\u014d": 600,
            "\u0150": 600,
            "\u0151": 600,
            "\u0152": 600,
            "\u0153": 600,
            "\u0154": 600,
            "\u0155": 600,
            "\u0156": 600,
            "\u0157": 600,
            "\u0158": 600,
            "\u0159": 600,
            "\u015a": 600,
            "\u015b": 600,
            "\u015e": 600,
            "\u015f": 600,
            "\u0160": 600,
            "\u0161": 600,
            "\u0162": 600,
            "\u0163": 600,
            "\u0164": 600,
            "\u0165": 600,
            "\u016a": 600,
            "\u016b": 600,
            "\u016e": 600,
            "\u016f": 600,
            "\u0170": 600,
            "\u0171": 600,
            "\u0172": 600,
            "\u0173": 600,
            "\u0178": 600,
            "\u0179": 600,
            "\u017a": 600,
            "\u017b": 600,
            "\u017c": 600,
            "\u017d": 600,
            "\u017e": 600,
            "\u0192": 600,
            "\u0218": 600,
            "\u0219": 600,
            "\u02c6": 600,
            "\u02c7": 600,
            "\u02d8": 600,
            "\u02d9": 600,
            "\u02da": 600,
            "\u02db": 600,
            "\u02dc": 600,
            "\u02dd": 600,
            "\u2013": 600,
            "\u2014": 600,
            "\u2018": 600,
            "\u2019": 600,
            "\u201a": 600,
            "\u201c": 600,
            "\u201d": 600,
            "\u201e": 600,
            "\u2020": 600,
            "\u2021": 600,
            "\u2022": 600,
            "\u2026": 600,
            "\u2030": 600,
            "\u2039": 600,
            "\u203a": 600,
            "\u2044": 600,
            "\u2122": 600,
            "\u2202": 600,
            "\u2206": 600,
            "\u2211": 600,
            "\u2212": 600,
            "\u221a": 600,
            "\u2260": 600,
            "\u2264": 600,
            "\u2265": 600,
            "\u25ca": 600,
            "\uf6c3": 600,
            "\ufb01": 600,
            "\ufb02": 600,
        },
    ),
    "Courier-BoldOblique": (
        {
            "FontName": "Courier-BoldOblique",
            "Descent": -194.0,
            "FontBBox": (-49.0, -249.0, 758.0, 811.0),
            "FontWeight": "Bold",
            "CapHeight": 572.0,
            "FontFamily": "Courier",
            "Flags": 64,
            "XHeight": 434.0,
            "ItalicAngle": -11.0,
            "Ascent": 627.0,
        },
        {
            " ": 600,
            "!": 600,
            '"': 600,
            "#": 600,
            "$": 600,
            "%": 600,
            "&": 600,
            "'": 600,
            "(": 600,
            ")": 600,
            "*": 600,
            "+": 600,
            ",": 600,
            "-": 600,
            ".": 600,
            "/": 600,
            "0": 600,
            "1": 600,
            "2": 600,
            "3": 600,
            "4": 600,
            "5": 600,
            "6": 600,
            "7": 600,
            "8": 600,
            "9": 600,
            ":": 600,
            ";": 600,
            "<": 600,
            "=": 600,
            ">": 600,
            "?": 600,
            "@": 600,
            "A": 600,
            "B": 600,
            "C": 600,
            "D": 600,
            "E": 600,
            "F": 600,
            "G": 600,
            "H": 600,
            "I": 600,
            "J": 600,
            "K": 600,
            "L": 600,
            "M": 600,
            "N": 600,
            "O": 600,
            "P": 600,
            "Q": 600,
            "R": 600,
            "S": 600,
            "T": 600,
            "U": 600,
            "V": 600,
            "W": 600,
            "X": 600,
            "Y": 600,
            "Z": 600,
            "[": 600,
            "\\": 600,
            "]": 600,
            "^": 600,
            "_": 600,
            "`": 600,
            "a": 600,
            "b": 600,
            "c": 600,
            "d": 600,
            "e": 600,
            "f": 600,
            "g": 600,
            "h": 600,
            "i": 600,
            "j": 600,
            "k": 600,
            "l": 600,
            "m": 600,
            "n": 600,
            "o": 600,
            "p": 600,
            "q": 600,
            "r": 600,
            "s": 600,
            "t": 600,
            "u": 600,
            "v": 600,
            "w": 600,
            "x": 600,
            "y": 600,
            "z": 600,
            "{": 600,
            "|": 600,
            "}": 600,
            "~": 600,
            "\xa1": 600,
            "\xa2": 600,
            "\xa3": 600,
            "\xa4": 600,
            "\xa5": 600,
            "\xa6": 600,
            "\xa7": 600,
            "\xa8": 600,
            "\xa9": 600,
            "\xaa": 600,
            "\xab": 600,
            "\xac": 600,
            "\xae": 600,
            "\xaf": 600,
            "\xb0": 600,
            "\xb1": 600,
            "\xb2": 600,
            "\xb3": 600,
            "\xb4": 600,
            "\xb5": 600,
            "\xb6": 600,
            "\xb7": 600,
            "\xb8": 600,
            "\xb9": 600,
            "\xba": 600,
            "\xbb": 600,
            "\xbc": 600,
            "\xbd": 600,
            "\xbe": 600,
            "\xbf": 600,
            "\xc0": 600,
            "\xc1": 600,
            "\xc2": 600,
            "\xc3": 600,
            "\xc4": 600,
            "\xc5": 600,
            "\xc6": 600,
            "\xc7": 600,
            "\xc8": 600,
            "\xc9": 600,
            "\xca": 600,
            "\xcb": 600,
            "\xcc": 600,
            "\xcd": 600,
            "\xce": 600,
            "\xcf": 600,
            "\xd0": 600,
            "\xd1": 600,
            "\xd2": 600,
            "\xd3": 600,
            "\xd4": 600,
            "\xd5": 600,
            "\xd6": 600,
            "\xd7": 600,
            "\xd8": 600,
            "\xd9": 600,
            "\xda": 600,
            "\xdb": 600,
            "\xdc": 600,
            "\xdd": 600,
            "\xde": 600,
            "\xdf": 600,
            "\xe0": 600,
            "\xe1": 600,
            "\xe2": 600,
            "\xe3": 600,
            "\xe4": 600,
            "\xe5": 600,
            "\xe6": 600,
            "\xe7": 600,
            "\xe8": 600,
            "\xe9": 600,
            "\xea": 600,
            "\xeb": 600,
            "\xec": 600,
            "\xed": 600,
            "\xee": 600,
            "\xef": 600,
            "\xf0": 600,
            "\xf1": 600,
            "\xf2": 600,
            "\xf3": 600,
            "\xf4": 600,
            "\xf5": 600,
            "\xf6": 600,
            "\xf7": 600,
            "\xf8": 600,
            "\xf9": 600,
            "\xfa": 600,
            "\xfb": 600,
            "\xfc": 600,
            "\xfd": 600,
            "\xfe": 600,
            "\xff": 600,
            "\u0100": 600,
            "\u0101": 600,
            "\u0102": 600,
            "\u0103": 600,
            "\u0104": 600,
            "\u0105": 600,
            "\u0106": 600,
            "\u0107": 600,
            "\u010c": 600,
            "\u010d": 600,
            "\u010e": 600,
            "\u010f": 600,
            "\u0110": 600,
            "\u0111": 600,
            "\u0112": 600,
            "\u0113": 600,
            "\u0116": 600,
            "\u0117": 600,
            "\u0118": 600,
            "\u0119": 600,
            "\u011a": 600,
            "\u011b": 600,
            "\u011e": 600,
            "\u011f": 600,
            "\u0122": 600,
            "\u0123": 600,
            "\u012a": 600,
            "\u012b": 600,
            "\u012e": 600,
            "\u012f": 600,
            "\u0130": 600,
            "\u0131": 600,
            "\u0136": 600,
            "\u0137": 600,
            "\u0139": 600,
            "\u013a": 600,
            "\u013b": 600,
            "\u013c": 600,
            "\u013d": 600,
            "\u013e": 600,
            "\u0141": 600,
            "\u0142": 600,
            "\u0143": 600,
            "\u0144": 600,
            "\u0145": 600,
            "\u0146": 600,
            "\u0147": 600,
            "\u0148": 600,
            "\u014c": 600,
            "\u014d": 600,
            "\u0150": 600,
            "\u0151": 600,
            "\u0152": 600,
            "\u0153": 600,
            "\u0154": 600,
            "\u0155": 600,
            "\u0156": 600,
            "\u0157": 600,
            "\u0158": 600,
            "\u0159": 600,
            "\u015a": 600,
            "\u015b": 600,
            "\u015e": 600,
            "\u015f": 600,
            "\u0160": 600,
            "\u0161": 600,
            "\u0162": 600,
            "\u0163": 600,
            "\u0164": 600,
            "\u0165": 600,
            "\u016a": 600,
            "\u016b": 600,
            "\u016e": 600,
            "\u016f": 600,
            "\u0170": 600,
            "\u0171": 600,
            "\u0172": 600,
            "\u0173": 600,
            "\u0178": 600,
            "\u0179": 600,
            "\u017a": 600,
            "\u017b": 600,
            "\u017c": 600,
            "\u017d": 600,
            "\u017e": 600,
            "\u0192": 600,
            "\u0218": 600,
            "\u0219": 600,
            "\u02c6": 600,
            "\u02c7": 600,
            "\u02d8": 600,
            "\u02d9": 600,
            "\u02da": 600,
            "\u02db": 600,
            "\u02dc": 600,
            "\u02dd": 600,
            "\u2013": 600,
            "\u2014": 600,
            "\u2018": 600,
            "\u2019": 600,
            "\u201a": 600,
            "\u201c": 600,
            "\u201d": 600,
            "\u201e": 600,
            "\u2020": 600,
            "\u2021": 600,
            "\u2022": 600,
            "\u2026": 600,
            "\u2030": 600,
            "\u2039": 600,
            "\u203a": 600,
            "\u2044": 600,
            "\u2122": 600,
            "\u2202": 600,
            "\u2206": 600,
            "\u2211": 600,
            "\u2212": 600,
            "\u221a": 600,
            "\u2260": 600,
            "\u2264": 600,
            "\u2265": 600,
            "\u25ca": 600,
            "\uf6c3": 600,
            "\ufb01": 600,
            "\ufb02": 600,
        },
    ),
    "Courier-Oblique": (
        {
            "FontName": "Courier-Oblique",
            "Descent": -194.0,
            "FontBBox": (-49.0, -249.0, 749.0, 803.0),
            "FontWeight": "Medium",
            "CapHeight": 572.0,
            "FontFamily": "Courier",
            "Flags": 64,
            "XHeight": 434.0,
            "ItalicAngle": -11.0,
            "Ascent": 627.0,
        },
        {
            " ": 600,
            "!": 600,
            '"': 600,
            "#": 600,
            "$": 600,
            "%": 600,
            "&": 600,
            "'": 600,
            "(": 600,
            ")": 600,
            "*": 600,
            "+": 600,
            ",": 600,
            "-": 600,
            ".": 600,
            "/": 600,
            "0": 600,
            "1": 600,
            "2": 600,
            "3": 600,
            "4": 600,
            "5": 600,
            "6": 600,
            "7": 600,
            "8": 600,
            "9": 600,
            ":": 600,
            ";": 600,
            "<": 600,
            "=": 600,
            ">": 600,
            "?": 600,
            "@": 600,
            "A": 600,
            "B": 600,
            "C": 600,
            "D": 600,
            "E": 600,
            "F": 600,
            "G": 600,
            "H": 600,
            "I": 600,
            "J": 600,
            "K": 600,
            "L": 600,
            "M": 600,
            "N": 600,
            "O": 600,
            "P": 600,
            "Q": 600,
            "R": 600,
            "S": 600,
            "T": 600,
            "U": 600,
            "V": 600,
            "W": 600,
            "X": 600,
            "Y": 600,
            "Z": 600,
            "[": 600,
            "\\": 600,
            "]": 600,
            "^": 600,
            "_": 600,
            "`": 600,
            "a": 600,
            "b": 600,
            "c": 600,
            "d": 600,
            "e": 600,
            "f": 600,
            "g": 600,
            "h": 600,
            "i": 600,
            "j": 600,
            "k": 600,
            "l": 600,
            "m": 600,
            "n": 600,
            "o": 600,
            "p": 600,
            "q": 600,
            "r": 600,
            "s": 600,
            "t": 600,
            "u": 600,
            "v": 600,
            "w": 600,
            "x": 600,
            "y": 600,
            "z": 600,
            "{": 600,
            "|": 600,
            "}": 600,
            "~": 600,
            "\xa1": 600,
            "\xa2": 600,
            "\xa3": 600,
            "\xa4": 600,
            "\xa5": 600,
            "\xa6": 600,
            "\xa7": 600,
            "\xa8": 600,
            "\xa9": 600,
            "\xaa": 600,
            "\xab": 600,
            "\xac": 600,
            "\xae": 600,
            "\xaf": 600,
            "\xb0": 600,
            "\xb1": 600,
            "\xb2": 600,
            "\xb3": 600,
            "\xb4": 600,
            "\xb5": 600,
            "\xb6": 600,
            "\xb7": 600,
            "\xb8": 600,
            "\xb9": 600,
            "\xba": 600,
            "\xbb": 600,
            "\xbc": 600,
            "\xbd": 600,
            "\xbe": 600,
            "\xbf": 600,
            "\xc0": 600,
            "\xc1": 600,
            "\xc2": 600,
            "\xc3": 600,
            "\xc4": 600,
            "\xc5": 600,
            "\xc6": 600,
            "\xc7": 600,
            "\xc8": 600,
            "\xc9": 600,
            "\xca": 600,
            "\xcb": 600,
            "\xcc": 600,
            "\xcd": 600,
            "\xce": 600,
            "\xcf": 600,
            "\xd0": 600,
            "\xd1": 600,
            "\xd2": 600,
            "\xd3": 600,
            "\xd4": 600,
            "\xd5": 600,
            "\xd6": 600,
            "\xd7": 600,
            "\xd8": 600,
            "\xd9": 600,
            "\xda": 600,
            "\xdb": 600,
            "\xdc": 600,
            "\xdd": 600,
            "\xde": 600,
            "\xdf": 600,
            "\xe0": 600,
            "\xe1": 600,
            "\xe2": 600,
            "\xe3": 600,
            "\xe4": 600,
            "\xe5": 600,
            "\xe6": 600,
            "\xe7": 600,
            "\xe8": 600,
            "\xe9": 600,
            "\xea": 600,
            "\xeb": 600,
            "\xec": 600,
            "\xed": 600,
            "\xee": 600,
            "\xef": 600,
            "\xf0": 600,
            "\xf1": 600,
            "\xf2": 600,
            "\xf3": 600,
            "\xf4": 600,
            "\xf5": 600,
            "\xf6": 600,
            "\xf7": 600,
            "\xf8": 600,
            "\xf9": 600,
            "\xfa": 600,
            "\xfb": 600,
            "\xfc": 600,
            "\xfd": 600,
            "\xfe": 600,
            "\xff": 600,
            "\u0100": 600,
            "\u0101": 600,
            "\u0102": 600,
            "\u0103": 600,
            "\u0104": 600,
            "\u0105": 600,
            "\u0106": 600,
            "\u0107": 600,
            "\u010c": 600,
            "\u010d": 600,
            "\u010e": 600,
            "\u010f": 600,
            "\u0110": 600,
            "\u0111": 600,
            "\u0112": 600,
            "\u0113": 600,
            "\u0116": 600,
            "\u0117": 600,
            "\u0118": 600,
            "\u0119": 600,
            "\u011a": 600,
            "\u011b": 600,
            "\u011e": 600,
            "\u011f": 600,
            "\u0122": 600,
            "\u0123": 600,
            "\u012a": 600,
            "\u012b": 600,
            "\u012e": 600,
            "\u012f": 600,
            "\u0130": 600,
            "\u0131": 600,
            "\u0136": 600,
            "\u0137": 600,
            "\u0139": 600,
            "\u013a": 600,
            "\u013b": 600,
            "\u013c": 600,
            "\u013d": 600,
            "\u013e": 600,
            "\u0141": 600,
            "\u0142": 600,
            "\u0143": 600,
            "\u0144": 600,
            "\u0145": 600,
            "\u0146": 600,
            "\u0147": 600,
            "\u0148": 600,
            "\u014c": 600,
            "\u014d": 600,
            "\u0150": 600,
            "\u0151": 600,
            "\u0152": 600,
            "\u0153": 600,
            "\u0154": 600,
            "\u0155": 600,
            "\u0156": 600,
            "\u0157": 600,
            "\u0158": 600,
            "\u0159": 600,
            "\u015a": 600,
            "\u015b": 600,
            "\u015e": 600,
            "\u015f": 600,
            "\u0160": 600,
            "\u0161": 600,
            "\u0162": 600,
            "\u0163": 600,
            "\u0164": 600,
            "\u0165": 600,
            "\u016a": 600,
            "\u016b": 600,
            "\u016e": 600,
            "\u016f": 600,
            "\u0170": 600,
            "\u0171": 600,
            "\u0172": 600,
            "\u0173": 600,
            "\u0178": 600,
            "\u0179": 600,
            "\u017a": 600,
            "\u017b": 600,
            "\u017c": 600,
            "\u017d": 600,
            "\u017e": 600,
            "\u0192": 600,
            "\u0218": 600,
            "\u0219": 600,
            "\u02c6": 600,
            "\u02c7": 600,
            "\u02d8": 600,
            "\u02d9": 600,
            "\u02da": 600,
            "\u02db": 600,
            "\u02dc": 600,
            "\u02dd": 600,
            "\u2013": 600,
            "\u2014": 600,
            "\u2018": 600,
            "\u2019": 600,
            "\u201a": 600,
            "\u201c": 600,
            "\u201d": 600,
            "\u201e": 600,
            "\u2020": 600,
            "\u2021": 600,
            "\u2022": 600,
            "\u2026": 600,
            "\u2030": 600,
            "\u2039": 600,
            "\u203a": 600,
            "\u2044": 600,
            "\u2122": 600,
            "\u2202": 600,
            "\u2206": 600,
            "\u2211": 600,
            "\u2212": 600,
            "\u221a": 600,
            "\u2260": 600,
            "\u2264": 600,
            "\u2265": 600,
            "\u25ca": 600,
            "\uf6c3": 600,
            "\ufb01": 600,
            "\ufb02": 600,
        },
    ),
    "Helvetica": (
        {
            "FontName": "Helvetica",
            "Descent": -207.0,
            "FontBBox": (-166.0, -225.0, 1000.0, 931.0),
            "FontWeight": "Medium",
            "CapHeight": 718.0,
            "FontFamily": "Helvetica",
            "Flags": 0,
            "XHeight": 523.0,
            "ItalicAngle": 0.0,
            "Ascent": 718.0,
        },
        {
            " ": 278,
            "!": 278,
            '"': 355,
            "#": 556,
            "$": 556,
            "%": 889,
            "&": 667,
            "'": 191,
            "(": 333,
            ")": 333,
            "*": 389,
            "+": 584,
            ",": 278,
            "-": 333,
            ".": 278,
            "/": 278,
            "0": 556,
            "1": 556,
            "2": 556,
            "3": 556,
            "4": 556,
            "5": 556,
            "6": 556,
            "7": 556,
            "8": 556,
            "9": 556,
            ":": 278,
            ";": 278,
            "<": 584,
            "=": 584,
            ">": 584,
            "?": 556,
            "@": 1015,
            "A": 667,
            "B": 667,
            "C": 722,
            "D": 722,
            "E": 667,
            "F": 611,
            "G": 778,
            "H": 722,
            "I": 278,
            "J": 500,
            "K": 667,
            "L": 556,
            "M": 833,
            "N": 722,
            "O": 778,
            "P": 667,
            "Q": 778,
            "R": 722,
            "S": 667,
            "T": 611,
            "U": 722,
            "V": 667,
            "W": 944,
            "X": 667,
            "Y": 667,
            "Z": 611,
            "[": 278,
            "\\": 278,
            "]": 278,
            "^": 469,
            "_": 556,
            "`": 333,
            "a": 556,
            "b": 556,
            "c": 500,
            "d": 556,
            "e": 556,
            "f": 278,
            "g": 556,
            "h": 556,
            "i": 222,
            "j": 222,
            "k": 500,
            "l": 222,
            "m": 833,
            "n": 556,
            "o": 556,
            "p": 556,
            "q": 556,
            "r": 333,
            "s": 500,
            "t": 278,
            "u": 556,
            "v": 500,
            "w": 722,
            "x": 500,
            "y": 500,
            "z": 500,
            "{": 334,
            "|": 260,
            "}": 334,
            "~": 584,
            "\xa1": 333,
            "\xa2": 556,
            "\xa3": 556,
            "\xa4": 556,
            "\xa5": 556,
            "\xa6": 260,
            "\xa7": 556,
            "\xa8": 333,
            "\xa9": 737,
            "\xaa": 370,
            "\xab": 556,
            "\xac": 584,
            "\xae": 737,
            "\xaf": 333,
            "\xb0": 400,
            "\xb1": 584,
            "\xb2": 333,
            "\xb3": 333,
            "\xb4": 333,
            "\xb5": 556,
            "\xb6": 537,
            "\xb7": 278,
            "\xb8": 333,
            "\xb9": 333,
            "\xba": 365,
            "\xbb": 556,
            "\xbc": 834,
            "\xbd": 834,
            "\xbe": 834,
            "\xbf": 611,
            "\xc0": 667,
            "\xc1": 667,
            "\xc2": 667,
            "\xc3": 667,
            "\xc4": 667,
            "\xc5": 667,
            "\xc6": 1000,
            "\xc7": 722,
            "\xc8": 667,
            "\xc9": 667,
            "\xca": 667,
            "\xcb": 667,
            "\xcc": 278,
            "\xcd": 278,
            "\xce": 278,
            "\xcf": 278,
            "\xd0": 722,
            "\xd1": 722,
            "\xd2": 778,
            "\xd3": 778,
            "\xd4": 778,
            "\xd5": 778,
            "\xd6": 778,
            "\xd7": 584,
            "\xd8": 778,
            "\xd9": 722,
            "\xda": 722,
            "\xdb": 722,
            "\xdc": 722,
            "\xdd": 667,
            "\xde": 667,
            "\xdf": 611,
            "\xe0": 556,
            "\xe1": 556,
            "\xe2": 556,
            "\xe3": 556,
            "\xe4": 556,
            "\xe5": 556,
            "\xe6": 889,
            "\xe7": 500,
            "\xe8": 556,
            "\xe9": 556,
            "\xea": 556,
            "\xeb": 556,
            "\xec": 278,
            "\xed": 278,
            "\xee": 278,
            "\xef": 278,
            "\xf0": 556,
            "\xf1": 556,
            "\xf2": 556,
            "\xf3": 556,
            "\xf4": 556,
            "\xf5": 556,
            "\xf6": 556,
            "\xf7": 584,
            "\xf8": 611,
            "\xf9": 556,
            "\xfa": 556,
            "\xfb": 556,
            "\xfc": 556,
            "\xfd": 500,
            "\xfe": 556,
            "\xff": 500,
            "\u0100": 667,
            "\u0101": 556,
            "\u0102": 667,
            "\u0103": 556,
            "\u0104": 667,
            "\u0105": 556,
            "\u0106": 722,
            "\u0107": 500,
            "\u010c": 722,
            "\u010d": 500,
            "\u010e": 722,
            "\u010f": 643,
            "\u0110": 722,
            "\u0111": 556,
            "\u0112": 667,
            "\u0113": 556,
            "\u0116": 667,
            "\u0117": 556,
            "\u0118": 667,
            "\u0119": 556,
            "\u011a": 667,
            "\u011b": 556,
            "\u011e": 778,
            "\u011f": 556,
            "\u0122": 778,
            "\u0123": 556,
            "\u012a": 278,
            "\u012b": 278,
            "\u012e": 278,
            "\u012f": 222,
            "\u0130": 278,
            "\u0131": 278,
            "\u0136": 667,
            "\u0137": 500,
            "\u0139": 556,
            "\u013a": 222,
            "\u013b": 556,
            "\u013c": 222,
            "\u013d": 556,
            "\u013e": 299,
            "\u0141": 556,
            "\u0142": 222,
            "\u0143": 722,
            "\u0144": 556,
            "\u0145": 722,
            "\u0146": 556,
            "\u0147": 722,
            "\u0148": 556,
            "\u014c": 778,
            "\u014d": 556,
            "\u0150": 778,
            "\u0151": 556,
            "\u0152": 1000,
            "\u0153": 944,
            "\u0154": 722,
            "\u0155": 333,
            "\u0156": 722,
            "\u0157": 333,
            "\u0158": 722,
            "\u0159": 333,
            "\u015a": 667,
            "\u015b": 500,
            "\u015e": 667,
            "\u015f": 500,
            "\u0160": 667,
            "\u0161": 500,
            "\u0162": 611,
            "\u0163": 278,
            "\u0164": 611,
            "\u0165": 317,
            "\u016a": 722,
            "\u016b": 556,
            "\u016e": 722,
            "\u016f": 556,
            "\u0170": 722,
            "\u0171": 556,
            "\u0172": 722,
            "\u0173": 556,
            "\u0178": 667,
            "\u0179": 611,
            "\u017a": 500,
            "\u017b": 611,
            "\u017c": 500,
            "\u017d": 611,
            "\u017e": 500,
            "\u0192": 556,
            "\u0218": 667,
            "\u0219": 500,
            "\u02c6": 333,
            "\u02c7": 333,
            "\u02d8": 333,
            "\u02d9": 333,
            "\u02da": 333,
            "\u02db": 333,
            "\u02dc": 333,
            "\u02dd": 333,
            "\u2013": 556,
            "\u2014": 1000,
            "\u2018": 222,
            "\u2019": 222,
            "\u201a": 222,
            "\u201c": 333,
            "\u201d": 333,
            "\u201e": 333,
            "\u2020": 556,
            "\u2021": 556,
            "\u2022": 350,
            "\u2026": 1000,
            "\u2030": 1000,
            "\u2039": 333,
            "\u203a": 333,
            "\u2044": 167,
            "\u2122": 1000,
            "\u2202": 476,
            "\u2206": 612,
            "\u2211": 600,
            "\u2212": 584,
            "\u221a": 453,
            "\u2260": 549,
            "\u2264": 549,
            "\u2265": 549,
            "\u25ca": 471,
            "\uf6c3": 250,
            "\ufb01": 500,
            "\ufb02": 500,
        },
    ),
    "Helvetica-Bold": (
        {
            "FontName": "Helvetica-Bold",
            "Descent": -207.0,
            "FontBBox": (-170.0, -228.0, 1003.0, 962.0),
            "FontWeight": "Bold",
            "CapHeight": 718.0,
            "FontFamily": "Helvetica",
            "Flags": 0,
            "XHeight": 532.0,
            "ItalicAngle": 0.0,
            "Ascent": 718.0,
        },
        {
            " ": 278,
            "!": 333,
            '"': 474,
            "#": 556,
            "$": 556,
            "%": 889,
            "&": 722,
            "'": 238,
            "(": 333,
            ")": 333,
            "*": 389,
            "+": 584,
            ",": 278,
            "-": 333,
            ".": 278,
            "/": 278,
            "0": 556,
            "1": 556,
            "2": 556,
            "3": 556,
            "4": 556,
            "5": 556,
            "6": 556,
            "7": 556,
            "8": 556,
            "9": 556,
            ":": 333,
            ";": 333,
            "<": 584,
            "=": 584,
            ">": 584,
            "?": 611,
            "@": 975,
            "A": 722,
            "B": 722,
            "C": 722,
            "D": 722,
            "E": 667,
            "F": 611,
            "G": 778,
            "H": 722,
            "I": 278,
            "J": 556,
            "K": 722,
            "L": 611,
            "M": 833,
            "N": 722,
            "O": 778,
            "P": 667,
            "Q": 778,
            "R": 722,
            "S": 667,
            "T": 611,
            "U": 722,
            "V": 667,
            "W": 944,
            "X": 667,
            "Y": 667,
            "Z": 611,
            "[": 333,
            "\\": 278,
            "]": 333,
            "^": 584,
            "_": 556,
            "`": 333,
            "a": 556,
            "b": 611,
            "c": 556,
            "d": 611,
            "e": 556,
            "f": 333,
            "g": 611,
            "h": 611,
            "i": 278,
            "j": 278,
            "k": 556,
            "l": 278,
            "m": 889,
            "n": 611,
            "o": 611,
            "p": 611,
            "q": 611,
            "r": 389,
            "s": 556,
            "t": 333,
            "u": 611,
            "v": 556,
            "w": 778,
            "x": 556,
            "y": 556,
            "z": 500,
            "{": 389,
            "|": 280,
            "}": 389,
            "~": 584,
            "\xa1": 333,
            "\xa2": 556,
            "\xa3": 556,
            "\xa4": 556,
            "\xa5": 556,
            "\xa6": 280,
            "\xa7": 556,
            "\xa8": 333,
            "\xa9": 737,
            "\xaa": 370,
            "\xab": 556,
            "\xac": 584,
            "\xae": 737,
            "\xaf": 333,
            "\xb0": 400,
            "\xb1": 584,
            "\xb2": 333,
            "\xb3": 333,
            "\xb4": 333,
            "\xb5": 611,
            "\xb6": 556,
            "\xb7": 278,
            "\xb8": 333,
            "\xb9": 333,
            "\xba": 365,
            "\xbb": 556,
            "\xbc": 834,
            "\xbd": 834,
            "\xbe": 834,
            "\xbf": 611,
            "\xc0": 722,
            "\xc1": 722,
            "\xc2": 722,
            "\xc3": 722,
            "\xc4": 722,
            "\xc5": 722,
            "\xc6": 1000,
            "\xc7": 722,
            "\xc8": 667,
            "\xc9": 667,
            "\xca": 667,
            "\xcb": 667,
            "\xcc": 278,
            "\xcd": 278,
            "\xce": 278,
            "\xcf": 278,
            "\xd0": 722,
            "\xd1": 722,
            "\xd2": 778,
            "\xd3": 778,
            "\xd4": 778,
            "\xd5": 778,
            "\xd6": 778,
            "\xd7": 584,
            "\xd8": 778,
            "\xd9": 722,
            "\xda": 722,
            "\xdb": 722,
            "\xdc": 722,
            "\xdd": 667,
            "\xde": 667,
            "\xdf": 611,
            "\xe0": 556,
            "\xe1": 556,
            "\xe2": 556,
            "\xe3": 556,
            "\xe4": 556,
            "\xe5": 556,
            "\xe6": 889,
            "\xe7": 556,
            "\xe8": 556,
            "\xe9": 556,
            "\xea": 556,
            "\xeb": 556,
            "\xec": 278,
            "\xed": 278,
            "\xee": 278,
            "\xef": 278,
            "\xf0": 611,
            "\xf1": 611,
            "\xf2": 611,
            "\xf3": 611,
            "\xf4": 611,
            "\xf5": 611,
            "\xf6": 611,
            "\xf7": 584,
            "\xf8": 611,
            "\xf9": 611,
            "\xfa": 611,
            "\xfb": 611,
            "\xfc": 611,
            "\xfd": 556,
            "\xfe": 611,
            "\xff": 556,
            "\u0100": 722,
            "\u0101": 556,
            "\u0102": 722,
            "\u0103": 556,
            "\u0104": 722,
            "\u0105": 556,
            "\u0106": 722,
            "\u0107": 556,
            "\u010c": 722,
            "\u010d": 556,
            "\u010e": 722,
            "\u010f": 743,
            "\u0110": 722,
            "\u0111": 611,
            "\u0112": 667,
            "\u0113": 556,
            "\u0116": 667,
            "\u0117": 556,
            "\u0118": 667,
            "\u0119": 556,
            "\u011a": 667,
            "\u011b": 556,
            "\u011e": 778,
            "\u011f": 611,
            "\u0122": 778,
            "\u0123": 611,
            "\u012a": 278,
            "\u012b": 278,
            "\u012e": 278,
            "\u012f": 278,
            "\u0130": 278,
            "\u0131": 278,
            "\u0136": 722,
            "\u0137": 556,
            "\u0139": 611,
            "\u013a": 278,
            "\u013b": 611,
            "\u013c": 278,
            "\u013d": 611,
            "\u013e": 400,
            "\u0141": 611,
            "\u0142": 278,
            "\u0143": 722,
            "\u0144": 611,
            "\u0145": 722,
            "\u0146": 611,
            "\u0147": 722,
            "\u0148": 611,
            "\u014c": 778,
            "\u014d": 611,
            "\u0150": 778,
            "\u0151": 611,
            "\u0152": 1000,
            "\u0153": 944,
            "\u0154": 722,
            "\u0155": 389,
            "\u0156": 722,
            "\u0157": 389,
            "\u0158": 722,
            "\u0159": 389,
            "\u015a": 667,
            "\u015b": 556,
            "\u015e": 667,
            "\u015f": 556,
            "\u0160": 667,
            "\u0161": 556,
            "\u0162": 611,
            "\u0163": 333,
            "\u0164": 611,
            "\u0165": 389,
            "\u016a": 722,
            "\u016b": 611,
            "\u016e": 722,
            "\u016f": 611,
            "\u0170": 722,
            "\u0171": 611,
            "\u0172": 722,
            "\u0173": 611,
            "\u0178": 667,
            "\u0179": 611,
            "\u017a": 500,
            "\u017b": 611,
            "\u017c": 500,
            "\u017d": 611,
            "\u017e": 500,
            "\u0192": 556,
            "\u0218": 667,
            "\u0219": 556,
            "\u02c6": 333,
            "\u02c7": 333,
            "\u02d8": 333,
            "\u02d9": 333,
            "\u02da": 333,
            "\u02db": 333,
            "\u02dc": 333,
            "\u02dd": 333,
            "\u2013": 556,
            "\u2014": 1000,
            "\u2018": 278,
            "\u2019": 278,
            "\u201a": 278,
            "\u201c": 500,
            "\u201d": 500,
            "\u201e": 500,
            "\u2020": 556,
            "\u2021": 556,
            "\u2022": 350,
            "\u2026": 1000,
            "\u2030": 1000,
            "\u2039": 333,
            "\u203a": 333,
            "\u2044": 167,
            "\u2122": 1000,
            "\u2202": 494,
            "\u2206": 612,
            "\u2211": 600,
            "\u2212": 584,
            "\u221a": 549,
            "\u2260": 549,
            "\u2264": 549,
            "\u2265": 549,
            "\u25ca": 494,
            "\uf6c3": 250,
            "\ufb01": 611,
            "\ufb02": 611,
        },
    ),
    "Helvetica-BoldOblique": (
        {
            "FontName": "Helvetica-BoldOblique",
            "Descent": -207.0,
            "FontBBox": (-175.0, -228.0, 1114.0, 962.0),
            "FontWeight": "Bold",
            "CapHeight": 718.0,
            "FontFamily": "Helvetica",
            "Flags": 0,
            "XHeight": 532.0,
            "ItalicAngle": -12.0,
            "Ascent": 718.0,
        },
        {
            " ": 278,
            "!": 333,
            '"': 474,
            "#": 556,
            "$": 556,
            "%": 889,
            "&": 722,
            "'": 238,
            "(": 333,
            ")": 333,
            "*": 389,
            "+": 584,
            ",": 278,
            "-": 333,
            ".": 278,
            "/": 278,
            "0": 556,
            "1": 556,
            "2": 556,
            "3": 556,
            "4": 556,
            "5": 556,
            "6": 556,
            "7": 556,
            "8": 556,
            "9": 556,
            ":": 333,
            ";": 333,
            "<": 584,
            "=": 584,
            ">": 584,
            "?": 611,
            "@": 975,
            "A": 722,
            "B": 722,
            "C": 722,
            "D": 722,
            "E": 667,
            "F": 611,
            "G": 778,
            "H": 722,
            "I": 278,
            "J": 556,
            "K": 722,
            "L": 611,
            "M": 833,
            "N": 722,
            "O": 778,
            "P": 667,
            "Q": 778,
            "R": 722,
            "S": 667,
            "T": 611,
            "U": 722,
            "V": 667,
            "W": 944,
            "X": 667,
            "Y": 667,
            "Z": 611,
            "[": 333,
            "\\": 278,
            "]": 333,
            "^": 584,
            "_": 556,
            "`": 333,
            "a": 556,
            "b": 611,
            "c": 556,
            "d": 611,
            "e": 556,
            "f": 333,
            "g": 611,
            "h": 611,
            "i": 278,
            "j": 278,
            "k": 556,
            "l": 278,
            "m": 889,
            "n": 611,
            "o": 611,
            "p": 611,
            "q": 611,
            "r": 389,
            "s": 556,
            "t": 333,
            "u": 611,
            "v": 556,
            "w": 778,
            "x": 556,
            "y": 556,
            "z": 500,
            "{": 389,
            "|": 280,
            "}": 389,
            "~": 584,
            "\xa1": 333,
            "\xa2": 556,
            "\xa3": 556,
            "\xa4": 556,
            "\xa5": 556,
            "\xa6": 280,
            "\xa7": 556,
            "\xa8": 333,
            "\xa9": 737,
            "\xaa": 370,
            "\xab": 556,
            "\xac": 584,
            "\xae": 737,
            "\xaf": 333,
            "\xb0": 400,
            "\xb1": 584,
            "\xb2": 333,
            "\xb3": 333,
            "\xb4": 333,
            "\xb5": 611,
            "\xb6": 556,
            "\xb7": 278,
            "\xb8": 333,
            "\xb9": 333,
            "\xba": 365,
            "\xbb": 556,
            "\xbc": 834,
            "\xbd": 834,
            "\xbe": 834,
            "\xbf": 611,
            "\xc0": 722,
            "\xc1": 722,
            "\xc2": 722,
            "\xc3": 722,
            "\xc4": 722,
            "\xc5": 722,
            "\xc6": 1000,
            "\xc7": 722,
            "\xc8": 667,
            "\xc9": 667,
            "\xca": 667,
            "\xcb": 667,
            "\xcc": 278,
            "\xcd": 278,
            "\xce": 278,
            "\xcf": 278,
            "\xd0": 722,
            "\xd1": 722,
            "\xd2": 778,
            "\xd3": 778,
            "\xd4": 778,
            "\xd5": 778,
            "\xd6": 778,
            "\xd7": 584,
            "\xd8": 778,
            "\xd9": 722,
            "\xda": 722,
            "\xdb": 722,
            "\xdc": 722,
            "\xdd": 667,
            "\xde": 667,
            "\xdf": 611,
            "\xe0": 556,
            "\xe1": 556,
            "\xe2": 556,
            "\xe3": 556,
            "\xe4": 556,
            "\xe5": 556,
            "\xe6": 889,
            "\xe7": 556,
            "\xe8": 556,
            "\xe9": 556,
            "\xea": 556,
            "\xeb": 556,
            "\xec": 278,
            "\xed": 278,
            "\xee": 278,
            "\xef": 278,
            "\xf0": 611,
            "\xf1": 611,
            "\xf2": 611,
            "\xf3": 611,
            "\xf4": 611,
            "\xf5": 611,
            "\xf6": 611,
            "\xf7": 584,
            "\xf8": 611,
            "\xf9": 611,
            "\xfa": 611,
            "\xfb": 611,
            "\xfc": 611,
            "\xfd": 556,
            "\xfe": 611,
            "\xff": 556,
            "\u0100": 722,
            "\u0101": 556,
            "\u0102": 722,
            "\u0103": 556,
            "\u0104": 722,
            "\u0105": 556,
            "\u0106": 722,
            "\u0107": 556,
            "\u010c": 722,
            "\u010d": 556,
            "\u010e": 722,
            "\u010f": 743,
            "\u0110": 722,
            "\u0111": 611,
            "\u0112": 667,
            "\u0113": 556,
            "\u0116": 667,
            "\u0117": 556,
            "\u0118": 667,
            "\u0119": 556,
            "\u011a": 667,
            "\u011b": 556,
            "\u011e": 778,
            "\u011f": 611,
            "\u0122": 778,
            "\u0123": 611,
            "\u012a": 278,
            "\u012b": 278,
            "\u012e": 278,
            "\u012f": 278,
            "\u0130": 278,
            "\u0131": 278,
            "\u0136": 722,
            "\u0137": 556,
            "\u0139": 611,
            "\u013a": 278,
            "\u013b": 611,
            "\u013c": 278,
            "\u013d": 611,
            "\u013e": 400,
            "\u0141": 611,
            "\u0142": 278,
            "\u0143": 722,
            "\u0144": 611,
            "\u0145": 722,
            "\u0146": 611,
            "\u0147": 722,
            "\u0148": 611,
            "\u014c": 778,
            "\u014d": 611,
            "\u0150": 778,
            "\u0151": 611,
            "\u0152": 1000,
            "\u0153": 944,
            "\u0154": 722,
            "\u0155": 389,
            "\u0156": 722,
            "\u0157": 389,
            "\u0158": 722,
            "\u0159": 389,
            "\u015a": 667,
            "\u015b": 556,
            "\u015e": 667,
            "\u015f": 556,
            "\u0160": 667,
            "\u0161": 556,
            "\u0162": 611,
            "\u0163": 333,
            "\u0164": 611,
            "\u0165": 389,
            "\u016a": 722,
            "\u016b": 611,
            "\u016e": 722,
            "\u016f": 611,
            "\u0170": 722,
            "\u0171": 611,
            "\u0172": 722,
            "\u0173": 611,
            "\u0178": 667,
            "\u0179": 611,
            "\u017a": 500,
            "\u017b": 611,
            "\u017c": 500,
            "\u017d": 611,
            "\u017e": 500,
            "\u0192": 556,
            "\u0218": 667,
            "\u0219": 556,
            "\u02c6": 333,
            "\u02c7": 333,
            "\u02d8": 333,
            "\u02d9": 333,
            "\u02da": 333,
            "\u02db": 333,
            "\u02dc": 333,
            "\u02dd": 333,
            "\u2013": 556,
            "\u2014": 1000,
            "\u2018": 278,
            "\u2019": 278,
            "\u201a": 278,
            "\u201c": 500,
            "\u201d": 500,
            "\u201e": 500,
            "\u2020": 556,
            "\u2021": 556,
            "\u2022": 350,
            "\u2026": 1000,
            "\u2030": 1000,
            "\u2039": 333,
            "\u203a": 333,
            "\u2044": 167,
            "\u2122": 1000,
            "\u2202": 494,
            "\u2206": 612,
            "\u2211": 600,
            "\u2212": 584,
            "\u221a": 549,
            "\u2260": 549,
            "\u2264": 549,
            "\u2265": 549,
            "\u25ca": 494,
            "\uf6c3": 250,
            "\ufb01": 611,
            "\ufb02": 611,
        },
    ),
    "Helvetica-Oblique": (
        {
            "FontName": "Helvetica-Oblique",
            "Descent": -207.0,
            "FontBBox": (-171.0, -225.0, 1116.0, 931.0),
            "FontWeight": "Medium",
            "CapHeight": 718.0,
            "FontFamily": "Helvetica",
            "Flags": 0,
            "XHeight": 523.0,
            "ItalicAngle": -12.0,
            "Ascent": 718.0,
        },
        {
            " ": 278,
            "!": 278,
            '"': 355,
            "#": 556,
            "$": 556,
            "%": 889,
            "&": 667,
            "'": 191,
            "(": 333,
            ")": 333,
            "*": 389,
            "+": 584,
            ",": 278,
            "-": 333,
            ".": 278,
            "/": 278,
            "0": 556,
            "1": 556,
            "2": 556,
            "3": 556,
            "4": 556,
            "5": 556,
            "6": 556,
            "7": 556,
            "8": 556,
            "9": 556,
            ":": 278,
            ";": 278,
            "<": 584,
            "=": 584,
            ">": 584,
            "?": 556,
            "@": 1015,
            "A": 667,
            "B": 667,
            "C": 722,
            "D": 722,
            "E": 667,
            "F": 611,
            "G": 778,
            "H": 722,
            "I": 278,
            "J": 500,
            "K": 667,
            "L": 556,
            "M": 833,
            "N": 722,
            "O": 778,
            "P": 667,
            "Q": 778,
            "R": 722,
            "S": 667,
            "T": 611,
            "U": 722,
            "V": 667,
            "W": 944,
            "X": 667,
            "Y": 667,
            "Z": 611,
            "[": 278,
            "\\": 278,
            "]": 278,
            "^": 469,
            "_": 556,
            "`": 333,
            "a": 556,
            "b": 556,
            "c": 500,
            "d": 556,
            "e": 556,
            "f": 278,
            "g": 556,
            "h": 556,
            "i": 222,
            "j": 222,
            "k": 500,
            "l": 222,
            "m": 833,
            "n": 556,
            "o": 556,
            "p": 556,
            "q": 556,
            "r": 333,
            "s": 500,
            "t": 278,
            "u": 556,
            "v": 500,
            "w": 722,
            "x": 500,
            "y": 500,
            "z": 500,
            "{": 334,
            "|": 260,
            "}": 334,
            "~": 584,
            "\xa1": 333,
            "\xa2": 556,
            "\xa3": 556,
            "\xa4": 556,
            "\xa5": 556,
            "\xa6": 260,
            "\xa7": 556,
            "\xa8": 333,
            "\xa9": 737,
            "\xaa": 370,
            "\xab": 556,
            "\xac": 584,
            "\xae": 737,
            "\xaf": 333,
            "\xb0": 400,
            "\xb1": 584,
            "\xb2": 333,
            "\xb3": 333,
            "\xb4": 333,
            "\xb5": 556,
            "\xb6": 537,
            "\xb7": 278,
            "\xb8": 333,
            "\xb9": 333,
            "\xba": 365,
            "\xbb": 556,
            "\xbc": 834,
            "\xbd": 834,
            "\xbe": 834,
            "\xbf": 611,
            "\xc0": 667,
            "\xc1": 667,
            "\xc2": 667,
            "\xc3": 667,
            "\xc4": 667,
            "\xc5": 667,
            "\xc6": 1000,
            "\xc7": 722,
            "\xc8": 667,
            "\xc9": 667,
            "\xca": 667,
            "\xcb": 667,
            "\xcc": 278,
            "\xcd": 278,
            "\xce": 278,
            "\xcf": 278,
            "\xd0": 722,
            "\xd1": 722,
            "\xd2": 778,
            "\xd3": 778,
            "\xd4": 778,
            "\xd5": 778,
            "\xd6": 778,
            "\xd7": 584,
            "\xd8": 778,
            "\xd9": 722,
            "\xda": 722,
            "\xdb": 722,
            "\xdc": 722,
            "\xdd": 667,
            "\xde": 667,
            "\xdf": 611,
            "\xe0": 556,
            "\xe1": 556,
            "\xe2": 556,
            "\xe3": 556,
            "\xe4": 556,
            "\xe5": 556,
            "\xe6": 889,
            "\xe7": 500,
            "\xe8": 556,
            "\xe9": 556,
            "\xea": 556,
            "\xeb": 556,
            "\xec": 278,
            "\xed": 278,
            "\xee": 278,
            "\xef": 278,
            "\xf0": 556,
            "\xf1": 556,
            "\xf2": 556,
            "\xf3": 556,
            "\xf4": 556,
            "\xf5": 556,
            "\xf6": 556,
            "\xf7": 584,
            "\xf8": 611,
            "\xf9": 556,
            "\xfa": 556,
            "\xfb": 556,
            "\xfc": 556,
            "\xfd": 500,
            "\xfe": 556,
            "\xff": 500,
            "\u0100": 667,
            "\u0101": 556,
            "\u0102": 667,
            "\u0103": 556,
            "\u0104": 667,
            "\u0105": 556,
            "\u0106": 722,
            "\u0107": 500,
            "\u010c": 722,
            "\u010d": 500,
            "\u010e": 722,
            "\u010f": 643,
            "\u0110": 722,
            "\u0111": 556,
            "\u0112": 667,
            "\u0113": 556,
            "\u0116": 667,
            "\u0117": 556,
            "\u0118": 667,
            "\u0119": 556,
            "\u011a": 667,
            "\u011b": 556,
            "\u011e": 778,
            "\u011f": 556,
            "\u0122": 778,
            "\u0123": 556,
            "\u012a": 278,
            "\u012b": 278,
            "\u012e": 278,
            "\u012f": 222,
            "\u0130": 278,
            "\u0131": 278,
            "\u0136": 667,
            "\u0137": 500,
            "\u0139": 556,
            "\u013a": 222,
            "\u013b": 556,
            "\u013c": 222,
            "\u013d": 556,
            "\u013e": 299,
            "\u0141": 556,
            "\u0142": 222,
            "\u0143": 722,
            "\u0144": 556,
            "\u0145": 722,
            "\u0146": 556,
            "\u0147": 722,
            "\u0148": 556,
            "\u014c": 778,
            "\u014d": 556,
            "\u0150": 778,
            "\u0151": 556,
            "\u0152": 1000,
            "\u0153": 944,
            "\u0154": 722,
            "\u0155": 333,
            "\u0156": 722,
            "\u0157": 333,
            "\u0158": 722,
            "\u0159": 333,
            "\u015a": 667,
            "\u015b": 500,
            "\u015e": 667,
            "\u015f": 500,
            "\u0160": 667,
            "\u0161": 500,
            "\u0162": 611,
            "\u0163": 278,
            "\u0164": 611,
            "\u0165": 317,
            "\u016a": 722,
            "\u016b": 556,
            "\u016e": 722,
            "\u016f": 556,
            "\u0170": 722,
            "\u0171": 556,
            "\u0172": 722,
            "\u0173": 556,
            "\u0178": 667,
            "\u0179": 611,
            "\u017a": 500,
            "\u017b": 611,
            "\u017c": 500,
            "\u017d": 611,
            "\u017e": 500,
            "\u0192": 556,
            "\u0218": 667,
            "\u0219": 500,
            "\u02c6": 333,
            "\u02c7": 333,
            "\u02d8": 333,
            "\u02d9": 333,
            "\u02da": 333,
            "\u02db": 333,
            "\u02dc": 333,
            "\u02dd": 333,
            "\u2013": 556,
            "\u2014": 1000,
            "\u2018": 222,
            "\u2019": 222,
            "\u201a": 222,
            "\u201c": 333,
            "\u201d": 333,
            "\u201e": 333,
            "\u2020": 556,
            "\u2021": 556,
            "\u2022": 350,
            "\u2026": 1000,
            "\u2030": 1000,
            "\u2039": 333,
            "\u203a": 333,
            "\u2044": 167,
            "\u2122": 1000,
            "\u2202": 476,
            "\u2206": 612,
            "\u2211": 600,
            "\u2212": 584,
            "\u221a": 453,
            "\u2260": 549,
            "\u2264": 549,
            "\u2265": 549,
            "\u25ca": 471,
            "\uf6c3": 250,
            "\ufb01": 500,
            "\ufb02": 500,
        },
    ),
    "Symbol": (
        {
            "FontName": "Symbol",
            "FontBBox": (-180.0, -293.0, 1090.0, 1010.0),
            "FontWeight": "Medium",
            "FontFamily": "Symbol",
            "Flags": 0,
            "ItalicAngle": 0.0,
        },
        {
            " ": 250,
            "!": 333,
            "#": 500,
            "%": 833,
            "&": 778,
            "(": 333,
            ")": 333,
            "+": 549,
            ",": 250,
            ".": 250,
            "/": 278,
            "0": 500,
            "1": 500,
            "2": 500,
            "3": 500,
            "4": 500,
            "5": 500,
            "6": 500,
            "7": 500,
            "8": 500,
            "9": 500,
            ":": 278,
            ";": 278,
            "<": 549,
            "=": 549,
            ">": 549,
            "?": 444,
            "[": 333,
            "]": 333,
            "_": 500,
            "{": 480,
            "|": 200,
            "}": 480,
            "\xac": 713,
            "\xb0": 400,
            "\xb1": 549,
            "\xb5": 576,
            "\xd7": 549,
            "\xf7": 549,
            "\u0192": 500,
            "\u0391": 722,
            "\u0392": 667,
            "\u0393": 603,
            "\u0395": 611,
            "\u0396": 611,
            "\u0397": 722,
            "\u0398": 741,
            "\u0399": 333,
            "\u039a": 722,
            "\u039b": 686,
            "\u039c": 889,
            "\u039d": 722,
            "\u039e": 645,
            "\u039f": 722,
            "\u03a0": 768,
            "\u03a1": 556,
            "\u03a3": 592,
            "\u03a4": 611,
            "\u03a5": 690,
            "\u03a6": 763,
            "\u03a7": 722,
            "\u03a8": 795,
            "\u03b1": 631,
            "\u03b2": 549,
            "\u03b3": 411,
            "\u03b4": 494,
            "\u03b5": 439,
            "\u03b6": 494,
            "\u03b7": 603,
            "\u03b8": 521,
            "\u03b9": 329,
            "\u03ba": 549,
            "\u03bb": 549,
            "\u03bd": 521,
            "\u03be": 493,
            "\u03bf": 549,
            "\u03c0": 549,
            "\u03c1": 549,
            "\u03c2": 439,
            "\u03c3": 603,
            "\u03c4": 439,
            "\u03c5": 576,
            "\u03c6": 521,
            "\u03c7": 549,
            "\u03c8": 686,
            "\u03c9": 686,
            "\u03d1": 631,
            "\u03d2": 620,
            "\u03d5": 603,
            "\u03d6": 713,
            "\u2022": 460,
            "\u2026": 1000,
            "\u2032": 247,
            "\u2033": 411,
            "\u2044": 167,
            "\u20ac": 750,
            "\u2111": 686,
            "\u2118": 987,
            "\u211c": 795,
            "\u2126": 768,
            "\u2135": 823,
            "\u2190": 987,
            "\u2191": 603,
            "\u2192": 987,
            "\u2193": 603,
            "\u2194": 1042,
            "\u21b5": 658,
            "\u21d0": 987,
            "\u21d1": 603,
            "\u21d2": 987,
            "\u21d3": 603,
            "\u21d4": 1042,
            "\u2200": 713,
            "\u2202": 494,
            "\u2203": 549,
            "\u2205": 823,
            "\u2206": 612,
            "\u2207": 713,
            "\u2208": 713,
            "\u2209": 713,
            "\u220b": 439,
            "\u220f": 823,
            "\u2211": 713,
            "\u2212": 549,
            "\u2217": 500,
            "\u221a": 549,
            "\u221d": 713,
            "\u221e": 713,
            "\u2220": 768,
            "\u2227": 603,
            "\u2228": 603,
            "\u2229": 768,
            "\u222a": 768,
            "\u222b": 274,
            "\u2234": 863,
            "\u223c": 549,
            "\u2245": 549,
            "\u2248": 549,
            "\u2260": 549,
            "\u2261": 549,
            "\u2264": 549,
            "\u2265": 549,
            "\u2282": 713,
            "\u2283": 713,
            "\u2284": 713,
            "\u2286": 713,
            "\u2287": 713,
            "\u2295": 768,
            "\u2297": 768,
            "\u22a5": 658,
            "\u22c5": 250,
            "\u2320": 686,
            "\u2321": 686,
            "\u2329": 329,
            "\u232a": 329,
            "\u25ca": 494,
            "\u2660": 753,
            "\u2663": 753,
            "\u2665": 753,
            "\u2666": 753,
            "\uf6d9": 790,
            "\uf6da": 790,
            "\uf6db": 890,
            "\uf8e5": 500,
            "\uf8e6": 603,
            "\uf8e7": 1000,
            "\uf8e8": 790,
            "\uf8e9": 790,
            "\uf8ea": 786,
            "\uf8eb": 384,
            "\uf8ec": 384,
            "\uf8ed": 384,
            "\uf8ee": 384,
            "\uf8ef": 384,
            "\uf8f0": 384,
            "\uf8f1": 494,
            "\uf8f2": 494,
            "\uf8f3": 494,
            "\uf8f4": 494,
            "\uf8f5": 686,
            "\uf8f6": 384,
            "\uf8f7": 384,
            "\uf8f8": 384,
            "\uf8f9": 384,
            "\uf8fa": 384,
            "\uf8fb": 384,
            "\uf8fc": 494,
            "\uf8fd": 494,
            "\uf8fe": 494,
            "\uf8ff": 790,
        },
    ),
    "Times-Bold": (
        {
            "FontName": "Times-Bold",
            "Descent": -217.0,
            "FontBBox": (-168.0, -218.0, 1000.0, 935.0),
            "FontWeight": "Bold",
            "CapHeight": 676.0,
            "FontFamily": "Times",
            "Flags": 0,
            "XHeight": 461.0,
            "ItalicAngle": 0.0,
            "Ascent": 683.0,
        },
        {
            " ": 250,
            "!": 333,
            '"': 555,
            "#": 500,
            "$": 500,
            "%": 1000,
            "&": 833,
            "'": 278,
            "(": 333,
            ")": 333,
            "*": 500,
            "+": 570,
            ",": 250,
            "-": 333,
            ".": 250,
            "/": 278,
            "0": 500,
            "1": 500,
            "2": 500,
            "3": 500,
            "4": 500,
            "5": 500,
            "6": 500,
            "7": 500,
            "8": 500,
            "9": 500,
            ":": 333,
            ";": 333,
            "<": 570,
            "=": 570,
            ">": 570,
            "?": 500,
            "@": 930,
            "A": 722,
            "B": 667,
            "C": 722,
            "D": 722,
            "E": 667,
            "F": 611,
            "G": 778,
            "H": 778,
            "I": 389,
            "J": 500,
            "K": 778,
            "L": 667,
            "M": 944,
            "N": 722,
            "O": 778,
            "P": 611,
            "Q": 778,
            "R": 722,
            "S": 556,
            "T": 667,
            "U": 722,
            "V": 722,
            "W": 1000,
            "X": 722,
            "Y": 722,
            "Z": 667,
            "[": 333,
            "\\": 278,
            "]": 333,
            "^": 581,
            "_": 500,
            "`": 333,
            "a": 500,
            "b": 556,
            "c": 444,
            "d": 556,
            "e": 444,
            "f": 333,
            "g": 500,
            "h": 556,
            "i": 278,
            "j": 333,
            "k": 556,
            "l": 278,
            "m": 833,
            "n": 556,
            "o": 500,
            "p": 556,
            "q": 556,
            "r": 444,
            "s": 389,
            "t": 333,
            "u": 556,
            "v": 500,
            "w": 722,
            "x": 500,
            "y": 500,
            "z": 444,
            "{": 394,
            "|": 220,
            "}": 394,
            "~": 520,
            "\xa1": 333,
            "\xa2": 500,
            "\xa3": 500,
            "\xa4": 500,
            "\xa5": 500,
            "\xa6": 220,
            "\xa7": 500,
            "\xa8": 333,
            "\xa9": 747,
            "\xaa": 300,
            "\xab": 500,
            "\xac": 570,
            "\xae": 747,
            "\xaf": 333,
            "\xb0": 400,
            "\xb1": 570,
            "\xb2": 300,
            "\xb3": 300,
            "\xb4": 333,
            "\xb5": 556,
            "\xb6": 540,
            "\xb7": 250,
            "\xb8": 333,
            "\xb9": 300,
            "\xba": 330,
            "\xbb": 500,
            "\xbc": 750,
            "\xbd": 750,
            "\xbe": 750,
            "\xbf": 500,
            "\xc0": 722,
            "\xc1": 722,
            "\xc2": 722,
            "\xc3": 722,
            "\xc4": 722,
            "\xc5": 722,
            "\xc6": 1000,
            "\xc7": 722,
            "\xc8": 667,
            "\xc9": 667,
            "\xca": 667,
            "\xcb": 667,
            "\xcc": 389,
            "\xcd": 389,
            "\xce": 389,
            "\xcf": 389,
            "\xd0": 722,
            "\xd1": 722,
            "\xd2": 778,
            "\xd3": 778,
            "\xd4": 778,
            "\xd5": 778,
            "\xd6": 778,
            "\xd7": 570,
            "\xd8": 778,
            "\xd9": 722,
            "\xda": 722,
            "\xdb": 722,
            "\xdc": 722,
            "\xdd": 722,
            "\xde": 611,
            "\xdf": 556,
            "\xe0": 500,
            "\xe1": 500,
            "\xe2": 500,
            "\xe3": 500,
            "\xe4": 500,
            "\xe5": 500,
            "\xe6": 722,
            "\xe7": 444,
            "\xe8": 444,
            "\xe9": 444,
            "\xea": 444,
            "\xeb": 444,
            "\xec": 278,
            "\xed": 278,
            "\xee": 278,
            "\xef": 278,
            "\xf0": 500,
            "\xf1": 556,
            "\xf2": 500,
            "\xf3": 500,
            "\xf4": 500,
            "\xf5": 500,
            "\xf6": 500,
            "\xf7": 570,
            "\xf8": 500,
            "\xf9": 556,
            "\xfa": 556,
            "\xfb": 556,
            "\xfc": 556,
            "\xfd": 500,
            "\xfe": 556,
            "\xff": 500,
            "\u0100": 722,
            "\u0101": 500,
            "\u0102": 722,
            "\u0103": 500,
            "\u0104": 722,
            "\u0105": 500,
            "\u0106": 722,
            "\u0107": 444,
            "\u010c": 722,
            "\u010d": 444,
            "\u010e": 722,
            "\u010f": 672,
            "\u0110": 722,
            "\u0111": 556,
            "\u0112": 667,
            "\u0113": 444,
            "\u0116": 667,
            "\u0117": 444,
            "\u0118": 667,
            "\u0119": 444,
            "\u011a": 667,
            "\u011b": 444,
            "\u011e": 778,
            "\u011f": 500,
            "\u0122": 778,
            "\u0123": 500,
            "\u012a": 389,
            "\u012b": 278,
            "\u012e": 389,
            "\u012f": 278,
            "\u0130": 389,
            "\u0131": 278,
            "\u0136": 778,
            "\u0137": 556,
            "\u0139": 667,
            "\u013a": 278,
            "\u013b": 667,
            "\u013c": 278,
            "\u013d": 667,
            "\u013e": 394,
            "\u0141": 667,
            "\u0142": 278,
            "\u0143": 722,
            "\u0144": 556,
            "\u0145": 722,
            "\u0146": 556,
            "\u0147": 722,
            "\u0148": 556,
            "\u014c": 778,
            "\u014d": 500,
            "\u0150": 778,
            "\u0151": 500,
            "\u0152": 1000,
            "\u0153": 722,
            "\u0154": 722,
            "\u0155": 444,
            "\u0156": 722,
            "\u0157": 444,
            "\u0158": 722,
            "\u0159": 444,
            "\u015a": 556,
            "\u015b": 389,
            "\u015e": 556,
            "\u015f": 389,
            "\u0160": 556,
            "\u0161": 389,
            "\u0162": 667,
            "\u0163": 333,
            "\u0164": 667,
            "\u0165": 416,
            "\u016a": 722,
            "\u016b": 556,
            "\u016e": 722,
            "\u016f": 556,
            "\u0170": 722,
            "\u0171": 556,
            "\u0172": 722,
            "\u0173": 556,
            "\u0178": 722,
            "\u0179": 667,
            "\u017a": 444,
            "\u017b": 667,
            "\u017c": 444,
            "\u017d": 667,
            "\u017e": 444,
            "\u0192": 500,
            "\u0218": 556,
            "\u0219": 389,
            "\u02c6": 333,
            "\u02c7": 333,
            "\u02d8": 333,
            "\u02d9": 333,
            "\u02da": 333,
            "\u02db": 333,
            "\u02dc": 333,
            "\u02dd": 333,
            "\u2013": 500,
            "\u2014": 1000,
            "\u2018": 333,
            "\u2019": 333,
            "\u201a": 333,
            "\u201c": 500,
            "\u201d": 500,
            "\u201e": 500,
            "\u2020": 500,
            "\u2021": 500,
            "\u2022": 350,
            "\u2026": 1000,
            "\u2030": 1000,
            "\u2039": 333,
            "\u203a": 333,
            "\u2044": 167,
            "\u2122": 1000,
            "\u2202": 494,
            "\u2206": 612,
            "\u2211": 600,
            "\u2212": 570,
            "\u221a": 549,
            "\u2260": 549,
            "\u2264": 549,
            "\u2265": 549,
            "\u25ca": 494,
            "\uf6c3": 250,
            "\ufb01": 556,
            "\ufb02": 556,
        },
    ),
    "Times-BoldItalic": (
        {
            "FontName": "Times-BoldItalic",
            "Descent": -217.0,
            "FontBBox": (-200.0, -218.0, 996.0, 921.0),
            "FontWeight": "Bold",
            "CapHeight": 669.0,
            "FontFamily": "Times",
            "Flags": 0,
            "XHeight": 462.0,
            "ItalicAngle": -15.0,
            "Ascent": 683.0,
        },
        {
            " ": 250,
            "!": 389,
            '"': 555,
            "#": 500,
            "$": 500,
            "%": 833,
            "&": 778,
            "'": 278,
            "(": 333,
            ")": 333,
            "*": 500,
            "+": 570,
            ",": 250,
            "-": 333,
            ".": 250,
            "/": 278,
            "0": 500,
            "1": 500,
            "2": 500,
            "3": 500,
            "4": 500,
            "5": 500,
            "6": 500,
            "7": 500,
            "8": 500,
            "9": 500,
            ":": 333,
            ";": 333,
            "<": 570,
            "=": 570,
            ">": 570,
            "?": 500,
            "@": 832,
            "A": 667,
            "B": 667,
            "C": 667,
            "D": 722,
            "E": 667,
            "F": 667,
            "G": 722,
            "H": 778,
            "I": 389,
            "J": 500,
            "K": 667,
            "L": 611,
            "M": 889,
            "N": 722,
            "O": 722,
            "P": 611,
            "Q": 722,
            "R": 667,
            "S": 556,
            "T": 611,
            "U": 722,
            "V": 667,
            "W": 889,
            "X": 667,
            "Y": 611,
            "Z": 611,
            "[": 333,
            "\\": 278,
            "]": 333,
            "^": 570,
            "_": 500,
            "`": 333,
            "a": 500,
            "b": 500,
            "c": 444,
            "d": 500,
            "e": 444,
            "f": 333,
            "g": 500,
            "h": 556,
            "i": 278,
            "j": 278,
            "k": 500,
            "l": 278,
            "m": 778,
            "n": 556,
            "o": 500,
            "p": 500,
            "q": 500,
            "r": 389,
            "s": 389,
            "t": 278,
            "u": 556,
            "v": 444,
            "w": 667,
            "x": 500,
            "y": 444,
            "z": 389,
            "{": 348,
            "|": 220,
            "}": 348,
            "~": 570,
            "\xa1": 389,
            "\xa2": 500,
            "\xa3": 500,
            "\xa4": 500,
            "\xa5": 500,
            "\xa6": 220,
            "\xa7": 500,
            "\xa8": 333,
            "\xa9": 747,
            "\xaa": 266,
            "\xab": 500,
            "\xac": 606,
            "\xae": 747,
            "\xaf": 333,
            "\xb0": 400,
            "\xb1": 570,
            "\xb2": 300,
            "\xb3": 300,
            "\xb4": 333,
            "\xb5": 576,
            "\xb6": 500,
            "\xb7": 250,
            "\xb8": 333,
            "\xb9": 300,
            "\xba": 300,
            "\xbb": 500,
            "\xbc": 750,
            "\xbd": 750,
            "\xbe": 750,
            "\xbf": 500,
            "\xc0": 667,
            "\xc1": 667,
            "\xc2": 667,
            "\xc3": 667,
            "\xc4": 667,
            "\xc5": 667,
            "\xc6": 944,
            "\xc7": 667,
            "\xc8": 667,
            "\xc9": 667,
            "\xca": 667,
            "\xcb": 667,
            "\xcc": 389,
            "\xcd": 389,
            "\xce": 389,
            "\xcf": 389,
            "\xd0": 722,
            "\xd1": 722,
            "\xd2": 722,
            "\xd3": 722,
            "\xd4": 722,
            "\xd5": 722,
            "\xd6": 722,
            "\xd7": 570,
            "\xd8": 722,
            "\xd9": 722,
            "\xda": 722,
            "\xdb": 722,
            "\xdc": 722,
            "\xdd": 611,
            "\xde": 611,
            "\xdf": 500,
            "\xe0": 500,
            "\xe1": 500,
            "\xe2": 500,
            "\xe3": 500,
            "\xe4": 500,
            "\xe5": 500,
            "\xe6": 722,
            "\xe7": 444,
            "\xe8": 444,
            "\xe9": 444,
            "\xea": 444,
            "\xeb": 444,
            "\xec": 278,
            "\xed": 278,
            "\xee": 278,
            "\xef": 278,
            "\xf0": 500,
            "\xf1": 556,
            "\xf2": 500,
            "\xf3": 500,
            "\xf4": 500,
            "\xf5": 500,
            "\xf6": 500,
            "\xf7": 570,
            "\xf8": 500,
            "\xf9": 556,
            "\xfa": 556,
            "\xfb": 556,
            "\xfc": 556,
            "\xfd": 444,
            "\xfe": 500,
            "\xff": 444,
            "\u0100": 667,
            "\u0101": 500,
            "\u0102": 667,
            "\u0103": 500,
            "\u0104": 667,
            "\u0105": 500,
            "\u0106": 667,
            "\u0107": 444,
            "\u010c": 667,
            "\u010d": 444,
            "\u010e": 722,
            "\u010f": 608,
            "\u0110": 722,
            "\u0111": 500,
            "\u0112": 667,
            "\u0113": 444,
            "\u0116": 667,
            "\u0117": 444,
            "\u0118": 667,
            "\u0119": 444,
            "\u011a": 667,
            "\u011b": 444,
            "\u011e": 722,
            "\u011f": 500,
            "\u0122": 722,
            "\u0123": 500,
            "\u012a": 389,
            "\u012b": 278,
            "\u012e": 389,
            "\u012f": 278,
            "\u0130": 389,
            "\u0131": 278,
            "\u0136": 667,
            "\u0137": 500,
            "\u0139": 611,
            "\u013a": 278,
            "\u013b": 611,
            "\u013c": 278,
            "\u013d": 611,
            "\u013e": 382,
            "\u0141": 611,
            "\u0142": 278,
            "\u0143": 722,
            "\u0144": 556,
            "\u0145": 722,
            "\u0146": 556,
            "\u0147": 722,
            "\u0148": 556,
            "\u014c": 722,
            "\u014d": 500,
            "\u0150": 722,
            "\u0151": 500,
            "\u0152": 944,
            "\u0153": 722,
            "\u0154": 667,
            "\u0155": 389,
            "\u0156": 667,
            "\u0157": 389,
            "\u0158": 667,
            "\u0159": 389,
            "\u015a": 556,
            "\u015b": 389,
            "\u015e": 556,
            "\u015f": 389,
            "\u0160": 556,
            "\u0161": 389,
            "\u0162": 611,
            "\u0163": 278,
            "\u0164": 611,
            "\u0165": 366,
            "\u016a": 722,
            "\u016b": 556,
            "\u016e": 722,
            "\u016f": 556,
            "\u0170": 722,
            "\u0171": 556,
            "\u0172": 722,
            "\u0173": 556,
            "\u0178": 611,
            "\u0179": 611,
            "\u017a": 389,
            "\u017b": 611,
            "\u017c": 389,
            "\u017d": 611,
            "\u017e": 389,
            "\u0192": 500,
            "\u0218": 556,
            "\u0219": 389,
            "\u02c6": 333,
            "\u02c7": 333,
            "\u02d8": 333,
            "\u02d9": 333,
            "\u02da": 333,
            "\u02db": 333,
            "\u02dc": 333,
            "\u02dd": 333,
            "\u2013": 500,
            "\u2014": 1000,
            "\u2018": 333,
            "\u2019": 333,
            "\u201a": 333,
            "\u201c": 500,
            "\u201d": 500,
            "\u201e": 500,
            "\u2020": 500,
            "\u2021": 500,
            "\u2022": 350,
            "\u2026": 1000,
            "\u2030": 1000,
            "\u2039": 333,
            "\u203a": 333,
            "\u2044": 167,
            "\u2122": 1000,
            "\u2202": 494,
            "\u2206": 612,
            "\u2211": 600,
            "\u2212": 606,
            "\u221a": 549,
            "\u2260": 549,
            "\u2264": 549,
            "\u2265": 549,
            "\u25ca": 494,
            "\uf6c3": 250,
            "\ufb01": 556,
            "\ufb02": 556,
        },
    ),
    "Times-Italic": (
        {
            "FontName": "Times-Italic",
            "Descent": -217.0,
            "FontBBox": (-169.0, -217.0, 1010.0, 883.0),
            "FontWeight": "Medium",
            "CapHeight": 653.0,
            "FontFamily": "Times",
            "Flags": 0,
            "XHeight": 441.0,
            "ItalicAngle": -15.5,
            "Ascent": 683.0,
        },
        {
            " ": 250,
            "!": 333,
            '"': 420,
            "#": 500,
            "$": 500,
            "%": 833,
            "&": 778,
            "'": 214,
            "(": 333,
            ")": 333,
            "*": 500,
            "+": 675,
            ",": 250,
            "-": 333,
            ".": 250,
            "/": 278,
            "0": 500,
            "1": 500,
            "2": 500,
            "3": 500,
            "4": 500,
            "5": 500,
            "6": 500,
            "7": 500,
            "8": 500,
            "9": 500,
            ":": 333,
            ";": 333,
            "<": 675,
            "=": 675,
            ">": 675,
            "?": 500,
            "@": 920,
            "A": 611,
            "B": 611,
            "C": 667,
            "D": 722,
            "E": 611,
            "F": 611,
            "G": 722,
            "H": 722,
            "I": 333,
            "J": 444,
            "K": 667,
            "L": 556,
            "M": 833,
            "N": 667,
            "O": 722,
            "P": 611,
            "Q": 722,
            "R": 611,
            "S": 500,
            "T": 556,
            "U": 722,
            "V": 611,
            "W": 833,
            "X": 611,
            "Y": 556,
            "Z": 556,
            "[": 389,
            "\\": 278,
            "]": 389,
            "^": 422,
            "_": 500,
            "`": 333,
            "a": 500,
            "b": 500,
            "c": 444,
            "d": 500,
            "e": 444,
            "f": 278,
            "g": 500,
            "h": 500,
            "i": 278,
            "j": 278,
            "k": 444,
            "l": 278,
            "m": 722,
            "n": 500,
            "o": 500,
            "p": 500,
            "q": 500,
            "r": 389,
            "s": 389,
            "t": 278,
            "u": 500,
            "v": 444,
            "w": 667,
            "x": 444,
            "y": 444,
            "z": 389,
            "{": 400,
            "|": 275,
            "}": 400,
            "~": 541,
            "\xa1": 389,
            "\xa2": 500,
            "\xa3": 500,
            "\xa4": 500,
            "\xa5": 500,
            "\xa6": 275,
            "\xa7": 500,
            "\xa8": 333,
            "\xa9": 760,
            "\xaa": 276,
            "\xab": 500,
            "\xac": 675,
            "\xae": 760,
            "\xaf": 333,
            "\xb0": 400,
            "\xb1": 675,
            "\xb2": 300,
            "\xb3": 300,
            "\xb4": 333,
            "\xb5": 500,
            "\xb6": 523,
            "\xb7": 250,
            "\xb8": 333,
            "\xb9": 300,
            "\xba": 310,
            "\xbb": 500,
            "\xbc": 750,
            "\xbd": 750,
            "\xbe": 750,
            "\xbf": 500,
            "\xc0": 611,
            "\xc1": 611,
            "\xc2": 611,
            "\xc3": 611,
            "\xc4": 611,
            "\xc5": 611,
            "\xc6": 889,
            "\xc7": 667,
            "\xc8": 611,
            "\xc9": 611,
            "\xca": 611,
            "\xcb": 611,
            "\xcc": 333,
            "\xcd": 333,
            "\xce": 333,
            "\xcf": 333,
            "\xd0": 722,
            "\xd1": 667,
            "\xd2": 722,
            "\xd3": 722,
            "\xd4": 722,
            "\xd5": 722,
            "\xd6": 722,
            "\xd7": 675,
            "\xd8": 722,
            "\xd9": 722,
            "\xda": 722,
            "\xdb": 722,
            "\xdc": 722,
            "\xdd": 556,
            "\xde": 611,
            "\xdf": 500,
            "\xe0": 500,
            "\xe1": 500,
            "\xe2": 500,
            "\xe3": 500,
            "\xe4": 500,
            "\xe5": 500,
            "\xe6": 667,
            "\xe7": 444,
            "\xe8": 444,
            "\xe9": 444,
            "\xea": 444,
            "\xeb": 444,
            "\xec": 278,
            "\xed": 278,
            "\xee": 278,
            "\xef": 278,
            "\xf0": 500,
            "\xf1": 500,
            "\xf2": 500,
            "\xf3": 500,
            "\xf4": 500,
            "\xf5": 500,
            "\xf6": 500,
            "\xf7": 675,
            "\xf8": 500,
            "\xf9": 500,
            "\xfa": 500,
            "\xfb": 500,
            "\xfc": 500,
            "\xfd": 444,
            "\xfe": 500,
            "\xff": 444,
            "\u0100": 611,
            "\u0101": 500,
            "\u0102": 611,
            "\u0103": 500,
            "\u0104": 611,
            "\u0105": 500,
            "\u0106": 667,
            "\u0107": 444,
            "\u010c": 667,
            "\u010d": 444,
            "\u010e": 722,
            "\u010f": 544,
            "\u0110": 722,
            "\u0111": 500,
            "\u0112": 611,
            "\u0113": 444,
            "\u0116": 611,
            "\u0117": 444,
            "\u0118": 611,
            "\u0119": 444,
            "\u011a": 611,
            "\u011b": 444,
            "\u011e": 722,
            "\u011f": 500,
            "\u0122": 722,
            "\u0123": 500,
            "\u012a": 333,
            "\u012b": 278,
            "\u012e": 333,
            "\u012f": 278,
            "\u0130": 333,
            "\u0131": 278,
            "\u0136": 667,
            "\u0137": 444,
            "\u0139": 556,
            "\u013a": 278,
            "\u013b": 556,
            "\u013c": 278,
            "\u013d": 611,
            "\u013e": 300,
            "\u0141": 556,
            "\u0142": 278,
            "\u0143": 667,
            "\u0144": 500,
            "\u0145": 667,
            "\u0146": 500,
            "\u0147": 667,
            "\u0148": 500,
            "\u014c": 722,
            "\u014d": 500,
            "\u0150": 722,
            "\u0151": 500,
            "\u0152": 944,
            "\u0153": 667,
            "\u0154": 611,
            "\u0155": 389,
            "\u0156": 611,
            "\u0157": 389,
            "\u0158": 611,
            "\u0159": 389,
            "\u015a": 500,
            "\u015b": 389,
            "\u015e": 500,
            "\u015f": 389,
            "\u0160": 500,
            "\u0161": 389,
            "\u0162": 556,
            "\u0163": 278,
            "\u0164": 556,
            "\u0165": 300,
            "\u016a": 722,
            "\u016b": 500,
            "\u016e": 722,
            "\u016f": 500,
            "\u0170": 722,
            "\u0171": 500,
            "\u0172": 722,
            "\u0173": 500,
            "\u0178": 556,
            "\u0179": 556,
            "\u017a": 389,
            "\u017b": 556,
            "\u017c": 389,
            "\u017d": 556,
            "\u017e": 389,
            "\u0192": 500,
            "\u0218": 500,
            "\u0219": 389,
            "\u02c6": 333,
            "\u02c7": 333,
            "\u02d8": 333,
            "\u02d9": 333,
            "\u02da": 333,
            "\u02db": 333,
            "\u02dc": 333,
            "\u02dd": 333,
            "\u2013": 500,
            "\u2014": 889,
            "\u2018": 333,
            "\u2019": 333,
            "\u201a": 333,
            "\u201c": 556,
            "\u201d": 556,
            "\u201e": 556,
            "\u2020": 500,
            "\u2021": 500,
            "\u2022": 350,
            "\u2026": 889,
            "\u2030": 1000,
            "\u2039": 333,
            "\u203a": 333,
            "\u2044": 167,
            "\u2122": 980,
            "\u2202": 476,
            "\u2206": 612,
            "\u2211": 600,
            "\u2212": 675,
            "\u221a": 453,
            "\u2260": 549,
            "\u2264": 549,
            "\u2265": 549,
            "\u25ca": 471,
            "\uf6c3": 250,
            "\ufb01": 500,
            "\ufb02": 500,
        },
    ),
    "Times-Roman": (
        {
            "FontName": "Times-Roman",
            "Descent": -217.0,
            "FontBBox": (-168.0, -218.0, 1000.0, 898.0),
            "FontWeight": "Roman",
            "CapHeight": 662.0,
            "FontFamily": "Times",
            "Flags": 0,
            "XHeight": 450.0,
            "ItalicAngle": 0.0,
            "Ascent": 683.0,
        },
        {
            " ": 250,
            "!": 333,
            '"': 408,
            "#": 500,
            "$": 500,
            "%": 833,
            "&": 778,
            "'": 180,
            "(": 333,
            ")": 333,
            "*": 500,
            "+": 564,
            ",": 250,
            "-": 333,
            ".": 250,
            "/": 278,
            "0": 500,
            "1": 500,
            "2": 500,
            "3": 500,
            "4": 500,
            "5": 500,
            "6": 500,
            "7": 500,
            "8": 500,
            "9": 500,
            ":": 278,
            ";": 278,
            "<": 564,
            "=": 564,
            ">": 564,
            "?": 444,
            "@": 921,
            "A": 722,
            "B": 667,
            "C": 667,
            "D": 722,
            "E": 611,
            "F": 556,
            "G": 722,
            "H": 722,
            "I": 333,
            "J": 389,
            "K": 722,
            "L": 611,
            "M": 889,
            "N": 722,
            "O": 722,
            "P": 556,
            "Q": 722,
            "R": 667,
            "S": 556,
            "T": 611,
            "U": 722,
            "V": 722,
            "W": 944,
            "X": 722,
            "Y": 722,
            "Z": 611,
            "[": 333,
            "\\": 278,
            "]": 333,
            "^": 469,
            "_": 500,
            "`": 333,
            "a": 444,
            "b": 500,
            "c": 444,
            "d": 500,
            "e": 444,
            "f": 333,
            "g": 500,
            "h": 500,
            "i": 278,
            "j": 278,
            "k": 500,
            "l": 278,
            "m": 778,
            "n": 500,
            "o": 500,
            "p": 500,
            "q": 500,
            "r": 333,
            "s": 389,
            "t": 278,
            "u": 500,
            "v": 500,
            "w": 722,
            "x": 500,
            "y": 500,
            "z": 444,
            "{": 480,
            "|": 200,
            "}": 480,
            "~": 541,
            "\xa1": 333,
            "\xa2": 500,
            "\xa3": 500,
            "\xa4": 500,
            "\xa5": 500,
            "\xa6": 200,
            "\xa7": 500,
            "\xa8": 333,
            "\xa9": 760,
            "\xaa": 276,
            "\xab": 500,
            "\xac": 564,
            "\xae": 760,
            "\xaf": 333,
            "\xb0": 400,
            "\xb1": 564,
            "\xb2": 300,
            "\xb3": 300,
            "\xb4": 333,
            "\xb5": 500,
            "\xb6": 453,
            "\xb7": 250,
            "\xb8": 333,
            "\xb9": 300,
            "\xba": 310,
            "\xbb": 500,
            "\xbc": 750,
            "\xbd": 750,
            "\xbe": 750,
            "\xbf": 444,
            "\xc0": 722,
            "\xc1": 722,
            "\xc2": 722,
            "\xc3": 722,
            "\xc4": 722,
            "\xc5": 722,
            "\xc6": 889,
            "\xc7": 667,
            "\xc8": 611,
            "\xc9": 611,
            "\xca": 611,
            "\xcb": 611,
            "\xcc": 333,
            "\xcd": 333,
            "\xce": 333,
            "\xcf": 333,
            "\xd0": 722,
            "\xd1": 722,
            "\xd2": 722,
            "\xd3": 722,
            "\xd4": 722,
            "\xd5": 722,
            "\xd6": 722,
            "\xd7": 564,
            "\xd8": 722,
            "\xd9": 722,
            "\xda": 722,
            "\xdb": 722,
            "\xdc": 722,
            "\xdd": 722,
            "\xde": 556,
            "\xdf": 500,
            "\xe0": 444,
            "\xe1": 444,
            "\xe2": 444,
            "\xe3": 444,
            "\xe4": 444,
            "\xe5": 444,
            "\xe6": 667,
            "\xe7": 444,
            "\xe8": 444,
            "\xe9": 444,
            "\xea": 444,
            "\xeb": 444,
            "\xec": 278,
            "\xed": 278,
            "\xee": 278,
            "\xef": 278,
            "\xf0": 500,
            "\xf1": 500,
            "\xf2": 500,
            "\xf3": 500,
            "\xf4": 500,
            "\xf5": 500,
            "\xf6": 500,
            "\xf7": 564,
            "\xf8": 500,
            "\xf9": 500,
            "\xfa": 500,
            "\xfb": 500,
            "\xfc": 500,
            "\xfd": 500,
            "\xfe": 500,
            "\xff": 500,
            "\u0100": 722,
            "\u0101": 444,
            "\u0102": 722,
            "\u0103": 444,
            "\u0104": 722,
            "\u0105": 444,
            "\u0106": 667,
            "\u0107": 444,
            "\u010c": 667,
            "\u010d": 444,
            "\u010e": 722,
            "\u010f": 588,
            "\u0110": 722,
            "\u0111": 500,
            "\u0112": 611,
            "\u0113": 444,
            "\u0116": 611,
            "\u0117": 444,
            "\u0118": 611,
            "\u0119": 444,
            "\u011a": 611,
            "\u011b": 444,
            "\u011e": 722,
            "\u011f": 500,
            "\u0122": 722,
            "\u0123": 500,
            "\u012a": 333,
            "\u012b": 278,
            "\u012e": 333,
            "\u012f": 278,
            "\u0130": 333,
            "\u0131": 278,
            "\u0136": 722,
            "\u0137": 500,
            "\u0139": 611,
            "\u013a": 278,
            "\u013b": 611,
            "\u013c": 278,
            "\u013d": 611,
            "\u013e": 344,
            "\u0141": 611,
            "\u0142": 278,
            "\u0143": 722,
            "\u0144": 500,
            "\u0145": 722,
            "\u0146": 500,
            "\u0147": 722,
            "\u0148": 500,
            "\u014c": 722,
            "\u014d": 500,
            "\u0150": 722,
            "\u0151": 500,
            "\u0152": 889,
            "\u0153": 722,
            "\u0154": 667,
            "\u0155": 333,
            "\u0156": 667,
            "\u0157": 333,
            "\u0158": 667,
            "\u0159": 333,
            "\u015a": 556,
            "\u015b": 389,
            "\u015e": 556,
            "\u015f": 389,
            "\u0160": 556,
            "\u0161": 389,
            "\u0162": 611,
            "\u0163": 278,
            "\u0164": 611,
            "\u0165": 326,
            "\u016a": 722,
            "\u016b": 500,
            "\u016e": 722,
            "\u016f": 500,
            "\u0170": 722,
            "\u0171": 500,
            "\u0172": 722,
            "\u0173": 500,
            "\u0178": 722,
            "\u0179": 611,
            "\u017a": 444,
            "\u017b": 611,
            "\u017c": 444,
            "\u017d": 611,
            "\u017e": 444,
            "\u0192": 500,
            "\u0218": 556,
            "\u0219": 389,
            "\u02c6": 333,
            "\u02c7": 333,
            "\u02d8": 333,
            "\u02d9": 333,
            "\u02da": 333,
            "\u02db": 333,
            "\u02dc": 333,
            "\u02dd": 333,
            "\u2013": 500,
            "\u2014": 1000,
            "\u2018": 333,
            "\u2019": 333,
            "\u201a": 333,
            "\u201c": 444,
            "\u201d": 444,
            "\u201e": 444,
            "\u2020": 500,
            "\u2021": 500,
            "\u2022": 350,
            "\u2026": 1000,
            "\u2030": 1000,
            "\u2039": 333,
            "\u203a": 333,
            "\u2044": 167,
            "\u2122": 980,
            "\u2202": 476,
            "\u2206": 612,
            "\u2211": 600,
            "\u2212": 564,
            "\u221a": 453,
            "\u2260": 549,
            "\u2264": 549,
            "\u2265": 549,
            "\u25ca": 471,
            "\uf6c3": 250,
            "\ufb01": 556,
            "\ufb02": 556,
        },
    ),
    "ZapfDingbats": (
        {
            "FontName": "ZapfDingbats",
            "FontBBox": (-1.0, -143.0, 981.0, 820.0),
            "FontWeight": "Medium",
            "FontFamily": "ITC",
            "Flags": 0,
            "ItalicAngle": 0.0,
        },
        {
            "\x01": 974,
            "\x02": 961,
            "\x03": 980,
            "\x04": 719,
            "\x05": 789,
            "\x06": 494,
            "\x07": 552,
            "\x08": 537,
            "\t": 577,
            "\n": 692,
            "\x0b": 960,
            "\x0c": 939,
            "\r": 549,
            "\x0e": 855,
            "\x0f": 911,
            "\x10": 933,
            "\x11": 945,
            "\x12": 974,
            "\x13": 755,
            "\x14": 846,
            "\x15": 762,
            "\x16": 761,
            "\x17": 571,
            "\x18": 677,
            "\x19": 763,
            "\x1a": 760,
            "\x1b": 759,
            "\x1c": 754,
            "\x1d": 786,
            "\x1e": 788,
            "\x1f": 788,
            " ": 790,
            "!": 793,
            '"': 794,
            "#": 816,
            "$": 823,
            "%": 789,
            "&": 841,
            "'": 823,
            "(": 833,
            ")": 816,
            "*": 831,
            "+": 923,
            ",": 744,
            "-": 723,
            ".": 749,
            "/": 790,
            "0": 792,
            "1": 695,
            "2": 776,
            "3": 768,
            "4": 792,
            "5": 759,
            "6": 707,
            "7": 708,
            "8": 682,
            "9": 701,
            ":": 826,
            ";": 815,
            "<": 789,
            "=": 789,
            ">": 707,
            "?": 687,
            "@": 696,
            "A": 689,
            "B": 786,
            "C": 787,
            "D": 713,
            "E": 791,
            "F": 785,
            "G": 791,
            "H": 873,
            "I": 761,
            "J": 762,
            "K": 759,
            "L": 892,
            "M": 892,
            "N": 788,
            "O": 784,
            "Q": 438,
            "R": 138,
            "S": 277,
            "T": 415,
            "U": 509,
            "V": 410,
            "W": 234,
            "X": 234,
            "Y": 390,
            "Z": 390,
            "[": 276,
            "\\": 276,
            "]": 317,
            "^": 317,
            "_": 334,
            "`": 334,
            "a": 392,
            "b": 392,
            "c": 668,
            "d": 668,
            "e": 732,
            "f": 544,
            "g": 544,
            "h": 910,
            "i": 911,
            "j": 667,
            "k": 760,
            "l": 760,
            "m": 626,
            "n": 694,
            "o": 595,
            "p": 776,
            "u": 690,
            "v": 791,
            "w": 790,
            "x": 788,
            "y": 788,
            "z": 788,
            "{": 788,
            "|": 788,
            "}": 788,
            "~": 788,
            "\x7f": 788,
            "\x80": 788,
            "\x81": 788,
            "\x82": 788,
            "\x83": 788,
            "\x84": 788,
            "\x85": 788,
            "\x86": 788,
            "\x87": 788,
            "\x88": 788,
            "\x89": 788,
            "\x8a": 788,
            "\x8b": 788,
            "\x8c": 788,
            "\x8d": 788,
            "\x8e": 788,
            "\x8f": 788,
            "\x90": 788,
            "\x91": 788,
            "\x92": 788,
            "\x93": 788,
            "\x94": 788,
            "\x95": 788,
            "\x96": 788,
            "\x97": 788,
            "\x98": 788,
            "\x99": 788,
            "\x9a": 788,
            "\x9b": 788,
            "\x9c": 788,
            "\x9d": 788,
            "\x9e": 788,
            "\x9f": 788,
            "\xa0": 894,
            "\xa1": 838,
            "\xa2": 924,
            "\xa3": 1016,
            "\xa4": 458,
            "\xa5": 924,
            "\xa6": 918,
            "\xa7": 927,
            "\xa8": 928,
            "\xa9": 928,
            "\xaa": 834,
            "\xab": 873,
            "\xac": 828,
            "\xad": 924,
            "\xae": 917,
            "\xaf": 930,
            "\xb0": 931,
            "\xb1": 463,
            "\xb2": 883,
            "\xb3": 836,
            "\xb4": 867,
            "\xb5": 696,
            "\xb6": 874,
            "\xb7": 760,
            "\xb8": 946,
            "\xb9": 865,
            "\xba": 967,
            "\xbb": 831,
            "\xbc": 873,
            "\xbd": 927,
            "\xbe": 970,
            "\xbf": 918,
            "\xc0": 748,
            "\xc1": 836,
            "\xc2": 771,
            "\xc3": 888,
            "\xc4": 748,
            "\xc5": 771,
            "\xc6": 888,
            "\xc7": 867,
            "\xc8": 696,
            "\xc9": 874,
            "\xca": 974,
            "\xcb": 762,
            "\xcc": 759,
            "\xcd": 509,
            "\xce": 410,
        },
    ),
}

# Aliases defined in implementation note 62 in Appecix H. related to section 5.5.1
# (Type 1 Fonts) in the PDF Reference.
FONT_METRICS["Arial"] = FONT_METRICS["Helvetica"]
FONT_METRICS["Arial,Italic"] = FONT_METRICS["Helvetica-Oblique"]
FONT_METRICS["Arial,Bold"] = FONT_METRICS["Helvetica-Bold"]
FONT_METRICS["Arial,BoldItalic"] = FONT_METRICS["Helvetica-BoldOblique"]
FONT_METRICS["CourierNew"] = FONT_METRICS["Courier"]
FONT_METRICS["CourierNew,Italic"] = FONT_METRICS["Courier-Oblique"]
FONT_METRICS["CourierNew,Bold"] = FONT_METRICS["Courier-Bold"]
FONT_METRICS["CourierNew,BoldItalic"] = FONT_METRICS["Courier-BoldOblique"]
FONT_METRICS["TimesNewRoman"] = FONT_METRICS["Times-Roman"]
FONT_METRICS["TimesNewRoman,Italic"] = FONT_METRICS["Times-Italic"]
FONT_METRICS["TimesNewRoman,Bold"] = FONT_METRICS["Times-Bold"]
FONT_METRICS["TimesNewRoman,BoldItalic"] = FONT_METRICS["Times-BoldItalic"]


================================================
FILE: babeldoc/pdfminer/glyphlist.py
================================================
"""Mappings from Adobe glyph names to Unicode characters.

In some CMap tables, Adobe glyph names are used for specifying
Unicode characters instead of using decimal/hex character code.

The following data was taken by

  $ wget https://partners.adobe.com/public/developer/en/opentype/glyphlist.txt

```python
from babeldoc.pdfminer.glyphlist import convert_glyphlist

convert_glyphlist("glyphlist.txt")"""

# ###################################################################################
# Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this documentation file to use, copy, publish, distribute,
# sublicense, and/or sell copies of the documentation, and to permit
# others to do the same, provided that:
# - No modification, editing or other alteration of this document is
# allowed; and
# - The above copyright notice and this permission notice shall be
# included in all copies of the documentation.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this documentation file, to create their own derivative works
# from the content of this document to use, copy, publish, distribute,
# sublicense, and/or sell the derivative works, and to permit others to do
# the same, provided that the derived work is not represented as being a
# copy or version of this document.
#
# Adobe shall not be liable to any party for any loss of revenue or profit
# or for indirect, incidental, special, consequential, or other similar
# damages, whether based on tort (including without limitation negligence
# or strict liability), contract or other legal or equitable grounds even
# if Adobe has been advised or had reason to know of the possibility of
# such damages. The Adobe materials are provided on an "AS IS" basis.
# Adobe specifically disclaims all express, statutory, or implied
# warranties relating to the Adobe materials, including but not limited to
# those concerning merchantability or fitness for a particular purpose or
# non-infringement of any third party rights regarding the Adobe
# materials.
# ###################################################################################
# Name:          Adobe Glyph List
# Table version: 2.0
# Date:          September 20, 2002
#
# See http://partners.adobe.com/asn/developer/typeforum/unicodegn.html
#
# Format: Semicolon-delimited fields:
#            (1) glyph name
#            (2) Unicode scalar value


def convert_glyphlist(path: str) -> None:
    """Convert a glyph list into a python representation.

    See output below.
    """
    state = 0
    with open(path) as fileinput:
        for line in fileinput.readlines():
            line = line.strip()
            if not line or line.startswith("#"):
                if state == 1:
                    state = 2
                    print("}\n")
                print(line)
                continue
            if state == 0:
                print("\nglyphname2unicode = {")
                state = 1
            (name, x) = line.split(";")
            codes = x.split(" ")
            print(
                " {!r}: u'{}',".format(name, "".join("\\u%s" % code for code in codes)),
            )


glyphname2unicode = {
    "A": "\u0041",
    "AE": "\u00c6",
    "AEacute": "\u01fc",
    "AEmacron": "\u01e2",
    "AEsmall": "\uf7e6",
    "Aacute": "\u00c1",
    "Aacutesmall": "\uf7e1",
    "Abreve": "\u0102",
    "Abreveacute": "\u1eae",
    "Abrevecyrillic": "\u04d0",
    "Abrevedotbelow": "\u1eb6",
    "Abrevegrave": "\u1eb0",
    "Abrevehookabove": "\u1eb2",
    "Abrevetilde": "\u1eb4",
    "Acaron": "\u01cd",
    "Acircle": "\u24b6",
    "Acircumflex": "\u00c2",
    "Acircumflexacute": "\u1ea4",
    "Acircumflexdotbelow": "\u1eac",
    "Acircumflexgrave": "\u1ea6",
    "Acircumflexhookabove": "\u1ea8",
    "Acircumflexsmall": "\uf7e2",
    "Acircumflextilde": "\u1eaa",
    "Acute": "\uf6c9",
    "Acutesmall": "\uf7b4",
    "Acyrillic": "\u0410",
    "Adblgrave": "\u0200",
    "Adieresis": "\u00c4",
    "Adieresiscyrillic": "\u04d2",
    "Adieresismacron": "\u01de",
    "Adieresissmall": "\uf7e4",
    "Adotbelow": "\u1ea0",
    "Adotmacron": "\u01e0",
    "Agrave": "\u00c0",
    "Agravesmall": "\uf7e0",
    "Ahookabove": "\u1ea2",
    "Aiecyrillic": "\u04d4",
    "Ainvertedbreve": "\u0202",
    "Alpha": "\u0391",
    "Alphatonos": "\u0386",
    "Amacron": "\u0100",
    "Amonospace": "\uff21",
    "Aogonek": "\u0104",
    "Aring": "\u00c5",
    "Aringacute": "\u01fa",
    "Aringbelow": "\u1e00",
    "Aringsmall": "\uf7e5",
    "Asmall": "\uf761",
    "Atilde": "\u00c3",
    "Atildesmall": "\uf7e3",
    "Aybarmenian": "\u0531",
    "B": "\u0042",
    "Bcircle": "\u24b7",
    "Bdotaccent": "\u1e02",
    "Bdotbelow": "\u1e04",
    "Becyrillic": "\u0411",
    "Benarmenian": "\u0532",
    "Beta": "\u0392",
    "Bhook": "\u0181",
    "Blinebelow": "\u1e06",
    "Bmonospace": "\uff22",
    "Brevesmall": "\uf6f4",
    "Bsmall": "\uf762",
    "Btopbar": "\u0182",
    "C": "\u0043",
    "Caarmenian": "\u053e",
    "Cacute": "\u0106",
    "Caron": "\uf6ca",
    "Caronsmall": "\uf6f5",
    "Ccaron": "\u010c",
    "Ccedilla": "\u00c7",
    "Ccedillaacute": "\u1e08",
    "Ccedillasmall": "\uf7e7",
    "Ccircle": "\u24b8",
    "Ccircumflex": "\u0108",
    "Cdot": "\u010a",
    "Cdotaccent": "\u010a",
    "Cedillasmall": "\uf7b8",
    "Chaarmenian": "\u0549",
    "Cheabkhasiancyrillic": "\u04bc",
    "Checyrillic": "\u0427",
    "Chedescenderabkhasiancyrillic": "\u04be",
    "Chedescendercyrillic": "\u04b6",
    "Chedieresiscyrillic": "\u04f4",
    "Cheharmenian": "\u0543",
    "Chekhakassiancyrillic": "\u04cb",
    "Cheverticalstrokecyrillic": "\u04b8",
    "Chi": "\u03a7",
    "Chook": "\u0187",
    "Circumflexsmall": "\uf6f6",
    "Cmonospace": "\uff23",
    "Coarmenian": "\u0551",
    "Csmall": "\uf763",
    "D": "\u0044",
    "DZ": "\u01f1",
    "DZcaron": "\u01c4",
    "Daarmenian": "\u0534",
    "Dafrican": "\u0189",
    "Dcaron": "\u010e",
    "Dcedilla": "\u1e10",
    "Dcircle": "\u24b9",
    "Dcircumflexbelow": "\u1e12",
    "Dcroat": "\u0110",
    "Ddotaccent": "\u1e0a",
    "Ddotbelow": "\u1e0c",
    "Decyrillic": "\u0414",
    "Deicoptic": "\u03ee",
    "Delta": "\u2206",
    "Deltagreek": "\u0394",
    "Dhook": "\u018a",
    "Dieresis": "\uf6cb",
    "DieresisAcute": "\uf6cc",
    "DieresisGrave": "\uf6cd",
    "Dieresissmall": "\uf7a8",
    "Digammagreek": "\u03dc",
    "Djecyrillic": "\u0402",
    "Dlinebelow": "\u1e0e",
    "Dmonospace": "\uff24",
    "Dotaccentsmall": "\uf6f7",
    "Dslash": "\u0110",
    "Dsmall": "\uf764",
    "Dtopbar": "\u018b",
    "Dz": "\u01f2",
    "Dzcaron": "\u01c5",
    "Dzeabkhasiancyrillic": "\u04e0",
    "Dzecyrillic": "\u0405",
    "Dzhecyrillic": "\u040f",
    "E": "\u0045",
    "Eacute": "\u00c9",
    "Eacutesmall": "\uf7e9",
    "Ebreve": "\u0114",
    "Ecaron": "\u011a",
    "Ecedillabreve": "\u1e1c",
    "Echarmenian": "\u0535",
    "Ecircle": "\u24ba",
    "Ecircumflex": "\u00ca",
    "Ecircumflexacute": "\u1ebe",
    "Ecircumflexbelow": "\u1e18",
    "Ecircumflexdotbelow": "\u1ec6",
    "Ecircumflexgrave": "\u1ec0",
    "Ecircumflexhookabove": "\u1ec2",
    "Ecircumflexsmall": "\uf7ea",
    "Ecircumflextilde": "\u1ec4",
    "Ecyrillic": "\u0404",
    "Edblgrave": "\u0204",
    "Edieresis": "\u00cb",
    "Edieresissmall": "\uf7eb",
    "Edot": "\u0116",
    "Edotaccent": "\u0116",
    "Edotbelow": "\u1eb8",
    "Efcyrillic": "\u0424",
    "Egrave": "\u00c8",
    "Egravesmall": "\uf7e8",
    "Eharmenian": "\u0537",
    "Ehookabove": "\u1eba",
    "Eightroman": "\u2167",
    "Einvertedbreve": "\u0206",
    "Eiotifiedcyrillic": "\u0464",
    "Elcyrillic": "\u041b",
    "Elevenroman": "\u216a",
    "Emacron": "\u0112",
    "Emacronacute": "\u1e16",
    "Emacrongrave": "\u1e14",
    "Emcyrillic": "\u041c",
    "Emonospace": "\uff25",
    "Encyrillic": "\u041d",
    "Endescendercyrillic": "\u04a2",
    "Eng": "\u014a",
    "Enghecyrillic": "\u04a4",
    "Enhookcyrillic": "\u04c7",
    "Eogonek": "\u0118",
    "Eopen": "\u0190",
    "Epsilon": "\u0395",
    "Epsilontonos": "\u0388",
    "Ercyrillic": "\u0420",
    "Ereversed": "\u018e",
    "Ereversedcyrillic": "\u042d",
    "Escyrillic": "\u0421",
    "Esdescendercyrillic": "\u04aa",
    "Esh": "\u01a9",
    "Esmall": "\uf765",
    "Eta": "\u0397",
    "Etarmenian": "\u0538",
    "Etatonos": "\u0389",
    "Eth": "\u00d0",
    "Ethsmall": "\uf7f0",
    "Etilde": "\u1ebc",
    "Etildebelow": "\u1e1a",
    "Euro": "\u20ac",
    "Ezh": "\u01b7",
    "Ezhcaron": "\u01ee",
    "Ezhreversed": "\u01b8",
    "F": "\u0046",
    "Fcircle": "\u24bb",
    "Fdotaccent": "\u1e1e",
    "Feharmenian": "\u0556",
    "Feicoptic": "\u03e4",
    "Fhook": "\u0191",
    "Fitacyrillic": "\u0472",
    "Fiveroman": "\u2164",
    "Fmonospace": "\uff26",
    "Fourroman": "\u2163",
    "Fsmall": "\uf766",
    "G": "\u0047",
    "GBsquare": "\u3387",
    "Gacute": "\u01f4",
    "Gamma": "\u0393",
    "Gammaafrican": "\u0194",
    "Gangiacoptic": "\u03ea",
    "Gbreve": "\u011e",
    "Gcaron": "\u01e6",
    "Gcedilla": "\u0122",
    "Gcircle": "\u24bc",
    "Gcircumflex": "\u011c",
    "Gcommaaccent": "\u0122",
    "Gdot": "\u0120",
    "Gdotaccent": "\u0120",
    "Gecyrillic": "\u0413",
    "Ghadarmenian": "\u0542",
    "Ghemiddlehookcyrillic": "\u0494",
    "Ghestrokecyrillic": "\u0492",
    "Gheupturncyrillic": "\u0490",
    "Ghook": "\u0193",
    "Gimarmenian": "\u0533",
    "Gjecyrillic": "\u0403",
    "Gmacron": "\u1e20",
    "Gmonospace": "\uff27",
    "Grave": "\uf6ce",
    "Gravesmall": "\uf760",
    "Gsmall": "\uf767",
    "Gsmallhook": "\u029b",
    "Gstroke": "\u01e4",
    "H": "\u0048",
    "H18533": "\u25cf",
    "H18543": "\u25aa",
    "H18551": "\u25ab",
    "H22073": "\u25a1",
    "HPsquare": "\u33cb",
    "Haabkhasiancyrillic": "\u04a8",
    "Hadescendercyrillic": "\u04b2",
    "Hardsigncyrillic": "\u042a",
    "Hbar": "\u0126",
    "Hbrevebelow": "\u1e2a",
    "Hcedilla": "\u1e28",
    "Hcircle": "\u24bd",
    "Hcircumflex": "\u0124",
    "Hdieresis": "\u1e26",
    "Hdotaccent": "\u1e22",
    "Hdotbelow": "\u1e24",
    "Hmonospace": "\uff28",
    "Hoarmenian": "\u0540",
    "Horicoptic": "\u03e8",
    "Hsmall": "\uf768",
    "Hungarumlaut": "\uf6cf",
    "Hungarumlautsmall": "\uf6f8",
    "Hzsquare": "\u3390",
    "I": "\u0049",
    "IAcyrillic": "\u042f",
    "IJ": "\u0132",
    "IUcyrillic": "\u042e",
    "Iacute": "\u00cd",
    "Iacutesmall": "\uf7ed",
    "Ibreve": "\u012c",
    "Icaron": "\u01cf",
    "Icircle": "\u24be",
    "Icircumflex": "\u00ce",
    "Icircumflexsmall": "\uf7ee",
    "Icyrillic": "\u0406",
    "Idblgrave": "\u0208",
    "Idieresis": "\u00cf",
    "Idieresisacute": "\u1e2e",
    "Idieresiscyrillic": "\u04e4",
    "Idieresissmall": "\uf7ef",
    "Idot": "\u0130",
    "Idotaccent": "\u0130",
    "Idotbelow": "\u1eca",
    "Iebrevecyrillic": "\u04d6",
    "Iecyrillic": "\u0415",
    "Ifraktur": "\u2111",
    "Igrave": "\u00cc",
    "Igravesmall": "\uf7ec",
    "Ihookabove": "\u1ec8",
    "Iicyrillic": "\u0418",
    "Iinvertedbreve": "\u020a",
    "Iishortcyrillic": "\u0419",
    "Imacron": "\u012a",
    "Imacroncyrillic": "\u04e2",
    "Imonospace": "\uff29",
    "Iniarmenian": "\u053b",
    "Iocyrillic": "\u0401",
    "Iogonek": "\u012e",
    "Iota": "\u0399",
    "Iotaafrican": "\u0196",
    "Iotadieresis": "\u03aa",
    "Iotatonos": "\u038a",
    "Ismall": "\uf769",
    "Istroke": "\u0197",
    "Itilde": "\u0128",
    "Itildebelow": "\u1e2c",
    "Izhitsacyrillic": "\u0474",
    "Izhitsadblgravecyrillic": "\u0476",
    "J": "\u004a",
    "Jaarmenian": "\u0541",
    "Jcircle": "\u24bf",
    "Jcircumflex": "\u0134",
    "Jecyrillic": "\u0408",
    "Jheharmenian": "\u054b",
    "Jmonospace": "\uff2a",
    "Jsmall": "\uf76a",
    "K": "\u004b",
    "KBsquare": "\u3385",
    "KKsquare": "\u33cd",
    "Kabashkircyrillic": "\u04a0",
    "Kacute": "\u1e30",
    "Kacyrillic": "\u041a",
    "Kadescendercyrillic": "\u049a",
    "Kahookcyrillic": "\u04c3",
    "Kappa": "\u039a",
    "Kastrokecyrillic": "\u049e",
    "Kaverticalstrokecyrillic": "\u049c",
    "Kcaron": "\u01e8",
    "Kcedilla": "\u0136",
    "Kcircle": "\u24c0",
    "Kcommaaccent": "\u0136",
    "Kdotbelow": "\u1e32",
    "Keharmenian": "\u0554",
    "Kenarmenian": "\u053f",
    "Khacyrillic": "\u0425",
    "Kheicoptic": "\u03e6",
    "Khook": "\u0198",
    "Kjecyrillic": "\u040c",
    "Klinebelow": "\u1e34",
    "Kmonospace": "\uff2b",
    "Koppacyrillic": "\u0480",
    "Koppagreek": "\u03de",
    "Ksicyrillic": "\u046e",
    "Ksmall": "\uf76b",
    "L": "\u004c",
    "LJ": "\u01c7",
    "LL": "\uf6bf",
    "Lacute": "\u0139",
    "Lambda": "\u039b",
    "Lcaron": "\u013d",
    "Lcedilla": "\u013b",
    "Lcircle": "\u24c1",
    "Lcircumflexbelow": "\u1e3c",
    "Lcommaaccent": "\u013b",
    "Ldot": "\u013f",
    "Ldotaccent": "\u013f",
    "Ldotbelow": "\u1e36",
    "Ldotbelowmacron": "\u1e38",
    "Liwnarmenian": "\u053c",
    "Lj": "\u01c8",
    "Ljecyrillic": "\u0409",
    "Llinebelow": "\u1e3a",
    "Lmonospace": "\uff2c",
    "Lslash": "\u0141",
    "Lslashsmall": "\uf6f9",
    "Lsmall": "\uf76c",
    "M": "\u004d",
    "MBsquare": "\u3386",
    "Macron": "\uf6d0",
    "Macronsmall": "\uf7af",
    "Macute": "\u1e3e",
    "Mcircle": "\u24c2",
    "Mdotaccent": "\u1e40",
    "Mdotbelow": "\u1e42",
    "Menarmenian": "\u0544",
    "Mmonospace": "\uff2d",
    "Msmall": "\uf76d",
    "Mturned": "\u019c",
    "Mu": "\u039c",
    "N": "\u004e",
    "NJ": "\u01ca",
    "Nacute": "\u0143",
    "Ncaron": "\u0147",
    "Ncedilla": "\u0145",
    "Ncircle": "\u24c3",
    "Ncircumflexbelow": "\u1e4a",
    "Ncommaaccent": "\u0145",
    "Ndotaccent": "\u1e44",
    "Ndotbelow": "\u1e46",
    "Nhookleft": "\u019d",
    "Nineroman": "\u2168",
    "Nj": "\u01cb",
    "Njecyrillic": "\u040a",
    "Nlinebelow": "\u1e48",
    "Nmonospace": "\uff2e",
    "Nowarmenian": "\u0546",
    "Nsmall": "\uf76e",
    "Ntilde": "\u00d1",
    "Ntildesmall": "\uf7f1",
    "Nu": "\u039d",
    "O": "\u004f",
    "OE": "\u0152",
    "OEsmall": "\uf6fa",
    "Oacute": "\u00d3",
    "Oacutesmall": "\uf7f3",
    "Obarredcyrillic": "\u04e8",
    "Obarreddieresiscyrillic": "\u04ea",
    "Obreve": "\u014e",
    "Ocaron": "\u01d1",
    "Ocenteredtilde": "\u019f",
    "Ocircle": "\u24c4",
    "Ocircumflex": "\u00d4",
    "Ocircumflexacute": "\u1ed0",
    "Ocircumflexdotbelow": "\u1ed8",
    "Ocircumflexgrave": "\u1ed2",
    "Ocircumflexhookabove": "\u1ed4",
    "Ocircumflexsmall": "\uf7f4",
    "Ocircumflextilde": "\u1ed6",
    "Ocyrillic": "\u041e",
    "Odblacute": "\u0150",
    "Odblgrave": "\u020c",
    "Odieresis": "\u00d6",
    "Odieresiscyrillic": "\u04e6",
    "Odieresissmall": "\uf7f6",
    "Odotbelow": "\u1ecc",
    "Ogoneksmall": "\uf6fb",
    "Ograve": "\u00d2",
    "Ogravesmall": "\uf7f2",
    "Oharmenian": "\u0555",
    "Ohm": "\u2126",
    "Ohookabove": "\u1ece",
    "Ohorn": "\u01a0",
    "Ohornacute": "\u1eda",
    "Ohorndotbelow": "\u1ee2",
    "Ohorngrave": "\u1edc",
    "Ohornhookabove": "\u1ede",
    "Ohorntilde": "\u1ee0",
    "Ohungarumlaut": "\u0150",
    "Oi": "\u01a2",
    "Oinvertedbreve": "\u020e",
    "Omacron": "\u014c",
    "Omacronacute": "\u1e52",
    "Omacrongrave": "\u1e50",
    "Omega": "\u2126",
    "Omegacyrillic": "\u0460",
    "Omegagreek": "\u03a9",
    "Omegaroundcyrillic": "\u047a",
    "Omegatitlocyrillic": "\u047c",
    "Omegatonos": "\u038f",
    "Omicron": "\u039f",
    "Omicrontonos": "\u038c",
    "Omonospace": "\uff2f",
    "Oneroman": "\u2160",
    "Oogonek": "\u01ea",
    "Oogonekmacron": "\u01ec",
    "Oopen": "\u0186",
    "Oslash": "\u00d8",
    "Oslashacute": "\u01fe",
    "Oslashsmall": "\uf7f8",
    "Osmall": "\uf76f",
    "Ostrokeacute": "\u01fe",
    "Otcyrillic": "\u047e",
    "Otilde": "\u00d5",
    "Otildeacute": "\u1e4c",
    "Otildedieresis": "\u1e4e",
    "Otildesmall": "\uf7f5",
    "P": "\u0050",
    "Pacute": "\u1e54",
    "Pcircle": "\u24c5",
    "Pdotaccent": "\u1e56",
    "Pecyrillic": "\u041f",
    "Peharmenian": "\u054a",
    "Pemiddlehookcyrillic": "\u04a6",
    "Phi": "\u03a6",
    "Phook": "\u01a4",
    "Pi": "\u03a0",
    "Piwrarmenian": "\u0553",
    "Pmonospace": "\uff30",
    "Psi": "\u03a8",
    "Psicyrillic": "\u0470",
    "Psmall": "\uf770",
    "Q": "\u0051",
    "Qcircle": "\u24c6",
    "Qmonospace": "\uff31",
    "Qsmall": "\uf771",
    "R": "\u0052",
    "Raarmenian": "\u054c",
    "Racute": "\u0154",
    "Rcaron": "\u0158",
    "Rcedilla": "\u0156",
    "Rcircle": "\u24c7",
    "Rcommaaccent": "\u0156",
    "Rdblgrave": "\u0210",
    "Rdotaccent": "\u1e58",
    "Rdotbelow": "\u1e5a",
    "Rdotbelowmacron": "\u1e5c",
    "Reharmenian": "\u0550",
    "Rfraktur": "\u211c",
    "Rho": "\u03a1",
    "Ringsmall": "\uf6fc",
    "Rinvertedbreve": "\u0212",
    "Rlinebelow": "\u1e5e",
    "Rmonospace": "\uff32",
    "Rsmall": "\uf772",
    "Rsmallinverted": "\u0281",
    "Rsmallinvertedsuperior": "\u02b6",
    "S": "\u0053",
    "SF010000": "\u250c",
    "SF020000": "\u2514",
    "SF030000": "\u2510",
    "SF040000": "\u2518",
    "SF050000": "\u253c",
    "SF060000": "\u252c",
    "SF070000": "\u2534",
    "SF080000": "\u251c",
    "SF090000": "\u2524",
    "SF100000": "\u2500",
    "SF110000": "\u2502",
    "SF190000": "\u2561",
    "SF200000": "\u2562",
    "SF210000": "\u2556",
    "SF220000": "\u2555",
    "SF230000": "\u2563",
    "SF240000": "\u2551",
    "SF250000": "\u2557",
    "SF260000": "\u255d",
    "SF270000": "\u255c",
    "SF280000": "\u255b",
    "SF360000": "\u255e",
    "SF370000": "\u255f",
    "SF380000": "\u255a",
    "SF390000": "\u2554",
    "SF400000": "\u2569",
    "SF410000": "\u2566",
    "SF420000": "\u2560",
    "SF430000": "\u2550",
    "SF440000": "\u256c",
    "SF450000": "\u2567",
    "SF460000": "\u2568",
    "SF470000": "\u2564",
    "SF480000": "\u2565",
    "SF490000": "\u2559",
    "SF500000": "\u2558",
    "SF510000": "\u2552",
    "SF520000": "\u2553",
    "SF530000": "\u256b",
    "SF540000": "\u256a",
    "Sacute": "\u015a",
    "Sacutedotaccent": "\u1e64",
    "Sampigreek": "\u03e0",
    "Scaron": "\u0160",
    "Scarondotaccent": "\u1e66",
    "Scaronsmall": "\uf6fd",
    "Scedilla": "\u015e",
    "Schwa": "\u018f",
    "Schwacyrillic": "\u04d8",
    "Schwadieresiscyrillic": "\u04da",
    "Scircle": "\u24c8",
    "Scircumflex": "\u015c",
    "Scommaaccent": "\u0218",
    "Sdotaccent": "\u1e60",
    "Sdotbelow": "\u1e62",
    "Sdotbelowdotaccent": "\u1e68",
    "Seharmenian": "\u054d",
    "Sevenroman": "\u2166",
    "Shaarmenian": "\u0547",
    "Shacyrillic": "\u0428",
    "Shchacyrillic": "\u0429",
    "Sheicoptic": "\u03e2",
    "Shhacyrillic": "\u04ba",
    "Shimacoptic": "\u03ec",
    "Sigma": "\u03a3",
    "Sixroman": "\u2165",
    "Smonospace": "\uff33",
    "Softsigncyrillic": "\u042c",
    "Ssmall": "\uf773",
    "Stigmagreek": "\u03da",
    "T": "\u0054",
    "Tau": "\u03a4",
    "Tbar": "\u0166",
    "Tcaron": "\u0164",
    "Tcedilla": "\u0162",
    "Tcircle": "\u24c9",
    "Tcircumflexbelow": "\u1e70",
    "Tcommaaccent": "\u0162",
    "Tdotaccent": "\u1e6a",
    "Tdotbelow": "\u1e6c",
    "Tecyrillic": "\u0422",
    "Tedescendercyrillic": "\u04ac",
    "Tenroman": "\u2169",
    "Tetsecyrillic": "\u04b4",
    "Theta": "\u0398",
    "Thook": "\u01ac",
    "Thorn": "\u00de",
    "Thornsmall": "\uf7fe",
    "Threeroman": "\u2162",
    "Tildesmall": "\uf6fe",
    "Tiwnarmenian": "\u054f",
    "Tlinebelow": "\u1e6e",
    "Tmonospace": "\uff34",
    "Toarmenian": "\u0539",
    "Tonefive": "\u01bc",
    "Tonesix": "\u0184",
    "Tonetwo": "\u01a7",
    "Tretroflexhook": "\u01ae",
    "Tsecyrillic": "\u0426",
    "Tshecyrillic": "\u040b",
    "Tsmall": "\uf774",
    "Twelveroman": "\u216b",
    "Tworoman": "\u2161",
    "U": "\u0055",
    "Uacute": "\u00da",
    "Uacutesmall": "\uf7fa",
    "Ubreve": "\u016c",
    "Ucaron": "\u01d3",
    "Ucircle": "\u24ca",
    "Ucircumflex": "\u00db",
    "Ucircumflexbelow": "\u1e76",
    "Ucircumflexsmall": "\uf7fb",
    "Ucyrillic": "\u0423",
    "Udblacute": "\u0170",
    "Udblgrave": "\u0214",
    "Udieresis": "\u00dc",
    "Udieresisacute": "\u01d7",
    "Udieresisbelow": "\u1e72",
    "Udieresiscaron": "\u01d9",
    "Udieresiscyrillic": "\u04f0",
    "Udieresisgrave": "\u01db",
    "Udieresismacron": "\u01d5",
    "Udieresissmall": "\uf7fc",
    "Udotbelow": "\u1ee4",
    "Ugrave": "\u00d9",
    "Ugravesmall": "\uf7f9",
    "Uhookabove": "\u1ee6",
    "Uhorn": "\u01af",
    "Uhornacute": "\u1ee8",
    "Uhorndotbelow": "\u1ef0",
    "Uhorngrave": "\u1eea",
    "Uhornhookabove": "\u1eec",
    "Uhorntilde": "\u1eee",
    "Uhungarumlaut": "\u0170",
    "Uhungarumlautcyrillic": "\u04f2",
    "Uinvertedbreve": "\u0216",
    "Ukcyrillic": "\u0478",
    "Umacron": "\u016a",
    "Umacroncyrillic": "\u04ee",
    "Umacrondieresis": "\u1e7a",
    "Umonospace": "\uff35",
    "Uogonek": "\u0172",
    "Upsilon": "\u03a5",
    "Upsilon1": "\u03d2",
    "Upsilonacutehooksymbolgreek": "\u03d3",
    "Upsilonafrican": "\u01b1",
    "Upsilondieresis": "\u03ab",
    "Upsilondieresishooksymbolgreek": "\u03d4",
    "Upsilonhooksymbol": "\u03d2",
    "Upsilontonos": "\u038e",
    "Uring": "\u016e",
    "Ushortcyrillic": "\u040e",
    "Usmall": "\uf775",
    "Ustraightcyrillic": "\u04ae",
    "Ustraightstrokecyrillic": "\u04b0",
    "Utilde": "\u0168",
    "Utildeacute": "\u1e78",
    "Utildebelow": "\u1e74",
    "V": "\u0056",
    "Vcircle": "\u24cb",
    "Vdotbelow": "\u1e7e",
    "Vecyrillic": "\u0412",
    "Vewarmenian": "\u054e",
    "Vhook": "\u01b2",
    "Vmonospace": "\uff36",
    "Voarmenian": "\u0548",
    "Vsmall": "\uf776",
    "Vtilde": "\u1e7c",
    "W": "\u0057",
    "Wacute": "\u1e82",
    "Wcircle": "\u24cc",
    "Wcircumflex": "\u0174",
    "Wdieresis": "\u1e84",
    "Wdotaccent": "\u1e86",
    "Wdotbelow": "\u1e88",
    "Wgrave": "\u1e80",
    "Wmonospace": "\uff37",
    "Wsmall": "\uf777",
    "X": "\u0058",
    "Xcircle": "\u24cd",
    "Xdieresis": "\u1e8c",
    "Xdotaccent": "\u1e8a",
    "Xeharmenian": "\u053d",
    "Xi": "\u039e",
    "Xmonospace": "\uff38",
    "Xsmall": "\uf778",
    "Y": "\u0059",
    "Yacute": "\u00dd",
    "Yacutesmall": "\uf7fd",
    "Yatcyrillic": "\u0462",
    "Ycircle": "\u24ce",
    "Ycircumflex": "\u0176",
    "Ydieresis": "\u0178",
    "Ydieresissmall": "\uf7ff",
    "Ydotaccent": "\u1e8e",
    "Ydotbelow": "\u1ef4",
    "Yericyrillic": "\u042b",
    "Yerudieresiscyrillic": "\u04f8",
    "Ygrave": "\u1ef2",
    "Yhook": "\u01b3",
    "Yhookabove": "\u1ef6",
    "Yiarmenian": "\u0545",
    "Yicyrillic": "\u0407",
    "Yiwnarmenian": "\u0552",
    "Ymonospace": "\uff39",
    "Ysmall": "\uf779",
    "Ytilde": "\u1ef8",
    "Yusbigcyrillic": "\u046a",
    "Yusbigiotifiedcyrillic": "\u046c",
    "Yuslittlecyrillic": "\u0466",
    "Yuslittleiotifiedcyrillic": "\u0468",
    "Z": "\u005a",
    "Zaarmenian": "\u0536",
    "Zacute": "\u0179",
    "Zcaron": "\u017d",
    "Zcaronsmall": "\uf6ff",
    "Zcircle": "\u24cf",
    "Zcircumflex": "\u1e90",
    "Zdot": "\u017b",
    "Zdotaccent": "\u017b",
    "Zdotbelow": "\u1e92",
    "Zecyrillic": "\u0417",
    "Zedescendercyrillic": "\u0498",
    "Zedieresiscyrillic": "\u04de",
    "Zeta": "\u0396",
    "Zhearmenian": "\u053a",
    "Zhebrevecyrillic": "\u04c1",
    "Zhecyrillic": "\u0416",
    "Zhedescendercyrillic": "\u0496",
    "Zhedieresiscyrillic": "\u04dc",
    "Zlinebelow": "\u1e94",
    "Zmonospace": "\uff3a",
    "Zsmall": "\uf77a",
    "Zstroke": "\u01b5",
    "a": "\u0061",
    "aabengali": "\u0986",
    "aacute": "\u00e1",
    "aadeva": "\u0906",
    "aagujarati": "\u0a86",
    "aagurmukhi": "\u0a06",
    "aamatragurmukhi": "\u0a3e",
    "aarusquare": "\u3303",
    "aavowelsignbengali": "\u09be",
    "aavowelsigndeva": "\u093e",
    "aavowelsigngujarati": "\u0abe",
    "abbreviationmarkarmenian": "\u055f",
    "abbreviationsigndeva": "\u0970",
    "abengali": "\u0985",
    "abopomofo": "\u311a",
    "abreve": "\u0103",
    "abreveacute": "\u1eaf",
    "abrevecyrillic": "\u04d1",
    "abrevedotbelow": "\u1eb7",
    "abrevegrave": "\u1eb1",
    "abrevehookabove": "\u1eb3",
    "abrevetilde": "\u1eb5",
    "acaron": "\u01ce",
    "acircle": "\u24d0",
    "acircumflex": "\u00e2",
    "acircumflexacute": "\u1ea5",
    "acircumflexdotbelow": "\u1ead",
    "acircumflexgrave": "\u1ea7",
    "acircumflexhookabove": "\u1ea9",
    "acircumflextilde": "\u1eab",
    "acute": "\u00b4",
    "acutebelowcmb": "\u0317",
    "acutecmb": "\u0301",
    "acutecomb": "\u0301",
    "acutedeva": "\u0954",
    "acutelowmod": "\u02cf",
    "acutetonecmb": "\u0341",
    "acyrillic": "\u0430",
    "adblgrave": "\u0201",
    "addakgurmukhi": "\u0a71",
    "adeva": "\u0905",
    "adieresis": "\u00e4",
    "adieresiscyrillic": "\u04d3",
    "adieresismacron": "\u01df",
    "adotbelow": "\u1ea1",
    "adotmacron": "\u01e1",
    "ae": "\u00e6",
    "aeacute": "\u01fd",
    "aekorean": "\u3150",
    "aemacron": "\u01e3",
    "afii00208": "\u2015",
    "afii08941": "\u20a4",
    "afii10017": "\u0410",
    "afii10018": "\u0411",
    "afii10019": "\u0412",
    "afii10020": "\u0413",
    "afii10021": "\u0414",
    "afii10022": "\u0415",
    "afii10023": "\u0401",
    "afii10024": "\u0416",
    "afii10025": "\u0417",
    "afii10026": "\u0418",
    "afii10027": "\u0419",
    "afii10028": "\u041a",
    "afii10029": "\u041b",
    "afii10030": "\u041c",
    "afii10031": "\u041d",
    "afii10032": "\u041e",
    "afii10033": "\u041f",
    "afii10034": "\u0420",
    "afii10035": "\u0421",
    "afii10036": "\u0422",
    "afii10037": "\u0423",
    "afii10038": "\u0424",
    "afii10039": "\u0425",
    "afii10040": "\u0426",
    "afii10041": "\u0427",
    "afii10042": "\u0428",
    "afii10043": "\u0429",
    "afii10044": "\u042a",
    "afii10045": "\u042b",
    "afii10046": "\u042c",
    "afii10047": "\u042d",
    "afii10048": "\u042e",
    "afii10049": "\u042f",
    "afii10050": "\u0490",
    "afii10051": "\u0402",
    "afii10052": "\u0403",
    "afii10053": "\u0404",
    "afii10054": "\u0405",
    "afii10055": "\u0406",
    "afii10056": "\u0407",
    "afii10057": "\u0408",
    "afii10058": "\u0409",
    "afii10059": "\u040a",
    "afii10060": "\u040b",
    "afii10061": "\u040c",
    "afii10062": "\u040e",
    "afii10063": "\uf6c4",
    "afii10064": "\uf6c5",
    "afii10065": "\u0430",
    "afii10066": "\u0431",
    "afii10067": "\u0432",
    "afii10068": "\u0433",
    "afii10069": "\u0434",
    "afii10070": "\u0435",
    "afii10071": "\u0451",
    "afii10072": "\u0436",
    "afii10073": "\u0437",
    "afii10074": "\u0438",
    "afii10075": "\u0439",
    "afii10076": "\u043a",
    "afii10077": "\u043b",
    "afii10078": "\u043c",
    "afii10079": "\u043d",
    "afii10080": "\u043e",
    "afii10081": "\u043f",
    "afii10082": "\u0440",
    "afii10083": "\u0441",
    "afii10084": "\u0442",
    "afii10085": "\u0443",
    "afii10086": "\u0444",
    "afii10087": "\u0445",
    "afii10088": "\u0446",
    "afii10089": "\u0447",
    "afii10090": "\u0448",
    "afii10091": "\u0449",
    "afii10092": "\u044a",
    "afii10093": "\u044b",
    "afii10094": "\u044c",
    "afii10095": "\u044d",
    "afii10096": "\u044e",
    "afii10097": "\u044f",
    "afii10098": "\u0491",
    "afii10099": "\u0452",
    "afii10100": "\u0453",
    "afii10101": "\u0454",
    "afii10102": "\u0455",
    "afii10103": "\u0456",
    "afii10104": "\u0457",
    "afii10105": "\u0458",
    "afii10106": "\u0459",
    "afii10107": "\u045a",
    "afii10108": "\u045b",
    "afii10109": "\u045c",
    "afii10110": "\u045e",
    "afii10145": "\u040f",
    "afii10146": "\u0462",
    "afii10147": "\u0472",
    "afii10148": "\u0474",
    "afii10192": "\uf6c6",
    "afii10193": "\u045f",
    "afii10194": "\u0463",
    "afii10195": "\u0473",
    "afii10196": "\u0475",
    "afii10831": "\uf6c7",
    "afii10832": "\uf6c8",
    "afii10846": "\u04d9",
    "afii299": "\u200e",
    "afii300": "\u200f",
    "afii301": "\u200d",
    "afii57381": "\u066a",
    "afii57388": "\u060c",
    "afii57392": "\u0660",
    "afii57393": "\u0661",
    "afii57394": "\u0662",
    "afii57395": "\u0663",
    "afii57396": "\u0664",
    "afii57397": "\u0665",
    "afii57398": "\u0666",
    "afii57399": "\u0667",
    "afii57400": "\u0668",
    "afii57401": "\u0669",
    "afii57403": "\u061b",
    "afii57407": "\u061f",
    "afii57409": "\u0621",
    "afii57410": "\u0622",
    "afii57411": "\u0623",
    "afii57412": "\u0624",
    "afii57413": "\u0625",
    "afii57414": "\u0626",
    "afii57415": "\u0627",
    "afii57416": "\u0628",
    "afii57417": "\u0629",
    "afii57418": "\u062a",
    "afii57419": "\u062b",
    "afii57420": "\u062c",
    "afii57421": "\u062d",
    "afii57422": "\u062e",
    "afii57423": "\u062f",
    "afii57424": "\u0630",
    "afii57425": "\u0631",
    "afii57426": "\u0632",
    "afii57427": "\u0633",
    "afii57428": "\u0634",
    "afii57429": "\u0635",
    "afii57430": "\u0636",
    "afii57431": "\u0637",
    "afii57432": "\u0638",
    "afii57433": "\u0639",
    "afii57434": "\u063a",
    "afii57440": "\u0640",
    "afii57441": "\u0641",
    "afii57442": "\u0642",
    "afii57443": "\u0643",
    "afii57444": "\u0644",
    "afii57445": "\u0645",
    "afii57446": "\u0646",
    "afii57448": "\u0648",
    "afii57449": "\u0649",
    "afii57450": "\u064a",
    "afii57451": "\u064b",
    "afii57452": "\u064c",
    "afii57453": "\u064d",
    "afii57454": "\u064e",
    "afii57455": "\u064f",
    "afii57456": "\u0650",
    "afii57457": "\u0651",
    "afii57458": "\u0652",
    "afii57470": "\u0647",
    "afii57505": "\u06a4",
    "afii57506": "\u067e",
    "afii57507": "\u0686",
    "afii57508": "\u0698",
    "afii57509": "\u06af",
    "afii57511": "\u0679",
    "afii57512": "\u0688",
    "afii57513": "\u0691",
    "afii57514": "\u06ba",
    "afii57519": "\u06d2",
    "afii57534": "\u06d5",
    "afii57636": "\u20aa",
    "afii57645": "\u05be",
    "afii57658": "\u05c3",
    "afii57664": "\u05d0",
    "afii57665": "\u05d1",
    "afii57666": "\u05d2",
    "afii57667": "\u05d3",
    "afii57668": "\u05d4",
    "afii57669": "\u05d5",
    "afii57670": "\u05d6",
    "afii57671": "\u05d7",
    "afii57672": "\u05d8",
    "afii57673": "\u05d9",
    "afii57674": "\u05da",
    "afii57675": "\u05db",
    "afii57676": "\u05dc",
    "afii57677": "\u05dd",
    "afii57678": "\u05de",
    "afii57679": "\u05df",
    "afii57680": "\u05e0",
    "afii57681": "\u05e1",
    "afii57682": "\u05e2",
    "afii57683": "\u05e3",
    "afii57684": "\u05e4",
    "afii57685": "\u05e5",
    "afii57686": "\u05e6",
    "afii57687": "\u05e7",
    "afii57688": "\u05e8",
    "afii57689": "\u05e9",
    "afii57690": "\u05ea",
    "afii57694": "\ufb2a",
    "afii57695": "\ufb2b",
    "afii57700": "\ufb4b",
    "afii57705": "\ufb1f",
    "afii57716": "\u05f0",
    "afii57717": "\u05f1",
    "afii57718": "\u05f2",
    "afii57723": "\ufb35",
    "afii57793": "\u05b4",
    "afii57794": "\u05b5",
    "afii57795": "\u05b6",
    "afii57796": "\u05bb",
    "afii57797": "\u05b8",
    "afii57798": "\u05b7",
    "afii57799": "\u05b0",
    "afii57800": "\u05b2",
    "afii57801": "\u05b1",
    "afii57802": "\u05b3",
    "afii57803": "\u05c2",
    "afii57804": "\u05c1",
    "afii57806": "\u05b9",
    "afii57807": "\u05bc",
    "afii57839": "\u05bd",
    "afii57841": "\u05bf",
    "afii57842": "\u05c0",
    "afii57929": "\u02bc",
    "afii61248": "\u2105",
    "afii61289": "\u2113",
    "afii61352": "\u2116",
    "afii61573": "\u202c",
    "afii61574": "\u202d",
    "afii61575": "\u202e",
    "afii61664": "\u200c",
    "afii63167": "\u066d",
    "afii64937": "\u02bd",
    "agrave": "\u00e0",
    "agujarati": "\u0a85",
    "agurmukhi": "\u0a05",
    "ahiragana": "\u3042",
    "ahookabove": "\u1ea3",
    "aibengali": "\u0990",
    "aibopomofo": "\u311e",
    "aideva": "\u0910",
    "aiecyrillic": "\u04d5",
    "aigujarati": "\u0a90",
    "aigurmukhi": "\u0a10",
    "aimatragurmukhi": "\u0a48",
    "ainarabic": "\u0639",
    "ainfinalarabic": "\ufeca",
    "aininitialarabic": "\ufecb",
    "ainmedialarabic": "\ufecc",
    "ainvertedbreve": "\u0203",
    "aivowelsignbengali": "\u09c8",
    "aivowelsigndeva": "\u0948",
    "aivowelsigngujarati": "\u0ac8",
    "akatakana": "\u30a2",
    "akatakanahalfwidth": "\uff71",
    "akorean": "\u314f",
    "alef": "\u05d0",
    "alefarabic": "\u0627",
    "alefdageshhebrew": "\ufb30",
    "aleffinalarabic": "\ufe8e",
    "alefhamzaabovearabic": "\u0623",
    "alefhamzaabovefinalarabic": "\ufe84",
    "alefhamzabelowarabic": "\u0625",
    "alefhamzabelowfinalarabic": "\ufe88",
    "alefhebrew": "\u05d0",
    "aleflamedhebrew": "\ufb4f",
    "alefmaddaabovearabic": "\u0622",
    "alefmaddaabovefinalarabic": "\ufe82",
    "alefmaksuraarabic": "\u0649",
    "alefmaksurafinalarabic": "\ufef0",
    "alefmaksurainitialarabic": "\ufef3",
    "alefmaksuramedialarabic": "\ufef4",
    "alefpatahhebrew": "\ufb2e",
    "alefqamatshebrew": "\ufb2f",
    "aleph": "\u2135",
    "allequal": "\u224c",
    "alpha": "\u03b1",
    "alphatonos": "\u03ac",
    "amacron": "\u0101",
    "amonospace": "\uff41",
    "ampersand": "\u0026",
    "ampersandmonospace": "\uff06",
    "ampersandsmall": "\uf726",
    "amsquare": "\u33c2",
    "anbopomofo": "\u3122",
    "angbopomofo": "\u3124",
    "angkhankhuthai": "\u0e5a",
    "angle": "\u2220",
    "anglebracketleft": "\u3008",
    "anglebracketleftvertical": "\ufe3f",
    "anglebracketright": "\u3009",
    "anglebracketrightvertical": "\ufe40",
    "angleleft": "\u2329",
    "angleright": "\u232a",
    "angstrom": "\u212b",
    "anoteleia": "\u0387",
    "anudattadeva": "\u0952",
    "anusvarabengali": "\u0982",
    "anusvaradeva": "\u0902",
    "anusvaragujarati": "\u0a82",
    "aogonek": "\u0105",
    "apaatosquare": "\u3300",
    "aparen": "\u249c",
    "apostrophearmenian": "\u055a",
    "apostrophemod": "\u02bc",
    "apple": "\uf8ff",
    "approaches": "\u2250",
    "approxequal": "\u2248",
    "approxequalorimage": "\u2252",
    "approximatelyequal": "\u2245",
    "araeaekorean": "\u318e",
    "araeakorean": "\u318d",
    "arc": "\u2312",
    "arighthalfring": "\u1e9a",
    "aring": "\u00e5",
    "aringacute": "\u01fb",
    "aringbelow": "\u1e01",
    "arrowboth": "\u2194",
    "arrowdashdown": "\u21e3",
    "arrowdashleft": "\u21e0",
    "arrowdashright": "\u21e2",
    "arrowdashup": "\u21e1",
    "arrowdblboth": "\u21d4",
    "arrowdbldown": "\u21d3",
    "arrowdblleft": "\u21d0",
    "arrowdblright": "\u21d2",
    "arrowdblup": "\u21d1",
    "arrowdown": "\u2193",
    "arrowdownleft": "\u2199",
    "arrowdownright": "\u2198",
    "arrowdownwhite": "\u21e9",
    "arrowheaddownmod": "\u02c5",
    "arrowheadleftmod": "\u02c2",
    "arrowheadrightmod": "\u02c3",
    "arrowheadupmod": "\u02c4",
    "arrowhorizex": "\uf8e7",
    "arrowleft": "\u2190",
    "arrowleftdbl": "\u21d0",
    "arrowleftdblstroke": "\u21cd",
    "arrowleftoverright": "\u21c6",
    "arrowleftwhite": "\u21e6",
    "arrowright": "\u2192",
    "arrowrightdblstroke": "\u21cf",
    "arrowrightheavy": "\u279e",
    "arrowrightoverleft": "\u21c4",
    "arrowrightwhite": "\u21e8",
    "arrowtableft": "\u21e4",
    "arrowtabright": "\u21e5",
    "arrowup": "\u2191",
    "arrowupdn": "\u2195",
    "arrowupdnbse": "\u21a8",
    "arrowupdownbase": "\u21a8",
    "arrowupleft": "\u2196",
    "arrowupleftofdown": "\u21c5",
    "arrowupright": "\u2197",
    "arrowupwhite": "\u21e7",
    "arrowvertex": "\uf8e6",
    "asciicircum": "\u005e",
    "asciicircummonospace": "\uff3e",
    "asciitilde": "\u007e",
    "asciitildemonospace": "\uff5e",
    "ascript": "\u0251",
    "ascriptturned": "\u0252",
    "asmallhiragana": "\u3041",
    "asmallkatakana": "\u30a1",
    "asmallkatakanahalfwidth": "\uff67",
    "asterisk": "\u002a",
    "asteriskaltonearabic": "\u066d",
    "asteriskarabic": "\u066d",
    "asteriskmath": "\u2217",
    "asteriskmonospace": "\uff0a",
    "asterisksmall": "\ufe61",
    "asterism": "\u2042",
    "asuperior": "\uf6e9",
    "asymptoticallyequal": "\u2243",
    "at": "\u0040",
    "atilde": "\u00e3",
    "atmonospace": "\uff20",
    "atsmall": "\ufe6b",
    "aturned": "\u0250",
    "aubengali": "\u0994",
    "aubopomofo": "\u3120",
    "audeva": "\u0914",
    "augujarati": "\u0a94",
    "augurmukhi": "\u0a14",
    "aulengthmarkbengali": "\u09d7",
    "aumatragurmukhi": "\u0a4c",
    "auvowelsignbengali": "\u09cc",
    "auvowelsigndeva": "\u094c",
    "auvowelsigngujarati": "\u0acc",
    "avagrahadeva": "\u093d",
    "aybarmenian": "\u0561",
    "ayin": "\u05e2",
    "ayinaltonehebrew": "\ufb20",
    "ayinhebrew": "\u05e2",
    "b": "\u0062",
    "babengali": "\u09ac",
    "backslash": "\u005c",
    "backslashmonospace": "\uff3c",
    "badeva": "\u092c",
    "bagujarati": "\u0aac",
    "bagurmukhi": "\u0a2c",
    "bahiragana": "\u3070",
    "bahtthai": "\u0e3f",
    "bakatakana": "\u30d0",
    "bar": "\u007c",
    "barmonospace": "\uff5c",
    "bbopomofo": "\u3105",
    "bcircle": "\u24d1",
    "bdotaccent": "\u1e03",
    "bdotbelow": "\u1e05",
    "beamedsixteenthnotes": "\u266c",
    "because": "\u2235",
    "becyrillic": "\u0431",
    "beharabic": "\u0628",
    "behfinalarabic": "\ufe90",
    "behinitialarabic": "\ufe91",
    "behiragana": "\u3079",
    "behmedialarabic": "\ufe92",
    "behmeeminitialarabic": "\ufc9f",
    "behmeemisolatedarabic": "\ufc08",
    "behnoonfinalarabic": "\ufc6d",
    "bekatakana": "\u30d9",
    "benarmenian": "\u0562",
    "bet": "\u05d1",
    "beta": "\u03b2",
    "betasymbolgreek": "\u03d0",
    "betdagesh": "\ufb31",
    "betdageshhebrew": "\ufb31",
    "bethebrew": "\u05d1",
    "betrafehebrew": "\ufb4c",
    "bhabengali": "\u09ad",
    "bhadeva": "\u092d",
    "bhagujarati": "\u0aad",
    "bhagurmukhi": "\u0a2d",
    "bhook": "\u0253",
    "bihiragana": "\u3073",
    "bikatakana": "\u30d3",
    "bilabialclick": "\u0298",
    "bindigurmukhi": "\u0a02",
    "birusquare": "\u3331",
    "blackcircle": "\u25cf",
    "blackdiamond": "\u25c6",
    "blackdownpointingtriangle": "\u25bc",
    "blackleftpointingpointer": "\u25c4",
    "blackleftpointingtriangle": "\u25c0",
    "blacklenticularbracketleft": "\u3010",
    "blacklenticularbracketleftvertical": "\ufe3b",
    "blacklenticularbracketright": "\u3011",
    "blacklenticularbracketrightvertical": "\ufe3c",
    "blacklowerlefttriangle": "\u25e3",
    "blacklowerrighttriangle": "\u25e2",
    "blackrectangle": "\u25ac",
    "blackrightpointingpointer": "\u25ba",
    "blackrightpointingtriangle": "\u25b6",
    "blacksmallsquare": "\u25aa",
    "blacksmilingface": "\u263b",
    "blacksquare": "\u25a0",
    "blackstar": "\u2605",
    "blackupperlefttriangle": "\u25e4",
    "blackupperrighttriangle": "\u25e5",
    "blackuppointingsmalltriangle": "\u25b4",
    "blackuppointingtriangle": "\u25b2",
    "blank": "\u2423",
    "blinebelow": "\u1e07",
    "block": "\u2588",
    "bmonospace": "\uff42",
    "bobaimaithai": "\u0e1a",
    "bohiragana": "\u307c",
    "bokatakana": "\u30dc",
    "bparen": "\u249d",
    "bqsquare": "\u33c3",
    "braceex": "\uf8f4",
    "braceleft": "\u007b",
    "braceleftbt": "\uf8f3",
    "braceleftmid": "\uf8f2",
    "braceleftmonospace": "\uff5b",
    "braceleftsmall": "\ufe5b",
    "bracelefttp": "\uf8f1",
    "braceleftvertical": "\ufe37",
    "braceright": "\u007d",
    "bracerightbt": "\uf8fe",
    "bracerightmid": "\uf8fd",
    "bracerightmonospace": "\uff5d",
    "bracerightsmall": "\ufe5c",
    "bracerighttp": "\uf8fc",
    "bracerightvertical": "\ufe38",
    "bracketleft": "\u005b",
    "bracketleftbt": "\uf8f0",
    "bracketleftex": "\uf8ef",
    "bracketleftmonospace": "\uff3b",
    "bracketlefttp": "\uf8ee",
    "bracketright": "\u005d",
    "bracketrightbt": "\uf8fb",
    "bracketrightex": "\uf8fa",
    "bracketrightmonospace": "\uff3d",
    "bracketrighttp": "\uf8f9",
    "breve": "\u02d8",
    "brevebelowcmb": "\u032e",
    "brevecmb": "\u0306",
    "breveinvertedbelowcmb": "\u032f",
    "breveinvertedcmb": "\u0311",
    "breveinverteddoublecmb": "\u0361",
    "bridgebelowcmb": "\u032a",
    "bridgeinvertedbelowcmb": "\u033a",
    "brokenbar": "\u00a6",
    "bstroke": "\u0180",
    "bsuperior": "\uf6ea",
    "btopbar": "\u0183",
    "buhiragana": "\u3076",
    "bukatakana": "\u30d6",
    "bullet": "\u2022",
    "bulletinverse": "\u25d8",
    "bulletoperator": "\u2219",
    "bullseye": "\u25ce",
    "c": "\u0063",
    "caarmenian": "\u056e",
    "cabengali": "\u099a",
    "cacute": "\u0107",
    "cadeva": "\u091a",
    "cagujarati": "\u0a9a",
    "cagurmukhi": "\u0a1a",
    "calsquare": "\u3388",
    "candrabindubengali": "\u0981",
    "candrabinducmb": "\u0310",
    "candrabindudeva": "\u0901",
    "candrabindugujarati": "\u0a81",
    "capslock": "\u21ea",
    "careof": "\u2105",
    "caron": "\u02c7",
    "caronbelowcmb": "\u032c",
    "caroncmb": "\u030c",
    "carriagereturn": "\u21b5",
    "cbopomofo": "\u3118",
    "ccaron": "\u010d",
    "ccedilla": "\u00e7",
    "ccedillaacute": "\u1e09",
    "ccircle": "\u24d2",
    "ccircumflex": "\u0109",
    "ccurl": "\u0255",
    "cdot": "\u010b",
    "cdotaccent": "\u010b",
    "cdsquare": "\u33c5",
    "cedilla": "\u00b8",
    "cedillacmb": "\u0327",
    "cent": "\u00a2",
    "centigrade": "\u2103",
    "centinferior": "\uf6df",
    "centmonospace": "\uffe0",
    "centoldstyle": "\uf7a2",
    "centsuperior": "\uf6e0",
    "chaarmenian": "\u0579",
    "chabengali": "\u099b",
    "chadeva": "\u091b",
    "chagujarati": "\u0a9b",
    "chagurmukhi": "\u0a1b",
    "chbopomofo": "\u3114",
    "cheabkhasiancyrillic": "\u04bd",
    "checkmark": "\u2713",
    "checyrillic": "\u0447",
    "chedescenderabkhasiancyrillic": "\u04bf",
    "chedescendercyrillic": "\u04b7",
    "chedieresiscyrillic": "\u04f5",
    "cheharmenian": "\u0573",
    "chekhakassiancyrillic": "\u04cc",
    "cheverticalstrokecyrillic": "\u04b9",
    "chi": "\u03c7",
    "chieuchacirclekorean": "\u3277",
    "chieuchaparenkorean": "\u3217",
    "chieuchcirclekorean": "\u3269",
    "chieuchkorean": "\u314a",
    "chieuchparenkorean": "\u3209",
    "chochangthai": "\u0e0a",
    "chochanthai": "\u0e08",
    "chochingthai": "\u0e09",
    "chochoethai": "\u0e0c",
    "chook": "\u0188",
    "cieucacirclekorean": "\u3276",
    "cieucaparenkorean": "\u3216",
    "cieuccirclekorean": "\u3268",
    "cieuckorean": "\u3148",
    "cieucparenkorean": "\u3208",
    "cieucuparenkorean": "\u321c",
    "circle": "\u25cb",
    "circlemultiply": "\u2297",
    "circleot": "\u2299",
    "circleplus": "\u2295",
    "circlepostalmark": "\u3036",
    "circlewithlefthalfblack": "\u25d0",
    "circlewithrighthalfblack": "\u25d1",
    "circumflex": "\u02c6",
    "circumflexbelowcmb": "\u032d",
    "circumflexcmb": "\u0302",
    "clear": "\u2327",
    "clickalveolar": "\u01c2",
    "clickdental": "\u01c0",
    "clicklateral": "\u01c1",
    "clickretroflex": "\u01c3",
    "club": "\u2663",
    "clubsuitblack": "\u2663",
    "clubsuitwhite": "\u2667",
    "cmcubedsquare": "\u33a4",
    "cmonospace": "\uff43",
    "cmsquaredsquare": "\u33a0",
    "coarmenian": "\u0581",
    "colon": "\u003a",
    "colonmonetary": "\u20a1",
    "colonmonospace": "\uff1a",
    "colonsign": "\u20a1",
    "colonsmall": "\ufe55",
    "colontriangularhalfmod": "\u02d1",
    "colontriangularmod": "\u02d0",
    "comma": "\u002c",
    "commaabovecmb": "\u0313",
    "commaaboverightcmb": "\u0315",
    "commaaccent": "\uf6c3",
    "commaarabic": "\u060c",
    "commaarmenian": "\u055d",
    "commainferior": "\uf6e1",
    "commamonospace": "\uff0c",
    "commareversedabovecmb": "\u0314",
    "commareversedmod": "\u02bd",
    "commasmall": "\ufe50",
    "commasuperior": "\uf6e2",
    "commaturnedabovecmb": "\u0312",
    "commaturnedmod": "\u02bb",
    "compass": "\u263c",
    "congruent": "\u2245",
    "contourintegral": "\u222e",
    "control": "\u2303",
    "controlACK": "\u0006",
    "controlBEL": "\u0007",
    "controlBS": "\u0008",
    "controlCAN": "\u0018",
    "controlCR": "\u000d",
    "controlDC1": "\u0011",
    "controlDC2": "\u0012",
    "controlDC3": "\u0013",
    "controlDC4": "\u0014",
    "controlDEL": "\u007f",
    "controlDLE": "\u0010",
    "controlEM": "\u0019",
    "controlENQ": "\u0005",
    "controlEOT": "\u0004",
    "controlESC": "\u001b",
    "controlETB": "\u0017",
    "controlETX": "\u0003",
    "controlFF": "\u000c",
    "controlFS": "\u001c",
    "controlGS": "\u001d",
    "controlHT": "\u0009",
    "controlLF": "\u000a",
    "controlNAK": "\u0015",
    "controlRS": "\u001e",
    "controlSI": "\u000f",
    "controlSO": "\u000e",
    "controlSOT": "\u0002",
    "controlSTX": "\u0001",
    "controlSUB": "\u001a",
    "controlSYN": "\u0016",
    "controlUS": "\u001f",
    "controlVT": "\u000b",
    "copyright": "\u00a9",
    "copyrightsans": "\uf8e9",
    "copyrightserif": "\uf6d9",
    "cornerbracketleft": "\u300c",
    "cornerbracketlefthalfwidth": "\uff62",
    "cornerbracketleftvertical": "\ufe41",
    "cornerbracketright": "\u300d",
    "cornerbracketrighthalfwidth": "\uff63",
    "cornerbracketrightvertical": "\ufe42",
    "corporationsquare": "\u337f",
    "cosquare": "\u33c7",
    "coverkgsquare": "\u33c6",
    "cparen": "\u249e",
    "cruzeiro": "\u20a2",
    "cstretched": "\u0297",
    "curlyand": "\u22cf",
    "curlyor": "\u22ce",
    "currency": "\u00a4",
    "cyrBreve": "\uf6d1",
    "cyrFlex": "\uf6d2",
    "cyrbreve": "\uf6d4",
    "cyrflex": "\uf6d5",
    "d": "\u0064",
    "daarmenian": "\u0564",
    "dabengali": "\u09a6",
    "dadarabic": "\u0636",
    "dadeva": "\u0926",
    "dadfinalarabic": "\ufebe",
    "dadinitialarabic": "\ufebf",
    "dadmedialarabic": "\ufec0",
    "dagesh": "\u05bc",
    "dageshhebrew": "\u05bc",
    "dagger": "\u2020",
    "daggerdbl": "\u2021",
    "dagujarati": "\u0aa6",
    "dagurmukhi": "\u0a26",
    "dahiragana": "\u3060",
    "dakatakana": "\u30c0",
    "dalarabic": "\u062f",
    "dalet": "\u05d3",
    "daletdagesh": "\ufb33",
    "daletdageshhebrew": "\ufb33",
    "dalethatafpatah": "\u05d3\u05b2",
    "dalethatafpatahhebrew": "\u05d3\u05b2",
    "dalethatafsegol": "\u05d3\u05b1",
    "dalethatafsegolhebrew": "\u05d3\u05b1",
    "dalethebrew": "\u05d3",
    "dalethiriq": "\u05d3\u05b4",
    "dalethiriqhebrew": "\u05d3\u05b4",
    "daletholam": "\u05d3\u05b9",
    "daletholamhebrew": "\u05d3\u05b9",
    "daletpatah": "\u05d3\u05b7",
    "daletpatahhebrew": "\u05d3\u05b7",
    "daletqamats": "\u05d3\u05b8",
    "daletqamatshebrew": "\u05d3\u05b8",
    "daletqubuts": "\u05d3\u05bb",
    "daletqubutshebrew": "\u05d3\u05bb",
    "daletsegol": "\u05d3\u05b6",
    "daletsegolhebrew": "\u05d3\u05b6",
    "daletsheva": "\u05d3\u05b0",
    "daletshevahebrew": "\u05d3\u05b0",
    "dalettsere": "\u05d3\u05b5",
    "dalettserehebrew": "\u05d3\u05b5",
    "dalfinalarabic": "\ufeaa",
    "dammaarabic": "\u064f",
    "dammalowarabic": "\u064f",
    "dammatanaltonearabic": "\u064c",
    "dammatanarabic": "\u064c",
    "danda": "\u0964",
    "dargahebrew": "\u05a7",
    "dargalefthebrew": "\u05a7",
    "dasiapneumatacyrilliccmb": "\u0485",
    "dblGrave": "\uf6d3",
    "dblanglebracketleft": "\u300a",
    "dblanglebracketleftvertical": "\ufe3d",
    "dblanglebracketright": "\u300b",
    "dblanglebracketrightvertical": "\ufe3e",
    "dblarchinvertedbelowcmb": "\u032b",
    "dblarrowleft": "\u21d4",
    "dblarrowright": "\u21d2",
    "dbldanda": "\u0965",
    "dblgrave": "\uf6d6",
    "dblgravecmb": "\u030f",
    "dblintegral": "\u222c",
    "dbllowline": "\u2017",
    "dbllowlinecmb": "\u0333",
    "dbloverlinecmb": "\u033f",
    "dblprimemod": "\u02ba",
    "dblverticalbar": "\u2016",
    "dblverticallineabovecmb": "\u030e",
    "dbopomofo": "\u3109",
    "dbsquare": "\u33c8",
    "dcaron": "\u010f",
    "dcedilla": "\u1e11",
    "dcircle": "\u24d3",
    "dcircumflexbelow": "\u1e13",
    "dcroat": "\u0111",
    "ddabengali": "\u09a1",
    "ddadeva": "\u0921",
    "ddagujarati": "\u0aa1",
    "ddagurmukhi": "\u0a21",
    "ddalarabic": "\u0688",
    "ddalfinalarabic": "\ufb89",
    "dddhadeva": "\u095c",
    "ddhabengali": "\u09a2",
    "ddhadeva": "\u0922",
    "ddhagujarati": "\u0aa2",
    "ddhagurmukhi": "\u0a22",
    "ddotaccent": "\u1e0b",
    "ddotbelow": "\u1e0d",
    "decimalseparatorarabic": "\u066b",
    "decimalseparatorpersian": "\u066b",
    "decyrillic": "\u0434",
    "degree": "\u00b0",
    "dehihebrew": "\u05ad",
    "dehiragana": "\u3067",
    "deicoptic": "\u03ef",
    "dekatakana": "\u30c7",
    "deleteleft": "\u232b",
    "deleteright": "\u2326",
    "delta": "\u03b4",
    "deltaturned": "\u018d",
    "denominatorminusonenumeratorbengali": "\u09f8",
    "dezh": "\u02a4",
    "dhabengali": "\u09a7",
    "dhadeva": "\u0927",
    "dhagujarati": "\u0aa7",
    "dhagurmukhi": "\u0a27",
    "dhook": "\u0257",
    "dialytikatonos": "\u0385",
    "dialytikatonoscmb": "\u0344",
    "diamond": "\u2666",
    "diamondsuitwhite": "\u2662",
    "dieresis": "\u00a8",
    "dieresisacute": "\uf6d7",
    "dieresisbelowcmb": "\u0324",
    "dieresiscmb": "\u0308",
    "dieresisgrave": "\uf6d8",
    "dieresistonos": "\u0385",
    "dihiragana": "\u3062",
    "dikatakana": "\u30c2",
    "dittomark": "\u3003",
    "divide": "\u00f7",
    "divides": "\u2223",
    "divisionslash": "\u2215",
    "djecyrillic": "\u0452",
    "dkshade": "\u2593",
    "dlinebelow": "\u1e0f",
    "dlsquare": "\u3397",
    "dmacron": "\u0111",
    "dmonospace": "\uff44",
    "dnblock": "\u2584",
    "dochadathai": "\u0e0e",
    "dodekthai": "\u0e14",
    "dohiragana": "\u3069",
    "dokatakana": "\u30c9",
    "dollar": "\u0024",
    "dollarinferior": "\uf6e3",
    "dollarmonospace": "\uff04",
    "dollaroldstyle": "\uf724",
    "dollarsmall": "\ufe69",
    "dollarsuperior": "\uf6e4",
    "dong": "\u20ab",
    "dorusquare": "\u3326",
    "dotaccent": "\u02d9",
    "dotaccentcmb": "\u0307",
    "dotbelowcmb": "\u0323",
    "dotbelowcomb": "\u0323",
    "dotkatakana": "\u30fb",
    "dotlessi": "\u0131",
    "dotlessj": "\uf6be",
    "dotlessjstrokehook": "\u0284",
    "dotmath": "\u22c5",
    "dottedcircle": "\u25cc",
    "doubleyodpatah": "\ufb1f",
    "doubleyodpatahhebrew": "\ufb1f",
    "downtackbelowcmb": "\u031e",
    "downtackmod": "\u02d5",
    "dparen": "\u249f",
    "dsuperior": "\uf6eb",
    "dtail": "\u0256",
    "dtopbar": "\u018c",
    "duhiragana": "\u3065",
    "dukatakana": "\u30c5",
    "dz": "\u01f3",
    "dzaltone": "\u02a3",
    "dzcaron": "\u01c6",
    "dzcurl": "\u02a5",
    "dzeabkhasiancyrillic": "\u04e1",
    "dzecyrillic": "\u0455",
    "dzhecyrillic": "\u045f",
    "e": "\u0065",
    "eacute": "\u00e9",
    "earth": "\u2641",
    "ebengali": "\u098f",
    "ebopomofo": "\u311c",
    "ebreve": "\u0115",
    "ecandradeva": "\u090d",
    "ecandragujarati": "\u0a8d",
    "ecandravowelsigndeva": "\u0945",
    "ecandravowelsigngujarati": "\u0ac5",
    "ecaron": "\u011b",
    "ecedillabreve": "\u1e1d",
    "echarmenian": "\u0565",
    "echyiwnarmenian": "\u0587",
    "ecircle": "\u24d4",
    "ecircumflex": "\u00ea",
    "ecircumflexacute": "\u1ebf",
    "ecircumflexbelow": "\u1e19",
    "ecircumflexdotbelow": "\u1ec7",
    "ecircumflexgrave": "\u1ec1",
    "ecircumflexhookabove": "\u1ec3",
    "ecircumflextilde": "\u1ec5",
    "ecyrillic": "\u0454",
    "edblgrave": "\u0205",
    "edeva": "\u090f",
    "edieresis": "\u00eb",
    "edot": "\u0117",
    "edotaccent": "\u0117",
    "edotbelow": "\u1eb9",
    "eegurmukhi": "\u0a0f",
    "eematragurmukhi": "\u0a47",
    "efcyrillic": "\u0444",
    "egrave": "\u00e8",
    "egujarati": "\u0a8f",
    "eharmenian": "\u0567",
    "ehbopomofo": "\u311d",
    "ehiragana": "\u3048",
    "ehookabove": "\u1ebb",
    "eibopomofo": "\u311f",
    "eight": "\u0038",
    "eightarabic": "\u0668",
    "eightbengali": "\u09ee",
    "eightcircle": "\u2467",
    "eightcircleinversesansserif": "\u2791",
    "eightdeva": "\u096e",
    "eighteencircle": "\u2471",
    "eighteenparen": "\u2485",
    "eighteenperiod": "\u2499",
    "eightgujarati": "\u0aee",
    "eightgurmukhi": "\u0a6e",
    "eighthackarabic": "\u0668",
    "eighthangzhou": "\u3028",
    "eighthnotebeamed": "\u266b",
    "eightideographicparen": "\u3227",
    "eightinferior": "\u2088",
    "eightmonospace": "\uff18",
    "eightoldstyle": "\uf738",
    "eightparen": "\u247b",
    "eightperiod": "\u248f",
    "eightpersian": "\u06f8",
    "eightroman": "\u2177",
    "eightsuperior": "\u2078",
    "eightthai": "\u0e58",
    "einvertedbreve": "\u0207",
    "eiotifiedcyrillic": "\u0465",
    "ekatakana": "\u30a8",
    "ekatakanahalfwidth": "\uff74",
    "ekonkargurmukhi": "\u0a74",
    "ekorean": "\u3154",
    "elcyrillic": "\u043b",
    "element": "\u2208",
    "elevencircle": "\u246a",
    "elevenparen": "\u247e",
    "elevenperiod": "\u2492",
    "elevenroman": "\u217a",
    "ellipsis": "\u2026",
    "ellipsisvertical": "\u22ee",
    "emacron": "\u0113",
    "emacronacute": "\u1e17",
    "emacrongrave": "\u1e15",
    "emcyrillic": "\u043c",
    "emdash": "\u2014",
    "emdashvertical": "\ufe31",
    "emonospace": "\uff45",
    "emphasismarkarmenian": "\u055b",
    "emptyset": "\u2205",
    "enbopomofo": "\u3123",
    "encyrillic": "\u043d",
    "endash": "\u2013",
    "endashvertical": "\ufe32",
    "endescendercyrillic": "\u04a3",
    "eng": "\u014b",
    "engbopomofo": "\u3125",
    "enghecyrillic": "\u04a5",
    "enhookcyrillic": "\u04c8",
    "enspace": "\u2002",
    "eogonek": "\u0119",
    "eokorean": "\u3153",
    "eopen": "\u025b",
    "eopenclosed": "\u029a",
    "eopenreversed": "\u025c",
    "eopenreversedclosed": "\u025e",
    "eopenreversedhook": "\u025d",
    "eparen": "\u24a0",
    "epsilon": "\u03b5",
    "epsilontonos": "\u03ad",
    "equal": "\u003d",
    "equalmonospace": "\uff1d",
    "equalsmall": "\ufe66",
    "equalsuperior": "\u207c",
    "equivalence": "\u2261",
    "erbopomofo": "\u3126",
    "ercyrillic": "\u0440",
    "ereversed": "\u0258",
    "ereversedcyrillic": "\u044d",
    "escyrillic": "\u0441",
    "esdescendercyrillic": "\u04ab",
    "esh": "\u0283",
    "eshcurl": "\u0286",
    "eshortdeva": "\u090e",
    "eshortvowelsigndeva": "\u0946",
    "eshreversedloop": "\u01aa",
    "eshsquatreversed": "\u0285",
    "esmallhiragana": "\u3047",
    "esmallkatakana": "\u30a7",
    "esmallkatakanahalfwidth": "\uff6a",
    "estimated": "\u212e",
    "esuperior": "\uf6ec",
    "eta": "\u03b7",
    "etarmenian": "\u0568",
    "etatonos": "\u03ae",
    "eth": "\u00f0",
    "etilde": "\u1ebd",
    "etildebelow": "\u1e1b",
    "etnahtafoukhhebrew": "\u0591",
    "etnahtafoukhlefthebrew": "\u0591",
    "etnahtahebrew": "\u0591",
    "etnahtalefthebrew": "\u0591",
    "eturned": "\u01dd",
    "eukorean": "\u3161",
    "euro": "\u20ac",
    "evowelsignbengali": "\u09c7",
    "evowelsigndeva": "\u0947",
    "evowelsigngujarati": "\u0ac7",
    "exclam": "\u0021",
    "exclamarmenian": "\u055c",
    "exclamdbl": "\u203c",
    "exclamdown": "\u00a1",
    "exclamdownsmall": "\uf7a1",
    "exclammonospace": "\uff01",
    "exclamsmall": "\uf721",
    "existential": "\u2203",
    "ezh": "\u0292",
    "ezhcaron": "\u01ef",
    "ezhcurl": "\u0293",
    "ezhreversed": "\u01b9",
    "ezhtail": "\u01ba",
    "f": "\u0066",
    "fadeva": "\u095e",
    "fagurmukhi": "\u0a5e",
    "fahrenheit": "\u2109",
    "fathaarabic": "\u064e",
    "fathalowarabic": "\u064e",
    "fathatanarabic": "\u064b",
    "fbopomofo": "\u3108",
    "fcircle": "\u24d5",
    "fdotaccent": "\u1e1f",
    "feharabic": "\u0641",
    "feharmenian": "\u0586",
    "fehfinalarabic": "\ufed2",
    "fehinitialarabic": "\ufed3",
    "fehmedialarabic": "\ufed4",
    "feicoptic": "\u03e5",
    "female": "\u2640",
    "ff": "\ufb00",
    "ffi": "\ufb03",
    "ffl": "\ufb04",
    "fi": "\ufb01",
    "fifteencircle": "\u246e",
    "fifteenparen": "\u2482",
    "fifteenperiod": "\u2496",
    "figuredash": "\u2012",
    "filledbox": "\u25a0",
    "filledrect": "\u25ac",
    "finalkaf": "\u05da",
    "finalkafdagesh": "\ufb3a",
    "finalkafdageshhebrew": "\ufb3a",
    "finalkafhebrew": "\u05da",
    "finalkafqamats": "\u05da\u05b8",
    "finalkafqamatshebrew": "\u05da\u05b8",
    "finalkafsheva": "\u05da\u05b0",
    "finalkafshevahebrew": "\u05da\u05b0",
    "finalmem": "\u05dd",
    "finalmemhebrew": "\u05dd",
    "finalnun": "\u05df",
    "finalnunhebrew": "\u05df",
    "finalpe": "\u05e3",
    "finalpehebrew": "\u05e3",
    "finaltsadi": "\u05e5",
    "finaltsadihebrew": "\u05e5",
    "firsttonechinese": "\u02c9",
    "fisheye": "\u25c9",
    "fitacyrillic": "\u0473",
    "five": "\u0035",
    "fivearabic": "\u0665",
    "fivebengali": "\u09eb",
    "fivecircle": "\u2464",
    "fivecircleinversesansserif": "\u278e",
    "fivedeva": "\u096b",
    "fiveeighths": "\u215d",
    "fivegujarati": "\u0aeb",
    "fivegurmukhi": "\u0a6b",
    "fivehackarabic": "\u0665",
    "fivehangzhou": "\u3025",
    "fiveideographicparen": "\u3224",
    "fiveinferior": "\u2085",
    "fivemonospace": "\uff15",
    "fiveoldstyle": "\uf735",
    "fiveparen": "\u2478",
    "fiveperiod": "\u248c",
    "fivepersian": "\u06f5",
    "fiveroman": "\u2174",
    "fivesuperior": "\u2075",
    "fivethai": "\u0e55",
    "fl": "\ufb02",
    "florin": "\u0192",
    "fmonospace": "\uff46",
    "fmsquare": "\u3399",
    "fofanthai": "\u0e1f",
    "fofathai": "\u0e1d",
    "fongmanthai": "\u0e4f",
    "forall": "\u2200",
    "four": "\u0034",
    "fourarabic": "\u0664",
    "fourbengali": "\u09ea",
    "fourcircle": "\u2463",
    "fourcircleinversesansserif": "\u278d",
    "fourdeva": "\u096a",
    "fourgujarati": "\u0aea",
    "fourgurmukhi": "\u0a6a",
    "fourhackarabic": "\u0664",
    "fourhangzhou": "\u3024",
    "fourideographicparen": "\u3223",
    "fourinferior": "\u2084",
    "fourmonospace": "\uff14",
    "fournumeratorbengali": "\u09f7",
    "fouroldstyle": "\uf734",
    "fourparen": "\u2477",
    "fourperiod": "\u248b",
    "fourpersian": "\u06f4",
    "fourroman": "\u2173",
    "foursuperior": "\u2074",
    "fourteencircle": "\u246d",
    "fourteenparen": "\u2481",
    "fourteenperiod": "\u2495",
    "fourthai": "\u0e54",
    "fourthtonechinese": "\u02cb",
    "fparen": "\u24a1",
    "fraction": "\u2044",
    "franc": "\u20a3",
    "g": "\u0067",
    "gabengali": "\u0997",
    "gacute": "\u01f5",
    "gadeva": "\u0917",
    "gafarabic": "\u06af",
    "gaffinalarabic": "\ufb93",
    "gafinitialarabic": "\ufb94",
    "gafmedialarabic": "\ufb95",
    "gagujarati": "\u0a97",
    "gagurmukhi": "\u0a17",
    "gahiragana": "\u304c",
    "gakatakana": "\u30ac",
    "gamma": "\u03b3",
    "gammalatinsmall": "\u0263",
    "gammasuperior": "\u02e0",
    "gangiacoptic": "\u03eb",
    "gbopomofo": "\u310d",
    "gbreve": "\u011f",
    "gcaron": "\u01e7",
    "gcedilla": "\u0123",
    "gcircle": "\u24d6",
    "gcircumflex": "\u011d",
    "gcommaaccent": "\u0123",
    "gdot": "\u0121",
    "gdotaccent": "\u0121",
    "gecyrillic": "\u0433",
    "gehiragana": "\u3052",
    "gekatakana": "\u30b2",
    "geometricallyequal": "\u2251",
    "gereshaccenthebrew": "\u059c",
    "gereshhebrew": "\u05f3",
    "gereshmuqdamhebrew": "\u059d",
    "germandbls": "\u00df",
    "gershayimaccenthebrew": "\u059e",
    "gershayimhebrew": "\u05f4",
    "getamark": "\u3013",
    "ghabengali": "\u0998",
    "ghadarmenian": "\u0572",
    "ghadeva": "\u0918",
    "ghagujarati": "\u0a98",
    "ghagurmukhi": "\u0a18",
    "ghainarabic": "\u063a",
    "ghainfinalarabic": "\ufece",
    "ghaininitialarabic": "\ufecf",
    "ghainmedialarabic": "\ufed0",
    "ghemiddlehookcyrillic": "\u0495",
    "ghestrokecyrillic": "\u0493",
    "gheupturncyrillic": "\u0491",
    "ghhadeva": "\u095a",
    "ghhagurmukhi": "\u0a5a",
    "ghook": "\u0260",
    "ghzsquare": "\u3393",
    "gihiragana": "\u304e",
    "gikatakana": "\u30ae",
    "gimarmenian": "\u0563",
    "gimel": "\u05d2",
    "gimeldagesh": "\ufb32",
    "gimeldageshhebrew": "\ufb32",
    "gimelhebrew": "\u05d2",
    "gjecyrillic": "\u0453",
    "glottalinvertedstroke": "\u01be",
    "glottalstop": "\u0294",
    "glottalstopinverted": "\u0296",
    "glottalstopmod": "\u02c0",
    "glottalstopreversed": "\u0295",
    "glottalstopreversedmod": "\u02c1",
    "glottalstopreversedsuperior": "\u02e4",
    "glottalstopstroke": "\u02a1",
    "glottalstopstrokereversed": "\u02a2",
    "gmacron": "\u1e21",
    "gmonospace": "\uff47",
    "gohiragana": "\u3054",
    "gokatakana": "\u30b4",
    "gparen": "\u24a2",
    "gpasquare": "\u33ac",
    "gradient": "\u2207",
    "grave": "\u0060",
    "gravebelowcmb": "\u0316",
    "gravecmb": "\u0300",
    "gravecomb": "\u0300",
    "gravedeva": "\u0953",
    "gravelowmod": "\u02ce",
    "gravemonospace": "\uff40",
    "gravetonecmb": "\u0340",
    "greater": "\u003e",
    "greaterequal": "\u2265",
    "greaterequalorless": "\u22db",
    "greatermonospace": "\uff1e",
    "greaterorequivalent": "\u2273",
    "greaterorless": "\u2277",
    "greateroverequal": "\u2267",
    "greatersmall": "\ufe65",
    "gscript": "\u0261",
    "gstroke": "\u01e5",
    "guhiragana": "\u3050",
    "guillemotleft": "\u00ab",
    "guillemotright": "\u00bb",
    "guilsinglleft": "\u2039",
    "guilsinglright": "\u203a",
    "gukatakana": "\u30b0",
    "guramusquare": "\u3318",
    "gysquare": "\u33c9",
    "h": "\u0068",
    "haabkhasiancyrillic": "\u04a9",
    "haaltonearabic": "\u06c1",
    "habengali": "\u09b9",
    "hadescendercyrillic": "\u04b3",
    "hadeva": "\u0939",
    "hagujarati": "\u0ab9",
    "hagurmukhi": "\u0a39",
    "haharabic": "\u062d",
    "hahfinalarabic": "\ufea2",
    "hahinitialarabic": "\ufea3",
    "hahiragana": "\u306f",
    "hahmedialarabic": "\ufea4",
    "haitusquare": "\u332a",
    "hakatakana": "\u30cf",
    "hakatakanahalfwidth": "\uff8a",
    "halantgurmukhi": "\u0a4d",
    "hamzaarabic": "\u0621",
    "hamzadammaarabic": "\u0621\u064f",
    "hamzadammatanarabic": "\u0621\u064c",
    "hamzafathaarabic": "\u0621\u064e",
    "hamzafathatanarabic": "\u0621\u064b",
    "hamzalowarabic": "\u0621",
    "hamzalowkasraarabic": "\u0621\u0650",
    "hamzalowkasratanarabic": "\u0621\u064d",
    "hamzasukunarabic": "\u0621\u0652",
    "hangulfiller": "\u3164",
    "hardsigncyrillic": "\u044a",
    "harpoonleftbarbup": "\u21bc",
    "harpoonrightbarbup": "\u21c0",
    "hasquare": "\u33ca",
    "hatafpatah": "\u05b2",
    "hatafpatah16": "\u05b2",
    "hatafpatah23": "\u05b2",
    "hatafpatah2f": "\u05b2",
    "hatafpatahhebrew": "\u05b2",
    "hatafpatahnarrowhebrew": "\u05b2",
    "hatafpatahquarterhebrew": "\u05b2",
    "hatafpatahwidehebrew": "\u05b2",
    "hatafqamats": "\u05b3",
    "hatafqamats1b": "\u05b3",
    "hatafqamats28": "\u05b3",
    "hatafqamats34": "\u05b3",
    "hatafqamatshebrew": "\u05b3",
    "hatafqamatsnarrowhebrew": "\u05b3",
    "hatafqamatsquarterhebrew": "\u05b3",
    "hatafqamatswidehebrew": "\u05b3",
    "hatafsegol": "\u05b1",
    "hatafsegol17": "\u05b1",
    "hatafsegol24": "\u05b1",
    "hatafsegol30": "\u05b1",
    "hatafsegolhebrew": "\u05b1",
    "hatafsegolnarrowhebrew": "\u05b1",
    "hatafsegolquarterhebrew": "\u05b1",
    "hatafsegolwidehebrew": "\u05b1",
    "hbar": "\u0127",
    "hbopomofo": "\u310f",
    "hbrevebelow": "\u1e2b",
    "hcedilla": "\u1e29",
    "hcircle": "\u24d7",
    "hcircumflex": "\u0125",
    "hdieresis": "\u1e27",
    "hdotaccent": "\u1e23",
    "hdotbelow": "\u1e25",
    "he": "\u05d4",
    "heart": "\u2665",
    "heartsuitblack": "\u2665",
    "heartsuitwhite": "\u2661",
    "hedagesh": "\ufb34",
    "hedageshhebrew": "\ufb34",
    "hehaltonearabic": "\u06c1",
    "heharabic": "\u0647",
    "hehebrew": "\u05d4",
    "hehfinalaltonearabic": "\ufba7",
    "hehfinalalttwoarabic": "\ufeea",
    "hehfinalarabic": "\ufeea",
    "hehhamzaabovefinalarabic": "\ufba5",
    "hehhamzaaboveisolatedarabic": "\ufba4",
    "hehinitialaltonearabic": "\ufba8",
    "hehinitialarabic": "\ufeeb",
    "hehiragana": "\u3078",
    "hehmedialaltonearabic": "\ufba9",
    "hehmedialarabic": "\ufeec",
    "heiseierasquare": "\u337b",
    "hekatakana": "\u30d8",
    "hekatakanahalfwidth": "\uff8d",
    "hekutaarusquare": "\u3336",
    "henghook": "\u0267",
    "herutusquare": "\u3339",
    "het": "\u05d7",
    "hethebrew": "\u05d7",
    "hhook": "\u0266",
    "hhooksuperior": "\u02b1",
    "hieuhacirclekorean": "\u327b",
    "hieuhaparenkorean": "\u321b",
    "hieuhcirclekorean": "\u326d",
    "hieuhkorean": "\u314e",
    "hieuhparenkorean": "\u320d",
    "hihiragana": "\u3072",
    "hikatakana": "\u30d2",
    "hikatakanahalfwidth": "\uff8b",
    "hiriq": "\u05b4",
    "hiriq14": "\u05b4",
    "hiriq21": "\u05b4",
    "hiriq2d": "\u05b4",
    "hiriqhebrew": "\u05b4",
    "hiriqnarrowhebrew": "\u05b4",
    "hiriqquarterhebrew": "\u05b4",
    "hiriqwidehebrew": "\u05b4",
    "hlinebelow": "\u1e96",
    "hmonospace": "\uff48",
    "hoarmenian": "\u0570",
    "hohipthai": "\u0e2b",
    "hohiragana": "\u307b",
    "hokatakana": "\u30db",
    "hokatakanahalfwidth": "\uff8e",
    "holam": "\u05b9",
    "holam19": "\u05b9",
    "holam26": "\u05b9",
    "holam32": "\u05b9",
    "holamhebrew": "\u05b9",
    "holamnarrowhebrew": "\u05b9",
    "holamquarterhebrew": "\u05b9",
    "holamwidehebrew": "\u05b9",
    "honokhukthai": "\u0e2e",
    "hookabovecomb": "\u0309",
    "hookcmb": "\u0309",
    "hookpalatalizedbelowcmb": "\u0321",
    "hookretroflexbelowcmb": "\u0322",
    "hoonsquare": "\u3342",
    "horicoptic": "\u03e9",
    "horizontalbar": "\u2015",
    "horncmb": "\u031b",
    "hotsprings": "\u2668",
    "house": "\u2302",
    "hparen": "\u24a3",
    "hsuperior": "\u02b0",
    "hturned": "\u0265",
    "huhiragana": "\u3075",
    "huiitosquare": "\u3333",
    "hukatakana": "\u30d5",
    "hukatakanahalfwidth": "\uff8c",
    "hungarumlaut": "\u02dd",
    "hungarumlautcmb": "\u030b",
    "hv": "\u0195",
    "hyphen": "\u002d",
    "hypheninferior": "\uf6e5",
    "hyphenmonospace": "\uff0d",
    "hyphensmall": "\ufe63",
    "hyphensuperior": "\uf6e6",
    "hyphentwo": "\u2010",
    "i": "\u0069",
    "iacute": "\u00ed",
    "iacyrillic": "\u044f",
    "ibengali": "\u0987",
    "ibopomofo": "\u3127",
    "ibreve": "\u012d",
    "icaron": "\u01d0",
    "icircle": "\u24d8",
    "icircumflex": "\u00ee",
    "icyrillic": "\u0456",
    "idblgrave": "\u0209",
    "ideographearthcircle": "\u328f",
    "ideographfirecircle": "\u328b",
    "ideographicallianceparen": "\u323f",
    "ideographiccallparen": "\u323a",
    "ideographiccentrecircle": "\u32a5",
    "ideographicclose": "\u3006",
    "ideographiccomma": "\u3001",
    "ideographiccommaleft": "\uff64",
    "ideographiccongratulationparen": "\u3237",
    "ideographiccorrectcircle": "\u32a3",
    "ideographicearthparen": "\u322f",
    "ideographicenterpriseparen": "\u323d",
    "ideographicexcellentcircle": "\u329d",
    "ideographicfestivalparen": "\u3240",
    "ideographicfinancialcircle": "\u3296",
    "ideographicfinancialparen": "\u3236",
    "ideographicfireparen": "\u322b",
    "ideographichaveparen": "\u3232",
    "ideographichighcircle": "\u32a4",
    "ideographiciterationmark": "\u3005",
    "ideographiclaborcircle": "\u3298",
    "ideographiclaborparen": "\u3238",
    "ideographicleftcircle": "\u32a7",
    "ideographiclowcircle": "\u32a6",
    "ideographicmedicinecircle": "\u32a9",
    "ideographicmetalparen": "\u322e",
    "ideographicmoonparen": "\u322a",
    "ideographicnameparen": "\u3234",
    "ideographicperiod": "\u3002",
    "ideographicprintcircle": "\u329e",
    "ideographicreachparen": "\u3243",
    "ideographicrepresentparen": "\u3239",
    "ideographicresourceparen": "\u323e",
    "ideographicrightcircle": "\u32a8",
    "ideographicsecretcircle": "\u3299",
    "ideographicselfparen": "\u3242",
    "ideographicsocietyparen": "\u3233",
    "ideographicspace": "\u3000",
    "ideographicspecialparen": "\u3235",
    "ideographicstockparen": "\u3231",
    "ideographicstudyparen": "\u323b",
    "ideographicsunparen": "\u3230",
    "ideographicsuperviseparen": "\u323c",
    "ideographicwaterparen": "\u322c",
    "ideographicwoodparen": "\u322d",
    "ideographiczero": "\u3007",
    "ideographmetalcircle": "\u328e",
    "ideographmooncircle": "\u328a",
    "ideographnamecircle": "\u3294",
    "ideographsuncircle": "\u3290",
    "ideographwatercircle": "\u328c",
    "ideographwoodcircle": "\u328d",
    "ideva": "\u0907",
    "idieresis": "\u00ef",
    "idieresisacute": "\u1e2f",
    "idieresiscyrillic": "\u04e5",
    "idotbelow": "\u1ecb",
    "iebrevecyrillic": "\u04d7",
    "iecyrillic": "\u0435",
    "ieungacirclekorean": "\u3275",
    "ieungaparenkorean": "\u3215",
    "ieungcirclekorean": "\u3267",
    "ieungkorean": "\u3147",
    "ieungparenkorean": "\u3207",
    "igrave": "\u00ec",
    "igujarati": "\u0a87",
    "igurmukhi": "\u0a07",
    "ihiragana": "\u3044",
    "ihookabove": "\u1ec9",
    "iibengali": "\u0988",
    "iicyrillic": "\u0438",
    "iideva": "\u0908",
    "iigujarati": "\u0a88",
    "iigurmukhi": "\u0a08",
    "iimatragurmukhi": "\u0a40",
    "iinvertedbreve": "\u020b",
    "iishortcyrillic": "\u0439",
    "iivowelsignbengali": "\u09c0",
    "iivowelsigndeva": "\u0940",
    "iivowelsigngujarati": "\u0ac0",
    "ij": "\u0133",
    "ikatakana": "\u30a4",
    "ikatakanahalfwidth": "\uff72",
    "ikorean": "\u3163",
    "ilde": "\u02dc",
    "iluyhebrew": "\u05ac",
    "imacron": "\u012b",
    "imacroncyrillic": "\u04e3",
    "imageorapproximatelyequal": "\u2253",
    "imatragurmukhi": "\u0a3f",
    "imonospace": "\uff49",
    "increment": "\u2206",
    "infinity": "\u221e",
    "iniarmenian": "\u056b",
    "integral": "\u222b",
    "integralbottom": "\u2321",
    "integralbt": "\u2321",
    "integralex": "\uf8f5",
    "integraltop": "\u2320",
    "integraltp": "\u2320",
    "intersection": "\u2229",
    "intisquare": "\u3305",
    "invbullet": "\u25d8",
    "invcircle": "\u25d9",
    "invsmileface": "\u263b",
    "iocyrillic": "\u0451",
    "iogonek": "\u012f",
    "iota": "\u03b9",
    "iotadieresis": "\u03ca",
    "iotadieresistonos": "\u0390",
    "iotalatin": "\u0269",
    "iotatonos": "\u03af",
    "iparen": "\u24a4",
    "irigurmukhi": "\u0a72",
    "ismallhiragana": "\u3043",
    "ismallkatakana": "\u30a3",
    "ismallkatakanahalfwidth": "\uff68",
    "issharbengali": "\u09fa",
    "istroke": "\u0268",
    "isuperior": "\uf6ed",
    "iterationhiragana": "\u309d",
    "iterationkatakana": "\u30fd",
    "itilde": "\u0129",
    "itildebelow": "\u1e2d",
    "iubopomofo": "\u3129",
    "iucyrillic": "\u044e",
    "ivowelsignbengali": "\u09bf",
    "ivowelsigndeva": "\u093f",
    "ivowelsigngujarati": "\u0abf",
    "izhitsacyrillic": "\u0475",
    "izhitsadblgravecyrillic": "\u0477",
    "j": "\u006a",
    "jaarmenian": "\u0571",
    "jabengali": "\u099c",
    "jadeva": "\u091c",
    "jagujarati": "\u0a9c",
    "jagurmukhi": "\u0a1c",
    "jbopomofo": "\u3110",
    "jcaron": "\u01f0",
    "jcircle": "\u24d9",
    "jcircumflex": "\u0135",
    "jcrossedtail": "\u029d",
    "jdotlessstroke": "\u025f",
    "jecyrillic": "\u0458",
    "jeemarabic": "\u062c",
    "jeemfinalarabic": "\ufe9e",
    "jeeminitialarabic": "\ufe9f",
    "jeemmedialarabic": "\ufea0",
    "jeharabic": "\u0698",
    "jehfinalarabic": "\ufb8b",
    "jhabengali": "\u099d",
    "jhadeva": "\u091d",
    "jhagujarati": "\u0a9d",
    "jhagurmukhi": "\u0a1d",
    "jheharmenian": "\u057b",
    "jis": "\u3004",
    "jmonospace": "\uff4a",
    "jparen": "\u24a5",
    "jsuperior": "\u02b2",
    "k": "\u006b",
    "kabashkircyrillic": "\u04a1",
    "kabengali": "\u0995",
    "kacute": "\u1e31",
    "kacyrillic": "\u043a",
    "kadescendercyrillic": "\u049b",
    "kadeva": "\u0915",
    "kaf": "\u05db",
    "kafarabic": "\u0643",
    "kafdagesh": "\ufb3b",
    "kafdageshhebrew": "\ufb3b",
    "kaffinalarabic": "\ufeda",
    "kafhebrew": "\u05db",
    "kafinitialarabic": "\ufedb",
    "kafmedialarabic": "\ufedc",
    "kafrafehebrew": "\ufb4d",
    "kagujarati": "\u0a95",
    "kagurmukhi": "\u0a15",
    "kahiragana": "\u304b",
    "kahookcyrillic": "\u04c4",
    "kakatakana": "\u30ab",
    "kakatakanahalfwidth": "\uff76",
    "kappa": "\u03ba",
    "kappasymbolgreek": "\u03f0",
    "kapyeounmieumkorean": "\u3171",
    "kapyeounphieuphkorean": "\u3184",
    "kapyeounpieupkorean": "\u3178",
    "kapyeounssangpieupkorean": "\u3179",
    "karoriisquare": "\u330d",
    "kashidaautoarabic": "\u0640",
    "kashidaautonosidebearingarabic": "\u0640",
    "kasmallkatakana": "\u30f5",
    "kasquare": "\u3384",
    "kasraarabic": "\u0650",
    "kasratanarabic": "\u064d",
    "kastrokecyrillic": "\u049f",
    "katahiraprolongmarkhalfwidth": "\uff70",
    "kaverticalstrokecyrillic": "\u049d",
    "kbopomofo": "\u310e",
    "kcalsquare": "\u3389",
    "kcaron": "\u01e9",
    "kcedilla": "\u0137",
    "kcircle": "\u24da",
    "kcommaaccent": "\u0137",
    "kdotbelow": "\u1e33",
    "keharmenian": "\u0584",
    "kehiragana": "\u3051",
    "kekatakana": "\u30b1",
    "kekatakanahalfwidth": "\uff79",
    "kenarmenian": "\u056f",
    "kesmallkatakana": "\u30f6",
    "kgreenlandic": "\u0138",
    "khabengali": "\u0996",
    "khacyrillic": "\u0445",
    "khadeva": "\u0916",
    "khagujarati": "\u0a96",
    "khagurmukhi": "\u0a16",
    "khaharabic": "\u062e",
    "khahfinalarabic": "\ufea6",
    "khahinitialarabic": "\ufea7",
    "khahmedialarabic": "\ufea8",
    "kheicoptic": "\u03e7",
    "khhadeva": "\u0959",
    "khhagurmukhi": "\u0a59",
    "khieukhacirclekorean": "\u3278",
    "khieukhaparenkorean": "\u3218",
    "khieukhcirclekorean": "\u326a",
    "khieukhkorean": "\u314b",
    "khieukhparenkorean": "\u320a",
    "khokhaithai": "\u0e02",
    "khokhonthai": "\u0e05",
    "khokhuatthai": "\u0e03",
    "khokhwaithai": "\u0e04",
    "khomutthai": "\u0e5b",
    "khook": "\u0199",
    "khorakhangthai": "\u0e06",
    "khzsquare": "\u3391",
    "kihiragana": "\u304d",
    "kikatakana": "\u30ad",
    "kikatakanahalfwidth": "\uff77",
    "kiroguramusquare": "\u3315",
    "kiromeetorusquare": "\u3316",
    "kirosquare": "\u3314",
    "kiyeokacirclekorean": "\u326e",
    "kiyeokaparenkorean": "\u320e",
    "kiyeokcirclekorean": "\u3260",
    "kiyeokkorean": "\u3131",
    "kiyeokparenkorean": "\u3200",
    "kiyeoksioskorean": "\u3133",
    "kjecyrillic": "\u045c",
    "klinebelow": "\u1e35",
    "klsquare": "\u3398",
    "kmcubedsquare": "\u33a6",
    "kmonospace": "\uff4b",
    "kmsquaredsquare": "\u33a2",
    "kohiragana": "\u3053",
    "kohmsquare": "\u33c0",
    "kokaithai": "\u0e01",
    "kokatakana": "\u30b3",
    "kokatakanahalfwidth": "\uff7a",
    "kooposquare": "\u331e",
    "koppacyrillic": "\u0481",
    "koreanstandardsymbol": "\u327f",
    "koroniscmb": "\u0343",
    "kparen": "\u24a6",
    "kpasquare": "\u33aa",
    "ksicyrillic": "\u046f",
    "ktsquare": "\u33cf",
    "kturned": "\u029e",
    "kuhiragana": "\u304f",
    "kukatakana": "\u30af",
    "kukatakanahalfwidth": "\uff78",
    "kvsquare": "\u33b8",
    "kwsquare": "\u33be",
    "l": "\u006c",
    "labengali": "\u09b2",
    "lacute": "\u013a",
    "ladeva": "\u0932",
    "lagujarati": "\u0ab2",
    "lagurmukhi": "\u0a32",
    "lakkhangyaothai": "\u0e45",
    "lamaleffinalarabic": "\ufefc",
    "lamalefhamzaabovefinalarabic": "\ufef8",
    "lamalefhamzaaboveisolatedarabic": "\ufef7",
    "lamalefhamzabelowfinalarabic": "\ufefa",
    "lamalefhamzabelowisolatedarabic": "\ufef9",
    "lamalefisolatedarabic": "\ufefb",
    "lamalefmaddaabovefinalarabic": "\ufef6",
    "lamalefmaddaaboveisolatedarabic": "\ufef5",
    "lamarabic": "\u0644",
    "lambda": "\u03bb",
    "lambdastroke": "\u019b",
    "lamed": "\u05dc",
    "lameddagesh": "\ufb3c",
    "lameddageshhebrew": "\ufb3c",
    "lamedhebrew": "\u05dc",
    "lamedholam": "\u05dc\u05b9",
    "lamedholamdagesh": "\u05dc\u05b9\u05bc",
    "lamedholamdageshhebrew": "\u05dc\u05b9\u05bc",
    "lamedholamhebrew": "\u05dc\u05b9",
    "lamfinalarabic": "\ufede",
    "lamhahinitialarabic": "\ufcca",
    "laminitialarabic": "\ufedf",
    "lamjeeminitialarabic": "\ufcc9",
    "lamkhahinitialarabic": "\ufccb",
    "lamlamhehisolatedarabic": "\ufdf2",
    "lammedialarabic": "\ufee0",
    "lammeemhahinitialarabic": "\ufd88",
    "lammeeminitialarabic": "\ufccc",
    "lammeemjeeminitialarabic": "\ufedf\ufee4\ufea0",
    "lammeemkhahinitialarabic": "\ufedf\ufee4\ufea8",
    "largecircle": "\u25ef",
    "lbar": "\u019a",
    "lbelt": "\u026c",
    "lbopomofo": "\u310c",
    "lcaron": "\u013e",
    "lcedilla": "\u013c",
    "lcircle": "\u24db",
    "lcircumflexbelow": "\u1e3d",
    "lcommaaccent": "\u013c",
    "ldot": "\u0140",
    "ldotaccent": "\u0140",
    "ldotbelow": "\u1e37",
    "ldotbelowmacron": "\u1e39",
    "leftangleabovecmb": "\u031a",
    "lefttackbelowcmb": "\u0318",
    "less": "\u003c",
    "lessequal": "\u2264",
    "lessequalorgreater": "\u22da",
    "lessmonospace": "\uff1c",
    "lessorequivalent": "\u2272",
    "lessorgreater": "\u2276",
    "lessoverequal": "\u2266",
    "lesssmall": "\ufe64",
    "lezh": "\u026e",
    "lfblock": "\u258c",
    "lhookretroflex": "\u026d",
    "lira": "\u20a4",
    "liwnarmenian": "\u056c",
    "lj": "\u01c9",
    "ljecyrillic": "\u0459",
    "ll": "\uf6c0",
    "lladeva": "\u0933",
    "llagujarati": "\u0ab3",
    "llinebelow": "\u1e3b",
    "llladeva": "\u0934",
    "llvocalicbengali": "\u09e1",
    "llvocalicdeva": "\u0961",
    "llvocalicvowelsignbengali": "\u09e3",
    "llvocalicvowelsigndeva": "\u0963",
    "lmiddletilde": "\u026b",
    "lmonospace": "\uff4c",
    "lmsquare": "\u33d0",
    "lochulathai": "\u0e2c",
    "logicaland": "\u2227",
    "logicalnot": "\u00ac",
    "logicalnotreversed": "\u2310",
    "logicalor": "\u2228",
    "lolingthai": "\u0e25",
    "longs": "\u017f",
    "lowlinecenterline": "\ufe4e",
    "lowlinecmb": "\u0332",
    "lowlinedashed": "\ufe4d",
    "lozenge": "\u25ca",
    "lparen": "\u24a7",
    "lslash": "\u0142",
    "lsquare": "\u2113",
    "lsuperior": "\uf6ee",
    "ltshade": "\u2591",
    "luthai": "\u0e26",
    "lvocalicbengali": "\u098c",
    "lvocalicdeva": "\u090c",
    "lvocalicvowelsignbengali": "\u09e2",
    "lvocalicvowelsigndeva": "\u0962",
    "lxsquare": "\u33d3",
    "m": "\u006d",
    "mabengali": "\u09ae",
    "macron": "\u00af",
    "macronbelowcmb": "\u0331",
    "macroncmb": "\u0304",
    "macronlowmod": "\u02cd",
    "macronmonospace": "\uffe3",
    "macute": "\u1e3f",
    "madeva": "\u092e",
    "magujarati": "\u0aae",
    "magurmukhi": "\u0a2e",
    "mahapakhhebrew": "\u05a4",
    "mahapakhlefthebrew": "\u05a4",
    "mahiragana": "\u307e",
    "maichattawalowleftthai": "\uf895",
    "maichattawalowrightthai": "\uf894",
    "maichattawathai": "\u0e4b",
    "maichattawaupperleftthai": "\uf893",
    "maieklowleftthai": "\uf88c",
    "maieklowrightthai": "\uf88b",
    "maiekthai": "\u0e48",
    "maiekupperleftthai": "\uf88a",
    "maihanakatleftthai": "\uf884",
    "maihanakatthai": "\u0e31",
    "maitaikhuleftthai": "\uf889",
    "maitaikhuthai": "\u0e47",
    "maitholowleftthai": "\uf88f",
    "maitholowrightthai": "\uf88e",
    "maithothai": "\u0e49",
    "maithoupperleftthai": "\uf88d",
    "maitrilowleftthai": "\uf892",
    "maitrilowrightthai": "\uf891",
    "maitrithai": "\u0e4a",
    "maitriupperleftthai": "\uf890",
    "maiyamokthai": "\u0e46",
    "makatakana": "\u30de",
    "makatakanahalfwidth": "\uff8f",
    "male": "\u2642",
    "mansyonsquare": "\u3347",
    "maqafhebrew": "\u05be",
    "mars": "\u2642",
    "masoracirclehebrew": "\u05af",
    "masquare": "\u3383",
    "mbopomofo": "\u3107",
    "mbsquare": "\u33d4",
    "mcircle": "\u24dc",
    "mcubedsquare": "\u33a5",
    "mdotaccent": "\u1e41",
    "mdotbelow": "\u1e43",
    "meemarabic": "\u0645",
    "meemfinalarabic": "\ufee2",
    "meeminitialarabic": "\ufee3",
    "meemmedialarabic": "\ufee4",
    "meemmeeminitialarabic": "\ufcd1",
    "meemmeemisolatedarabic": "\ufc48",
    "meetorusquare": "\u334d",
    "mehiragana": "\u3081",
    "meizierasquare": "\u337e",
    "mekatakana": "\u30e1",
    "mekatakanahalfwidth": "\uff92",
    "mem": "\u05de",
    "memdagesh": "\ufb3e",
    "memdageshhebrew": "\ufb3e",
    "memhebrew": "\u05de",
    "menarmenian": "\u0574",
    "merkhahebrew": "\u05a5",
    "merkhakefulahebrew": "\u05a6",
    "merkhakefulalefthebrew": "\u05a6",
    "merkhalefthebrew": "\u05a5",
    "mhook": "\u0271",
    "mhzsquare": "\u3392",
    "middledotkatakanahalfwidth": "\uff65",
    "middot": "\u00b7",
    "mieumacirclekorean": "\u3272",
    "mieumaparenkorean": "\u3212",
    "mieumcirclekorean": "\u3264",
    "mieumkorean": "\u3141",
    "mieumpansioskorean": "\u3170",
    "mieumparenkorean": "\u3204",
    "mieumpieupkorean": "\u316e",
    "mieumsioskorean": "\u316f",
    "mihiragana": "\u307f",
    "mikatakana": "\u30df",
    "mikatakanahalfwidth": "\uff90",
    "minus": "\u2212",
    "minusbelowcmb": "\u0320",
    "minuscircle": "\u2296",
    "minusmod": "\u02d7",
    "minusplus": "\u2213",
    "minute": "\u2032",
    "miribaarusquare": "\u334a",
    "mirisquare": "\u3349",
    "mlonglegturned": "\u0270",
    "mlsquare": "\u3396",
    "mmcubedsquare": "\u33a3",
    "mmonospace": "\uff4d",
    "mmsquaredsquare": "\u339f",
    "mohiragana": "\u3082",
    "mohmsquare": "\u33c1",
    "mokatakana": "\u30e2",
    "mokatakanahalfwidth": "\uff93",
    "molsquare": "\u33d6",
    "momathai": "\u0e21",
    "moverssquare": "\u33a7",
    "moverssquaredsquare": "\u33a8",
    "mparen": "\u24a8",
    "mpasquare": "\u33ab",
    "mssquare": "\u33b3",
    "msuperior": "\uf6ef",
    "mturned": "\u026f",
    "mu": "\u00b5",
    "mu1": "\u00b5",
    "muasquare": "\u3382",
    "muchgreater": "\u226b",
    "muchless": "\u226a",
    "mufsquare": "\u338c",
    "mugreek": "\u03bc",
    "mugsquare": "\u338d",
    "muhiragana": "\u3080",
    "mukatakana": "\u30e0",
    "mukatakanahalfwidth": "\uff91",
    "mulsquare": "\u3395",
    "multiply": "\u00d7",
    "mumsquare": "\u339b",
    "munahhebrew": "\u05a3",
    "munahlefthebrew": "\u05a3",
    "musicalnote": "\u266a",
    "musicalnotedbl": "\u266b",
    "musicflatsign": "\u266d",
    "musicsharpsign": "\u266f",
    "mussquare": "\u33b2",
    "muvsquare": "\u33b6",
    "muwsquare": "\u33bc",
    "mvmegasquare": "\u33b9",
    "mvsquare": "\u33b7",
    "mwmegasquare": "\u33bf",
    "mwsquare": "\u33bd",
    "n": "\u006e",
    "nabengali": "\u09a8",
    "nabla": "\u2207",
    "nacute": "\u0144",
    "nadeva": "\u0928",
    "nagujarati": "\u0aa8",
    "nagurmukhi": "\u0a28",
    "nahiragana": "\u306a",
    "nakatakana": "\u30ca",
    "nakatakanahalfwidth": "\uff85",
    "napostrophe": "\u0149",
    "nasquare": "\u3381",
    "nbopomofo": "\u310b",
    "nbspace": "\u00a0",
    "ncaron": "\u0148",
    "ncedilla": "\u0146",
    "ncircle": "\u24dd",
    "ncircumflexbelow": "\u1e4b",
    "ncommaaccent": "\u0146",
    "ndotaccent": "\u1e45",
    "ndotbelow": "\u1e47",
    "nehiragana": "\u306d",
    "nekatakana": "\u30cd",
    "nekatakanahalfwidth": "\uff88",
    "newsheqelsign": "\u20aa",
    "nfsquare": "\u338b",
    "ngabengali": "\u0999",
    "ngadeva": "\u0919",
    "ngagujarati": "\u0a99",
    "ngagurmukhi": "\u0a19",
    "ngonguthai": "\u0e07",
    "nhiragana": "\u3093",
    "nhookleft": "\u0272",
    "nhookretroflex": "\u0273",
    "nieunacirclekorean": "\u326f",
    "nieunaparenkorean": "\u320f",
    "nieuncieuckorean": "\u3135",
    "nieuncirclekorean": "\u3261",
    "nieunhieuhkorean": "\u3136",
    "nieunkorean": "\u3134",
    "nieunpansioskorean": "\u3168",
    "nieunparenkorean": "\u3201",
    "nieunsioskorean": "\u3167",
    "nieuntikeutkorean": "\u3166",
    "nihiragana": "\u306b",
    "nikatakana": "\u30cb",
    "nikatakanahalfwidth": "\uff86",
    "nikhahitleftthai": "\uf899",
    "nikhahitthai": "\u0e4d",
    "nine": "\u0039",
    "ninearabic": "\u0669",
    "ninebengali": "\u09ef",
    "ninecircle": "\u2468",
    "ninecircleinversesansserif": "\u2792",
    "ninedeva": "\u096f",
    "ninegujarati": "\u0aef",
    "ninegurmukhi": "\u0a6f",
    "ninehackarabic": "\u0669",
    "ninehangzhou": "\u3029",
    "nineideographicparen": "\u3228",
    "nineinferior": "\u2089",
    "ninemonospace": "\uff19",
    "nineoldstyle": "\uf739",
    "nineparen": "\u247c",
    "nineperiod": "\u2490",
    "ninepersian": "\u06f9",
    "nineroman": "\u2178",
    "ninesuperior": "\u2079",
    "nineteencircle": "\u2472",
    "nineteenparen": "\u2486",
    "nineteenperiod": "\u249a",
    "ninethai": "\u0e59",
    "nj": "\u01cc",
    "njecyrillic": "\u045a",
    "nkatakana": "\u30f3",
    "nkatakanahalfwidth": "\uff9d",
    "nlegrightlong": "\u019e",
    "nlinebelow": "\u1e49",
    "nmonospace": "\uff4e",
    "nmsquare": "\u339a",
    "nnabengali": "\u09a3",
    "nnadeva": "\u0923",
    "nnagujarati": "\u0aa3",
    "nnagurmukhi": "\u0a23",
    "nnnadeva": "\u0929",
    "nohiragana": "\u306e",
    "nokatakana": "\u30ce",
    "nokatakanahalfwidth": "\uff89",
    "nonbreakingspace": "\u00a0",
    "nonenthai": "\u0e13",
    "nonuthai": "\u0e19",
    "noonarabic": "\u0646",
    "noonfinalarabic": "\ufee6",
    "noonghunnaarabic": "\u06ba",
    "noonghunnafinalarabic": "\ufb9f",
    "noonhehinitialarabic": "\ufee7\ufeec",
    "nooninitialarabic": "\ufee7",
    "noonjeeminitialarabic": "\ufcd2",
    "noonjeemisolatedarabic": "\ufc4b",
    "noonmedialarabic": "\ufee8",
    "noonmeeminitialarabic": "\ufcd5",
    "noonmeemisolatedarabic": "\ufc4e",
    "noonnoonfinalarabic": "\ufc8d",
    "notcontains": "\u220c",
    "notelement": "\u2209",
    "notelementof": "\u2209",
    "notequal": "\u2260",
    "notgreater": "\u226f",
    "notgreaternorequal": "\u2271",
    "notgreaternorless": "\u2279",
    "notidentical": "\u2262",
    "notless": "\u226e",
    "notlessnorequal": "\u2270",
    "notparallel": "\u2226",
    "notprecedes": "\u2280",
    "notsubset": "\u2284",
    "notsucceeds": "\u2281",
    "notsuperset": "\u2285",
    "nowarmenian": "\u0576",
    "nparen": "\u24a9",
    "nssquare": "\u33b1",
    "nsuperior": "\u207f",
    "ntilde": "\u00f1",
    "nu": "\u03bd",
    "nuhiragana": "\u306c",
    "nukatakana": "\u30cc",
    "nukatakanahalfwidth": "\uff87",
    "nuktabengali": "\u09bc",
    "nuktadeva": "\u093c",
    "nuktagujarati": "\u0abc",
    "nuktagurmukhi": "\u0a3c",
    "numbersign": "\u0023",
    "numbersignmonospace": "\uff03",
    "numbersignsmall": "\ufe5f",
    "numeralsigngreek": "\u0374",
    "numeralsignlowergreek": "\u0375",
    "numero": "\u2116",
    "nun": "\u05e0",
    "nundagesh": "\ufb40",
    "nundageshhebrew": "\ufb40",
    "nunhebrew": "\u05e0",
    "nvsquare": "\u33b5",
    "nwsquare": "\u33bb",
    "nyabengali": "\u099e",
    "nyadeva": "\u091e",
    "nyagujarati": "\u0a9e",
    "nyagurmukhi": "\u0a1e",
    "o": "\u006f",
    "oacute": "\u00f3",
    "oangthai": "\u0e2d",
    "obarred": "\u0275",
    "obarredcyrillic": "\u04e9",
    "obarreddieresiscyrillic": "\u04eb",
    "obengali": "\u0993",
    "obopomofo": "\u311b",
    "obreve": "\u014f",
    "ocandradeva": "\u0911",
    "ocandragujarati": "\u0a91",
    "ocandravowelsigndeva": "\u0949",
    "ocandravowelsigngujarati": "\u0ac9",
    "ocaron": "\u01d2",
    "ocircle": "\u24de",
    "ocircumflex": "\u00f4",
    "ocircumflexacute": "\u1ed1",
    "ocircumflexdotbelow": "\u1ed9",
    "ocircumflexgrave": "\u1ed3",
    "ocircumflexhookabove": "\u1ed5",
    "ocircumflextilde": "\u1ed7",
    "ocyrillic": "\u043e",
    "odblacute": "\u0151",
    "odblgrave": "\u020d",
    "odeva": "\u0913",
    "odieresis": "\u00f6",
    "odieresiscyrillic": "\u04e7",
    "odotbelow": "\u1ecd",
    "oe": "\u0153",
    "oekorean": "\u315a",
    "ogonek": "\u02db",
    "ogonekcmb": "\u0328",
    "ograve": "\u00f2",
    "ogujarati": "\u0a93",
    "oharmenian": "\u0585",
    "ohiragana": "\u304a",
    "ohookabove": "\u1ecf",
    "ohorn": "\u01a1",
    "ohornacute": "\u1edb",
    "ohorndotbelow": "\u1ee3",
    "ohorngrave": "\u1edd",
    "ohornhookabove": "\u1edf",
    "ohorntilde": "\u1ee1",
    "ohungarumlaut": "\u0151",
    "oi": "\u01a3",
    "oinvertedbreve": "\u020f",
    "okatakana": "\u30aa",
    "okatakanahalfwidth": "\uff75",
    "okorean": "\u3157",
    "olehebrew": "\u05ab",
    "omacron": "\u014d",
    "omacronacute": "\u1e53",
    "omacrongrave": "\u1e51",
    "omdeva": "\u0950",
    "omega": "\u03c9",
    "omega1": "\u03d6",
    "omegacyrillic": "\u0461",
    "omegalatinclosed": "\u0277",
    "omegaroundcyrillic": "\u047b",
    "omegatitlocyrillic": "\u047d",
    "omegatonos": "\u03ce",
    "omgujarati": "\u0ad0",
    "omicron": "\u03bf",
    "omicrontonos": "\u03cc",
    "omonospace": "\uff4f",
    "one": "\u0031",
    "onearabic": "\u0661",
    "onebengali": "\u09e7",
    "onecircle": "\u2460",
    "onecircleinversesansserif": "\u278a",
    "onedeva": "\u0967",
    "onedotenleader": "\u2024",
    "oneeighth": "\u215b",
    "onefitted": "\uf6dc",
    "onegujarati": "\u0ae7",
    "onegurmukhi": "\u0a67",
    "onehackarabic": "\u0661",
    "onehalf": "\u00bd",
    "onehangzhou": "\u3021",
    "oneideographicparen": "\u3220",
    "oneinferior": "\u2081",
    "onemonospace": "\uff11",
    "onenumeratorbengali": "\u09f4",
    "oneoldstyle": "\uf731",
    "oneparen": "\u2474",
    "oneperiod": "\u2488",
    "onepersian": "\u06f1",
    "onequarter": "\u00bc",
    "oneroman": "\u2170",
    "onesuperior": "\u00b9",
    "onethai": "\u0e51",
    "onethird": "\u2153",
    "oogonek": "\u01eb",
    "oogonekmacron": "\u01ed",
    "oogurmukhi": "\u0a13",
    "oomatragurmukhi": "\u0a4b",
    "oopen": "\u0254",
    "oparen": "\u24aa",
    "openbullet": "\u25e6",
    "option": "\u2325",
    "ordfeminine": "\u00aa",
    "ordmasculine": "\u00ba",
    "orthogonal": "\u221f",
    "oshortdeva": "\u0912",
    "oshortvowelsigndeva": "\u094a",
    "oslash": "\u00f8",
    "oslashacute": "\u01ff",
    "osmallhiragana": "\u3049",
    "osmallkatakana": "\u30a9",
    "osmallkatakanahalfwidth": "\uff6b",
    "ostrokeacute": "\u01ff",
    "osuperior": "\uf6f0",
    "otcyrillic": "\u047f",
    "otilde": "\u00f5",
    "otildeacute": "\u1e4d",
    "otildedieresis": "\u1e4f",
    "oubopomofo": "\u3121",
    "overline": "\u203e",
    "overlinecenterline": "\ufe4a",
    "overlinecmb": "\u0305",
    "overlinedashed": "\ufe49",
    "overlinedblwavy": "\ufe4c",
    "overlinewavy": "\ufe4b",
    "overscore": "\u00af",
    "ovowelsignbengali": "\u09cb",
    "ovowelsigndeva": "\u094b",
    "ovowelsigngujarati": "\u0acb",
    "p": "\u0070",
    "paampssquare": "\u3380",
    "paasentosquare": "\u332b",
    "pabengali": "\u09aa",
    "pacute": "\u1e55",
    "padeva": "\u092a",
    "pagedown": "\u21df",
    "pageup": "\u21de",
    "pagujarati": "\u0aaa",
    "pagurmukhi": "\u0a2a",
    "pahiragana": "\u3071",
    "paiyannoithai": "\u0e2f",
    "pakatakana": "\u30d1",
    "palatalizationcyrilliccmb": "\u0484",
    "palochkacyrillic": "\u04c0",
    "pansioskorean": "\u317f",
    "paragraph": "\u00b6",
    "parallel": "\u2225",
    "parenleft": "\u0028",
    "parenleftaltonearabic": "\ufd3e",
    "parenleftbt": "\uf8ed",
    "parenleftex": "\uf8ec",
    "parenleftinferior": "\u208d",
    "parenleftmonospace": "\uff08",
    "parenleftsmall": "\ufe59",
    "parenleftsuperior": "\u207d",
    "parenlefttp": "\uf8eb",
    "parenleftvertical": "\ufe35",
    "parenright": "\u0029",
    "parenrightaltonearabic": "\ufd3f",
    "parenrightbt": "\uf8f8",
    "parenrightex": "\uf8f7",
    "parenrightinferior": "\u208e",
    "parenrightmonospace": "\uff09",
    "parenrightsmall": "\ufe5a",
    "parenrightsuperior": "\u207e",
    "parenrighttp": "\uf8f6",
    "parenrightvertical": "\ufe36",
    "partialdiff": "\u2202",
    "paseqhebrew": "\u05c0",
    "pashtahebrew": "\u0599",
    "pasquare": "\u33a9",
    "patah": "\u05b7",
    "patah11": "\u05b7",
    "patah1d": "\u05b7",
    "patah2a": "\u05b7",
    "patahhebrew": "\u05b7",
    "patahnarrowhebrew": "\u05b7",
    "patahquarterhebrew": "\u05b7",
    "patahwidehebrew": "\u05b7",
    "pazerhebrew": "\u05a1",
    "pbopomofo": "\u3106",
    "pcircle": "\u24df",
    "pdotaccent": "\u1e57",
    "pe": "\u05e4",
    "pecyrillic": "\u043f",
    "pedagesh": "\ufb44",
    "pedageshhebrew": "\ufb44",
    "peezisquare": "\u333b",
    "pefinaldageshhebrew": "\ufb43",
    "peharabic": "\u067e",
    "peharmenian": "\u057a",
    "pehebrew": "\u05e4",
    "pehfinalarabic": "\ufb57",
    "pehinitialarabic": "\ufb58",
    "pehiragana": "\u307a",
    "pehmedialarabic": "\ufb59",
    "pekatakana": "\u30da",
    "pemiddlehookcyrillic": "\u04a7",
    "perafehebrew": "\ufb4e",
    "percent": "\u0025",
    "percentarabic": "\u066a",
    "percentmonospace": "\uff05",
    "percentsmall": "\ufe6a",
    "period": "\u002e",
    "periodarmenian": "\u0589",
    "periodcentered": "\u00b7",
    "periodhalfwidth": "\uff61",
    "periodinferior": "\uf6e7",
    "periodmonospace": "\uff0e",
    "periodsmall": "\ufe52",
    "periodsuperior": "\uf6e8",
    "perispomenigreekcmb": "\u0342",
    "perpendicular": "\u22a5",
    "perthousand": "\u2030",
    "peseta": "\u20a7",
    "pfsquare": "\u338a",
    "phabengali": "\u09ab",
    "phadeva": "\u092b",
    "phagujarati": "\u0aab",
    "phagurmukhi": "\u0a2b",
    "phi": "\u03c6",
    "phi1": "\u03d5",
    "phieuphacirclekorean": "\u327a",
    "phieuphaparenkorean": "\u321a",
    "phieuphcirclekorean": "\u326c",
    "phieuphkorean": "\u314d",
    "phieuphparenkorean": "\u320c",
    "philatin": "\u0278",
    "phinthuthai": "\u0e3a",
    "phisymbolgreek": "\u03d5",
    "phook": "\u01a5",
    "phophanthai": "\u0e1e",
    "phophungthai": "\u0e1c",
    "phosamphaothai": "\u0e20",
    "pi": "\u03c0",
    "pieupacirclekorean": "\u3273",
    "pieupaparenkorean": "\u3213",
    "pieupcieuckorean": "\u3176",
    "pieupcirclekorean": "\u3265",
    "pieupkiyeokkorean": "\u3172",
    "pieupkorean": "\u3142",
    "pieupparenkorean": "\u3205",
    "pieupsioskiyeokkorean": "\u3174",
    "pieupsioskorean": "\u3144",
    "pieupsiostikeutkorean": "\u3175",
    "pieupthieuthkorean": "\u3177",
    "pieuptikeutkorean": "\u3173",
    "pihiragana": "\u3074",
    "pikatakana": "\u30d4",
    "pisymbolgreek": "\u03d6",
    "piwrarmenian": "\u0583",
    "plus": "\u002b",
    "plusbelowcmb": "\u031f",
    "pluscircle": "\u2295",
    "plusminus": "\u00b1",
    "plusmod": "\u02d6",
    "plusmonospace": "\uff0b",
    "plussmall": "\ufe62",
    "plussuperior": "\u207a",
    "pmonospace": "\uff50",
    "pmsquare": "\u33d8",
    "pohiragana": "\u307d",
    "pointingindexdownwhite": "\u261f",
    "pointingindexleftwhite": "\u261c",
    "pointingindexrightwhite": "\u261e",
    "pointingindexupwhite": "\u261d",
    "pokatakana": "\u30dd",
    "poplathai": "\u0e1b",
    "postalmark": "\u3012",
    "postalmarkface": "\u3020",
    "pparen": "\u24ab",
    "precedes": "\u227a",
    "prescription": "\u211e",
    "primemod": "\u02b9",
    "primereversed": "\u2035",
    "product": "\u220f",
    "projective": "\u2305",
    "prolongedkana": "\u30fc",
    "propellor": "\u2318",
    "propersubset": "\u2282",
    "propersuperset": "\u2283",
    "proportion": "\u2237",
    "proportional": "\u221d",
    "psi": "\u03c8",
    "psicyrillic": "\u0471",
    "psilipneumatacyrilliccmb": "\u0486",
    "pssquare": "\u33b0",
    "puhiragana": "\u3077",
    "pukatakana": "\u30d7",
    "pvsquare": "\u33b4",
    "pwsquare": "\u33ba",
    "q": "\u0071",
    "qadeva": "\u0958",
    "qadmahebrew": "\u05a8",
    "qafarabic": "\u0642",
    "qaffinalarabic": "\ufed6",
    "qafinitialarabic": "\ufed7",
    "qafmedialarabic": "\ufed8",
    "qamats": "\u05b8",
    "qamats10": "\u05b8",
    "qamats1a": "\u05b8",
    "qamats1c": "\u05b8",
    "qamats27": "\u05b8",
    "qamats29": "\u05b8",
    "qamats33": "\u05b8",
    "qamatsde": "\u05b8",
    "qamatshebrew": "\u05b8",
    "qamatsnarrowhebrew": "\u05b8",
    "qamatsqatanhebrew": "\u05b8",
    "qamatsqatannarrowhebrew": "\u05b8",
    "qamatsqatanquarterhebrew": "\u05b8",
    "qamatsqatanwidehebrew": "\u05b8",
    "qamatsquarterhebrew": "\u05b8",
    "qamatswidehebrew": "\u05b8",
    "qarneyparahebrew": "\u059f",
    "qbopomofo": "\u3111",
    "qcircle": "\u24e0",
    "qhook": "\u02a0",
    "qmonospace": "\uff51",
    "qof": "\u05e7",
    "qofdagesh": "\ufb47",
    "qofdageshhebrew": "\ufb47",
    "qofhatafpatah": "\u05e7\u05b2",
    "qofhatafpatahhebrew": "\u05e7\u05b2",
    "qofhatafsegol": "\u05e7\u05b1",
    "qofhatafsegolhebrew": "\u05e7\u05b1",
    "qofhebrew": "\u05e7",
    "qofhiriq": "\u05e7\u05b4",
    "qofhiriqhebrew": "\u05e7\u05b4",
    "qofholam": "\u05e7\u05b9",
    "qofholamhebrew": "\u05e7\u05b9",
    "qofpatah": "\u05e7\u05b7",
    "qofpatahhebrew": "\u05e7\u05b7",
    "qofqamats": "\u05e7\u05b8",
    "qofqamatshebrew": "\u05e7\u05b8",
    "qofqubuts": "\u05e7\u05bb",
    "qofqubutshebrew": "\u05e7\u05bb",
    "qofsegol": "\u05e7\u05b6",
    "qofsegolhebrew": "\u05e7\u05b6",
    "qofsheva": "\u05e7\u05b0",
    "qofshevahebrew": "\u05e7\u05b0",
    "qoftsere": "\u05e7\u05b5",
    "qoftserehebrew": "\u05e7\u05b5",
    "qparen": "\u24ac",
    "quarternote": "\u2669",
    "qubuts": "\u05bb",
    "qubuts18": "\u05bb",
    "qubuts25": "\u05bb",
    "qubuts31": "\u05bb",
    "qubutshebrew": "\u05bb",
    "qubutsnarrowhebrew": "\u05bb",
    "qubutsquarterhebrew": "\u05bb",
    "qubutswidehebrew": "\u05bb",
    "question": "\u003f",
    "questionarabic": "\u061f",
    "questionarmenian": "\u055e",
    "questiondown": "\u00bf",
    "questiondownsmall": "\uf7bf",
    "questiongreek": "\u037e",
    "questionmonospace": "\uff1f",
    "questionsmall": "\uf73f",
    "quotedbl": "\u0022",
    "quotedblbase": "\u201e",
    "quotedblleft": "\u201c",
    "quotedblmonospace": "\uff02",
    "quotedblprime": "\u301e",
    "quotedblprimereversed": "\u301d",
    "quotedblright": "\u201d",
    "quoteleft": "\u2018",
    "quoteleftreversed": "\u201b",
    "quotereversed": "\u201b",
    "quoteright": "\u2019",
    "quoterightn": "\u0149",
    "quotesinglbase": "\u201a",
    "quotesingle": "\u0027",
    "quotesinglemonospace": "\uff07",
    "r": "\u0072",
    "raarmenian": "\u057c",
    "rabengali": "\u09b0",
    "racute": "\u0155",
    "radeva": "\u0930",
    "radical": "\u221a",
    "radicalex": "\uf8e5",
    "radoverssquare": "\u33ae",
    "radoverssquaredsquare": "\u33af",
    "radsquare": "\u33ad",
    "rafe": "\u05bf",
    "rafehebrew": "\u05bf",
    "ragujarati": "\u0ab0",
    "ragurmukhi": "\u0a30",
    "rahiragana": "\u3089",
    "rakatakana": "\u30e9",
    "rakatakanahalfwidth": "\uff97",
    "ralowerdiagonalbengali": "\u09f1",
    "ramiddlediagonalbengali": "\u09f0",
    "ramshorn": "\u0264",
    "ratio": "\u2236",
    "rbopomofo": "\u3116",
    "rcaron": "\u0159",
    "rcedilla": "\u0157",
    "rcircle": "\u24e1",
    "rcommaaccent": "\u0157",
    "rdblgrave": "\u0211",
    "rdotaccent": "\u1e59",
    "rdotbelow": "\u1e5b",
    "rdotbelowmacron": "\u1e5d",
    "referencemark": "\u203b",
    "reflexsubset": "\u2286",
    "reflexsuperset": "\u2287",
    "registered": "\u00ae",
    "registersans": "\uf8e8",
    "registerserif": "\uf6da",
    "reharabic": "\u0631",
    "reharmenian": "\u0580",
    "rehfinalarabic": "\ufeae",
    "rehiragana": "\u308c",
    "rehyehaleflamarabic": "\u0631\ufef3\ufe8e\u0644",
    "rekatakana": "\u30ec",
    "rekatakanahalfwidth": "\uff9a",
    "resh": "\u05e8",
    "reshdageshhebrew": "\ufb48",
    "reshhatafpatah": "\u05e8\u05b2",
    "reshhatafpatahhebrew": "\u05e8\u05b2",
    "reshhatafsegol": "\u05e8\u05b1",
    "reshhatafsegolhebrew": "\u05e8\u05b1",
    "reshhebrew": "\u05e8",
    "reshhiriq": "\u05e8\u05b4",
    "reshhiriqhebrew": "\u05e8\u05b4",
    "reshholam": "\u05e8\u05b9",
    "reshholamhebrew": "\u05e8\u05b9",
    "reshpatah": "\u05e8\u05b7",
    "reshpatahhebrew": "\u05e8\u05b7",
    "reshqamats": "\u05e8\u05b8",
    "reshqamatshebrew": "\u05e8\u05b8",
    "reshqubuts": "\u05e8\u05bb",
    "reshqubutshebrew": "\u05e8\u05bb",
    "reshsegol": "\u05e8\u05b6",
    "reshsegolhebrew": "\u05e8\u05b6",
    "reshsheva": "\u05e8\u05b0",
    "reshshevahebrew": "\u05e8\u05b0",
    "reshtsere": "\u05e8\u05b5",
    "reshtserehebrew": "\u05e8\u05b5",
    "reversedtilde": "\u223d",
    "reviahebrew": "\u0597",
    "reviamugrashhebrew": "\u0597",
    "revlogicalnot": "\u2310",
    "rfishhook": "\u027e",
    "rfishhookreversed": "\u027f",
    "rhabengali": "\u09dd",
    "rhadeva": "\u095d",
    "rho": "\u03c1",
    "rhook": "\u027d",
    "rhookturned": "\u027b",
    "rhookturnedsuperior": "\u02b5",
    "rhosymbolgreek": "\u03f1",
    "rhotichookmod": "\u02de",
    "rieulacirclekorean": "\u3271",
    "rieulaparenkorean": "\u3211",
    "rieulcirclekorean": "\u3263",
    "rieulhieuhkorean": "\u3140",
    "rieulkiyeokkorean": "\u313a",
    "rieulkiyeoksioskorean": "\u3169",
    "rieulkorean": "\u3139",
    "rieulmieumkorean": "\u313b",
    "rieulpansioskorean": "\u316c",
    "rieulparenkorean": "\u3203",
    "rieulphieuphkorean": "\u313f",
    "rieulpieupkorean": "\u313c",
    "rieulpieupsioskorean": "\u316b",
    "rieulsioskorean": "\u313d",
    "rieulthieuthkorean": "\u313e",
    "rieultikeutkorean": "\u316a",
    "rieulyeorinhieuhkorean": "\u316d",
    "rightangle": "\u221f",
    "righttackbelowcmb": "\u0319",
    "righttriangle": "\u22bf",
    "rihiragana": "\u308a",
    "rikatakana": "\u30ea",
    "rikatakanahalfwidth": "\uff98",
    "ring": "\u02da",
    "ringbelowcmb": "\u0325",
    "ringcmb": "\u030a",
    "ringhalfleft": "\u02bf",
    "ringhalfleftarmenian": "\u0559",
    "ringhalfleftbelowcmb": "\u031c",
    "ringhalfleftcentered": "\u02d3",
    "ringhalfright": "\u02be",
    "ringhalfrightbelowcmb": "\u0339",
    "ringhalfrightcentered": "\u02d2",
    "rinvertedbreve": "\u0213",
    "rittorusquare": "\u3351",
    "rlinebelow": "\u1e5f",
    "rlongleg": "\u027c",
    "rlonglegturned": "\u027a",
    "rmonospace": "\uff52",
    "rohiragana": "\u308d",
    "rokatakana": "\u30ed",
    "rokatakanahalfwidth": "\uff9b",
    "roruathai": "\u0e23",
    "rparen": "\u24ad",
    "rrabengali": "\u09dc",
    "rradeva": "\u0931",
    "rragurmukhi": "\u0a5c",
    "rreharabic": "\u0691",
    "rrehfinalarabic": "\ufb8d",
    "rrvocalicbengali": "\u09e0",
    "rrvocalicdeva": "\u0960",
    "rrvocalicgujarati": "\u0ae0",
    "rrvocalicvowelsignbengali": "\u09c4",
    "rrvocalicvowelsigndeva": "\u0944",
    "rrvocalicvowelsigngujarati": "\u0ac4",
    "rsuperior": "\uf6f1",
    "rtblock": "\u2590",
    "rturned": "\u0279",
    "rturnedsuperior": "\u02b4",
    "ruhiragana": "\u308b",
    "rukatakana": "\u30eb",
    "rukatakanahalfwidth": "\uff99",
    "rupeemarkbengali": "\u09f2",
    "rupeesignbengali": "\u09f3",
    "rupiah": "\uf6dd",
    "ruthai": "\u0e24",
    "rvocalicbengali": "\u098b",
    "rvocalicdeva": "\u090b",
    "rvocalicgujarati": "\u0a8b",
    "rvocalicvowelsignbengali": "\u09c3",
    "rvocalicvowelsigndeva": "\u0943",
    "rvocalicvowelsigngujarati": "\u0ac3",
    "s": "\u0073",
    "sabengali": "\u09b8",
    "sacute": "\u015b",
    "sacutedotaccent": "\u1e65",
    "sadarabic": "\u0635",
    "sadeva": "\u0938",
    "sadfinalarabic": "\ufeba",
    "sadinitialarabic": "\ufebb",
    "sadmedialarabic": "\ufebc",
    "sagujarati": "\u0ab8",
    "sagurmukhi": "\u0a38",
    "sahiragana": "\u3055",
    "sakatakana": "\u30b5",
    "sakatakanahalfwidth": "\uff7b",
    "sallallahoualayhewasallamarabic": "\ufdfa",
    "samekh": "\u05e1",
    "samekhdagesh": "\ufb41",
    "samekhdageshhebrew": "\ufb41",
    "samekhhebrew": "\u05e1",
    "saraaathai": "\u0e32",
    "saraaethai": "\u0e41",
    "saraaimaimalaithai": "\u0e44",
    "saraaimaimuanthai": "\u0e43",
    "saraamthai": "\u0e33",
    "saraathai": "\u0e30",
    "saraethai": "\u0e40",
    "saraiileftthai": "\uf886",
    "saraiithai": "\u0e35",
    "saraileftthai": "\uf885",
    "saraithai": "\u0e34",
    "saraothai": "\u0e42",
    "saraueeleftthai": "\uf888",
    "saraueethai": "\u0e37",
    "saraueleftthai": "\uf887",
    "sarauethai": "\u0e36",
    "sarauthai": "\u0e38",
    "sarauuthai": "\u0e39",
    "sbopomofo": "\u3119",
    "scaron": "\u0161",
    "scarondotaccent": "\u1e67",
    "scedilla": "\u015f",
    "schwa": "\u0259",
    "schwacyrillic": "\u04d9",
    "schwadieresiscyrillic": "\u04db",
    "schwahook": "\u025a",
    "scircle": "\u24e2",
    "scircumflex": "\u015d",
    "scommaaccent": "\u0219",
    "sdotaccent": "\u1e61",
    "sdotbelow": "\u1e63",
    "sdotbelowdotaccent": "\u1e69",
    "seagullbelowcmb": "\u033c",
    "second": "\u2033",
    "secondtonechinese": "\u02ca",
    "section": "\u00a7",
    "seenarabic": "\u0633",
    "seenfinalarabic": "\ufeb2",
    "seeninitialarabic": "\ufeb3",
    "seenmedialarabic": "\ufeb4",
    "segol": "\u05b6",
    "segol13": "\u05b6",
    "segol1f": "\u05b6",
    "segol2c": "\u05b6",
    "segolhebrew": "\u05b6",
    "segolnarrowhebrew": "\u05b6",
    "segolquarterhebrew": "\u05b6",
    "segoltahebrew": "\u0592",
    "segolwidehebrew": "\u05b6",
    "seharmenian": "\u057d",
    "sehiragana": "\u305b",
    "sekatakana": "\u30bb",
    "sekatakanahalfwidth": "\uff7e",
    "semicolon": "\u003b",
    "semicolonarabic": "\u061b",
    "semicolonmonospace": "\uff1b",
    "semicolonsmall": "\ufe54",
    "semivoicedmarkkana": "\u309c",
    "semivoicedmarkkanahalfwidth": "\uff9f",
    "sentisquare": "\u3322",
    "sentosquare": "\u3323",
    "seven": "\u0037",
    "sevenarabic": "\u0667",
    "sevenbengali": "\u09ed",
    "sevencircle": "\u2466",
    "sevencircleinversesansserif": "\u2790",
    "sevendeva": "\u096d",
    "seveneighths": "\u215e",
    "sevengujarati": "\u0aed",
    "sevengurmukhi": "\u0a6d",
    "sevenhackarabic": "\u0667",
    "sevenhangzhou": "\u3027",
    "sevenideographicparen": "\u3226",
    "seveninferior": "\u2087",
    "sevenmonospace": "\uff17",
    "sevenoldstyle": "\uf737",
    "sevenparen": "\u247a",
    "sevenperiod": "\u248e",
    "sevenpersian": "\u06f7",
    "sevenroman": "\u2176",
    "sevensuperior": "\u2077",
    "seventeencircle": "\u2470",
    "seventeenparen": "\u2484",
    "seventeenperiod": "\u2498",
    "seventhai": "\u0e57",
    "sfthyphen": "\u00ad",
    "shaarmenian": "\u0577",
    "shabengali": "\u09b6",
    "shacyrillic": "\u0448",
    "shaddaarabic": "\u0651",
    "shaddadammaarabic": "\ufc61",
    "shaddadammatanarabic": "\ufc5e",
    "shaddafathaarabic": "\ufc60",
    "shaddafathatanarabic": "\u0651\u064b",
    "shaddakasraarabic": "\ufc62",
    "shaddakasratanarabic": "\ufc5f",
    "shade": "\u2592",
    "shadedark": "\u2593",
    "shadelight": "\u2591",
    "shademedium": "\u2592",
    "shadeva": "\u0936",
    "shagujarati": "\u0ab6",
    "shagurmukhi": "\u0a36",
    "shalshelethebrew": "\u0593",
    "shbopomofo": "\u3115",
    "shchacyrillic": "\u0449",
    "sheenarabic": "\u0634",
    "sheenfinalarabic": "\ufeb6",
    "sheeninitialarabic": "\ufeb7",
    "sheenmedialarabic": "\ufeb8",
    "sheicoptic": "\u03e3",
    "sheqel": "\u20aa",
    "sheqelhebrew": "\u20aa",
    "sheva": "\u05b0",
    "sheva115": "\u05b0",
    "sheva15": "\u05b0",
    "sheva22": "\u05b0",
    "sheva2e": "\u05b0",
    "shevahebrew": "\u05b0",
    "shevanarrowhebrew": "\u05b0",
    "shevaquarterhebrew": "\u05b0",
    "shevawidehebrew": "\u05b0",
    "shhacyrillic": "\u04bb",
    "shimacoptic": "\u03ed",
    "shin": "\u05e9",
    "shindagesh": "\ufb49",
    "shindageshhebrew": "\ufb49",
    "shindageshshindot": "\ufb2c",
    "shindageshshindothebrew": "\ufb2c",
    "shindageshsindot": "\ufb2d",
    "shindageshsindothebrew": "\ufb2d",
    "shindothebrew": "\u05c1",
    "shinhebrew": "\u05e9",
    "shinshindot": "\ufb2a",
    "shinshindothebrew": "\ufb2a",
    "shinsindot": "\ufb2b",
    "shinsindothebrew": "\ufb2b",
    "shook": "\u0282",
    "sigma": "\u03c3",
    "sigma1": "\u03c2",
    "sigmafinal": "\u03c2",
    "sigmalunatesymbolgreek": "\u03f2",
    "sihiragana": "\u3057",
    "sikatakana": "\u30b7",
    "sikatakanahalfwidth": "\uff7c",
    "siluqhebrew": "\u05bd",
    "siluqlefthebrew": "\u05bd",
    "similar": "\u223c",
    "sindothebrew": "\u05c2",
    "siosacirclekorean": "\u3274",
    "siosaparenkorean": "\u3214",
    "sioscieuckorean": "\u317e",
    "sioscirclekorean": "\u3266",
    "sioskiyeokkorean": "\u317a",
    "sioskorean": "\u3145",
    "siosnieunkorean": "\u317b",
    "siosparenkorean": "\u3206",
    "siospieupkorean": "\u317d",
    "siostikeutkorean": "\u317c",
    "six": "\u0036",
    "sixarabic": "\u0666",
    "sixbengali": "\u09ec",
    "sixcircle": "\u2465",
    "sixcircleinversesansserif": "\u278f",
    "sixdeva": "\u096c",
    "sixgujarati": "\u0aec",
    "sixgurmukhi": "\u0a6c",
    "sixhackarabic": "\u0666",
    "sixhangzhou": "\u3026",
    "sixideographicparen": "\u3225",
    "sixinferior": "\u2086",
    "sixmonospace": "\uff16",
    "sixoldstyle": "\uf736",
    "sixparen": "\u2479",
    "sixperiod": "\u248d",
    "sixpersian": "\u06f6",
    "sixroman": "\u2175",
    "sixsuperior": "\u2076",
    "sixteencircle": "\u246f",
    "sixteencurrencydenominatorbengali": "\u09f9",
    "sixteenparen": "\u2483",
    "sixteenperiod": "\u2497",
    "sixthai": "\u0e56",
    "slash": "\u002f",
    "slashmonospace": "\uff0f",
    "slong": "\u017f",
    "slongdotaccent": "\u1e9b",
    "smileface": "\u263a",
    "smonospace": "\uff53",
    "sofpasuqhebrew": "\u05c3",
    "softhyphen": "\u00ad",
    "softsigncyrillic": "\u044c",
    "sohiragana": "\u305d",
    "sokatakana": "\u30bd",
    "sokatakanahalfwidth": "\uff7f",
    "soliduslongoverlaycmb": "\u0338",
    "solidusshortoverlaycmb": "\u0337",
    "sorusithai": "\u0e29",
    "sosalathai": "\u0e28",
    "sosothai": "\u0e0b",
    "sosuathai": "\u0e2a",
    "space": "\u0020",
    "spacehackarabic": "\u0020",
    "spade": "\u2660",
    "spadesuitblack": "\u2660",
    "spadesuitwhite": "\u2664",
    "sparen": "\u24ae",
    "squarebelowcmb": "\u033b",
    "squarecc": "\u33c4",
    "squarecm": "\u339d",
    "squarediagonalcrosshatchfill": "\u25a9",
    "squarehorizontalfill": "\u25a4",
    "squarekg": "\u338f",
    "squarekm": "\u339e",
    "squarekmcapital": "\u33ce",
    "squareln": "\u33d1",
    "squarelog": "\u33d2",
    "squaremg": "\u338e",
    "squaremil": "\u33d5",
    "squaremm": "\u339c",
    "squaremsquared": "\u33a1",
    "squareorthogonalcrosshatchfill": "\u25a6",
    "squareupperlefttolowerrightfill": "\u25a7",
    "squareupperrighttolowerleftfill": "\u25a8",
    "squareverticalfill": "\u25a5",
    "squarewhitewithsmallblack": "\u25a3",
    "srsquare": "\u33db",
    "ssabengali": "\u09b7",
    "ssadeva": "\u0937",
    "ssagujarati": "\u0ab7",
    "ssangcieuckorean": "\u3149",
    "ssanghieuhkorean": "\u3185",
    "ssangieungkorean": "\u3180",
    "ssangkiyeokkorean": "\u3132",
    "ssangnieunkorean": "\u3165",
    "ssangpieupkorean": "\u3143",
    "ssangsioskorean": "\u3146",
    "ssangtikeutkorean": "\u3138",
    "ssuperior": "\uf6f2",
    "sterling": "\u00a3",
    "sterlingmonospace": "\uffe1",
    "strokelongoverlaycmb": "\u0336",
    "strokeshortoverlaycmb": "\u0335",
    "subset": "\u2282",
    "subsetnotequal": "\u228a",
    "subsetorequal": "\u2286",
    "succeeds": "\u227b",
    "suchthat": "\u220b",
    "suhiragana": "\u3059",
    "sukatakana": "\u30b9",
    "sukatakanahalfwidth": "\uff7d",
    "sukunarabic": "\u0652",
    "summation": "\u2211",
    "sun": "\u263c",
    "superset": "\u2283",
    "supersetnotequal": "\u228b",
    "supersetorequal": "\u2287",
    "svsquare": "\u33dc",
    "syouwaerasquare": "\u337c",
    "t": "\u0074",
    "tabengali": "\u09a4",
    "tackdown": "\u22a4",
    "tackleft": "\u22a3",
    "tadeva": "\u0924",
    "tagujarati": "\u0aa4",
    "tagurmukhi": "\u0a24",
    "taharabic": "\u0637",
    "tahfinalarabic": "\ufec2",
    "tahinitialarabic": "\ufec3",
    "tahiragana": "\u305f",
    "tahmedialarabic": "\ufec4",
    "taisyouerasquare": "\u337d",
    "takatakana": "\u30bf",
    "takatakanahalfwidth": "\uff80",
    "tatweelarabic": "\u0640",
    "tau": "\u03c4",
    "tav": "\u05ea",
    "tavdages": "\ufb4a",
    "tavdagesh": "\ufb4a",
    "tavdageshhebrew": "\ufb4a",
    "tavhebrew": "\u05ea",
    "tbar": "\u0167",
    "tbopomofo": "\u310a",
    "tcaron": "\u0165",
    "tccurl": "\u02a8",
    "tcedilla": "\u0163",
    "tcheharabic": "\u0686",
    "tchehfinalarabic": "\ufb7b",
    "tchehinitialarabic": "\ufb7c",
    "tchehmedialarabic": "\ufb7d",
    "tchehmeeminitialarabic": "\ufb7c\ufee4",
    "tcircle": "\u24e3",
    "tcircumflexbelow": "\u1e71",
    "tcommaaccent": "\u0163",
    "tdieresis": "\u1e97",
    "tdotaccent": "\u1e6b",
    "tdotbelow": "\u1e6d",
    "tecyrillic": "\u0442",
    "tedescendercyrillic": "\u04ad",
    "teharabic": "\u062a",
    "tehfinalarabic": "\ufe96",
    "tehhahinitialarabic": "\ufca2",
    "tehhahisolatedarabic": "\ufc0c",
    "tehinitialarabic": "\ufe97",
    "tehiragana": "\u3066",
    "tehjeeminitialarabic": "\ufca1",
    "tehjeemisolatedarabic": "\ufc0b",
    "tehmarbutaarabic": "\u0629",
    "tehmarbutafinalarabic": "\ufe94",
    "tehmedialarabic": "\ufe98",
    "tehmeeminitialarabic": "\ufca4",
    "tehmeemisolatedarabic": "\ufc0e",
    "tehnoonfinalarabic": "\ufc73",
    "tekatakana": "\u30c6",
    "tekatakanahalfwidth": "\uff83",
    "telephone": "\u2121",
    "telephoneblack": "\u260e",
    "telishagedolahebrew": "\u05a0",
    "telishaqetanahebrew": "\u05a9",
    "tencircle": "\u2469",
    "tenideographicparen": "\u3229",
    "tenparen": "\u247d",
    "tenperiod": "\u2491",
    "tenroman": "\u2179",
    "tesh": "\u02a7",
    "tet": "\u05d8",
    "tetdagesh": "\ufb38",
    "tetdageshhebrew": "\ufb38",
    "tethebrew": "\u05d8",
    "tetsecyrillic": "\u04b5",
    "tevirhebrew": "\u059b",
    "tevirlefthebrew": "\u059b",
    "thabengali": "\u09a5",
    "thadeva": "\u0925",
    "thagujarati": "\u0aa5",
    "thagurmukhi": "\u0a25",
    "thalarabic": "\u0630",
    "thalfinalarabic": "\ufeac",
    "thanthakhatlowleftthai": "\uf898",
    "thanthakhatlowrightthai": "\uf897",
    "thanthakhatthai": "\u0e4c",
    "thanthakhatupperleftthai": "\uf896",
    "theharabic": "\u062b",
    "thehfinalarabic": "\ufe9a",
    "thehinitialarabic": "\ufe9b",
    "thehmedialarabic": "\ufe9c",
    "thereexists": "\u2203",
    "therefore": "\u2234",
    "theta": "\u03b8",
    "theta1": "\u03d1",
    "thetasymbolgreek": "\u03d1",
    "thieuthacirclekorean": "\u3279",
    "thieuthaparenkorean": "\u3219",
    "thieuthcirclekorean": "\u326b",
    "thieuthkorean": "\u314c",
    "thieuthparenkorean": "\u320b",
    "thirteencircle": "\u246c",
    "thirteenparen": "\u2480",
    "thirteenperiod": "\u2494",
    "thonangmonthothai": "\u0e11",
    "thook": "\u01ad",
    "thophuthaothai": "\u0e12",
    "thorn": "\u00fe",
    "thothahanthai": "\u0e17",
    "thothanthai": "\u0e10",
    "thothongthai": "\u0e18",
    "thothungthai": "\u0e16",
    "thousandcyrillic": "\u0482",
    "thousandsseparatorarabic": "\u066c",
    "thousandsseparatorpersian": "\u066c",
    "three": "\u0033",
    "threearabic": "\u0663",
    "threebengali": "\u09e9",
    "threecircle": "\u2462",
    "threecircleinversesansserif": "\u278c",
    "threedeva": "\u0969",
    "threeeighths": "\u215c",
    "threegujarati": "\u0ae9",
    "threegurmukhi": "\u0a69",
    "threehackarabic": "\u0663",
    "threehangzhou": "\u3023",
    "threeideographicparen": "\u3222",
    "threeinferior": "\u2083",
    "threemonospace": "\uff13",
    "threenumeratorbengali": "\u09f6",
    "threeoldstyle": "\uf733",
    "threeparen": "\u2476",
    "threeperiod": "\u248a",
    "threepersian": "\u06f3",
    "threequarters": "\u00be",
    "threequartersemdash": "\uf6de",
    "threeroman": "\u2172",
    "threesuperior": "\u00b3",
    "threethai": "\u0e53",
    "thzsquare": "\u3394",
    "tihiragana": "\u3061",
    "tikatakana": "\u30c1",
    "tikatakanahalfwidth": "\uff81",
    "tikeutacirclekorean": "\u3270",
    "tikeutaparenkorean": "\u3210",
    "tikeutcirclekorean": "\u3262",
    "tikeutkorean": "\u3137",
    "tikeutparenkorean": "\u3202",
    "tilde": "\u02dc",
    "tildebelowcmb": "\u0330",
    "tildecmb": "\u0303",
    "tildecomb": "\u0303",
    "tildedoublecmb": "\u0360",
    "tildeoperator": "\u223c",
    "tildeoverlaycmb": "\u0334",
    "tildeverticalcmb": "\u033e",
    "timescircle": "\u2297",
    "tipehahebrew": "\u0596",
    "tipehalefthebrew": "\u0596",
    "tippigurmukhi": "\u0a70",
    "titlocyrilliccmb": "\u0483",
    "tiwnarmenian": "\u057f",
    "tlinebelow": "\u1e6f",
    "tmonospace": "\uff54",
    "toarmenian": "\u0569",
    "tohiragana": "\u3068",
    "tokatakana": "\u30c8",
    "tokatakanahalfwidth": "\uff84",
    "tonebarextrahighmod": "\u02e5",
    "tonebarextralowmod": "\u02e9",
    "tonebarhighmod": "\u02e6",
    "tonebarlowmod": "\u02e8",
    "tonebarmidmod": "\u02e7",
    "tonefive": "\u01bd",
    "tonesix": "\u0185",
    "tonetwo": "\u01a8",
    "tonos": "\u0384",
    "tonsquare": "\u3327",
    "topatakthai": "\u0e0f",
    "tortoiseshellbracketleft": "\u3014",
    "tortoiseshellbracketleftsmall": "\ufe5d",
    "tortoiseshellbracketleftvertical": "\ufe39",
    "tortoiseshellbracketright": "\u3015",
    "tortoiseshellbracketrightsmall": "\ufe5e",
    "tortoiseshellbracketrightvertical": "\ufe3a",
    "totaothai": "\u0e15",
    "tpalatalhook": "\u01ab",
    "tparen": "\u24af",
    "trademark": "\u2122",
    "trademarksans": "\uf8ea",
    "trademarkserif": "\uf6db",
    "tretroflexhook": "\u0288",
    "triagdn": "\u25bc",
    "triaglf": "\u25c4",
    "triagrt": "\u25ba",
    "triagup": "\u25b2",
    "ts": "\u02a6",
    "tsadi": "\u05e6",
    "tsadidagesh": "\ufb46",
    "tsadidageshhebrew": "\ufb46",
    "tsadihebrew": "\u05e6",
    "tsecyrillic": "\u0446",
    "tsere": "\u05b5",
    "tsere12": "\u05b5",
    "tsere1e": "\u05b5",
    "tsere2b": "\u05b5",
    "tserehebrew": "\u05b5",
    "tserenarrowhebrew": "\u05b5",
    "tserequarterhebrew": "\u05b5",
    "tserewidehebrew": "\u05b5",
    "tshecyrillic": "\u045b",
    "tsuperior": "\uf6f3",
    "ttabengali": "\u099f",
    "ttadeva": "\u091f",
    "ttagujarati": "\u0a9f",
    "ttagurmukhi": "\u0a1f",
    "tteharabic": "\u0679",
    "ttehfinalarabic": "\ufb67",
    "ttehinitialarabic": "\ufb68",
    "ttehmedialarabic": "\ufb69",
    "tthabengali": "\u09a0",
    "tthadeva": "\u0920",
    "tthagujarati": "\u0aa0",
    "tthagurmukhi": "\u0a20",
    "tturned": "\u0287",
    "tuhiragana": "\u3064",
    "tukatakana": "\u30c4",
    "tukatakanahalfwidth": "\uff82",
    "tusmallhiragana": "\u3063",
    "tusmallkatakana": "\u30c3",
    "tusmallkatakanahalfwidth": "\uff6f",
    "twelvecircle": "\u246b",
    "twelveparen": "\u247f",
    "twelveperiod": "\u2493",
    "twelveroman": "\u217b",
    "twentycircle": "\u2473",
    "twentyhangzhou": "\u5344",
    "twentyparen": "\u2487",
    "twentyperiod": "\u249b",
    "two": "\u0032",
    "twoarabic": "\u0662",
    "twobengali": "\u09e8",
    "twocircle": "\u2461",
    "twocircleinversesansserif": "\u278b",
    "twodeva": "\u0968",
    "twodotenleader": "\u2025",
    "twodotleader": "\u2025",
    "twodotleadervertical": "\ufe30",
    "twogujarati": "\u0ae8",
    "twogurmukhi": "\u0a68",
    "twohackarabic": "\u0662",
    "twohangzhou": "\u3022",
    "twoideographicparen": "\u3221",
    "twoinferior": "\u2082",
    "twomonospace": "\uff12",
    "twonumeratorbengali": "\u09f5",
    "twooldstyle": "\uf732",
    "twoparen": "\u2475",
    "twoperiod": "\u2489",
    "twopersian": "\u06f2",
    "tworoman": "\u2171",
    "twostroke": "\u01bb",
    "twosuperior": "\u00b2",
    "twothai": "\u0e52",
    "twothirds": "\u2154",
    "u": "\u0075",
    "uacute": "\u00fa",
    "ubar": "\u0289",
    "ubengali": "\u0989",
    "ubopomofo": "\u3128",
    "ubreve": "\u016d",
    "ucaron": "\u01d4",
    "ucircle": "\u24e4",
    "ucircumflex": "\u00fb",
    "ucircumflexbelow": "\u1e77",
    "ucyrillic": "\u0443",
    "udattadeva": "\u0951",
    "udblacute": "\u0171",
    "udblgrave": "\u0215",
    "udeva": "\u0909",
    "udieresis": "\u00fc",
    "udieresisacute": "\u01d8",
    "udieresisbelow": "\u1e73",
    "udieresiscaron": "\u01da",
    "udieresiscyrillic": "\u04f1",
    "udieresisgrave": "\u01dc",
    "udieresismacron": "\u01d6",
    "udotbelow": "\u1ee5",
    "ugrave": "\u00f9",
    "ugujarati": "\u0a89",
    "ugurmukhi": "\u0a09",
    "uhiragana": "\u3046",
    "uhookabove": "\u1ee7",
    "uhorn": "\u01b0",
    "uhornacute": "\u1ee9",
    "uhorndotbelow": "\u1ef1",
    "uhorngrave": "\u1eeb",
    "uhornhookabove": "\u1eed",
    "uhorntilde": "\u1eef",
    "uhungarumlaut": "\u0171",
    "uhungarumlautcyrillic": "\u04f3",
    "uinvertedbreve": "\u0217",
    "ukatakana": "\u30a6",
    "ukatakanahalfwidth": "\uff73",
    "ukcyrillic": "\u0479",
    "ukorean": "\u315c",
    "umacron": "\u016b",
    "umacroncyrillic": "\u04ef",
    "umacrondieresis": "\u1e7b",
    "umatragurmukhi": "\u0a41",
    "umonospace": "\uff55",
    "underscore": "\u005f",
    "underscoredbl": "\u2017",
    "underscoremonospace": "\uff3f",
    "underscorevertical": "\ufe33",
    "underscorewavy": "\ufe4f",
    "union": "\u222a",
    "universal": "\u2200",
    "uogonek": "\u0173",
    "uparen": "\u24b0",
    "upblock": "\u2580",
    "upperdothebrew": "\u05c4",
    "upsilon": "\u03c5",
    "upsilondieresis": "\u03cb",
    "upsilondieresistonos": "\u03b0",
    "upsilonlatin": "\u028a",
    "upsilontonos": "\u03cd",
    "uptackbelowcmb": "\u031d",
    "uptackmod": "\u02d4",
    "uragurmukhi": "\u0a73",
    "uring": "\u016f",
    "ushortcyrillic": "\u045e",
    "usmallhiragana": "\u3045",
    "usmallkatakana": "\u30a5",
    "usmallkatakanahalfwidth": "\uff69",
    "ustraightcyrillic": "\u04af",
    "ustraightstrokecyrillic": "\u04b1",
    "utilde": "\u0169",
    "utildeacute": "\u1e79",
    "utildebelow": "\u1e75",
    "uubengali": "\u098a",
    "uudeva": "\u090a",
    "uugujarati": "\u0a8a",
    "uugurmukhi": "\u0a0a",
    "uumatragurmukhi": "\u0a42",
    "uuvowelsignbengali": "\u09c2",
    "uuvowelsigndeva": "\u0942",
    "uuvowelsigngujarati": "\u0ac2",
    "uvowelsignbengali": "\u09c1",
    "uvowelsigndeva": "\u0941",
    "uvowelsigngujarati": "\u0ac1",
    "v": "\u0076",
    "vadeva": "\u0935",
    "vagujarati": "\u0ab5",
    "vagurmukhi": "\u0a35",
    "vakatakana": "\u30f7",
    "vav": "\u05d5",
    "vavdagesh": "\ufb35",
    "vavdagesh65": "\ufb35",
    "vavdageshhebrew": "\ufb35",
    "vavhebrew": "\u05d5",
    "vavholam": "\ufb4b",
    "vavholamhebrew": "\ufb4b",
    "vavvavhebrew": "\u05f0",
    "vavyodhebrew": "\u05f1",
    "vcircle": "\u24e5",
    "vdotbelow": "\u1e7f",
    "vecyrillic": "\u0432",
    "veharabic": "\u06a4",
    "vehfinalarabic": "\ufb6b",
    "vehinitialarabic": "\ufb6c",
    "vehmedialarabic": "\ufb6d",
    "vekatakana": "\u30f9",
    "venus": "\u2640",
    "verticalbar": "\u007c",
    "verticallineabovecmb": "\u030d",
    "verticallinebelowcmb": "\u0329",
    "verticallinelowmod": "\u02cc",
    "verticallinemod": "\u02c8",
    "vewarmenian": "\u057e",
    "vhook": "\u028b",
    "vikatakana": "\u30f8",
    "viramabengali": "\u09cd",
    "viramadeva": "\u094d",
    "viramagujarati": "\u0acd",
    "visargabengali": "\u0983",
    "visargadeva": "\u0903",
    "visargagujarati": "\u0a83",
    "vmonospace": "\uff56",
    "voarmenian": "\u0578",
    "voicediterationhiragana": "\u309e",
    "voicediterationkatakana": "\u30fe",
    "voicedmarkkana": "\u309b",
    "voicedmarkkanahalfwidth": "\uff9e",
    "vokatakana": "\u30fa",
    "vparen": "\u24b1",
    "vtilde": "\u1e7d",
    "vturned": "\u028c",
    "vuhiragana": "\u3094",
    "vukatakana": "\u30f4",
    "w": "\u0077",
    "wacute": "\u1e83",
    "waekorean": "\u3159",
    "wahiragana": "\u308f",
    "wakatakana": "\u30ef",
    "wakatakanahalfwidth": "\uff9c",
    "wakorean": "\u3158",
    "wasmallhiragana": "\u308e",
    "wasmallkatakana": "\u30ee",
    "wattosquare": "\u3357",
    "wavedash": "\u301c",
    "wavyunderscorevertical": "\ufe34",
    "wawarabic": "\u0648",
    "wawfinalarabic": "\ufeee",
    "wawhamzaabovearabic": "\u0624",
    "wawhamzaabovefinalarabic": "\ufe86",
    "wbsquare": "\u33dd",
    "wcircle": "\u24e6",
    "wcircumflex": "\u0175",
    "wdieresis": "\u1e85",
    "wdotaccent": "\u1e87",
    "wdotbelow": "\u1e89",
    "wehiragana": "\u3091",
    "weierstrass": "\u2118",
    "wekatakana": "\u30f1",
    "wekorean": "\u315e",
    "weokorean": "\u315d",
    "wgrave": "\u1e81",
    "whitebullet": "\u25e6",
    "whitecircle": "\u25cb",
    "whitecircleinverse": "\u25d9",
    "whitecornerbracketleft": "\u300e",
    "whitecornerbracketleftvertical": "\ufe43",
    "whitecornerbracketright": "\u300f",
    "whitecornerbracketrightvertical": "\ufe44",
    "whitediamond": "\u25c7",
    "whitediamondcontainingblacksmalldiamond": "\u25c8",
    "whitedownpointingsmalltriangle": "\u25bf",
    "whitedownpointingtriangle": "\u25bd",
    "whiteleftpointingsmalltriangle": "\u25c3",
    "whiteleftpointingtriangle": "\u25c1",
    "whitelenticularbracketleft": "\u3016",
    "whitelenticularbracketright": "\u3017",
    "whiterightpointingsmalltriangle": "\u25b9",
    "whiterightpointingtriangle": "\u25b7",
    "whitesmallsquare": "\u25ab",
    "whitesmilingface": "\u263a",
    "whitesquare": "\u25a1",
    "whitestar": "\u2606",
    "whitetelephone": "\u260f",
    "whitetortoiseshellbracketleft": "\u3018",
    "whitetortoiseshellbracketright": "\u3019",
    "whiteuppointingsmalltriangle": "\u25b5",
    "whiteuppointingtriangle": "\u25b3",
    "wihiragana": "\u3090",
    "wikatakana": "\u30f0",
    "wikorean": "\u315f",
    "wmonospace": "\uff57",
    "wohiragana": "\u3092",
    "wokatakana": "\u30f2",
    "wokatakanahalfwidth": "\uff66",
    "won": "\u20a9",
    "wonmonospace": "\uffe6",
    "wowaenthai": "\u0e27",
    "wparen": "\u24b2",
    "wring": "\u1e98",
    "wsuperior": "\u02b7",
    "wturned": "\u028d",
    "wynn": "\u01bf",
    "x": "\u0078",
    "xabovecmb": "\u033d",
    "xbopomofo": "\u3112",
    "xcircle": "\u24e7",
    "xdieresis": "\u1e8d",
    "xdotaccent": "\u1e8b",
    "xeharmenian": "\u056d",
    "xi": "\u03be",
    "xmonospace": "\uff58",
    "xparen": "\u24b3",
    "xsuperior": "\u02e3",
    "y": "\u0079",
    "yaadosquare": "\u334e",
    "yabengali": "\u09af",
    "yacute": "\u00fd",
    "yadeva": "\u092f",
    "yaekorean": "\u3152",
    "yagujarati": "\u0aaf",
    "yagurmukhi": "\u0a2f",
    "yahiragana": "\u3084",
    "yakatakana": "\u30e4",
    "yakatakanahalfwidth": "\uff94",
    "yakorean": "\u3151",
    "yamakkanthai": "\u0e4e",
    "yasmallhiragana": "\u3083",
    "yasmallkatakana": "\u30e3",
    "yasmallkatakanahalfwidth": "\uff6c",
    "yatcyrillic": "\u0463",
    "ycircle": "\u24e8",
    "ycircumflex": "\u0177",
    "ydieresis": "\u00ff",
    "ydotaccent": "\u1e8f",
    "ydotbelow": "\u1ef5",
    "yeharabic": "\u064a",
    "yehbarreearabic": "\u06d2",
    "yehbarreefinalarabic": "\ufbaf",
    "yehfinalarabic": "\ufef2",
    "yehhamzaabovearabic": "\u0626",
    "yehhamzaabovefinalarabic": "\ufe8a",
    "yehhamzaaboveinitialarabic": "\ufe8b",
    "yehhamzaabovemedialarabic": "\ufe8c",
    "yehinitialarabic": "\ufef3",
    "yehmedialarabic": "\ufef4",
    "yehmeeminitialarabic": "\ufcdd",
    "yehmeemisolatedarabic": "\ufc58",
    "yehnoonfinalarabic": "\ufc94",
    "yehthreedotsbelowarabic": "\u06d1",
    "yekorean": "\u3156",
    "yen": "\u00a5",
    "yenmonospace": "\uffe5",
    "yeokorean": "\u3155",
    "yeorinhieuhkorean": "\u3186",
    "yerahbenyomohebrew": "\u05aa",
    "yerahbenyomolefthebrew": "\u05aa",
    "yericyrillic": "\u044b",
    "yerudieresiscyrillic": "\u04f9",
    "yesieungkorean": "\u3181",
    "yesieungpansioskorean": "\u3183",
    "yesieungsioskorean": "\u3182",
    "yetivhebrew": "\u059a",
    "ygrave": "\u1ef3",
    "yhook": "\u01b4",
    "yhookabove": "\u1ef7",
    "yiarmenian": "\u0575",
    "yicyrillic": "\u0457",
    "yikorean": "\u3162",
    "yinyang": "\u262f",
    "yiwnarmenian": "\u0582",
    "ymonospace": "\uff59",
    "yod": "\u05d9",
    "yoddagesh": "\ufb39",
    "yoddageshhebrew": "\ufb39",
    "yodhebrew": "\u05d9",
    "yodyodhebrew": "\u05f2",
    "yodyodpatahhebrew": "\ufb1f",
    "yohiragana": "\u3088",
    "yoikorean": "\u3189",
    "yokatakana": "\u30e8",
    "yokatakanahalfwidth": "\uff96",
    "yokorean": "\u315b",
    "yosmallhiragana": "\u3087",
    "yosmallkatakana": "\u30e7",
    "yosmallkatakanahalfwidth": "\uff6e",
    "yotgreek": "\u03f3",
    "yoyaekorean": "\u3188",
    "yoyakorean": "\u3187",
    "yoyakthai": "\u0e22",
    "yoyingthai": "\u0e0d",
    "yparen": "\u24b4",
    "ypogegrammeni": "\u037a",
    "ypogegrammenigreekcmb": "\u0345",
    "yr": "\u01a6",
    "yring": "\u1e99",
    "ysuperior": "\u02b8",
    "ytilde": "\u1ef9",
    "yturned": "\u028e",
    "yuhiragana": "\u3086",
    "yuikorean": "\u318c",
    "yukatakana": "\u30e6",
    "yukatakanahalfwidth": "\uff95",
    "yukorean": "\u3160",
    "yusbigcyrillic": "\u046b",
    "yusbigiotifiedcyrillic": "\u046d",
    "yuslittlecyrillic": "\u0467",
    "yuslittleiotifiedcyrillic": "\u0469",
    "yusmallhiragana": "\u3085",
    "yusmallkatakana": "\u30e5",
    "yusmallkatakanahalfwidth": "\uff6d",
    "yuyekorean": "\u318b",
    "yuyeokorean": "\u318a",
    "yyabengali": "\u09df",
    "yyadeva": "\u095f",
    "z": "\u007a",
    "zaarmenian": "\u0566",
    "zacute": "\u017a",
    "zadeva": "\u095b",
    "zagurmukhi": "\u0a5b",
    "zaharabic": "\u0638",
    "zahfinalarabic": "\ufec6",
    "zahinitialarabic": "\ufec7",
    "zahiragana": "\u3056",
    "zahmedialarabic": "\ufec8",
    "zainarabic": "\u0632",
    "zainfinalarabic": "\ufeb0",
    "zakatakana": "\u30b6",
    "zaqefgadolhebrew": "\u0595",
    "zaqefqatanhebrew": "\u0594",
    "zarqahebrew": "\u0598",
    "zayin": "\u05d6",
    "zayindagesh": "\ufb36",
    "zayindageshhebrew": "\ufb36",
    "zayinhebrew": "\u05d6",
    "zbopomofo": "\u3117",
    "zcaron": "\u017e",
    "zcircle": "\u24e9",
    "zcircumflex": "\u1e91",
    "zcurl": "\u0291",
    "zdot": "\u017c",
    "zdotaccent": "\u017c",
    "zdotbelow": "\u1e93",
    "zecyrillic": "\u0437",
    "zedescendercyrillic": "\u0499",
    "zedieresiscyrillic": "\u04df",
    "zehiragana": "\u305c",
    "zekatakana": "\u30bc",
    "zero": "\u0030",
    "zeroarabic": "\u0660",
    "zerobengali": "\u09e6",
    "zerodeva": "\u0966",
    "zerogujarati": "\u0ae6",
    "zerogurmukhi": "\u0a66",
    "zerohackarabic": "\u0660",
    "zeroinferior": "\u2080",
    "zeromonospace": "\uff10",
    "zerooldstyle": "\uf730",
    "zeropersian": "\u06f0",
    "zerosuperior": "\u2070",
    "zerothai": "\u0e50",
    "zerowidthjoiner": "\ufeff",
    "zerowidthnonjoiner": "\u200c",
    "zerowidthspace": "\u200b",
    "zeta": "\u03b6",
    "zhbopomofo": "\u3113",
    "zhearmenian": "\u056a",
    "zhebrevecyrillic": "\u04c2",
    "zhecyrillic": "\u0436",
    "zhedescendercyrillic": "\u0497",
    "zhedieresiscyrillic": "\u04dd",
    "zihiragana": "\u3058",
    "zikatakana": "\u30b8",
    "zinorhebrew": "\u05ae",
    "zlinebelow": "\u1e95",
    "zmonospace": "\uff5a",
    "zohiragana": "\u305e",
    "zokatakana": "\u30be",
    "zparen": "\u24b5",
    "zretroflexhook": "\u0290",
    "zstroke": "\u01b6",
    "zuhiragana": "\u305a",
    "zukatakana": "\u30ba",
}
# --end


================================================
FILE: babeldoc/pdfminer/high_level.py
================================================
"""Functions that can be used for the most common use-cases for pdfminer.six"""

import logging
import sys
from collections.abc import Container
from collections.abc import Iterator
from io import StringIO
from typing import Any
from typing import BinaryIO
from typing import cast

from babeldoc.pdfminer.converter import HOCRConverter
from babeldoc.pdfminer.converter import HTMLConverter
from babeldoc.pdfminer.converter import PDFPageAggregator
from babeldoc.pdfminer.converter import TextConverter
from babeldoc.pdfminer.converter import XMLConverter
from babeldoc.pdfminer.image import ImageWriter
from babeldoc.pdfminer.layout import LAParams
from babeldoc.pdfminer.layout import LTPage
from babeldoc.pdfminer.pdfdevice import PDFDevice
from babeldoc.pdfminer.pdfdevice import TagExtractor
from babeldoc.pdfminer.pdfexceptions import PDFValueError
from babeldoc.pdfminer.pdfinterp import PDFPageInterpreter
from babeldoc.pdfminer.pdfinterp import PDFResourceManager
from babeldoc.pdfminer.pdfpage import PDFPage
from babeldoc.pdfminer.utils import AnyIO
from babeldoc.pdfminer.utils import FileOrName
from babeldoc.pdfminer.utils import open_filename


def extract_text_to_fp(
    inf: BinaryIO,
    outfp: AnyIO,
    output_type: str = "text",
    codec: str = "utf-8",
    laparams: LAParams | None = None,
    maxpages: int = 0,
    page_numbers: Container[int] | None = None,
    password: str = "",
    scale: float = 1.0,
    rotation: int = 0,
    layoutmode: str = "normal",
    output_dir: str | None = None,
    strip_control: bool = False,
    debug: bool = False,
    disable_caching: bool = False,
    **kwargs: Any,
) -> None:
    """Parses text from inf-file and writes to outfp file-like object.

    Takes loads of optional arguments but the defaults are somewhat sane.
    Beware laparams: Including an empty LAParams is not the same as passing
    None!

    :param inf: a file-like object to read PDF structure from, such as a
        file handler (using the builtin `open()` function) or a `BytesIO`.
    :param outfp: a file-like object to write the text to.
    :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
        Only 'text' works properly.
    :param codec: Text decoding codec
    :param laparams: An LAParams object from babeldoc.pdfminer.layout. Default is None
        but may not layout correctly.
    :param maxpages: How many pages to stop parsing after
    :param page_numbers: zero-indexed page numbers to operate on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param scale: Scale factor
    :param rotation: Rotation factor
    :param layoutmode: Default is 'normal', see
        pdfminer.converter.HTMLConverter
    :param output_dir: If given, creates an ImageWriter for extracted images.
    :param strip_control: Does what it says on the tin
    :param debug: Output more logging data
    :param disable_caching: Does what it says on the tin
    :param other:
    :return: nothing, acting as it does on two streams. Use StringIO to get
        strings.
    """
    if debug:
        logging.getLogger().setLevel(logging.DEBUG)

    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)

    rsrcmgr = PDFResourceManager(caching=not disable_caching)
    device: PDFDevice | None = None

    if output_type != "text" and outfp == sys.stdout:
        outfp = sys.stdout.buffer

    if output_type == "text":
        device = TextConverter(
            rsrcmgr,
            outfp,
            codec=codec,
            laparams=laparams,
            imagewriter=imagewriter,
        )

    elif output_type == "xml":
        device = XMLConverter(
            rsrcmgr,
            outfp,
            codec=codec,
            laparams=laparams,
            imagewriter=imagewriter,
            stripcontrol=strip_control,
        )

    elif output_type == "html":
        device = HTMLConverter(
            rsrcmgr,
            outfp,
            codec=codec,
            scale=scale,
            layoutmode=layoutmode,
            laparams=laparams,
            imagewriter=imagewriter,
        )

    elif output_type == "hocr":
        device = HOCRConverter(
            rsrcmgr,
            outfp,
            codec=codec,
            laparams=laparams,
            stripcontrol=strip_control,
        )

    elif output_type == "tag":
        # Binary I/O is required, but we have no good way to test it here.
        device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)

    else:
        msg = f"Output type can be text, html, xml or tag but is {output_type}"
        raise PDFValueError(msg)

    assert device is not None
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(
        inf,
        page_numbers,
        maxpages=maxpages,
        password=password,
        caching=not disable_caching,
    ):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

    device.close()


def extract_text(
    pdf_file: FileOrName,
    password: str = "",
    page_numbers: Container[int] | None = None,
    maxpages: int = 0,
    caching: bool = True,
    codec: str = "utf-8",
    laparams: LAParams | None = None,
) -> str:
    """Parse and return the text contained in a PDF file.

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from babeldoc.pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a string containing all of the text extracted.
    """
    if laparams is None:
        laparams = LAParams()

    with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
        fp = cast(BinaryIO, fp)  # we opened in binary mode
        rsrcmgr = PDFResourceManager(caching=caching)
        device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.get_pages(
            fp,
            page_numbers,
            maxpages=maxpages,
            password=password,
            caching=caching,
        ):
            interpreter.process_page(page)

        return output_string.getvalue()


def extract_pages(
    pdf_file: FileOrName,
    password: str = "",
    page_numbers: Container[int] | None = None,
    maxpages: int = 0,
    caching: bool = True,
    laparams: LAParams | None = None,
) -> Iterator[LTPage]:
    """Extract and yield LTPage objects

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param laparams: An LAParams object from babeldoc.pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: LTPage objects
    """
    if laparams is None:
        laparams = LAParams()

    with open_filename(pdf_file, "rb") as fp:
        fp = cast(BinaryIO, fp)  # we opened in binary mode
        resource_manager = PDFResourceManager(caching=caching)
        device = PDFPageAggregator(resource_manager, laparams=laparams)
        interpreter = PDFPageInterpreter(resource_manager, device)
        for page in PDFPage.get_pages(
            fp,
            page_numbers,
            maxpages=maxpages,
            password=password,
            caching=caching,
        ):
            interpreter.process_page(page)
            layout = device.get_result()
            yield layout


================================================
FILE: babeldoc/pdfminer/image.py
================================================
import os
import os.path
import struct
from io import BytesIO
from typing import BinaryIO
from typing import Literal

from babeldoc.pdfminer.jbig2 import JBIG2StreamReader
from babeldoc.pdfminer.jbig2 import JBIG2StreamWriter
from babeldoc.pdfminer.layout import LTImage
from babeldoc.pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
from babeldoc.pdfminer.pdfcolor import LITERAL_DEVICE_GRAY
from babeldoc.pdfminer.pdfcolor import LITERAL_DEVICE_RGB
from babeldoc.pdfminer.pdfcolor import LITERAL_INLINE_DEVICE_GRAY
from babeldoc.pdfminer.pdfcolor import LITERAL_INLINE_DEVICE_RGB
from babeldoc.pdfminer.pdfexceptions import PDFValueError
from babeldoc.pdfminer.pdftypes import LITERALS_DCT_DECODE
from babeldoc.pdfminer.pdftypes import LITERALS_FLATE_DECODE
from babeldoc.pdfminer.pdftypes import LITERALS_JBIG2_DECODE
from babeldoc.pdfminer.pdftypes import LITERALS_JPX_DECODE

PIL_ERROR_MESSAGE = (
    "Could not import Pillow. This dependency of pdfminer.six is not "
    "installed by default. You need it to to save jpg images to a file. Install it "
    "with `pip install 'pdfminer.six[image]'`"
)


def align32(x: int) -> int:
    return ((x + 3) // 4) * 4


class BMPWriter:
    def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
        self.fp = fp
        self.bits = bits
        self.width = width
        self.height = height
        if bits == 1:
            ncols = 2
        elif bits == 8:
            ncols = 256
        elif bits == 24:
            ncols = 0
        else:
            raise PDFValueError(bits)
        self.linesize = align32((self.width * self.bits + 7) // 8)
        self.datasize = self.linesize * self.height
        headersize = 14 + 40 + ncols * 4
        info = struct.pack(
            "<IiiHHIIIIII",
            40,
            self.width,
            self.height,
            1,
            self.bits,
            0,
            self.datasize,
            0,
            0,
            ncols,
            0,
        )
        assert len(info) == 40, str(len(info))
        header = struct.pack(
            "<ccIHHI",
            b"B",
            b"M",
            headersize + self.datasize,
            0,
            0,
            headersize,
        )
        assert len(header) == 14, str(len(header))
        self.fp.write(header)
        self.fp.write(info)
        if ncols == 2:
            # B&W color table
            for i in (0, 255):
                self.fp.write(struct.pack("BBBx", i, i, i))
        elif ncols == 256:
            # grayscale color table
            for i in range(256):
                self.fp.write(struct.pack("BBBx", i, i, i))
        self.pos0 = self.fp.tell()
        self.pos1 = self.pos0 + self.datasize

    def write_line(self, y: int, data: bytes) -> None:
        self.fp.seek(self.pos1 - (y + 1) * self.linesize)
        self.fp.write(data)


class ImageWriter:
    """Write image to a file

    Supports various image types: JPEG, JBIG2 and bitmaps
    """

    def __init__(self, outdir: str) -> None:
        self.outdir = outdir
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir)

    def export_image(self, image: LTImage) -> str:
        """Save an LTImage to disk"""
        (width, height) = image.srcsize

        filters = image.stream.get_filters()

        if filters[-1][0] in LITERALS_DCT_DECODE:
            name = self._save_jpeg(image)

        elif filters[-1][0] in LITERALS_JPX_DECODE:
            name = self._save_jpeg2000(image)

        elif self._is_jbig2_iamge(image):
            name = self._save_jbig2(image)

        elif image.bits == 1:
            name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)

        elif image.bits == 8 and (
            LITERAL_DEVICE_RGB in image.colorspace
            or LITERAL_INLINE_DEVICE_RGB in image.colorspace
        ):
            name = self._save_bmp(image, width, height, width * 3, image.bits * 3)

        elif image.bits == 8 and (
            LITERAL_DEVICE_GRAY in image.colorspace
            or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
        ):
            name = self._save_bmp(image, width, height, width, image.bits)

        elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
            name = self._save_bytes(image)

        else:
            name = self._save_raw(image)

        return name

    def _save_jpeg(self, image: LTImage) -> str:
        """Save a JPEG encoded image"""
        data = image.stream.get_data()

        name, path = self._create_unique_image_name(image, ".jpg")
        with open(path, "wb") as fp:
            if LITERAL_DEVICE_CMYK in image.colorspace:
                try:
                    from PIL import Image  # type: ignore[import]
                    from PIL import ImageChops  # type: ignore[import]
                except ImportError:
                    raise ImportError(PIL_ERROR_MESSAGE)

                ifp = BytesIO(data)
                i = Image.open(ifp)
                i = ImageChops.invert(i)
                i = i.convert("RGB")
                i.save(fp, "JPEG")
            else:
                fp.write(data)

        return name

    def _save_jpeg2000(self, image: LTImage) -> str:
        """Save a JPEG 2000 encoded image"""
        data = image.stream.get_data()

        name, path = self._create_unique_image_name(image, ".jp2")
        with open(path, "wb") as fp:
            try:
                from PIL import Image  # type: ignore[import]
            except ImportError:
                raise ImportError(PIL_ERROR_MESSAGE)

            # if we just write the raw data, most image programs
            # that I have tried cannot open the file. However,
            # open and saving with PIL produces a file that
            # seems to be easily opened by other programs
            ifp = BytesIO(data)
            i = Image.open(ifp)
            i.save(fp, "JPEG2000")
        return name

    def _save_jbig2(self, image: LTImage) -> str:
        """Save a JBIG2 encoded image"""
        name, path = self._create_unique_image_name(image, ".jb2")
        with open(path, "wb") as fp:
            input_stream = BytesIO()

            global_streams = []
            filters = image.stream.get_filters()
            for filter_name, params in filters:
                if filter_name in LITERALS_JBIG2_DECODE:
                    global_streams.append(params["JBIG2Globals"].resolve())

            if len(global_streams) > 1:
                msg = (
                    "There should never be more than one JBIG2Globals "
                    "associated with a JBIG2 embedded image"
                )
                raise PDFValueError(msg)
            if len(global_streams) == 1:
                input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
            input_stream.write(image.stream.get_data())
            input_stream.seek(0)
            reader = JBIG2StreamReader(input_stream)
            segments = reader.get_segments()

            writer = JBIG2StreamWriter(fp)
            writer.write_file(segments)
        return name

    def _save_bmp(
        self,
        image: LTImage,
        width: int,
        height: int,
        bytes_per_line: int,
        bits: int,
    ) -> str:
        """Save a BMP encoded image"""
        name, path = self._create_unique_image_name(image, ".bmp")
        with open(path, "wb") as fp:
            bmp = BMPWriter(fp, bits, width, height)
            data = image.stream.get_data()
            i = 0
            for y in range(height):
                bmp.write_line(y, data[i : i + bytes_per_line])
                i += bytes_per_line
        return name

    def _save_bytes(self, image: LTImage) -> str:
        """Save an image without encoding, just bytes"""
        name, path = self._create_unique_image_name(image, ".jpg")
        width, height = image.srcsize
        channels = len(image.stream.get_data()) / width / height / (image.bits / 8)
        with open(path, "wb") as fp:
            try:
                from PIL import Image  # type: ignore[import]
                from PIL import ImageOps
            except ImportError:
                raise ImportError(PIL_ERROR_MESSAGE)

            mode: Literal["1", "L", "RGB", "CMYK"]
            if image.bits == 1:
                mode = "1"
            elif image.bits == 8 and channels == 1:
                mode = "L"
            elif image.bits == 8 and channels == 3:
                mode = "RGB"
            elif image.bits == 8 and channels == 4:
                mode = "CMYK"

            img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw")
            if mode == "L":
                img = ImageOps.invert(img)

            img.save(fp)

        return name

    def _save_raw(self, image: LTImage) -> str:
        """Save an image with unknown encoding"""
        ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1])
        name, path = self._create_unique_image_name(image, ext)

        with open(path, "wb") as fp:
            fp.write(image.stream.get_data())
        return name

    @staticmethod
    def _is_jbig2_iamge(image: LTImage) -> bool:
        filters = image.stream.get_filters()
        for filter_name, params in filters:
            if filter_name in LITERALS_JBIG2_DECODE:
                return True
        return False

    def _create_unique_image_name(self, image: LTImage, ext: str) -> tuple[str, str]:
        name = image.name + ext
        path = os.path.join(self.outdir, name)
        img_index = 0
        while os.path.exists(path):
            name = "%s.%d%s" % (image.name, img_index, ext)
            path = os.path.join(self.outdir, name)
            img_index += 1
        return name, path


================================================
FILE: babeldoc/pdfminer/jbig2.py
================================================
import math
import os
from collections.abc import Iterable
from struct import calcsize
from struct import pack
from struct import unpack
from typing import BinaryIO
from typing import cast

from babeldoc.pdfminer.pdfexceptions import PDFValueError

# segment structure base
SEG_STRUCT = [
    (">L", "number"),
    (">B", "flags"),
    (">B", "retention_flags"),
    (">B", "page_assoc"),
    (">L", "data_length"),
]

# segment header literals
HEADER_FLAG_DEFERRED = 0b10000000
HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000

SEG_TYPE_MASK = 0b00111111

REF_COUNT_SHORT_MASK = 0b11100000
REF_COUNT_LONG_MASK = 0x1FFFFFFF
REF_COUNT_LONG = 7

DATA_LEN_UNKNOWN = 0xFFFFFFFF

# segment types
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
SEG_TYPE_END_OF_PAGE = 49
SEG_TYPE_END_OF_FILE = 51

# file literals
FILE_HEADER_ID = b"\x97\x4a\x42\x32\x0d\x0a\x1a\x0a"
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001


def bit_set(bit_pos: int, value: int) -> bool:
    return bool((value >> bit_pos) & 1)


def check_flag(flag: int, value: int) -> bool:
    return bool(flag & value)


def masked_value(mask: int, value: int) -> int:
    for bit_pos in range(31):
        if bit_set(bit_pos, mask):
            return (value & mask) >> bit_pos

    raise PDFValueError("Invalid mask or value")


def mask_value(mask: int, value: int) -> int:
    for bit_pos in range(31):
        if bit_set(bit_pos, mask):
            return (value & (mask >> bit_pos)) << bit_pos

    raise PDFValueError("Invalid mask or value")


def unpack_int(format: str, buffer: bytes) -> int:
    assert format in {">B", ">I", ">L"}
    [result] = cast(tuple[int], unpack(format, buffer))
    return result


JBIG2SegmentFlags = dict[str, int | bool]
JBIG2RetentionFlags = dict[str, int | list[int] | list[bool]]
JBIG2Segment = dict[
    str,
    bool | int | bytes | JBIG2SegmentFlags | JBIG2RetentionFlags,
]


class JBIG2StreamReader:
    """Read segments from a JBIG2 byte stream"""

    def __init__(self, stream: BinaryIO) -> None:
        self.stream = stream

    def get_segments(self) -> list[JBIG2Segment]:
        segments: list[JBIG2Segment] = []
        while not self.is_eof():
            segment: JBIG2Segment = {}
            for field_format, name in SEG_STRUCT:
                field_len = calcsize(field_format)
                field = self.stream.read(field_len)
                if len(field) < field_len:
                    segment["_error"] = True
                    break
                value = unpack_int(field_format, field)
                parser = getattr(self, "parse_%s" % name, None)
                if callable(parser):
                    value = parser(segment, value, field)
                segment[name] = value

            if not segment.get("_error"):
                segments.append(segment)
        return segments

    def is_eof(self) -> bool:
        if self.stream.read(1) == b"":
            return True
        else:
            self.stream.seek(-1, os.SEEK_CUR)
            return False

    def parse_flags(
        self,
        segment: JBIG2Segment,
        flags: int,
        field: bytes,
    ) -> JBIG2SegmentFlags:
        return {
            "deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
            "page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
            "type": masked_value(SEG_TYPE_MASK, flags),
        }

    def parse_retention_flags(
        self,
        segment: JBIG2Segment,
        flags: int,
        field: bytes,
    ) -> JBIG2RetentionFlags:
        ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
        retain_segments = []
        ref_segments = []

        if ref_count < REF_COUNT_LONG:
            for bit_pos in range(5):
                retain_segments.append(bit_set(bit_pos, flags))
        else:
            field += self.stream.read(3)
            ref_count = unpack_int(">L", field)
            ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count)
            ret_bytes_count = int(math.ceil((ref_count + 1) / 8))
            for ret_byte_index in range(ret_bytes_count):
                ret_byte = unpack_int(">B", self.stream.read(1))
                for bit_pos in range(7):
                    retain_segments.append(bit_set(bit_pos, ret_byte))

        seg_num = segment["number"]
        assert isinstance(seg_num, int)
        if seg_num <= 256:
            ref_format = ">B"
        elif seg_num <= 65536:
            ref_format = ">I"
        else:
            ref_format = ">L"

        ref_size = calcsize(ref_format)

        for ref_index in range(ref_count):
            ref_data = self.stream.read(ref_size)
            ref = unpack_int(ref_format, ref_data)
            ref_segments.append(ref)

        return {
            "ref_count": ref_count,
            "retain_segments": retain_segments,
            "ref_segments": ref_segments,
        }

    def parse_page_assoc(self, segment: JBIG2Segment, page: int, field: bytes) -> int:
        if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]:
            field += self.stream.read(3)
            page = unpack_int(">L", field)
        return page

    def parse_data_length(
        self,
        segment: JBIG2Segment,
        length: int,
        field: bytes,
    ) -> int:
        if length:
            if (
                cast(JBIG2SegmentFlags, segment["flags"])["type"]
                == SEG_TYPE_IMMEDIATE_GEN_REGION
            ) and (length == DATA_LEN_UNKNOWN):
                raise NotImplementedError(
                    "Working with unknown segment length is not implemented yet",
                )
            else:
                segment["raw_data"] = self.stream.read(length)

        return length


class JBIG2StreamWriter:
    """Write JBIG2 segments to a file in JBIG2 format"""

    EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = {
        "ref_count": 0,
        "ref_segments": cast(list[int], []),
        "retain_segments": cast(list[bool], []),
    }

    def __init__(self, stream: BinaryIO) -> None:
        self.stream = stream

    def write_segments(
        self,
        segments: Iterable[JBIG2Segment],
        fix_last_page: bool = True,
    ) -> int:
        data_len = 0
        current_page: int | None = None
        seg_num: int | None = None

        for segment in segments:
            data = self.encode_segment(segment)
            self.stream.write(data)
            data_len += len(data)

            seg_num = cast(int | None, segment["number"])

            if fix_last_page:
                seg_page = cast(int, segment.get("page_assoc"))

                if (
                    cast(JBIG2SegmentFlags, segment["flags"])["type"]
                    == SEG_TYPE_END_OF_PAGE
                ):
                    current_page = None
                elif seg_page:
                    current_page = seg_page

        if fix_last_page and current_page and (seg_num is not None):
            segment = self.get_eop_segment(seg_num + 1, current_page)
            data = self.encode_segment(segment)
            self.stream.write(data)
            data_len += len(data)

        return data_len

    def write_file(
        self,
        segments: Iterable[JBIG2Segment],
        fix_last_page: bool = True,
    ) -> int:
        header = FILE_HEADER_ID
        header_flags = FILE_HEAD_FLAG_SEQUENTIAL
        header += pack(">B", header_flags)
        # The embedded JBIG2 files in a PDF always
        # only have one page
        number_of_pages = pack(">L", 1)
        header += number_of_pages
        self.stream.write(header)
        data_len = len(header)

        data_len += self.write_segments(segments, fix_last_page)

        seg_num = 0
        for segment in segments:
            seg_num = cast(int, segment["number"])

        if fix_last_page:
            seg_num_offset = 2
        else:
            seg_num_offset = 1
        eof_segment = self.get_eof_segment(seg_num + seg_num_offset)
        data = self.encode_segment(eof_segment)

        self.stream.write(data)
        data_len += len(data)

        return data_len

    def encode_segment(self, segment: JBIG2Segment) -> bytes:
        data = b""
        for field_format, name in SEG_STRUCT:
            value = segment.get(name)
            encoder = getattr(self, "encode_%s" % name, None)
            if callable(encoder):
                field = encoder(value, segment)
            else:
                field = pack(field_format, value)
            data += field
        return data

    def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment) -> bytes:
        flags = 0
        if value.get("deferred"):
            flags |= HEADER_FLAG_DEFERRED

        if "page_assoc_long" in value:
            flags |= HEADER_FLAG_PAGE_ASSOC_LONG if value["page_assoc_long"] else flags
        else:
            flags |= (
                HEADER_FLAG_PAGE_ASSOC_LONG
                if cast(int, segment.get("page", 0)) > 255
                else flags
            )

        flags |= mask_value(SEG_TYPE_MASK, value["type"])

        return pack(">B", flags)

    def encode_retention_flags(
        self,
        value: JBIG2RetentionFlags,
        segment: JBIG2Segment,
    ) -> bytes:
        flags = []
        flags_format = ">B"
        ref_count = value["ref_count"]
        assert isinstance(ref_count, int)
        retain_segments = cast(list[bool], value.get("retain_segments", []))

        if ref_count <= 4:
            flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
            for ref_index, ref_retain in enumerate(retain_segments):
                if ref_retain:
                    flags_byte |= 1 << ref_index
            flags.append(flags_byte)
        else:
            bytes_count = math.ceil((ref_count + 1) / 8)
            flags_format = ">L" + ("B" * bytes_count)
            flags_dword = mask_value(REF_COUNT_SHORT_MASK, REF_COUNT_LONG) << 24
            flags.append(flags_dword)

            for byte_index in range(bytes_count):
                ret_byte = 0
                ret_part = retain_segments[byte_index * 8 : byte_index * 8 + 8]
                for bit_pos, ret_seg in enumerate(ret_part):
                    ret_byte |= 1 << bit_pos if ret_seg else ret_byte

                flags.append(ret_byte)

        ref_segments = cast(list[int], value.get("ref_segments", []))

        seg_num = cast(int, segment["number"])
        if seg_num <= 256:
            ref_format = "B"
        elif seg_num <= 65536:
            ref_format = "I"
        else:
            ref_format = "L"

        for ref in ref_segments:
            flags_format += ref_format
            flags.append(ref)

        return pack(flags_format, *flags)

    def encode_data_length(self, value: int, segment: JBIG2Segment) -> bytes:
        data = pack(">L", value)
        data += cast(bytes, segment["raw_data"])
        return data

    def get_eop_segment(self, seg_number: int, page_number: int) -> JBIG2Segment:
        return {
            "data_length": 0,
            "flags": {"deferred": False, "type": SEG_TYPE_END_OF_PAGE},
            "number": seg_number,
            "page_assoc": page_number,
            "raw_data": b"",
            "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
        }

    def get_eof_segment(self, seg_number: int) -> JBIG2Segment:
        return {
            "data_length": 0,
            "flags": {"deferred": False, "type": SEG_TYPE_END_OF_FILE},
            "number": seg_number,
            "page_assoc": 0,
            "raw_data": b"",
            "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
        }


================================================
FILE: babeldoc/pdfminer/latin_enc.py
================================================
"""Standard encoding tables used in PDF.

This table is extracted from PDF Reference Manual 1.6, pp.925
  "D.1 Latin Character Set and Encodings"

"""

EncodingRow = tuple[str, int | None, int | None, int | None, int | None]

ENCODING: list[EncodingRow] = [
    # (name, std, mac, win, pdf)
    ("A", 65, 65, 65, 65),
    ("AE", 225, 174, 198, 198),
    ("Aacute", None, 231, 193, 193),
    ("Acircumflex", None, 229, 194, 194),
    ("Adieresis", None, 128, 196, 196),
    ("Agrave", None, 203, 192, 192),
    ("Aring", None, 129, 197, 197),
    ("Atilde", None, 204, 195, 195),
    ("B", 66, 66, 66, 66),
    ("C", 67, 67, 67, 67),
    ("Ccedilla", None, 130, 199, 199),
    ("D", 68, 68, 68, 68),
    ("E", 69, 69, 69, 69),
    ("Eacute", None, 131, 201, 201),
    ("Ecircumflex", None, 230, 202, 202),
    ("Edieresis", None, 232, 203, 203),
    ("Egrave", None, 233, 200, 200),
    ("Eth", None, None, 208, 208),
    ("Euro", None, None, 128, 160),
    ("F", 70, 70, 70, 70),
    ("G", 71, 71, 71, 71),
    ("H", 72, 72, 72, 72),
    ("I", 73, 73, 73, 73),
    ("Iacute", None, 234, 205, 205),
    ("Icircumflex", None, 235, 206, 206),
    ("Idieresis", None, 236, 207, 207),
    ("Igrave", None, 237, 204, 204),
    ("J", 74, 74, 74, 74),
    ("K", 75, 75, 75, 75),
    ("L", 76, 76, 76, 76),
    ("Lslash", 232, None, None, 149),
    ("M", 77, 77, 77, 77),
    ("N", 78, 78, 78, 78),
    ("Ntilde", None, 132, 209, 209),
    ("O", 79, 79, 79, 79),
    ("OE", 234, 206, 140, 150),
    ("Oacute", None, 238, 211, 211),
    ("Ocircumflex", None, 239, 212, 212),
    ("Odieresis", None, 133, 214, 214),
    ("Ograve", None, 241, 210, 210),
    ("Oslash", 233, 175, 216, 216),
    ("Otilde", None, 205, 213, 213),
    ("P", 80, 80, 80, 80),
    ("Q", 81, 81, 81, 81),
    ("R", 82, 82, 82, 82),
    ("S", 83, 83, 83, 83),
    ("Scaron", None, None, 138, 151),
    ("T", 84, 84, 84, 84),
    ("Thorn", None, None, 222, 222),
    ("U", 85, 85, 85, 85),
    ("Uacute", None, 242, 218, 218),
    ("Ucircumflex", None, 243, 219, 219),
    ("Udieresis", None, 134, 220, 220),
    ("Ugrave", None, 244, 217, 217),
    ("V", 86, 86, 86, 86),
    ("W", 87, 87, 87, 87),
    ("X", 88, 88, 88, 88),
    ("Y", 89, 89, 89, 89),
    ("Yacute", None, None, 221, 221),
    ("Ydieresis", None, 217, 159, 152),
    ("Z", 90, 90, 90, 90),
    ("Zcaron", None, None, 142, 153),
    ("a", 97, 97, 97, 97),
    ("aacute", None, 135, 225, 225),
    ("acircumflex", None, 137, 226, 226),
    ("acute", 194, 171, 180, 180),
    ("adieresis", None, 138, 228, 228),
    ("ae", 241, 190, 230, 230),
    ("agrave", None, 136, 224, 224),
    ("ampersand", 38, 38, 38, 38),
    ("aring", None, 140, 229, 229),
    ("asciicircum", 94, 94, 94, 94),
    ("asciitilde", 126, 126, 126, 126),
    ("asterisk", 42, 42, 42, 42),
    ("at", 64, 64, 64, 64),
    ("atilde", None, 139, 227, 227),
    ("b", 98, 98, 98, 98),
    ("backslash", 92, 92, 92, 92),
    ("bar", 124, 124, 124, 124),
    ("braceleft", 123, 123, 123, 123),
    ("braceright", 125, 125, 125, 125),
    ("bracketleft", 91, 91, 91, 91),
    ("bracketright", 93, 93, 93, 93),
    ("breve", 198, 249, None, 24),
    ("brokenbar", None, None, 166, 166),
    ("bullet", 183, 165, 149, 128),
    ("c", 99, 99, 99, 99),
    ("caron", 207, 255, None, 25),
    ("ccedilla", None, 141, 231, 231),
    ("cedilla", 203, 252, 184, 184),
    ("cent", 162, 162, 162, 162),
    ("circumflex", 195, 246, 136, 26),
    ("colon", 58, 58, 58, 58),
    ("comma", 44, 44, 44, 44),
    ("copyright", None, 169, 169, 169),
    ("currency", 168, 219, 164, 164),
    ("d", 100, 100, 100, 100),
    ("dagger", 178, 160, 134, 129),
    ("daggerdbl", 179, 224, 135, 130),
    ("degree", None, 161, 176, 176),
    ("dieresis", 200, 172, 168, 168),
    ("divide", None, 214, 247, 247),
    ("dollar", 36, 36, 36, 36),
    ("dotaccent", 199, 250, None, 27),
    ("dotlessi", 245, 245, None, 154),
    ("e", 101, 101, 101, 101),
    ("eacute", None, 142, 233, 233),
    ("ecircumflex", None, 144, 234, 234),
    ("edieresis", None, 145, 235, 235),
    ("egrave", None, 143, 232, 232),
    ("eight", 56, 56, 56, 56),
    ("ellipsis", 188, 201, 133, 131),
    ("emdash", 208, 209, 151, 132),
    ("endash", 177, 208, 150, 133),
    ("equal", 61, 61, 61, 61),
    ("eth", None, None, 240, 240),
    ("exclam", 33, 33, 33, 33),
    ("exclamdown", 161, 193, 161, 161),
    ("f", 102, 102, 102, 102),
    ("fi", 174, 222, None, 147),
    ("five", 53, 53, 53, 53),
    ("fl", 175, 223, None, 148),
    ("florin", 166, 196, 131, 134),
    ("four", 52, 52, 52, 52),
    ("fraction", 164, 218, None, 135),
    ("g", 103, 103, 103, 103),
    ("germandbls", 251, 167, 223, 223),
    ("grave", 193, 96, 96, 96),
    ("greater", 62, 62, 62, 62),
    ("guillemotleft", 171, 199, 171, 171),
    ("guillemotright", 187, 200, 187, 187),
    ("guilsinglleft", 172, 220, 139, 136),
    ("guilsinglright", 173, 221, 155, 137),
    ("h", 104, 104, 104, 104),
    ("hungarumlaut", 205, 253, None, 28),
    ("hyphen", 45, 45, 45, 45),
    ("i", 105, 105, 105, 105),
    ("iacute", None, 146, 237, 237),
    ("icircumflex", None, 148, 238, 238),
    ("idieresis", None, 149, 239, 239),
    ("igrave", None, 147, 236, 236),
    ("j", 106, 106, 106, 106),
    ("k", 107, 107, 107, 107),
    ("l", 108, 108, 108, 108),
    ("less", 60, 60, 60, 60),
    ("logicalnot", None, 194, 172, 172),
    ("lslash", 248, None, None, 155),
    ("m", 109, 109, 109, 109),
    ("macron", 197, 248, 175, 175),
    ("minus", None, None, None, 138),
    ("mu", None, 181, 181, 181),
    ("multiply", None, None, 215, 215),
    ("n", 110, 110, 110, 110),
    ("nbspace", None, 202, 160, None),
    ("nine", 57, 57, 57, 57),
    ("ntilde", None, 150, 241, 241),
    ("numbersign", 35, 35, 35, 35),
    ("o", 111, 111, 111, 111),
    ("oacute", None, 151, 243, 243),
    ("ocircumflex", None, 153, 244, 244),
    ("odieresis", None, 154, 246, 246),
    ("oe", 250, 207, 156, 156),
    ("ogonek", 206, 254, None, 29),
    ("ograve", None, 152, 242, 242),
    ("one", 49, 49, 49, 49),
    ("onehalf", None, None, 189, 189),
    ("onequarter", None, None, 188, 188),
    ("onesuperior", None, None, 185, 185),
    ("ordfeminine", 227, 187, 170, 170),
    ("ordmasculine", 235, 188, 186, 186),
    ("oslash", 249, 191, 248, 248),
    ("otilde", None, 155, 245, 245),
    ("p", 112, 112, 112, 112),
    ("paragraph", 182, 166, 182, 182),
    ("parenleft", 40, 40, 40, 40),
    ("parenright", 41, 41, 41, 41),
    ("percent", 37, 37, 37, 37),
    ("period", 46, 46, 46, 46),
    ("periodcentered", 180, 225, 183, 183),
    ("perthousand", 189, 228, 137, 139),
    ("plus", 43, 43, 43, 43),
    ("plusminus", None, 177, 177, 177),
    ("q", 113, 113, 113, 113),
    ("question", 63, 63, 63, 63),
    ("questiondown", 191, 192, 191, 191),
    ("quotedbl", 34, 34, 34, 34),
    ("quotedblbase", 185, 227, 132, 140),
    ("quotedblleft", 170, 210, 147, 141),
    ("quotedblright", 186, 211, 148, 142),
    ("quoteleft", 96, 212, 145, 143),
    ("quoteright", 39, 213, 146, 144),
    ("quotesinglbase", 184, 226, 130, 145),
    ("quotesingle", 169, 39, 39, 39),
    ("r", 114, 114, 114, 114),
    ("registered", None, 168, 174, 174),
    ("ring", 202, 251, None, 30),
    ("s", 115, 115, 115, 115),
    ("scaron", None, None, 154, 157),
    ("section", 167, 164, 167, 167),
    ("semicolon", 59, 59, 59, 59),
    ("seven", 55, 55, 55, 55),
    ("six", 54, 54, 54, 54),
    ("slash", 47, 47, 47, 47),
    ("space", 32, 32, 32, 32),
    ("space", None, 202, 160, None),
    ("space", None, 202, 173, None),
    ("sterling", 163, 163, 163, 163),
    ("t", 116, 116, 116, 116),
    ("thorn", None, None, 254, 254),
    ("three", 51, 51, 51, 51),
    ("threequarters", None, None, 190, 190),
    ("threesuperior", None, None, 179, 179),
    ("tilde", 196, 247, 152, 31),
    ("trademark", None, 170, 153, 146),
    ("two", 50, 50, 50, 50),
    ("twosuperior", None, None, 178, 178),
    ("u", 117, 117, 117, 117),
    ("uacute", None, 156, 250, 250),
    ("ucircumflex", None, 158, 251, 251),
    ("udieresis", None, 159, 252, 252),
    ("ugrave", None, 157, 249, 249),
    ("underscore", 95, 95, 95, 95),
    ("v", 118, 118, 118, 118),
    ("w", 119, 119, 119, 119),
    ("x", 120, 120, 120, 120),
    ("y", 121, 121, 121, 121),
    ("yacute", None, None, 253, 253),
    ("ydieresis", None, 216, 255, 255),
    ("yen", 165, 180, 165, 165),
    ("z", 122, 122, 122, 122),
    ("zcaron", None, None, 158, 158),
    ("zero", 48, 48, 48, 48),
]


================================================
FILE: babeldoc/pdfminer/layout.py
================================================
import heapq
import logging
from collections.abc import Iterable
from collections.abc import Iterator
from collections.abc import Sequence
from typing import Generic
from typing import TypeVar
from typing import Union
from typing import cast

from babeldoc.format.pdf.babelpdf.utils import guarded_bbox
from babeldoc.pdfminer.pdfcolor import PDFColorSpace
from babeldoc.pdfminer.pdfexceptions import PDFTypeError
from babeldoc.pdfminer.pdfexceptions import PDFValueError
from babeldoc.pdfminer.pdffont import PDFFont
from babeldoc.pdfminer.pdfinterp import Color
from babeldoc.pdfminer.pdfinterp import PDFGraphicState
from babeldoc.pdfminer.pdftypes import PDFStream
from babeldoc.pdfminer.utils import INF
from babeldoc.pdfminer.utils import LTComponentT
from babeldoc.pdfminer.utils import Matrix
from babeldoc.pdfminer.utils import PathSegment
from babeldoc.pdfminer.utils import Plane
from babeldoc.pdfminer.utils import Point
from babeldoc.pdfminer.utils import Rect
from babeldoc.pdfminer.utils import apply_matrix_pt
from babeldoc.pdfminer.utils import bbox2str
from babeldoc.pdfminer.utils import fsplit
from babeldoc.pdfminer.utils import get_bound
from babeldoc.pdfminer.utils import matrix2str
from babeldoc.pdfminer.utils import uniq

logger = logging.getLogger(__name__)


class IndexAssigner:
    def __init__(self, index: int = 0) -> None:
        self.index = index

    def run(self, obj: "LTItem") -> None:
        if isinstance(obj, LTTextBox):
            obj.index = self.index
            self.index += 1
        elif isinstance(obj, LTTextGroup):
            for x in obj:
                self.run(x)


class LAParams:
    """Parameters for layout analysis

    :param line_overlap: If two characters have more overlap than this they
        are considered to be on the same line. The overlap is specified
        relative to the minimum height of both characters.
    :param char_margin: If two characters are closer together than this
        margin they are considered part of the same line. The margin is
        specified relative to the width of the character.
    :param word_margin: If two characters on the same line are further apart
        than this margin then they are considered to be two separate words, and
        an intermediate space will be added for readability. The margin is
        specified relative to the width of the character.
    :param line_margin: If two lines are are close together they are
        considered to be part of the same paragraph. The margin is
        specified relative to the height of a line.
    :param boxes_flow: Specifies how much a horizontal and vertical position
        of a text matters when determining the order of text boxes. The value
        should be within the range of -1.0 (only horizontal position
        matters) to +1.0 (only vertical position matters). You can also pass
        `None` to disable advanced layout analysis, and instead return text
        based on the position of the bottom left corner of the text box.
    :param detect_vertical: If vertical text should be considered during
        layout analysis
    :param all_texts: If layout analysis should be performed on text in
        figures.
    """

    def __init__(
        self,
        line_overlap: float = 0.5,
        char_margin: float = 2.0,
        line_margin: float = 0.5,
        word_margin: float = 0.1,
        boxes_flow: float | None = 0.5,
        detect_vertical: bool = False,
        all_texts: bool = False,
    ) -> None:
        self.line_overlap = line_overlap
        self.char_margin = char_margin
        self.line_margin = line_margin
        self.word_margin = word_margin
        self.boxes_flow = boxes_flow
        self.detect_vertical = detect_vertical
        self.all_texts = all_texts

        self._validate()

    def _validate(self) -> None:
        if self.boxes_flow is not None:
            boxes_flow_err_msg = (
                "LAParam boxes_flow should be None, or a number between -1 and +1"
            )
            if not (
                isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float)
            ):
                raise PDFTypeError(boxes_flow_err_msg)
            if not -1 <= self.boxes_flow <= 1:
                raise PDFValueError(boxes_flow_err_msg)

    def __repr__(self) -> str:
        return (
            "<LAParams: char_margin=%.1f, line_margin=%.1f, "
            "word_margin=%.1f all_texts=%r>"
            % (self.char_margin, self.line_margin, self.word_margin, self.all_texts)
        )


class LTItem:
    """Interface for things that can be analyzed"""

    def analyze(self, laparams: LAParams) -> None:
        """Perform the layout analysis."""


class LTText:
    """Interface for things that have text"""

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__} {self.get_text()!r}>"

    def get_text(self) -> str:
        """Text contained in this object"""
        raise NotImplementedError


class LTComponent(LTItem):
    """Object with a bounding box"""

    def __init__(self, bbox: Rect) -> None:
        LTItem.__init__(self)
        self.set_bbox(bbox)

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__} {bbox2str(self.bbox)}>"

    # Disable comparison.
    def __lt__(self, _: object) -> bool:
        raise PDFValueError

    def __le__(self, _: object) -> bool:
        raise PDFValueError

    def __gt__(self, _: object) -> bool:
        raise PDFValueError

    def __ge__(self, _: object) -> bool:
        raise PDFValueError

    def set_bbox(self, bbox: Rect) -> None:
        (x0, y0, x1, y1) = bbox
        self.x0 = x0
        self.y0 = y0
        self.x1 = x1
        self.y1 = y1
        self.width = x1 - x0
        self.height = y1 - y0
        self.bbox = bbox

    def is_empty(self) -> bool:
        return self.width <= 0 or self.height <= 0

    def is_hoverlap(self, obj: "LTComponent") -> bool:
        assert isinstance(obj, LTComponent), str(type(obj))
        return obj.x0 <= self.x1 and self.x0 <= obj.x1

    def hdistance(self, obj: "LTComponent") -> float:
        assert isinstance(obj, LTComponent), str(type(obj))
        if self.is_hoverlap(obj):
            return 0
        else:
            return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))

    def hoverlap(self, obj: "LTComponent") -> float:
        assert isinstance(obj, LTComponent), str(type(obj))
        if self.is_hoverlap(obj):
            return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
        else:
            return 0

    def is_voverlap(self, obj: "LTComponent") -> bool:
        assert isinstance(obj, LTComponent), str(type(obj))
        return obj.y0 <= self.y1 and self.y0 <= obj.y1

    def vdistance(self, obj: "LTComponent") -> float:
        assert isinstance(obj, LTComponent), str(type(obj))
        if self.is_voverlap(obj):
            return 0
        else:
            return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))

    def voverlap(self, obj: "LTComponent") -> float:
        assert isinstance(obj, LTComponent), str(type(obj))
        if self.is_voverlap(obj):
            return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
        else:
            return 0


class LTCurve(LTComponent):
    """A generic Bezier curve

    The parameter `original_path` contains the original
    pathing information from the pdf (e.g. for reconstructing Bezier Curves).

    `dashing_style` contains the Dashing information if any.
    """

    def __init__(
        self,
        linewidth: float,
        pts: list[Point],
        stroke: bool = False,
        fill: bool = False,
        evenodd: bool = False,
        stroking_color: Color | None = None,
        non_stroking_color: Color | None = None,
        original_path: list[PathSegment] | None = None,
        dashing_style: tuple[object, object] | None = None,
    ) -> None:
        LTComponent.__init__(self, get_bound(pts))
        self.pts = pts
        self.linewidth = linewidth
        self.stroke = stroke
        self.fill = fill
        self.evenodd = evenodd
        self.stroking_color = stroking_color
        self.non_stroking_color = non_stroking_color
        self.original_path = original_path
        self.dashing_style = dashing_style

    def get_pts(self) -> str:
        return ",".join("%.3f,%.3f" % p for p in self.pts)


class LTLine(LTCurve):
    """A single straight line.

    Could be used for separating text or figures.
    """

    def __init__(
        self,
        linewidth: float,
        p0: Point,
        p1: Point,
        stroke: bool = False,
        fill: bool = False,
        evenodd: bool = False,
        stroking_color: Color | None = None,
        non_stroking_color: Color | None = None,
        original_path: list[PathSegment] | None = None,
        dashing_style: tuple[object, object] | None = None,
    ) -> None:
        LTCurve.__init__(
            self,
            linewidth,
            [p0, p1],
            stroke,
            fill,
            evenodd,
            stroking_color,
            non_stroking_color,
            original_path,
            dashing_style,
        )


class LTRect(LTCurve):
    """A rectangle.

    Could be used for framing another pictures or figures.
    """

    def __init__(
        self,
        linewidth: float,
        bbox: Rect,
        stroke: bool = False,
        fill: bool = False,
        evenodd: bool = False,
        stroking_color: Color | None = None,
        non_stroking_color: Color | None = None,
        original_path: list[PathSegment] | None = None,
        dashing_style: tuple[object, object] | None = None,
    ) -> None:
        (x0, y0, x1, y1) = bbox
        LTCurve.__init__(
            self,
            linewidth,
            [(x0, y0), (x1, y0), (x1, y1), (x0, y1)],
            stroke,
            fill,
            evenodd,
            stroking_color,
            non_stroking_color,
            original_path,
            dashing_style,
        )


class LTImage(LTComponent):
    """An image object.

    Embedded images can be in JPEG, Bitmap or JBIG2.
    """

    def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None:
        LTComponent.__init__(self, bbox)
        self.name = name
        self.stream = stream
        self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height")))
        self.imagemask = stream.get_any(("IM", "ImageMask"))
        self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1)
        self.colorspace = stream.get_any(("CS", "ColorSpace"))
        if not isinstance(self.colorspace, list):
            self.colorspace = [self.colorspace]

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} {self.srcsize!r}>"


class LTAnno(LTItem, LTText):
    """Actual letter in the text as a Unicode string.

    Note that, while a LTChar object has actual boundaries, LTAnno objects does
    not, as these are "virtual" characters, inserted by a layout analyzer
    according to the relationship between two characters (e.g. a space).
    """

    def __init__(self, text: str) -> None:
        self._text = text

    def get_text(self) -> str:
        return self._text


class LTChar(LTComponent, LTText):
    """Actual letter in the text as a Unicode string."""

    def __init__(
        self,
        matrix: Matrix,
        font: PDFFont,
        fontsize: float,
        scaling: float,
        rise: float,
        text: str,
        textwidth: float,
        textdisp: float | tuple[float | None, float],
        ncs: PDFColorSpace,
        graphicstate: PDFGraphicState,
    ) -> None:
        LTText.__init__(self)
        self._text = text
        self.matrix = matrix
        self.fontname = font.fontname
        self.ncs = ncs
        self.graphicstate = graphicstate
        self.adv = textwidth * fontsize * scaling
        # compute the boundary rectangle.
        if font.is_vertical():
            # vertical
            assert isinstance(textdisp, tuple)
            (vx, vy) = textdisp
            if vx is None:
                vx = fontsize * 0.5
            else:
                vx = vx * fontsize * 0.001
            vy = (1000 - vy) * fontsize * 0.001
            bbox_lower_left = (-vx, vy + rise + self.adv)
            bbox_upper_right = (-vx + fontsize, vy + rise)
        else:
            # horizontal
            descent = font.get_descent() * fontsize
            bbox_lower_left = (0, descent + rise)
            bbox_upper_right = (self.adv, descent + rise + fontsize)
        (a, b, c, d, e, f) = self.matrix
        self.upright = a * d * scaling > 0 and b * c <= 0
        (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
        (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
        if x1 < x0:
            (x0, x1) = (x1, x0)
        if y1 < y0:
            (y0, y1) = (y1, y0)
        LTComponent.__init__(self, (x0, y0, x1, y1))
        if font.is_vertical():
            self.size = self.width
        else:
            self.size = self.height

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"

    def get_text(self) -> str:
        return self._text


LTItemT = TypeVar("LTItemT", bound=LTItem)


class LTContainer(LTComponent, Generic[LTItemT]):
    """Object that can be extended and analyzed"""

    def __init__(self, bbox: Rect) -> None:
        LTComponent.__init__(self, bbox)
        self._objs: list[LTItemT] = []

    def __iter__(self) -> Iterator[LTItemT]:
        return iter(self._objs)

    def __len__(self) -> int:
        return len(self._objs)

    def add(self, obj: LTItemT) -> None:
        self._objs.append(obj)

    def extend(self, objs: Iterable[LTItemT]) -> None:
        for obj in objs:
            self.add(obj)

    def analyze(self, laparams: LAParams) -> None:
        for obj in self._objs:
            obj.analyze(laparams)


class LTExpandableContainer(LTContainer[LTItemT]):
    def __init__(self) -> None:
        LTContainer.__init__(self, (+INF, +INF, -INF, -INF))

    # Incompatible override: we take an LTComponent (with bounding box), but
    # super() LTContainer only considers LTItem (no bounding box).
    def add(self, obj: LTComponent) -> None:  # type: ignore[override]
        LTContainer.add(self, cast(LTItemT, obj))
        self.set_bbox(
            (
                min(self.x0, obj.x0),
                min(self.y0, obj.y0),
                max(self.x1, obj.x1),
                max(self.y1, obj.y1),
            ),
        )


class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
    def __init__(self) -> None:
        LTText.__init__(self)
        LTExpandableContainer.__init__(self)

    def get_text(self) -> str:
        return "".join(
            cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)
        )


TextLineElement = Union[LTChar, LTAnno]


class LTTextLine(LTTextContainer[TextLineElement]):
    """Contains a list of LTChar objects that represent a single text line.

    The characters are aligned either horizontally or vertically, depending on
    the text's writing mode.
    """

    def __init__(self, word_margin: float) -> None:
        super().__init__()
        self.word_margin = word_margin

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__} {bbox2str(self.bbox)} {self.get_text()!r}>"

    def analyze(self, laparams: LAParams) -> None:
        for obj in self._objs:
            obj.analyze(laparams)
        LTContainer.add(self, LTAnno("\n"))

    def find_neighbors(
        self,
        plane: Plane[LTComponentT],
        ratio: float,
    ) -> list["LTTextLine"]:
        raise NotImplementedError

    def is_empty(self) -> bool:
        return super().is_empty() or self.get_text().isspace()


class LTTextLineHorizontal(LTTextLine):
    def __init__(self, word_margin: float) -> None:
        LTTextLine.__init__(self, word_margin)
        self._x1: float = +INF

    # Incompatible override: we take an LTComponent (with bounding box), but
    # LTContainer only considers LTItem (no bounding box).
    def add(self, obj: LTComponent) -> None:  # type: ignore[override]
        if isinstance(obj, LTChar) and self.word_margin:
            margin = self.word_margin * max(obj.width, obj.height)
            if self._x1 < obj.x0 - margin:
                LTContainer.add(self, LTAnno(" "))
        self._x1 = obj.x1
        super().add(obj)

    def find_neighbors(
        self,
        plane: Plane[LTComponentT],
        ratio: float,
    ) -> list[LTTextLine]:
        """Finds neighboring LTTextLineHorizontals in the plane.

        Returns a list of other LTTestLineHorizontals in the plane which are
        close to self. "Close" can be controlled by ratio. The returned objects
        will be the same height as self, and also either left-, right-, or
        centrally-aligned.
        """
        d = ratio * self.height
        objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
        return [
            obj
            for obj in objs
            if (
                isinstance(obj, LTTextLineHorizontal)
                and self._is_same_height_as(obj, tolerance=d)
                and (
                    self._is_left_aligned_with(obj, tolerance=d)
                    or self._is_right_aligned_with(obj, tolerance=d)
                    or self._is_centrally_aligned_with(obj, tolerance=d)
                )
            )
        ]

    def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
        """Whether the left-hand edge of `other` is within `tolerance`."""
        return abs(other.x0 - self.x0) <= tolerance

    def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
        """Whether the right-hand edge of `other` is within `tolerance`."""
        return abs(other.x1 - self.x1) <= tolerance

    def _is_centrally_aligned_with(
        self,
        other: LTComponent,
        tolerance: float = 0,
    ) -> bool:
        """Whether the horizontal center of `other` is within `tolerance`."""
        return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance

    def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:
        return abs(other.height - self.height) <= tolerance


class LTTextLineVertical(LTTextLine):
    def __init__(self, word_margin: float) -> None:
        LTTextLine.__init__(self, word_margin)
        self._y0: float = -INF

    # Incompatible override: we take an LTComponent (with bounding box), but
    # LTContainer only considers LTItem (no bounding box).
    def add(self, obj: LTComponent) -> None:  # type: ignore[override]
        if isinstance(obj, LTChar) and self.word_margin:
            margin = self.word_margin * max(obj.width, obj.height)
            if obj.y1 + margin < self._y0:
                LTContainer.add(self, LTAnno(" "))
        self._y0 = obj.y0
        super().add(obj)

    def find_neighbors(
        self,
        plane: Plane[LTComponentT],
        ratio: float,
    ) -> list[LTTextLine]:
        """Finds neighboring LTTextLineVerticals in the plane.

        Returns a list of other LTTextLineVerticals in the plane which are
        close to self. "Close" can be controlled by ratio. The returned objects
        will be the same width as self, and also either upper-, lower-, or
        centrally-aligned.
        """
        d = ratio * self.width
        objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
        return [
            obj
            for obj in objs
            if (
                isinstance(obj, LTTextLineVertical)
                and self._is_same_width_as(obj, tolerance=d)
                and (
                    self._is_lower_aligned_with(obj, tolerance=d)
                    or self._is_upper_aligned_with(obj, tolerance=d)
                    or self._is_centrally_aligned_with(obj, tolerance=d)
                )
            )
        ]

    def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
        """Whether the lower edge of `other` is within `tolerance`."""
        return abs(other.y0 - self.y0) <= tolerance

    def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
        """Whether the upper edge of `other` is within `tolerance`."""
        return abs(other.y1 - self.y1) <= tolerance

    def _is_centrally_aligned_with(
        self,
        other: LTComponent,
        tolerance: float = 0,
    ) -> bool:
        """Whether the vertical center of `other` is within `tolerance`."""
        return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance

    def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
        return abs(other.width - self.width) <= tolerance


class LTTextBox(LTTextContainer[LTTextLine]):
    """Represents a group of text chunks in a rectangular area.

    Note that this box is created by geometric analysis and does not
    necessarily represents a logical boundary of the text. It contains a list
    of LTTextLine objects.
    """

    def __init__(self) -> None:
        LTTextContainer.__init__(self)
        self.index: int = -1

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__}({self.index}) {bbox2str(self.bbox)} {self.get_text()!r}>"

    def get_writing_mode(self) -> str:
        raise NotImplementedError


class LTTextBoxHorizontal(LTTextBox):
    def analyze(self, laparams: LAParams) -> None:
        super().analyze(laparams)
        self._objs.sort(key=lambda obj: -obj.y1)

    def get_writing_mode(self) -> str:
        return "lr-tb"


class LTTextBoxVertical(LTTextBox):
    def analyze(self, laparams: LAParams) -> None:
        super().analyze(laparams)
        self._objs.sort(key=lambda obj: -obj.x1)

    def get_writing_mode(self) -> str:
        return "tb-rl"


TextGroupElement = Union[LTTextBox, "LTTextGroup"]


class LTTextGroup(LTTextContainer[TextGroupElement]):
    def __init__(self, objs: Iterable[TextGroupElement]) -> None:
        super().__init__()
        self.extend(objs)


class LTTextGroupLRTB(LTTextGroup):
    def analyze(self, laparams: LAParams) -> None:
        super().analyze(laparams)
        assert laparams.boxes_flow is not None
        boxes_flow = laparams.boxes_flow
        # reorder the objects from top-left to bottom-right.
        self._objs.sort(
            key=lambda obj: (1 - boxes_flow) * obj.x0
            - (1 + boxes_flow) * (obj.y0 + obj.y1),
        )


class LTTextGroupTBRL(LTTextGroup):
    def analyze(self, laparams: LAParams) -> None:
        super().analyze(laparams)
        assert laparams.boxes_flow is not None
        boxes_flow = laparams.boxes_flow
        # reorder the objects from top-right to bottom-left.
        self._objs.sort(
            key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)
            - (1 - boxes_flow) * obj.y1,
        )


class LTLayoutContainer(LTContainer[LTComponent]):
    def __init__(self, bbox: Rect) -> None:
        LTContainer.__init__(self, bbox)
        self.groups: list[LTTextGroup] | None = None

    # group_objects: group text object to textlines.
    def group_objects(
        self,
        laparams: LAParams,
        objs: Iterable[LTComponent],
    ) -> Iterator[LTTextLine]:
        obj0 = None
        line = None
        for obj1 in objs:
            if obj0 is not None:
                # halign: obj0 and obj1 is horizontally aligned.
                #
                #   +------+ - - -
                #   | obj0 | - - +------+   -
                #   |      |     | obj1 |   | (line_overlap)
                #   +------+ - - |      |   -
                #          - - - +------+
                #
                #          |<--->|
                #        (char_margin)
                halign = (
                    obj0.is_voverlap(obj1)
                    and min(obj0.height, obj1.height) * laparams.line_overlap
                    < obj0.voverlap(obj1)
                    and obj0.hdistance(obj1)
                    < max(obj0.width, obj1.width) * laparams.char_margin
                )

                # valign: obj0 and obj1 is vertically aligned.
                #
                #   +------+
                #   | obj0 |
                #   |      |
                #   +------+ - - -
                #     |    |     | (char_margin)
                #     +------+ - -
                #     | obj1 |
                #     |      |
                #     +------+
                #
                #     |<-->|
                #   (line_overlap)
                valign = (
                    laparams.detect_vertical
                    and obj0.is_hoverlap(obj1)
                    and min(obj0.width, obj1.width) * laparams.line_overlap
                    < obj0.hoverlap(obj1)
                    and obj0.vdistance(obj1)
                    < max(obj0.height, obj1.height) * laparams.char_margin
                )

                if (halign and isinstance(line, LTTextLineHorizontal)) or (
                    valign and isinstance(line, LTTextLineVertical)
                ):
                    line.add(obj1)
                elif line is not None:
                    yield line
                    line = None
                elif valign and not halign:
                    line = LTTextLineVertical(laparams.word_margin)
                    line.add(obj0)
                    line.add(obj1)
                elif halign and not valign:
                    line = LTTextLineHorizontal(laparams.word_margin)
                    line.add(obj0)
                    line.add(obj1)
                else:
                    line = LTTextLineHorizontal(laparams.word_margin)
                    line.add(obj0)
                    yield line
                    line = None
            obj0 = obj1
        if line is None:
            line = LTTextLineHorizontal(laparams.word_margin)
            assert obj0 is not None
            line.add(obj0)
        yield line

    def group_textlines(
        self,
        laparams: LAParams,
        lines: Iterable[LTTextLine],
    ) -> Iterator[LTTextBox]:
        """Group neighboring lines to textboxes"""
        plane: Plane[LTTextLine] = Plane(self.bbox)
        plane.extend(lines)
        boxes: dict[LTTextLine, LTTextBox] = {}
        for line in lines:
            neighbors = line.find_neighbors(plane, laparams.line_margin)
            members = [line]
            for obj1 in neighbors:
                members.append(obj1)
                if obj1 in boxes:
                    members.extend(boxes.pop(obj1))
            if isinstance(line, LTTextLineHorizontal):
                box: LTTextBox = LTTextBoxHorizontal()
            else:
                box = LTTextBoxVertical()
            for obj in uniq(members):
                box.add(obj)
                boxes[obj] = box
        done = set()
        for line in lines:
            if line not in boxes:
                continue
            box = boxes[line]
            if box in done:
                continue
            done.add(box)
            if not box.is_empty():
                yield box

    def group_textboxes(
        self,
        laparams: LAParams,
        boxes: Sequence[LTTextBox],
    ) -> list[LTTextGroup]:
        """Group textboxes hierarchically.

        Get pair-wise distances, via dist func defined below, and then merge
        from the closest textbox pair. Once obj1 and obj2 are merged /
        grouped, the resulting group is considered as a new object, and its
        distances to other objects & groups are added to the process queue.

        For performance reason, pair-wise distances and object pair info are
        maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)
        tuples. It ensures quick access to the smallest element. Note that
        since comparison operators, e.g., __lt__, are disabled for
        LTComponent, id(obj) has to appear before obj in element tuples.

        :param laparams: LAParams object.
        :param boxes: All textbox objects to be grouped.
        :return: a list that has only one element, the final top level group.
        """
        ElementT = Union[LTTextBox, LTTextGroup]
        plane: Plane[ElementT] = Plane(self.bbox)

        def dist(obj1: LTComponent, obj2: LTComponent) -> float:
            """A distance function between two TextBoxes.

            Consider the bounding rectangle for obj1 and obj2.
            Return its area less the areas of obj1 and obj2,
            shown as 'www' below. This value may be negative.
                    +------+..........+ (x1, y1)
                    | obj1 |wwwwwwwwww:
                    +------+www+------+
                    :wwwwwwwwww| obj2 |
            (x0, y0) +..........+------+
            """
            x0 = min(obj1.x0, obj2.x0)
            y0 = min(obj1.y0, obj2.y0)
            x1 = max(obj1.x1, obj2.x1)
            y1 = max(obj1.y1, obj2.y1)
            return (
                (x1 - x0) * (y1 - y0)
                - obj1.width * obj1.height
                - obj2.width * obj2.height
            )

        def isany(obj1: ElementT, obj2: ElementT) -> set[ElementT]:
            """Check if there's any other object between obj1 and obj2."""
            x0 = min(obj1.x0, obj2.x0)
            y0 = min(obj1.y0, obj2.y0)
            x1 = max(obj1.x1, obj2.x1)
            y1 = max(obj1.y1, obj2.y1)
            objs = set(plane.find((x0, y0, x1, y1)))
            return objs.difference((obj1, obj2))

        dists: list[tuple[bool, float, int, int, ElementT, ElementT]] = []
        for i in range(len(boxes)):
            box1 = boxes[i]
            for j in range(i + 1, len(boxes)):
                box2 = boxes[j]
                dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))
        heapq.heapify(dists)

        plane.extend(boxes)
        done = set()
        while len(dists) > 0:
            (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
            # Skip objects that are already merged
            if (id1 not in done) and (id2 not in done):
                if not skip_isany and isany(obj1, obj2):
                    heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
                    continue
                if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(
                    obj2,
                    (LTTextBoxVertical, LTTextGroupTBRL),
                ):
                    group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])
                else:
                    group = LTTextGroupLRTB([obj1, obj2])
                plane.remove(obj1)
                plane.remove(obj2)
                done.update([id1, id2])

                for other in plane:
                    heapq.heappush(
                        dists,
                        (False, dist(group, other), id(group), id(other), group, other),
                    )
                plane.add(group)
        # By now only groups are in the plane
        return list(cast(LTTextGroup, g) for g in plane)

    def analyze(self, laparams: LAParams) -> None:
        # textobjs is a list of LTChar objects, i.e.
        # it has all the individual characters in the page.
        (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
        for obj in otherobjs:
            obj.analyze(laparams)
        if not textobjs:
            return
        textlines = list(self.group_objects(laparams, textobjs))
        (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
        for obj in empties:
            obj.analyze(laparams)
        textboxes = list(self.group_textlines(laparams, textlines))
        if laparams.boxes_flow is None:
            for textbox in textboxes:
                textbox.analyze(laparams)

            def getkey(box: LTTextBox) -> tuple[int, float, float]:
                if isinstance(box, LTTextBoxVertical):
                    return (0, -box.x1, -box.y0)
                else:
                    return (1, -box.y0, box.x0)

            textboxes.sort(key=getkey)
        else:
            self.groups = self.group_textboxes(laparams, textboxes)
            assigner = IndexAssigner()
            for group in self.groups:
                group.analyze(laparams)
                assigner.run(group)
            textboxes.sort(key=lambda box: box.index)
        self._objs = (
            cast(list[LTComponent], textboxes)
            + otherobjs
            + cast(list[LTComponent], empties)
        )


class LTFigure(LTLayoutContainer):
    """Represents an area used by PDF Form objects.

    PDF Forms can be used to present figures or pictures by embedding yet
    another PDF document within a page. Note that LTFigure objects can appear
    recursively.
    """

    def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None:
        self.name = name
        self.matrix = matrix
        (x, y, w, h) = guarded_bbox(bbox)
        bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h))
        bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds)
        LTLayoutContainer.__init__(self, bbox)

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>"

    def analyze(self, laparams: LAParams) -> None:
        if not laparams.all_texts:
            return
        LTLayoutContainer.analyze(self, laparams)


class LTPage(LTLayoutContainer):
    """Represents an entire page.

    Like any other LTLayoutContainer, an LTPage can be iterated to obtain child
    objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine.
    """

    def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None:
        LTLayoutContainer.__init__(self, bbox)
        self.pageid = pageid
        self.rotate = rotate

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>"


================================================
FILE: babeldoc/pdfminer/lzw.py
================================================
import logging
from collections.abc import Iterator
from io import BytesIO
from typing import BinaryIO
from typing import cast

from babeldoc.pdfminer.pdfexceptions import PDFEOFError
from babeldoc.pdfminer.pdfexceptions import PDFException

logger = logging.getLogger(__name__)


class CorruptDataError(PDFException):
    pass


class LZWDecoder:
    def __init__(self, fp: BinaryIO) -> None:
        self.fp = fp
        self.buff = 0
        self.bpos = 8
        self.nbits = 9
        # NB: self.table stores None only in indices 256 and 257
        self.table: list[bytes | None] = []
        self.prevbuf: bytes | None = None

    def readbits(self, bits: int) -> int:
        v = 0
        while 1:
            # the number of remaining bits we can get from the current buffer.
            r = 8 - self.bpos
            if bits <= r:
                # |-----8-bits-----|
                # |-bpos-|-bits-|  |
                # |      |----r----|
                v = (v << bits) | ((self.buff >> (r - bits)) & ((1 << bits) - 1))
                self.bpos += bits
                break
            else:
                # |-----8-bits-----|
                # |-bpos-|---bits----...
                # |      |----r----|
                v = (v << r) | (self.buff & ((1 << r) - 1))
                bits -= r
                x = self.fp.read(1)
                if not x:
                    raise PDFEOFError
                self.buff = ord(x)
                self.bpos = 0
        return v

    def feed(self, code: int) -> bytes:
        x = b""
        if code == 256:
            self.table = [bytes((c,)) for c in range(256)]  # 0-255
            self.table.append(None)  # 256
            self.table.append(None)  # 257
            self.prevbuf = b""
            self.nbits = 9
        elif code == 257:
            pass
        elif not self.prevbuf:
            x = self.prevbuf = cast(bytes, self.table[code])  # assume not None
        else:
            if code < len(self.table):
                x = cast(bytes, self.table[code])  # assume not None
                self.table.append(self.prevbuf + x[:1])
            elif code == len(self.table):
                self.table.append(self.prevbuf + self.prevbuf[:1])
                x = cast(bytes, self.table[code])
            else:
                raise CorruptDataError
            table_length = len(self.table)
            if table_length == 511:
                self.nbits = 10
            elif table_length == 1023:
                self.nbits = 11
            elif table_length == 2047:
                self.nbits = 12
            self.prevbuf = x
        return x

    def run(self) -> Iterator[bytes]:
        while 1:
            try:
                code = self.readbits(self.nbits)
            except EOFError:
                break
            try:
                x = self.feed(code)
            except CorruptDataError:
                # just ignore corrupt data and stop yielding there
                break
            yield x

            logger.debug(
                "nbits=%d, code=%d, output=%r, table=%r",
                self.nbits,
                code,
                x,
                self.table[258:],
            )


def lzwdecode(data: bytes) -> bytes:
    fp = BytesIO(data)
    s = LZWDecoder(fp).run()
    return b"".join(s)


================================================
FILE: babeldoc/pdfminer/pdfcolor.py
================================================
import collections

from babeldoc.pdfminer.psparser import LIT

LITERAL_DEVICE_GRAY = LIT("DeviceGray")
LITERAL_DEVICE_RGB = LIT("DeviceRGB")
LITERAL_DEVICE_CMYK = LIT("DeviceCMYK")
# Abbreviations for inline images
LITERAL_INLINE_DEVICE_GRAY = LIT("G")
LITERAL_INLINE_DEVICE_RGB = LIT("RGB")
LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK")


class PDFColorSpace:
    def __init__(self, name: str, ncomponents: int) -> None:
        self.name = name
        self.ncomponents = ncomponents

    def __repr__(self) -> str:
        return "<PDFColorSpace: %s, ncomponents=%d>" % (self.name, self.ncomponents)


PREDEFINED_COLORSPACE: dict[str, PDFColorSpace] = collections.OrderedDict()

for name, n in [
    ("DeviceGray", 1),  # default value first
    ("CalRGB", 3),
    ("CalGray", 1),
    ("Lab", 3),
    ("DeviceRGB", 3),
    ("DeviceCMYK", 4),
    ("Separation", 1),
    ("Indexed", 1),
    ("Pattern", 1),
]:
    PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)


================================================
FILE: babeldoc/pdfminer/pdfdevice.py
================================================
import logging
from collections.abc import Iterable
from collections.abc import Sequence
from typing import TYPE_CHECKING
from typing import BinaryIO
from typing import Optional
from typing import cast

from babeldoc.pdfminer.pdfcolor import PDFColorSpace
from babeldoc.pdfminer.pdffont import PDFFont
from babeldoc.pdfminer.pdffont import PDFUnicodeNotDefined
from babeldoc.pdfminer.pdfpage import PDFPage
from babeldoc.pdfminer.pdftypes import PDFStream
from babeldoc.pdfminer.psparser import PSLiteral
from babeldoc.pdfminer.utils import Matrix
from babeldoc.pdfminer.utils import PathSegment
from babeldoc.pdfminer.utils import Point
from babeldoc.pdfminer.utils import Rect
from babeldoc.pdfminer import utils

if TYPE_CHECKING:
    from babeldoc.pdfminer.pdfinterp import PDFGraphicState
    from babeldoc.pdfminer.pdfinterp import PDFResourceManager
    from babeldoc.pdfminer.pdfinterp import PDFStackT
    from babeldoc.pdfminer.pdfinterp import PDFTextState


PDFTextSeq = Iterable[int | float | bytes]

logger = logging.getLogger(__name__)


class PDFDevice:
    """Translate the output of PDFPageInterpreter to the output that is needed"""

    def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
        self.rsrcmgr = rsrcmgr
        self.ctm: Matrix | None = None

    def __repr__(self) -> str:
        return "<PDFDevice>"

    def __enter__(self) -> "PDFDevice":
        return self

    def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
        self.close()

    def close(self) -> None:
        pass

    def set_ctm(self, ctm: Matrix) -> None:
        self.ctm = ctm

    def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
        pass

    def end_tag(self) -> None:
        pass

    def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
        pass

    def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
        pass

    def end_page(self, page: PDFPage) -> None:
        pass

    def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
        pass

    def end_figure(self, name: str) -> None:
        pass

    def paint_path(
        self,
        graphicstate: "PDFGraphicState",
        stroke: bool,
        fill: bool,
        evenodd: bool,
        path: Sequence[PathSegment],
    ) -> None:
        pass

    def render_image(self, name: str, stream: PDFStream) -> None:
        pass

    def render_string(
        self,
        textstate: "PDFTextState",
        seq: PDFTextSeq,
        ncs: PDFColorSpace,
        graphicstate: "PDFGraphicState",
    ) -> None:
        pass


class PDFTextDevice(PDFDevice):
    def render_string(
        self,
        textstate: "PDFTextState",
        seq: PDFTextSeq,
        ncs: PDFColorSpace,
        graphicstate: "PDFGraphicState",
    ) -> None:
        assert self.ctm is not None
        matrix = utils.mult_matrix(textstate.matrix, self.ctm)
        font = textstate.font
        font.font_id_temp = getattr(textstate, "font_id", None)
        fontsize = textstate.fontsize
        scaling = textstate.scaling * 0.01
        charspace = textstate.charspace * scaling
        wordspace = textstate.wordspace * scaling
        rise = textstate.rise
        assert font is not None
        if font.is_multibyte():
            wordspace = 0
        dxscale = 0.001 * fontsize * scaling
        if font.is_vertical():
            textstate.linematrix = self.render_string_vertical(
                seq,
                matrix,
                textstate.linematrix,
                font,
                fontsize,
                scaling,
                charspace,
                wordspace,
                rise,
                dxscale,
                ncs,
                graphicstate,
            )
        else:
            textstate.linematrix = self.render_string_horizontal(
                seq,
                matrix,
                textstate.linematrix,
                font,
                fontsize,
                scaling,
                charspace,
                wordspace,
                rise,
                dxscale,
                ncs,
                graphicstate,
            )

    def render_string_horizontal(
        self,
        seq: PDFTextSeq,
        matrix: Matrix,
        pos: Point,
        font: PDFFont,
        fontsize: float,
        scaling: float,
        charspace: float,
        wordspace: float,
        rise: float,
        dxscale: float,
        ncs: PDFColorSpace,
        graphicstate: "PDFGraphicState",
    ) -> Point:
        (x, y) = pos
        needcharspace = False
        for obj in seq:
            if isinstance(obj, (int, float)):
                x -= obj * dxscale
                needcharspace = True
            elif isinstance(obj, bytes):
                for cid in font.decode(obj):
                    if needcharspace:
                        x += charspace
                    x += self.render_char(
                        utils.translate_matrix(matrix, (x, y)),
                        font,
                        fontsize,
                        scaling,
                        rise,
                        cid,
                        ncs,
                        graphicstate,
                    )
                    if cid == 32 and wordspace:
                        x += wordspace
                    needcharspace = True
            else:
                logger.warning(
                    f"Cannot render horizontal string because {obj!r} is not a valid int, float or bytes."
                )
        return (x, y)

    def render_string_vertical(
        self,
        seq: PDFTextSeq,
        matrix: Matrix,
        pos: Point,
        font: PDFFont,
        fontsize: float,
        scaling: float,
        charspace: float,
        wordspace: float,
        rise: float,
        dxscale: float,
        ncs: PDFColorSpace,
        graphicstate: "PDFGraphicState",
    ) -> Point:
        (x, y) = pos
        needcharspace = False
        for obj in seq:
            if isinstance(obj, (int, float)):
                y -= obj * dxscale
                needcharspace = True
            elif isinstance(obj, bytes):
                for cid in font.decode(obj):
                    if needcharspace:
                        y += charspace
                    y += self.render_char(
                        utils.translate_matrix(matrix, (x, y)),
                        font,
                        fontsize,
                        scaling,
                        rise,
                        cid,
                        ncs,
                        graphicstate,
                    )
                    if cid == 32 and wordspace:
                        y += wordspace
                    needcharspace = True
            else:
                logger.warning(
                    f"Cannot render vertical string because {obj!r} is not a valid int, float or bytes."
                )
        return (x, y)

    def render_char(
        self,
        matrix: Matrix,
        font: PDFFont,
        fontsize: float,
        scaling: float,
        rise: float,
        cid: int,
        ncs: PDFColorSpace,
        graphicstate: "PDFGraphicState",
    ) -> float:
        return 0


class TagExtractor(PDFDevice):
    def __init__(
        self,
        rsrcmgr: "PDFResourceManager",
        outfp: BinaryIO,
        codec: str = "utf-8",
    ) -> None:
        PDFDevice.__init__(self, rsrcmgr)
        self.outfp = outfp
        self.codec = codec
        self.pageno = 0
        self._stack: list[PSLiteral] = []

    def render_string(
        self,
        textstate: "PDFTextState",
        seq: PDFTextSeq,
        ncs: PDFColorSpace,
        graphicstate: "PDFGraphicState",
    ) -> None:
        font = textstate.font
        assert font is not None
        text = ""
        for obj in seq:
            if isinstance(obj, str):
                obj = utils.make_compat_bytes(obj)
            if not isinstance(obj, bytes):
                continue
            chars = font.decode(obj)
            for cid in chars:
                try:
                    char = font.to_unichr(cid)
                    text += char
                except PDFUnicodeNotDefined:
                    pass
        self._write(utils.enc(text))

    def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
        output = '<page id="%s" bbox="%s" rotate="%d">' % (
            self.pageno,
            utils.bbox2str(page.mediabox),
            page.rotate,
        )
        self._write(output)

    def end_page(self, page: PDFPage) -> None:
        self._write("</page>\n")
        self.pageno += 1

    def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
        s = ""
        if isinstance(props, dict):
            s = "".join(
                [
                    f' {utils.enc(k)}="{utils.make_compat_str(v)}"'
                    for (k, v) in sorted(props.items())
                ],
            )
        out_s = f"<{utils.enc(cast(str, tag.name))}{s}>"
        self._write(out_s)
        self._stack.append(tag)

    def end_tag(self) -> None:
        assert self._stack, str(self.pageno)
        tag = self._stack.pop(-1)
        out_s = "</%s>" % utils.enc(cast(str, tag.name))
        self._write(out_s)

    def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
        self.begin_tag(tag, props)
        self._stack.pop(-1)

    def _write(self, s: str) -> None:
        self.outfp.write(s.encode(self.codec))


================================================
FILE: babeldoc/pdfminer/pdfdocument.py
================================================
import itertools
import logging
import re
import struct
from collections.abc import Callable
from collections.abc import Iterable
from collections.abc import Iterator
from collections.abc import KeysView
from collections.abc import Sequence
from hashlib import md5
from hashlib import sha256
from hashlib import sha384
from hashlib import sha512
from typing import Any
from typing import cast

from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.ciphers import Cipher
from cryptography.hazmat.primitives.ciphers import algorithms
from cryptography.hazmat.primitives.ciphers import modes

from babeldoc.pdfminer.arcfour import Arcfour
from babeldoc.pdfminer.casting import safe_int
from babeldoc.pdfminer.data_structures import NumberTree
from babeldoc.pdfminer.pdfexceptions import PDFException
from babeldoc.pdfminer.pdfexceptions import PDFKeyError
from babeldoc.pdfminer.pdfexceptions import PDFObjectNotFound
from babeldoc.pdfminer.pdfexceptions import PDFTypeError
from babeldoc.pdfminer.pdfparser import PDFParser
from babeldoc.pdfminer.pdfparser import PDFStreamParser
from babeldoc.pdfminer.pdfparser import PDFSyntaxError
from babeldoc.pdfminer.pdftypes import DecipherCallable
from babeldoc.pdfminer.pdftypes import PDFStream
from babeldoc.pdfminer.pdftypes import decipher_all
from babeldoc.pdfminer.pdftypes import dict_value
from babeldoc.pdfminer.pdftypes import int_value
from babeldoc.pdfminer.pdftypes import list_value
from babeldoc.pdfminer.pdftypes import str_value
from babeldoc.pdfminer.pdftypes import stream_value
from babeldoc.pdfminer.pdftypes import uint_value
from babeldoc.pdfminer.psexceptions import PSEOF
from babeldoc.pdfminer.psparser import KWD
from babeldoc.pdfminer.psparser import LIT
from babeldoc.pdfminer.psparser import literal_name
from babeldoc.pdfminer.utils import choplist
from babeldoc.pdfminer.utils import decode_text
from babeldoc.pdfminer.utils import format_int_alpha
from babeldoc.pdfminer.utils import format_int_roman
from babeldoc.pdfminer.utils import nunpack
from babeldoc.pdfminer import settings

log = logging.getLogger(__name__)


class PDFNoValidXRef(PDFSyntaxError):
    pass


class PDFNoValidXRefWarning(SyntaxWarning):
    """Legacy warning for missing xref.

    Not used anymore because warnings.warn is replaced by logger.Logger.warn.
    """


class PDFNoOutlines(PDFException):
    pass


class PDFNoPageLabels(PDFException):
    pass


class PDFDestinationNotFound(PDFException):
    pass


class PDFEncryptionError(PDFException):
    pass


class PDFPasswordIncorrect(PDFEncryptionError):
    pass


class PDFEncryptionWarning(UserWarning):
    """Legacy warning for failed decryption.

    Not used anymore because warnings.warn is replaced by logger.Logger.warn.
    """


class PDFTextExtractionNotAllowedWarning(UserWarning):
    """Legacy warning for PDF that does not allow extraction.

    Not used anymore because warnings.warn is replaced by logger.Logger.warn.
    """


class PDFTextExtractionNotAllowed(PDFEncryptionError):
    pass


# some predefined literals and keywords.
LITERAL_OBJSTM = LIT("ObjStm")
LITERAL_XREF = LIT("XRef")
LITERAL_CATALOG = LIT("Catalog")


class PDFBaseXRef:
    def get_trailer(self) -> dict[str, Any]:
        raise NotImplementedError

    def get_objids(self) -> Iterable[int]:
        return []

    # Must return
    #     (strmid, index, genno)
    #  or (None, pos, genno)
    def get_pos(self, objid: int) -> tuple[int | None, int, int]:
        raise PDFKeyError(objid)

    def load(self, parser: PDFParser) -> None:
        raise NotImplementedError


class PDFXRef(PDFBaseXRef):
    def __init__(self) -> None:
        self.offsets: dict[int, tuple[int | None, int, int]] = {}
        self.trailer: dict[str, Any] = {}

    def __repr__(self) -> str:
        return "<PDFXRef: offsets=%r>" % (self.offsets.keys())

    def load(self, parser: PDFParser) -> None:
        while True:
            try:
                (pos, line) = parser.nextline()
                line = line.strip()
                if not line:
                    continue
            except PSEOF:
                raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
            if line.startswith(b"trailer"):
                parser.seek(pos)
                break
            f = line.split(b" ")
            if len(f) != 2:
                error_msg = f"Trailer not found: {parser!r}: line={line!r}"
                raise PDFNoValidXRef(error_msg)
            try:
                (start, nobjs) = map(int, f)
            except ValueError:
                error_msg = f"Invalid line: {parser!r}: line={line!r}"
                raise PDFNoValidXRef(error_msg)
            for objid in range(start, start + nobjs):
                try:
                    (_, line) = parser.nextline()
                    line = line.strip()
                except PSEOF:
                    raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
                f = line.split(b" ")
                if len(f) != 3:
                    error_msg = f"Invalid XRef format: {parser!r}, line={line!r}"
                    raise PDFNoValidXRef(error_msg)
                (pos_b, genno_b, use_b) = f
                if use_b != b"n":
                    continue

                pos_i = safe_int(pos_b)
                genno_i = safe_int(genno_b)
                if pos_i is not None and genno_i is not None:
                    self.offsets[objid] = (None, pos_i, genno_i)
                else:
                    log.warning(
                        f"Not adding object {objid} to xref because position {pos_b!r} "
                        f"or generation number {genno_b!r} cannot be parsed as an int"
                    )

        log.debug("xref objects: %r", self.offsets)
        self.load_trailer(parser)

    def load_trailer(self, parser: PDFParser) -> None:
        try:
            (_, kwd) = parser.nexttoken()
            assert kwd is KWD(b"trailer"), str(kwd)
            (_, dic) = parser.nextobject()
        except PSEOF:
            x = parser.pop(1)
            if not x:
                raise PDFNoValidXRef("Unexpected EOF - file corrupted")
            (_, dic) = x[0]
        self.trailer.update(dict_value(dic))
        log.debug("trailer=%r", self.trailer)

    def get_trailer(self) -> dict[str, Any]:
        return self.trailer

    def get_objids(self) -> KeysView[int]:
        return self.offsets.keys()

    def get_pos(self, objid: int) -> tuple[int | None, int, int]:
        return self.offsets[objid]


class PDFXRefFallback(PDFXRef):
    def __repr__(self) -> str:
        return "<PDFXRefFallback: offsets=%r>" % (self.offsets.keys())

    PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b")

    def load(self, parser: PDFParser) -> None:
        parser.seek(0)
        while 1:
            try:
                (pos, line_bytes) = parser.nextline()
            except PSEOF:
                break
            if line_bytes.startswith(b"trailer"):
                parser.seek(pos)
                self.load_trailer(parser)
                log.debug("trailer: %r", self.trailer)
                break
            line = line_bytes.decode("latin-1")  # default pdf encoding
            m = self.PDFOBJ_CUE.match(line)
            if not m:
                continue
            (objid_s, genno_s) = m.groups()
            objid = int(objid_s)
            genno = int(genno_s)
            self.offsets[objid] = (None, pos, genno)
            # expand ObjStm.
            parser.seek(pos)
            (_, obj) = parser.nextobject()
            if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
                stream = stream_value(obj)
                try:
                    n = stream["N"]
                except KeyError:
                    if settings.STRICT:
                        raise PDFSyntaxError("N is not defined: %r" % stream)
                    n = 0
                parser1 = PDFStreamParser(stream.get_data())
                objs: list[int] = []
                try:
                    while 1:
                        (_, obj) = parser1.nextobject()
                        objs.append(cast(int, obj))
                except PSEOF:
                    pass
                n = min(n, len(objs) // 2)
                for index in range(n):
                    objid1 = objs[index * 2]
                    self.offsets[objid1] = (objid, index, 0)


class PDFXRefStream(PDFBaseXRef):
    def __init__(self) -> None:
        self.data: bytes | None = None
        self.entlen: int | None = None
        self.fl1: int | None = None
        self.fl2: int | None = None
        self.fl3: int | None = None
        self.ranges: list[tuple[int, int]] = []

    def __repr__(self) -> str:
        return "<PDFXRefStream: ranges=%r>" % (self.ranges)

    def load(self, parser: PDFParser) -> None:
        (_, objid) = parser.nexttoken()  # ignored
        (_, genno) = parser.nexttoken()  # ignored
        (_, kwd) = parser.nexttoken()
        (_, stream) = parser.nextobject()
        if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
            raise PDFNoValidXRef("Invalid PDF stream spec.")
        size = stream["Size"]
        index_array = stream.get("Index", (0, size))
        if len(index_array) % 2 != 0:
            raise PDFSyntaxError("Invalid index number")
        self.ranges.extend(cast(Iterator[tuple[int, int]], choplist(2, index_array)))
        (self.fl1, self.fl2, self.fl3) = stream["W"]
        assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
        self.data = stream.get_data()
        self.entlen = self.fl1 + self.fl2 + self.fl3
        self.trailer = stream.attrs
        log.debug(
            "xref stream: objid=%s, fields=%d,%d,%d",
            ", ".join(map(repr, self.ranges)),
            self.fl1,
            self.fl2,
            self.fl3,
        )

    def get_trailer(self) -> dict[str, Any]:
        return self.trailer

    def get_objids(self) -> Iterator[int]:
        for start, nobjs in self.ranges:
            for i in range(nobjs):
                assert self.entlen is not None
                assert self.data is not None
                offset = self.entlen * i
                ent = self.data[offset : offset + self.entlen]
                f1 = nunpack(ent[: self.fl1], 1)
                if f1 == 1 or f1 == 2:
                    yield start + i

    def get_pos(self, objid: int) -> tuple[int | None, int, int]:
        index = 0
        for start, nobjs in self.ranges:
            if start <= objid and objid < start + nobjs:
                index += objid - start
                break
            else:
                index += nobjs
        else:
            raise PDFKeyError(objid)
        assert self.entlen is not None
        assert self.data is not None
        assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
        offset = self.entlen * index
        ent = self.data[offset : offset + self.entlen]
        f1 = nunpack(ent[: self.fl1], 1)
        f2 = nunpack(ent[self.fl1 : self.fl1 + self.fl2])
        f3 = nunpack(ent[self.fl1 + self.fl2 :])
        if f1 == 1:
            return (None, f2, f3)
        elif f1 == 2:
            return (f2, f3, 0)
        else:
            # this is a free object
            raise PDFKeyError(objid)


class PDFStandardSecurityHandler:
    PASSWORD_PADDING = (
        b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz"
    )
    supported_revisions: tuple[int, ...] = (2, 3)

    def __init__(
        self,
        docid: Sequence[bytes],
        param: dict[str, Any],
        password: str = "",
    ) -> None:
        self.docid = docid
        self.param = param
        self.password = password
        self.init()

    def init(self) -> None:
        self.init_params()
        if self.r not in self.supported_revisions:
            error_msg = "Unsupported revision: param=%r" % self.param
            raise PDFEncryptionError(error_msg)
        self.init_key()

    def init_params(self) -> None:
        self.v = int_value(self.param.get("V", 0))
        self.r = int_value(self.param["R"])
        self.p = uint_value(self.param["P"], 32)
        self.o = str_value(self.param["O"])
        self.u = str_value(self.param["U"])
        self.length = int_value(self.param.get("Length", 40))

    def init_key(self) -> None:
        self.key = self.authenticate(self.password)
        if self.key is None:
            raise PDFPasswordIncorrect

    def is_printable(self) -> bool:
        return bool(self.p & 4)

    def is_modifiable(self) -> bool:
        return bool(self.p & 8)

    def is_extractable(self) -> bool:
        return bool(self.p & 16)

    def compute_u(self, key: bytes) -> bytes:
        if self.r == 2:
            # Algorithm 3.4
            return Arcfour(key).encrypt(self.PASSWORD_PADDING)  # 2
        else:
            # Algorithm 3.5
            hash = md5(self.PASSWORD_PADDING)  # 2
            hash.update(self.docid[0])  # 3
            result = Arcfour(key).encrypt(hash.digest())  # 4
            for i in range(1, 20):  # 5
                k = b"".join(bytes((c ^ i,)) for c in iter(key))
                result = Arcfour(k).encrypt(result)
            result += result  # 6
            return result

    def compute_encryption_key(self, password: bytes) -> bytes:
        # Algorithm 3.2
        password = (password + self.PASSWORD_PADDING)[:32]  # 1
        hash = md5(password)  # 2
        hash.update(self.o)  # 3
        # See https://github.com/pdfminer/pdfminer.six/issues/186
        hash.update(struct.pack("<L", self.p))  # 4
        hash.update(self.docid[0])  # 5
        if self.r >= 4:
            if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata:
                hash.update(b"\xff\xff\xff\xff")
        result = hash.digest()
        n = 5
        if self.r >= 3:
            n = self.length // 8
            for _ in range(50):
                result = md5(result[:n]).digest()
        return result[:n]

    def authenticate(self, password: str) -> bytes | None:
        password_bytes = password.encode("latin1")
        key = self.authenticate_user_password(password_bytes)
        if key is None:
            key = self.authenticate_owner_password(password_bytes)
        return key

    def authenticate_user_password(self, password: bytes) -> bytes | None:
        key = self.compute_encryption_key(password)
        if self.verify_encryption_key(key):
            return key
        else:
            return None

    def verify_encryption_key(self, key: bytes) -> bool:
        # Algorithm 3.6
        u = self.compute_u(key)
        if self.r == 2:
            return u == self.u
        return u[:16] == self.u[:16]

    def authenticate_owner_password(self, password: bytes) -> bytes | None:
        # Algorithm 3.7
        password = (password + self.PASSWORD_PADDING)[:32]
        hash = md5(password)
        if self.r >= 3:
            for _ in range(50):
                hash = md5(hash.digest())
        n = 5
        if self.r >= 3:
            n = self.length // 8
        key = hash.digest()[:n]
        if self.r == 2:
            user_password = Arcfour(key).decrypt(self.o)
        else:
            user_password = self.o
            for i in range(19, -1, -1):
                k = b"".join(bytes((c ^ i,)) for c in iter(key))
                user_password = Arcfour(k).decrypt(user_password)
        return self.authenticate_user_password(user_password)

    def decrypt(
        self,
        objid: int,
        genno: int,
        data: bytes,
        attrs: dict[str, Any] | None = None,
    ) -> bytes:
        return self.decrypt_rc4(objid, genno, data)

    def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:
        assert self.key is not None
        key = self.key + struct.pack("<L", objid)[:3] + struct.pack("<L", genno)[:2]
        hash = md5(key)
        key = hash.digest()[: min(len(key), 16)]
        return Arcfour(key).decrypt(data)


class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
    supported_revisions: tuple[int, ...] = (4,)

    def init_params(self) -> None:
        super().init_params()
        self.length = 128
        self.cf = dict_value(self.param.get("CF"))
        self.stmf = literal_name(self.param["StmF"])
        self.strf = literal_name(self.param["StrF"])
        self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True))
        if self.stmf != self.strf:
            error_msg = "Unsupported crypt filter: param=%r" % self.param
            raise PDFEncryptionError(error_msg)
        self.cfm = {}
        for k, v in self.cf.items():
            f = self.get_cfm(literal_name(v["CFM"]))
            if f is None:
                error_msg = "Unknown crypt filter method: param=%r" % self.param
                raise PDFEncryptionError(error_msg)
            self.cfm[k] = f
        self.cfm["Identity"] = self.decrypt_identity
        if self.strf not in self.cfm:
            error_msg = "Undefined crypt filter: param=%r" % self.param
            raise PDFEncryptionError(error_msg)

    def get_cfm(self, name: str) -> Callable[[int, int, bytes], bytes] | None:
        if name == "V2":
            return self.decrypt_rc4
        elif name == "AESV2":
            return self.decrypt_aes128
        else:
            return None

    def decrypt(
        self,
        objid: int,
        genno: int,
        data: bytes,
        attrs: dict[str, Any] | None = None,
        name: str | None = None,
    ) -> bytes:
        if not self.encrypt_metadata and attrs is not None:
            t = attrs.get("Type")
            if t is not None and literal_name(t) == "Metadata":
                return data
        if name is None:
            name = self.strf
        return self.cfm[name](objid, genno, data)

    def decrypt_identity(self, objid: int, genno: int, data: bytes) -> bytes:
        return data

    def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:
        assert self.key is not None
        key = (
            self.key
            + struct.pack("<L", objid)[:3]
            + struct.pack("<L", genno)[:2]
            + b"sAlT"
        )
        hash = md5(key)
        key = hash.digest()[: min(len(key), 16)]
        initialization_vector = data[:16]
        ciphertext = data[16:]
        cipher = Cipher(
            algorithms.AES(key),
            modes.CBC(initialization_vector),
            backend=default_backend(),
        )  # type: ignore
        return cipher.decryptor().update(ciphertext)  # type: ignore


class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
    supported_revisions = (5, 6)

    def init_params(self) -> None:
        super().init_params()
        self.length = 256
        self.oe = str_value(self.param["OE"])
        self.ue = str_value(self.param["UE"])
        self.o_hash = self.o[:32]
        self.o_validation_salt = self.o[32:40]
        self.o_key_salt = self.o[40:]
        self.u_hash = self.u[:32]
        self.u_validation_salt = self.u[32:40]
        self.u_key_salt = self.u[40:]

    def get_cfm(self, name: str) -> Callable[[int, int, bytes], bytes] | None:
        if name == "AESV3":
            return self.decrypt_aes256
        else:
            return None

    def authenticate(self, password: str) -> bytes | None:
        password_b = self._normalize_password(password)
        hash = self._password_hash(password_b, self.o_validation_salt, self.u)
        if hash == self.o_hash:
            hash = self._password_hash(password_b, self.o_key_salt, self.u)
            cipher = Cipher(
                algorithms.AES(hash),
                modes.CBC(b"\0" * 16),
                backend=default_backend(),
            )  # type: ignore
            return cipher.decryptor().update(self.oe)  # type: ignore
        hash = self._password_hash(password_b, self.u_validation_salt)
        if hash == self.u_hash:
            hash = self._password_hash(password_b, self.u_key_salt)
            cipher = Cipher(
                algorithms.AES(hash),
                modes.CBC(b"\0" * 16),
                backend=default_backend(),
            )  # type: ignore
            return cipher.decryptor().update(self.ue)  # type: ignore
        return None

    def _normalize_password(self, password: str) -> bytes:
        if self.r == 6:
            # saslprep expects non-empty strings, apparently
            if not password:
                return b""
            from babeldoc.pdfminer._saslprep import saslprep

            password = saslprep(password)
        return password.encode("utf-8")[:127]

    def _password_hash(
        self,
        password: bytes,
        salt: bytes,
        vector: bytes | None = None,
    ) -> bytes:
        """Compute password hash depending on revision number"""
        if self.r == 5:
            return self._r5_password(password, salt, vector)
        return self._r6_password(password, salt[0:8], vector)

    def _r5_password(
        self,
        password: bytes,
        salt: bytes,
        vector: bytes | None = None,
    ) -> bytes:
        """Compute the password for revision 5"""
        hash = sha256(password)
        hash.update(salt)
        if vector is not None:
            hash.update(vector)
        return hash.digest()

    def _r6_password(
        self,
        password: bytes,
        salt: bytes,
        vector: bytes | None = None,
    ) -> bytes:
        """Compute the password for revision 6"""
        initial_hash = sha256(password)
        initial_hash.update(salt)
        if vector is not None:
            initial_hash.update(vector)
        k = initial_hash.digest()
        hashes = (sha256, sha384, sha512)
        round_no = last_byte_val = 0
        while round_no < 64 or last_byte_val > round_no - 32:
            k1 = (password + k + (vector or b"")) * 64
            e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1)
            # compute the first 16 bytes of e,
            # interpreted as an unsigned integer mod 3
            next_hash = hashes[self._bytes_mod_3(e[:16])]
            k = next_hash(e).digest()
            last_byte_val = e[len(e) - 1]
            round_no += 1
        return k[:32]

    @staticmethod
    def _bytes_mod_3(input_bytes: bytes) -> int:
        # 256 is 1 mod 3, so we can just sum 'em
        return sum(b % 3 for b in input_bytes) % 3

    def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes:
        cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
        encryptor = cipher.encryptor()  # type: ignore
        return encryptor.update(data) + encryptor.finalize()  # type: ignore

    def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes:
        initialization_vector = data[:16]
        ciphertext = data[16:]
        assert self.key is not None
        cipher = Cipher(
            algorithms.AES(self.key),
            modes.CBC(initialization_vector),
            backend=default_backend(),
        )  # type: ignore
        return cipher.decryptor().update(ciphertext)  # type: ignore


class PDFDocument:
    """PDFDocument object represents a PDF document.

    Since a PDF file can be very big, normally it is not loaded at
    once. So PDF document has to cooperate with a PDF parser in order to
    dynamically import the data as processing goes.

    Typical usage:
      doc = PDFDocument(parser, password)
      obj = doc.getobj(objid)

    """

    security_handler_registry: dict[int, type[PDFStandardSecurityHandler]] = {
        1: PDFStandardSecurityHandler,
        2: PDFStandardSecurityHandler,
        4: PDFStandardSecurityHandlerV4,
        5: PDFStandardSecurityHandlerV5,
    }

    def __init__(
        self,
        parser: PDFParser,
        password: str = "",
        caching: bool = True,
        fallback: bool = True,
    ) -> None:
        """Set the document to use a given PDFParser object."""
        self.caching = caching
        self.xrefs: list[PDFBaseXRef] = []
        self.info = []
        self.catalog: dict[str, Any] = {}
        self.encryption: tuple[Any, Any] | None = None
        self.decipher: DecipherCallable | None = None
        self._parser = None
        self._cached_objs: dict[int, tuple[object, int]] = {}
        self._parsed_objs: dict[int, tuple[list[object], int]] = {}
        self._parser = parser
        self._parser.set_document(self)
        self.is_printable = self.is_modifiable = self.is_extractable = True
        # Retrieve the information of each header that was appended
        # (maybe multiple times) at the end of the document.
        try:
            pos = self.find_xref(parser)
            self.read_xref_from(parser, pos, self.xrefs)
        except PDFNoValidXRef:
            if fallback:
                parser.fallback = True
                newxref = PDFXRefFallback()
                newxref.load(parser)
                self.xrefs.append(newxref)

        for xref in self.xrefs:
            trailer = xref.get_trailer()
            if not trailer:
                continue
            # If there's an encryption info, remember it.
            if "Encrypt" in trailer:
                if "ID" in trailer:
                    id_value = list_value(trailer["ID"])
                else:
                    # Some documents may not have a /ID, use two empty
                    # byte strings instead. Solves
                    # https://github.com/pdfminer/pdfminer.six/issues/594
                    id_value = (b"", b"")
                self.encryption = (id_value, dict_value(trailer["Encrypt"]))
                self._initialize_password(password)
            if "Info" in trailer:
                self.info.append(dict_value(trailer["Info"]))
            if "Root" in trailer:
                # Every PDF file must have exactly one /Root dictionary.
                self.catalog = dict_value(trailer["Root"])
                break
        else:
            raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
        if self.catalog.get("Type") is not LITERAL_CATALOG:
            if settings.STRICT:
                raise PDFSyntaxError("Catalog not found!")

    KEYWORD_OBJ = KWD(b"obj")

    # _initialize_password(password=b'')
    #   Perform the initialization with a given password.
    def _initialize_password(self, password: str = "") -> None:
        assert self.encryption is not None
        (docid, param) = self.encryption
        if literal_name(param.get("Filter")) != "Standard":
            raise PDFEncryptionError("Unknown filter: param=%r" % param)
        v = int_value(param.get("V", 0))
        factory = self.security_handler_registry.get(v)
        if factory is None:
            raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
        handler = factory(docid, param, password)
        self.decipher = handler.decrypt
        self.is_printable = handler.is_printable()
        self.is_modifiable = handler.is_modifiable()
        self.is_extractable = handler.is_extractable()
        assert self._parser is not None
        self._parser.fallback = False  # need to read streams with exact length

    def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
        if stream.objid in self._parsed_objs:
            (objs, n) = self._parsed_objs[stream.objid]
        else:
            (objs, n) = self._get_objects(stream)
            if self.caching:
                assert stream.objid is not None
                self._parsed_objs[stream.objid] = (objs, n)
        i = n * 2 + index
        try:
            obj = objs[i]
        except IndexError:
            raise PDFSyntaxError("index too big: %r" % index)
        return obj

    def _get_objects(self, stream: PDFStream) -> tuple[list[object], int]:
        if stream.get("Type") is not LITERAL_OBJSTM:
            if settings.STRICT:
                raise PDFSyntaxError("Not a stream object: %r" % stream)
        try:
            n = cast(int, stream["N"])
        except KeyError:
            if settings.STRICT:
                raise PDFSyntaxError("N is not defined: %r" % stream)
            n = 0
        parser = PDFStreamParser(stream.get_data())
        parser.set_document(self)
        objs: list[object] = []
        try:
            while 1:
                (_, obj) = parser.nextobject()
                objs.append(obj)
        except PSEOF:
            pass
        return (objs, n)

    def _getobj_parse(self, pos: int, objid: int) -> object:
        assert self._parser is not None
        self._parser.seek(pos)
        (_, objid1) = self._parser.nexttoken()  # objid
        (_, genno) = self._parser.nexttoken()  # genno
        (_, kwd) = self._parser.nexttoken()
        # hack around malformed pdf files
        # copied from https://github.com/jaepil/pdfminer3k/blob/master/
        # pdfminer/pdfparser.py#L399
        # to solve https://github.com/pdfminer/pdfminer.six/issues/56
        # assert objid1 == objid, str((objid1, objid))
        if objid1 != objid:
            x = []
            while kwd is not self.KEYWORD_OBJ:
                (_, kwd) = self._parser.nexttoken()
                x.append(kwd)
            if len(x) >= 2:
                objid1 = x[-2]
        # #### end hack around malformed pdf files
        if objid1 != objid:
            raise PDFSyntaxError(f"objid mismatch: {objid1!r}={objid!r}")

        if kwd != KWD(b"obj"):
            raise PDFSyntaxError("Invalid object spec: offset=%r" % pos)
        (_, obj) = self._parser.nextobject()
        return obj

    # can raise PDFObjectNotFound
    def getobj(self, objid: int) -> object:
        """Get object from PDF

        :raises PDFException if PDFDocument is not initialized
        :raises PDFObjectNotFound if objid does not exist in PDF
        """
        if not self.xrefs:
            raise PDFException("PDFDocument is not initialized")
        log.debug("getobj: objid=%r", objid)
        if objid in self._cached_objs:
            (obj, genno) = self._cached_objs[objid]
        else:
            for xref in self.xrefs:
                try:
                    (strmid, index, genno) = xref.get_pos(objid)
                except KeyError:
                    continue
                try:
                    if strmid is not None:
                        stream = stream_value(self.getobj(strmid))
                        obj = self._getobj_objstm(stream, index, objid)
                    else:
                        obj = self._getobj_parse(index, objid)
                        if self.decipher:
                            obj = decipher_all(self.decipher, objid, genno, obj)

                    if isinstance(obj, PDFStream):
                        obj.set_objid(objid, genno)
                    break
                except (PSEOF, PDFSyntaxError):
                    continue
            else:
                raise PDFObjectNotFound(objid)
            log.debug("register: objid=%r: %r", objid, obj)
            if self.caching:
                self._cached_objs[objid] = (obj, genno)
        return obj

    OutlineType = tuple[Any, Any, Any, Any, Any]

    def get_outlines(self) -> Iterator[OutlineType]:
        if "Outlines" not in self.catalog:
            raise PDFNoOutlines

        def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]:
            entry = dict_value(entry)
            if "Title" in entry:
                if "A" in entry or "Dest" in entry:
                    title = decode_text(str_value(entry["Title"]))
                    dest = entry.get("Dest")
                    action = entry.get("A")
                    se = entry.get("SE")
                    yield (level, title, dest, action, se)
            if "First" in entry and "Last" in entry:
                yield from search(entry["First"], level + 1)
            if "Next" in entry:
                yield from search(entry["Next"], level)

        return search(self.catalog["Outlines"], 0)

    def get_page_labels(self) -> Iterator[str]:
        """Generate page label strings for the PDF document.

        If the document includes page labels, generates strings, one per page.
        If not, raises PDFNoPageLabels.

        The resulting iteration is unbounded.
        """
        assert self.catalog is not None

        try:
            page_labels = PageLabels(self.catalog["PageLabels"])
        except (PDFTypeError, KeyError):
            raise PDFNoPageLabels

        return page_labels.labels

    def lookup_name(self, cat: str, key: str | bytes) -> Any:
        try:
            names = dict_value(self.catalog["Names"])
        except (PDFTypeError, KeyError):
            raise PDFKeyError((cat, key))
        # may raise KeyError
        d0 = dict_value(names[cat])

        def lookup(d: dict[str, Any]) -> Any:
            if "Limits" in d:
                (k1, k2) = list_value(d["Limits"])
                if key < k1 or k2 < key:
                    return None
            if "Names" in d:
                objs = list_value(d["Names"])
                names = dict(
                    cast(Iterator[tuple[str | bytes, Any]], choplist(2, objs)),
                )
                return names[key]
            if "Kids" in d:
                for c in list_value(d["Kids"]):
                    v = lookup(dict_value(c))
                    if v:
                        return v
            raise PDFKeyError((cat, key))

        return lookup(d0)

    def get_dest(self, name: str | bytes) -> Any:
        try:
            # PDF-1.2 or later
            obj = self.lookup_name("Dests", name)
        except KeyError:
            # PDF-1.1 or prior
            if "Dests" not in self.catalog:
                raise PDFDestinationNotFound(name)
            d0 = dict_value(self.catalog["Dests"])
            if name not in d0:
                raise PDFDestinationNotFound(name)
            obj = d0[name]
        return obj

    # find_xref
    def find_xref(self, parser: PDFParser) -> int:
        """Internal function used to locate the first XRef."""
        # search the last xref table by scanning the file backwards.
        prev = b""
        for line in parser.revreadlines():
            line = line.strip()
            log.debug("find_xref: %r", line)

            if line == b"startxref":
                log.debug("xref found: pos=%r", prev)

                if not prev.isdigit():
                    raise PDFNoValidXRef(f"Invalid xref position: {prev!r}")

                start = int(prev)

                if not start >= 0:
                    raise PDFNoValidXRef(f"Invalid negative xref position: {start}")

                return start

            if line:
                prev = line

        raise PDFNoValidXRef("Unexpected EOF")

    # read xref table
    def read_xref_from(
        self,
        parser: PDFParser,
        start: int,
        xrefs: list[PDFBaseXRef],
    ) -> None:
        """Reads XRefs from the given location."""
        parser.seek(start)
        parser.reset()
        try:
            (pos, token) = parser.nexttoken()
        except PSEOF:
            raise PDFNoValidXRef("Unexpected EOF")
        log.debug("read_xref_from: start=%d, token=%r", start, token)
        if isinstance(token, int):
            # XRefStream: PDF-1.5
            parser.seek(pos)
            parser.reset()
            xref: PDFBaseXRef = PDFXRefStream()
            xref.load(parser)
        else:
            if token is parser.KEYWORD_XREF:
                parser.nextline()
            xref = PDFXRef()
            xref.load(parser)
        xrefs.append(xref)
        trailer = xref.get_trailer()
        log.debug("trailer: %r", trailer)
        if "XRefStm" in trailer:
            pos = int_value(trailer["XRefStm"])
            self.read_xref_from(parser, pos, xrefs)
        if "Prev" in trailer:
            # find previous xref
            pos = int_value(trailer["Prev"])
            self.read_xref_from(parser, pos, xrefs)


class PageLabels(NumberTree):
    """PageLabels from the document catalog.

    See Section 8.3.1 in the PDF Reference.
    """

    @property
    def labels(self) -> Iterator[str]:
        ranges = self.values

        # The tree must begin with page index 0
        if len(ranges) == 0 or ranges[0][0] != 0:
            if settings.STRICT:
                raise PDFSyntaxError("PageLabels is missing page index 0")
            else:
                # Try to cope, by assuming empty labels for the initial pages
                ranges.insert(0, (0, {}))

        for next, (start, label_dict_unchecked) in enumerate(ranges, 1):
            label_dict = dict_value(label_dict_unchecked)
            style = label_dict.get("S")
            prefix = decode_text(str_value(label_dict.get("P", b"")))
            first_value = int_value(label_dict.get("St", 1))

            if next == len(ranges):
                # This is the last specified range. It continues until the end
                # of the document.
                values: Iterable[int] = itertools.count(first_value)
            else:
                end, _ = ranges[next]
                range_length = end - start
                values = range(first_value, first_value + range_length)

            for value in values:
                label = self._format_page_label(value, style)
                yield prefix + label

    @staticmethod
    def _format_page_label(value: int, style: Any) -> str:
        """Format page label value in a specific style"""
        if style is None:
            label = ""
        elif style is LIT("D"):  # Decimal arabic numerals
            label = str(value)
        elif style is LIT("R"):  # Uppercase roman numerals
            label = format_int_roman(value).upper()
        elif style is LIT("r"):  # Lowercase roman numerals
            label = format_int_roman(value)
        elif style is LIT("A"):  # Uppercase letters A-Z, AA-ZZ...
            label = format_int_alpha(value).upper()
        elif style is LIT("a"):  # Lowercase letters a-z, aa-zz...
            label = format_int_alpha(value)
        else:
            log.warning("Unknown page label style: %r", style)
            label = ""
        return label


================================================
FILE: babeldoc/pdfminer/pdfexceptions.py
================================================
from babeldoc.pdfminer.psexceptions import PSException


class PDFException(PSException):
    pass


class PDFTypeError(PDFException, TypeError):
    pass


class PDFValueError(PDFException, ValueError):
    pass


class PDFObjectNotFound(PDFException):
    pass


class PDFNotImplementedError(PDFException, NotImplementedError):
    pass


class PDFKeyError(PDFException, KeyError):
    pass


class PDFEOFError(PDFException, EOFError):
    pass


class PDFIOError(PDFException, IOError):
    pass


================================================
FILE: babeldoc/pdfminer/pdffont.py
================================================
import logging
import struct
from collections.abc import Iterable
from collections.abc import Iterator
from collections.abc import Mapping
from io import BytesIO
from typing import TYPE_CHECKING
from typing import Any
from typing import BinaryIO
from typing import cast
import freetype

from babeldoc.pdfminer.casting import safe_float
from babeldoc.pdfminer.casting import safe_rect_list
from babeldoc.pdfminer.cmapdb import CMap
from babeldoc.pdfminer.cmapdb import CMapBase
from babeldoc.pdfminer.cmapdb import CMapDB
from babeldoc.pdfminer.cmapdb import CMapParser
from babeldoc.pdfminer.cmapdb import FileUnicodeMap
from babeldoc.pdfminer.cmapdb import IdentityUnicodeMap
from babeldoc.pdfminer.cmapdb import UnicodeMap
from babeldoc.pdfminer.encodingdb import EncodingDB
from babeldoc.pdfminer.encodingdb import name2unicode
from babeldoc.pdfminer.fontmetrics import FONT_METRICS
from babeldoc.pdfminer.pdfexceptions import PDFException
from babeldoc.pdfminer.pdfexceptions import PDFKeyError
from babeldoc.pdfminer.pdfexceptions import PDFValueError
from babeldoc.pdfminer.pdftypes import PDFStream
from babeldoc.pdfminer.pdftypes import dict_value
from babeldoc.pdfminer.pdftypes import int_value
from babeldoc.pdfminer.pdftypes import list_value
from babeldoc.pdfminer.pdftypes import num_value
from babeldoc.pdfminer.pdftypes import resolve1
from babeldoc.pdfminer.pdftypes import resolve_all
from babeldoc.pdfminer.pdftypes import stream_value
from babeldoc.pdfminer.psexceptions import PSEOF
from babeldoc.pdfminer.psparser import KWD
from babeldoc.pdfminer.psparser import LIT
from babeldoc.pdfminer.psparser import PSKeyword
from babeldoc.pdfminer.psparser import PSLiteral
from babeldoc.pdfminer.psparser import PSStackParser
from babeldoc.pdfminer.psparser import literal_name
from babeldoc.pdfminer.utils import Matrix
from babeldoc.pdfminer.utils import Point
from babeldoc.pdfminer.utils import Rect
from babeldoc.pdfminer.utils import apply_matrix_norm
from babeldoc.pdfminer.utils import choplist
from babeldoc.pdfminer.utils import nunpack
from babeldoc.pdfminer import settings
from babeldoc.format.pdf.babelpdf.cmap import CharacterMap

if TYPE_CHECKING:
    from babeldoc.pdfminer.pdfinterp import PDFResourceManager

log = logging.getLogger(__name__)


def get_widths(seq: Iterable[object]) -> dict[str | int, float]:
    """Build a mapping of character widths for horizontal writing."""
    widths: dict[int, float] = {}
    r: list[float] = []
    for v in seq:
        v = resolve1(v)
        if isinstance(v, list):
            if r:
                char1 = r[-1]
                for i, w in enumerate(v):
                    widths[cast(int, char1) + i] = w
                r = []
        elif isinstance(v, (int, float)):  # == utils.isnumber(v)
            r.append(v)
            if len(r) == 3:
                (char1, char2, w) = r
                if isinstance(char1, int) and isinstance(char2, int):
                    for i in range(cast(int, char1), cast(int, char2) + 1):
                        widths[i] = w
                else:
                    log.warning(
                        f"Skipping invalid font width specification for {char1} to {char2} because either of them is not an int"
                    )
                r = []
        else:
            log.warning(
                f"Skipping invalid font width specification for {v} because it is not a number or a list"
            )
    return cast(dict[str | int, float], widths)


def get_widths2(seq: Iterable[object]) -> dict[int, tuple[float, Point]]:
    """Build a mapping of character widths for vertical writing."""
    widths: dict[int, tuple[float, Point]] = {}
    r: list[float] = []
    for v in seq:
        if isinstance(v, list):
            if r:
                char1 = r[-1]
                for i, (w, vx, vy) in enumerate(choplist(3, v)):
                    widths[cast(int, char1) + i] = (w, (vx, vy))
                r = []
        elif isinstance(v, (int, float)):  # == utils.isnumber(v)
            r.append(v)
            if len(r) == 5:
                (char1, char2, w, vx, vy) = r
                for i in range(cast(int, char1), cast(int, char2) + 1):
                    widths[i] = (w, (vx, vy))
                r = []
    return widths


class FontMetricsDB:
    @classmethod
    def get_metrics(cls, fontname: str) -> tuple[dict[str, object], dict[str, int]]:
        return FONT_METRICS[fontname]


# int here means that we're not extending PSStackParser with additional types.
class Type1FontHeaderParser(PSStackParser[int]):
    KEYWORD_BEGIN = KWD(b"begin")
    KEYWORD_END = KWD(b"end")
    KEYWORD_DEF = KWD(b"def")
    KEYWORD_PUT = KWD(b"put")
    KEYWORD_DICT = KWD(b"dict")
    KEYWORD_ARRAY = KWD(b"array")
    KEYWORD_READONLY = KWD(b"readonly")
    KEYWORD_FOR = KWD(b"for")

    def __init__(self, data: BinaryIO) -> None:
        PSStackParser.__init__(self, data)
        self._cid2unicode: dict[int, str] = {}

    def get_encoding(self) -> dict[int, str]:
        """Parse the font encoding.

        The Type1 font encoding maps character codes to character names. These
        character names could either be standard Adobe glyph names, or
        character names associated with custom CharStrings for this font. A
        CharString is a sequence of operations that describe how the character
        should be drawn. Currently, this function returns '' (empty string)
        for character names that are associated with a CharStrings.

        Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format

        :returns mapping of character identifiers (cid's) to unicode characters
        """
        while 1:
            try:
                (cid, name) = self.nextobject()
            except PSEOF:
                break
            try:
                self._cid2unicode[cid] = name2unicode(cast(str, name))
            except KeyError as e:
                log.debug(str(e))
        return self._cid2unicode

    def do_keyword(self, pos: int, token: PSKeyword) -> None:
        if token is self.KEYWORD_PUT:
            ((_, key), (_, value)) = self.pop(2)
            if isinstance(key, int) and isinstance(value, PSLiteral):
                self.add_results((key, literal_name(value)))


NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")

# Mapping of cmap names. Original cmap name is kept if not in the mapping.
# (missing reference for why DLIdent is mapped to Identity)
IDENTITY_ENCODER = {
    "DLIdent-H": "Identity-H",
    "DLIdent-V": "Identity-V",
}


def getdict(data: bytes) -> dict[int, list[float | int]]:
    d: dict[int, list[float | int]] = {}
    fp = BytesIO(data)
    stack: list[float | int] = []
    while 1:
        c = fp.read(1)
        if not c:
            break
        b0 = ord(c)
        if b0 <= 21:
            d[b0] = stack
            stack = []
            continue
        if b0 == 30:
            s = ""
            loop = True
            while loop:
                b = ord(fp.read(1))
                for n in (b >> 4, b & 15):
                    if n == 15:
                        loop = False
                    else:
                        nibble = NIBBLES[n]
                        assert nibble is not None
                        s += nibble
            value = float(s)
        elif b0 >= 32 and b0 <= 246:
            value = b0 - 139
        else:
            b1 = ord(fp.read(1))
            if b0 >= 247 and b0 <= 250:
                value = ((b0 - 247) << 8) + b1 + 108
            elif b0 >= 251 and b0 <= 254:
                value = -((b0 - 251) << 8) - b1 - 108
            else:
                b2 = ord(fp.read(1))
                if b1 >= 128:
                    b1 -= 256
                if b0 == 28:
                    value = b1 << 8 | b2
                else:
                    value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]
        stack.append(value)
    return d


class CFFFont:
    STANDARD_STRINGS = (
        ".notdef",
        "space",
        "exclam",
        "quotedbl",
        "numbersign",
        "dollar",
        "percent",
        "ampersand",
        "quoteright",
        "parenleft",
        "parenright",
        "asterisk",
        "plus",
        "comma",
        "hyphen",
        "period",
        "slash",
        "zero",
        "one",
        "two",
        "three",
        "four",
        "five",
        "six",
        "seven",
        "eight",
        "nine",
        "colon",
        "semicolon",
        "less",
        "equal",
        "greater",
        "question",
        "at",
        "A",
        "B",
        "C",
        "D",
        "E",
        "F",
        "G",
        "H",
        "I",
        "J",
        "K",
        "L",
        "M",
        "N",
        "O",
        "P",
        "Q",
        "R",
        "S",
        "T",
        "U",
        "V",
        "W",
        "X",
        "Y",
        "Z",
        "bracketleft",
        "backslash",
        "bracketright",
        "asciicircum",
        "underscore",
        "quoteleft",
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
        "q",
        "r",
        "s",
        "t",
        "u",
        "v",
        "w",
        "x",
        "y",
        "z",
        "braceleft",
        "bar",
        "braceright",
        "asciitilde",
        "exclamdown",
        "cent",
        "sterling",
        "fraction",
        "yen",
        "florin",
        "section",
        "currency",
        "quotesingle",
        "quotedblleft",
        "guillemotleft",
        "guilsinglleft",
        "guilsinglright",
        "fi",
        "fl",
        "endash",
        "dagger",
        "daggerdbl",
        "periodcentered",
        "paragraph",
        "bullet",
        "quotesinglbase",
        "quotedblbase",
        "quotedblright",
        "guillemotright",
        "ellipsis",
        "perthousand",
        "questiondown",
        "grave",
        "acute",
        "circumflex",
        "tilde",
        "macron",
        "breve",
        "dotaccent",
        "dieresis",
        "ring",
        "cedilla",
        "hungarumlaut",
        "ogonek",
        "caron",
        "emdash",
        "AE",
        "ordfeminine",
        "Lslash",
        "Oslash",
        "OE",
        "ordmasculine",
        "ae",
        "dotlessi",
        "lslash",
        "oslash",
        "oe",
        "germandbls",
        "onesuperior",
        "logicalnot",
        "mu",
        "trademark",
        "Eth",
        "onehalf",
        "plusminus",
        "Thorn",
        "onequarter",
        "divide",
        "brokenbar",
        "degree",
        "thorn",
        "threequarters",
        "twosuperior",
        "registered",
        "minus",
        "eth",
        "multiply",
        "threesuperior",
        "copyright",
        "Aacute",
        "Acircumflex",
        "Adieresis",
        "Agrave",
        "Aring",
        "Atilde",
        "Ccedilla",
        "Eacute",
        "Ecircumflex",
        "Edieresis",
        "Egrave",
        "Iacute",
        "Icircumflex",
        "Idieresis",
        "Igrave",
        "Ntilde",
        "Oacute",
        "Ocircumflex",
        "Odieresis",
        "Ograve",
        "Otilde",
        "Scaron",
        "Uacute",
        "Ucircumflex",
        "Udieresis",
        "Ugrave",
        "Yacute",
        "Ydieresis",
        "Zcaron",
        "aacute",
        "acircumflex",
        "adieresis",
        "agrave",
        "aring",
        "atilde",
        "ccedilla",
        "eacute",
        "ecircumflex",
        "edieresis",
        "egrave",
        "iacute",
        "icircumflex",
        "idieresis",
        "igrave",
        "ntilde",
        "oacute",
        "ocircumflex",
        "odieresis",
        "ograve",
        "otilde",
        "scaron",
        "uacute",
        "ucircumflex",
        "udieresis",
        "ugrave",
        "yacute",
        "ydieresis",
        "zcaron",
        "exclamsmall",
        "Hungarumlautsmall",
        "dollaroldstyle",
        "dollarsuperior",
        "ampersandsmall",
        "Acutesmall",
        "parenleftsuperior",
        "parenrightsuperior",
        "twodotenleader",
        "onedotenleader",
        "zerooldstyle",
        "oneoldstyle",
        "twooldstyle",
        "threeoldstyle",
        "fouroldstyle",
        "fiveoldstyle",
        "sixoldstyle",
        "sevenoldstyle",
        "eightoldstyle",
        "nineoldstyle",
        "commasuperior",
        "threequartersemdash",
        "periodsuperior",
        "questionsmall",
        "asuperior",
        "bsuperior",
        "centsuperior",
        "dsuperior",
        "esuperior",
        "isuperior",
        "lsuperior",
        "msuperior",
        "nsuperior",
        "osuperior",
        "rsuperior",
        "ssuperior",
        "tsuperior",
        "ff",
        "ffi",
        "ffl",
        "parenleftinferior",
        "parenrightinferior",
        "Circumflexsmall",
        "hyphensuperior",
        "Gravesmall",
        "Asmall",
        "Bsmall",
        "Csmall",
        "Dsmall",
        "Esmall",
        "Fsmall",
        "Gsmall",
        "Hsmall",
        "Ismall",
        "Jsmall",
        "Ksmall",
        "Lsmall",
        "Msmall",
        "Nsmall",
        "Osmall",
        "Psmall",
        "Qsmall",
        "Rsmall",
        "Ssmall",
        "Tsmall",
        "Usmall",
        "Vsmall",
        "Wsmall",
        "Xsmall",
        "Ysmall",
        "Zsmall",
        "colonmonetary",
        "onefitted",
        "rupiah",
        "Tildesmall",
        "exclamdownsmall",
        "centoldstyle",
        "Lslashsmall",
        "Scaronsmall",
        "Zcaronsmall",
        "Dieresissmall",
        "Brevesmall",
        "Caronsmall",
        "Dotaccentsmall",
        "Macronsmall",
        "figuredash",
        "hypheninferior",
        "Ogoneksmall",
        "Ringsmall",
        "Cedillasmall",
        "questiondownsmall",
        "oneeighth",
        "threeeighths",
        "fiveeighths",
        "seveneighths",
        "onethird",
        "twothirds",
        "zerosuperior",
        "foursuperior",
        "fivesuperior",
        "sixsuperior",
        "sevensuperior",
        "eightsuperior",
        "ninesuperior",
        "zeroinferior",
        "oneinferior",
        "twoinferior",
        "threeinferior",
        "fourinferior",
        "fiveinferior",
        "sixinferior",
        "seveninferior",
        "eightinferior",
        "nineinferior",
        "centinferior",
        "dollarinferior",
        "periodinferior",
        "commainferior",
        "Agravesmall",
        "Aacutesmall",
        "Acircumflexsmall",
        "Atildesmall",
        "Adieresissmall",
        "Aringsmall",
        "AEsmall",
        "Ccedillasmall",
        "Egravesmall",
        "Eacutesmall",
        "Ecircumflexsmall",
        "Edieresissmall",
        "Igravesmall",
        "Iacutesmall",
        "Icircumflexsmall",
        "Idieresissmall",
        "Ethsmall",
        "Ntildesmall",
        "Ogravesmall",
        "Oacutesmall",
        "Ocircumflexsmall",
        "Otildesmall",
        "Odieresissmall",
        "OEsmall",
        "Oslashsmall",
        "Ugravesmall",
        "Uacutesmall",
        "Ucircumflexsmall",
        "Udieresissmall",
        "Yacutesmall",
        "Thornsmall",
        "Ydieresissmall",
        "001.000",
        "001.001",
        "001.002",
        "001.003",
        "Black",
        "Bold",
        "Book",
        "Light",
        "Medium",
        "Regular",
        "Roman",
        "Semibold",
    )

    class INDEX:
        def __init__(self, fp: BinaryIO) -> None:
            self.fp = fp
            self.offsets: list[int] = []
            (count, offsize) = struct.unpack(">HB", self.fp.read(3))
            for i in range(count + 1):
                self.offsets.append(nunpack(self.fp.read(offsize)))
            self.base = self.fp.tell() - 1
            self.fp.seek(self.base + self.offsets[-1])

        def __repr__(self) -> str:
            return "<INDEX: size=%d>" % len(self)

        def __len__(self) -> int:
            return len(self.offsets) - 1

        def __getitem__(self, i: int) -> bytes:
            self.fp.seek(self.base + self.offsets[i])
            return self.fp.read(self.offsets[i + 1] - self.offsets[i])

        def __iter__(self) -> Iterator[bytes]:
            return iter(self[i] for i in range(len(self)))

    def __init__(self, name: str, fp: BinaryIO) -> None:
        self.name = name
        self.fp = fp
        # Header
        (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4))
        self.fp.read(hdrsize - 4)
        # Name INDEX
        self.name_index = self.INDEX(self.fp)
        # Top DICT INDEX
        self.dict_index = self.INDEX(self.fp)
        # String INDEX
        self.string_index = self.INDEX(self.fp)
        # Global Subr INDEX
        self.subr_index = self.INDEX(self.fp)
        # Top DICT DATA
        self.top_dict = getdict(self.dict_index[0])
        (charset_pos,) = self.top_dict.get(15, [0])
        (encoding_pos,) = self.top_dict.get(16, [0])
        (charstring_pos,) = self.top_dict.get(17, [0])
        # CharStrings
        self.fp.seek(cast(int, charstring_pos))
        self.charstring = self.INDEX(self.fp)
        self.nglyphs = len(self.charstring)
        # Encodings
        self.code2gid = {}
        self.gid2code = {}
        self.fp.seek(cast(int, encoding_pos))
        format = self.fp.read(1)
        if format == b"\x00":
            # Format 0
            (n,) = struct.unpack("B", self.fp.read(1))
            for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):
                self.code2gid[code] = gid
                self.gid2code[gid] = code
        elif format == b"\x01":
            # Format 1
            (n,) = struct.unpack("B", self.fp.read(1))
            code = 0
            for i in range(n):
                (first, nleft) = struct.unpack("BB", self.fp.read(2))
                for gid in range(first, first + nleft + 1):
                    self.code2gid[code] = gid
                    self.gid2code[gid] = code
                    code += 1
        else:
            raise PDFValueError("unsupported encoding format: %r" % format)
        # Charsets
        self.name2gid = {}
        self.gid2name = {}
        self.fp.seek(cast(int, charset_pos))
        format = self.fp.read(1)
        if format == b"\x00":
            # Format 0
            n = self.nglyphs - 1
            for gid, sid in enumerate(
                cast(
                    tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))
                ),
            ):
                gid += 1
                sidname = self.getstr(sid)
                self.name2gid[sidname] = gid
                self.gid2name[gid] = sidname
        elif format == b"\x01":
            # Format 1
            (n,) = struct.unpack("B", self.fp.read(1))
            sid = 0
            for i in range(n):
                (first, nleft) = struct.unpack("BB", self.fp.read(2))
                for gid in range(first, first + nleft + 1):
                    sidname = self.getstr(sid)
                    self.name2gid[sidname] = gid
                    self.gid2name[gid] = sidname
                    sid += 1
        elif format == b"\x02":
            # Format 2
            assert False, str(("Unhandled", format))
        else:
            raise PDFValueError("unsupported charset format: %r" % format)

    def getstr(self, sid: int) -> str | bytes:
        # This returns str for one of the STANDARD_STRINGS but bytes otherwise,
        # and appears to be a needless source of type complexity.
        if sid < len(self.STANDARD_STRINGS):
            return self.STANDARD_STRINGS[sid]
        return self.string_index[sid - len(self.STANDARD_STRINGS)]


class TrueTypeFont:
    class CMapNotFound(PDFException):
        pass

    def __init__(self, name: str, fp: BinaryIO) -> None:
        self.name = name
        self.fp = fp
        self.tables: dict[bytes, tuple[int, int]] = {}
        self.fonttype = fp.read(4)
        try:
            (ntables, _1, _2, _3) = cast(
                tuple[int, int, int, int],
                struct.unpack(">HHHH", fp.read(8)),
            )
            for _ in range(ntables):
                (name_bytes, tsum, offset, length) = cast(
                    tuple[bytes, int, int, int],
                    struct.unpack(">4sLLL", fp.read(16)),
                )
                self.tables[name_bytes] = (offset, length)
        except struct.error:
            # Do not fail if there are not enough bytes to read. Even for
            # corrupted PDFs we would like to get as much information as
            # possible, so continue.
            pass

    def create_unicode_map(self) -> FileUnicodeMap:
        if b"cmap" not in self.tables:
            raise TrueTypeFont.CMapNotFound
        fp = self.fp
        char2gid = []
        try:
            face = freetype.Face(fp)
            char2gid = list(face.get_chars())
        except Exception:
            raise TrueTypeFont.CMapNotFound
        # create unicode map
        unicode_map = FileUnicodeMap()
        for char, gid in char2gid:
            unicode_map.add_cid2unichr(gid, char)
        return unicode_map


class PDFFontError(PDFException):
    pass


class PDFUnicodeNotDefined(PDFFontError):
    pass


LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")
LITERAL_TYPE1C = LIT("Type1C")

# Font widths are maintained in a dict type that maps from *either* unicode
# chars or integer character IDs.
FontWidthDict = dict[int | str, float]


class PDFFont:
    def __init__(
        self,
        descriptor: Mapping[str, Any],
        widths: FontWidthDict,
        default_width: float | None = None,
    ) -> None:
        self.descriptor = descriptor
        self.widths: FontWidthDict = resolve_all(widths)
        self.fontname = resolve1(descriptor.get("FontName", "unknown"))
        if isinstance(self.fontname, PSLiteral):
            self.fontname = literal_name(self.fontname)
        self.flags = int_value(descriptor.get("Flags", 0))
        self.ascent = num_value(descriptor.get("Ascent", 0))
        self.descent = num_value(descriptor.get("Descent", 0))
        self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
        if default_width is None:
            self.default_width = num_value(descriptor.get("MissingWidth", 0))
        else:
            self.default_width = default_width
        self.default_width = resolve1(self.default_width)
        self.leading = num_value(descriptor.get("Leading", 0))
        self.bbox = self._parse_bbox(descriptor)
        self.hscale = self.vscale = 0.001

        # PDF RM 9.8.1 specifies /Descent should always be a negative number.
        # PScript5.dll seems to produce Descent with a positive number, but
        # text analysis will be wrong if this is taken as correct. So force
        # descent to negative.
        if self.descent > 0:
            self.descent = -self.descent

    def __repr__(self) -> str:
        return "<PDFFont>"

    def is_vertical(self) -> bool:
        return False

    def is_multibyte(self) -> bool:
        return False

    def decode(self, bytes: bytes) -> Iterable[int]:
        return bytearray(bytes)  # map(ord, bytes)

    def get_ascent(self) -> float:
        """Ascent above the baseline, in text space units"""
        return self.ascent * self.vscale

    def get_descent(self) -> float:
        """Descent below the baseline, in text space units; always negative"""
        return self.descent * self.vscale

    def get_width(self) -> float:
        w = self.bbox[2] - self.bbox[0]
        if w == 0:
            w = -self.default_width
        return w * self.hscale

    def get_height(self) -> float:
        h = self.bbox[3] - self.bbox[1]
        if h == 0:
            h = self.ascent - self.descent
        return h * self.vscale

    def char_width(self, cid: int) -> float:
        # Because character widths may be mapping either IDs or strings,
        # we try to lookup the character ID first, then its str equivalent.
        cid_width = safe_float(self.widths.get(cid))
        if cid_width is not None:
            return cid_width * self.hscale

        try:
            str_cid = self.to_unichr(cid)
            cid_width = safe_float(self.widths.get(str_cid))
            if cid_width is not None:
                return cid_width * self.hscale

        except PDFUnicodeNotDefined:
            pass

        return self.default_width * self.hscale

    def char_disp(self, cid: int) -> float | tuple[float | None, float]:
        """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
        return 0

    def string_width(self, s: bytes) -> float:
        return sum(self.char_width(cid) for cid in self.decode(s))

    def to_unichr(self, cid: int) -> str:
        raise NotImplementedError

    @staticmethod
    def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect:
        """Parse FontBBox from the fonts descriptor"""
        font_bbox = resolve_all(descriptor.get("FontBBox"))
        bbox = safe_rect_list(font_bbox)
        if bbox is None:
            log.warning(
                f"Could get FontBBox from font descriptor because {font_bbox!r} cannot be parsed as 4 floats"
            )
            return 0.0, 0.0, 0.0, 0.0
        return bbox


class PDFSimpleFont(PDFFont):
    def __init__(
        self,
        descriptor: Mapping[str, Any],
        widths: FontWidthDict,
        spec: Mapping[str, Any],
    ) -> None:
        # Font encoding is specified either by a name of
        # built-in encoding or a dictionary that describes
        # the differences.
        if "Encoding" in spec:
            encoding = resolve1(spec["Encoding"])
        else:
            encoding = LITERAL_STANDARD_ENCODING
        if isinstance(encoding, dict):
            name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING))
            diff = list_value(encoding.get("Differences", []))
            self.cid2unicode = EncodingDB.get_encoding(name, diff)
        else:
            self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
        self.unicode_map: UnicodeMap | None = None
        if "ToUnicode" in spec:
            strm = stream_value(spec["ToUnicode"])
            self.unicode_map = FileUnicodeMap()
            CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
        PDFFont.__init__(self, descriptor, widths)

    def to_unichr(self, cid: int) -> str:
        if self.unicode_map:
            try:
                return self.unicode_map.get_unichr(cid)
            except KeyError:
                pass
        try:
            return self.cid2unicode[cid]
        except KeyError:
            raise PDFUnicodeNotDefined(None, cid)


class PDFType1Font(PDFSimpleFont):
    def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
        try:
            self.basefont = literal_name(spec["BaseFont"])
        except KeyError:
            if settings.STRICT:
                raise PDFFontError("BaseFont is missing")
            self.basefont = "unknown"

        widths: FontWidthDict
        try:
            (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
            widths = cast(dict[str | int, float], int_widths)  # implicit int->float
        except KeyError:
            descriptor = dict_value(spec.get("FontDescriptor", {}))
            firstchar = int_value(spec.get("FirstChar", 0))
            # lastchar = int_value(spec.get('LastChar', 255))
            width_list = list_value(spec.get("Widths", [0] * 256))
            widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}
        PDFSimpleFont.__init__(self, descriptor, widths, spec)
        if "Encoding" not in spec and "FontFile" in descriptor:
            # try to recover the missing encoding info from the font file.
            self.fontfile = stream_value(descriptor.get("FontFile"))
            length1 = int_value(self.fontfile["Length1"])
            data = self.fontfile.get_data()[:length1]
            # awcm: quickfix for type 1 font which contains bad string literals
            offset = 0
            if enc_offset := data.index(b"/Encoding"):
                offset = enc_offset
            parser = Type1FontHeaderParser(BytesIO(data[offset:]))
            self.cid2unicode = parser.get_encoding()

    def __repr__(self) -> str:
        return "<PDFType1Font: basefont=%r>" % self.basefont


class PDFTrueTypeFont(PDFType1Font):
    def __repr__(self) -> str:
        return "<PDFTrueTypeFont: basefont=%r>" % self.basefont


class PDFType3Font(PDFSimpleFont):
    def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
        firstchar = int_value(spec.get("FirstChar", 0))
        # lastchar = int_value(spec.get('LastChar', 0))
        width_list = list_value(spec.get("Widths", [0] * 256))
        widths: dict[str | int, float] = {
            i + firstchar: w for (i, w) in enumerate(width_list)
        }
        if "FontDescriptor" in spec:
            descriptor = dict_value(spec["FontDescriptor"])
        else:
            descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
        PDFSimpleFont.__init__(self, descriptor, widths, spec)
        self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))
        (_, self.descent, _, self.ascent) = self.bbox
        (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))

    def __repr__(self) -> str:
        return "<PDFType3Font>"


class PDFCIDFont(PDFFont):
    default_disp: float | tuple[float | None, float]

    def __init__(
        self,
        rsrcmgr: "PDFResourceManager",
        spec: Mapping[str, Any],
        strict: bool = settings.STRICT,
    ) -> None:
        try:
            self.basefont = literal_name(spec["BaseFont"])
        except KeyError:
            if strict:
                raise PDFFontError("BaseFont is missing")
            self.basefont = "unknown"
        self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
        cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
            "latin1",
        )
        cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
            "latin1",
        )
        self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
        self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)

        try:
            descriptor = dict_value(spec["FontDescriptor"])
        except KeyError:
            if strict:
                raise PDFFontError("FontDescriptor is missing")
            descriptor = {}
        ttf = None
        self.has_encoding = False
        self.cid_encoding = None
        try:
            if "Encoding" in spec:
                encoding_part = resolve1(spec["Encoding"])
                if isinstance(encoding_part, PDFStream):
                    self.has_encoding = True
                    self.cid_encoding = CharacterMap(
                        encoding_part.get_data().decode("U8")
                    )
        except Exception as e:
            log.error(f"Error get cid_encoding from spec: {e}")
            self.has_encoding = False
            self.cid_encoding = None
        if "FontFile2" in descriptor:
            self.fontfile = stream_value(descriptor.get("FontFile2"))
            ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
        self.unicode_map: UnicodeMap | None = None
        if "ToUnicode" in spec:
            if isinstance(spec["ToUnicode"], PDFStream):
                strm = stream_value(spec["ToUnicode"])
                self.unicode_map = FileUnicodeMap()
                CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
            else:
                cmap_name = literal_name(spec["ToUnicode"])
                encoding = literal_name(spec["Encoding"])
                if (
                    "Identity" in cid_ordering
                    or "Identity" in cmap_name
                    or "Identity" in encoding
                ):
                    self.unicode_map = IdentityUnicodeMap()
        elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):
            if ttf:
                try:
                    self.unicode_map = ttf.create_unicode_map()
                except TrueTypeFont.CMapNotFound:
                    pass
        else:
            try:
                self.unicode_map = CMapDB.get_unicode_map(
                    self.cidcoding,
                    self.cmap.is_vertical(),
                )
            except CMapDB.CMapNotFound:
                pass

        self.vertical = self.cmap.is_vertical()
        if self.vertical:
            # writing mode: vertical
            widths2 = get_widths2(list_value(spec.get("W2", [])))
            self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}
            (vy, w) = resolve1(spec.get("DW2", [880, -1000]))
            self.default_disp = (None, vy)
            widths: dict[str | int, float] = {
                cid: w for (cid, (w, _)) in widths2.items()
            }
            default_width = w
        else:
            # writing mode: horizontal
            self.disps = {}
            self.default_disp = 0
            widths = get_widths(list_value(spec.get("W", [])))
            default_width = spec.get("DW", 1000)
        PDFFont.__init__(self, descriptor, widths, default_width=default_width)

    def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
        """Get cmap from font specification

        For certain PDFs, Encoding Type isn't mentioned as an attribute of
        Encoding but as an attribute of CMapName, where CMapName is an
        attribute of spec['Encoding'].
        The horizontal/vertical modes are mentioned with different name
        such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
        """
        cmap_name = self._get_cmap_name(spec, strict)

        try:
            return CMapDB.get_cmap(cmap_name)
        except CMapDB.CMapNotFound as e:
            if strict:
                raise PDFFontError(e)
            return CMap()

    @staticmethod
    def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
        """Get cmap name from font specification"""
        cmap_name = "unknown"  # default value

        try:
            spec_encoding = spec["Encoding"]
            if hasattr(spec_encoding, "name"):
                cmap_name = literal_name(spec["Encoding"])
            else:
                cmap_name = literal_name(spec_encoding["CMapName"])
        except KeyError:
            if strict:
                raise PDFFontError("Encoding is unspecified")

        if type(cmap_name) is PDFStream:  # type: ignore[comparison-overlap]
            cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
            if "CMapName" in cmap_name_stream:
                cmap_name = cmap_name_stream.get("CMapName").name
            elif strict:
                raise PDFFontError("CMapName unspecified for encoding")

        return IDENTITY_ENCODER.get(cmap_name, cmap_name)

    def __repr__(self) -> str:
        return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"

    def is_vertical(self) -> bool:
        return self.vertical

    def is_multibyte(self) -> bool:
        return True

    def decode(self, bytes: bytes) -> Iterable[int]:
        try:
            if self.has_encoding:
                res = self.cid_encoding.decode(bytes)

                if res is not None and all(x > 0 for x in res):
                    return res
        except Exception as e:
            log.error(f"Error use cid_encoding to decode bytes: {e}")
        return self.cmap.decode(bytes)

    def char_disp(self, cid: int) -> float | tuple[float | None, float]:
        """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
        return self.disps.get(cid, self.default_disp)

    def to_unichr(self, cid: int) -> str:
        try:
            if not self.unicode_map:
                raise PDFKeyError(cid)
            return self.unicode_map.get_unichr(cid)
        except KeyError:
            raise PDFUnicodeNotDefined(self.cidcoding, cid)


================================================
FILE: babeldoc/pdfminer/pdfinterp.py
================================================
import logging
import re
from collections.abc import Mapping
from collections.abc import Sequence
from io import BytesIO
from typing import Union
from typing import cast

from babeldoc.pdfminer.casting import safe_cmyk
from babeldoc.pdfminer.casting import safe_float
from babeldoc.pdfminer.casting import safe_int
from babeldoc.pdfminer.casting import safe_matrix
from babeldoc.pdfminer.casting import safe_rgb
from babeldoc.pdfminer.cmapdb import CMap
from babeldoc.pdfminer.cmapdb import CMapBase
from babeldoc.pdfminer.cmapdb import CMapDB
from babeldoc.pdfminer.pdfcolor import PREDEFINED_COLORSPACE
from babeldoc.pdfminer.pdfcolor import PDFColorSpace
from babeldoc.pdfminer.pdfdevice import PDFDevice
from babeldoc.pdfminer.pdfdevice import PDFTextSeq
from babeldoc.pdfminer.pdfexceptions import PDFException
from babeldoc.pdfminer.pdfexceptions import PDFValueError
from babeldoc.pdfminer.pdffont import PDFCIDFont
from babeldoc.pdfminer.pdffont import PDFFont
from babeldoc.pdfminer.pdffont import PDFFontError
from babeldoc.pdfminer.pdffont import PDFTrueTypeFont
from babeldoc.pdfminer.pdffont import PDFType1Font
from babeldoc.pdfminer.pdffont import PDFType3Font
from babeldoc.pdfminer.pdfpage import PDFPage
from babeldoc.pdfminer.pdftypes import LITERALS_ASCII85_DECODE
from babeldoc.pdfminer.pdftypes import PDFObjRef
from babeldoc.pdfminer.pdftypes import PDFStream
from babeldoc.pdfminer.pdftypes import dict_value
from babeldoc.pdfminer.pdftypes import list_value
from babeldoc.pdfminer.pdftypes import resolve1
from babeldoc.pdfminer.pdftypes import stream_value
from babeldoc.pdfminer.psexceptions import PSEOF
from babeldoc.pdfminer.psexceptions import PSTypeError
from babeldoc.pdfminer.psparser import KWD
from babeldoc.pdfminer.psparser import LIT
from babeldoc.pdfminer.psparser import PSKeyword
from babeldoc.pdfminer.psparser import PSLiteral
from babeldoc.pdfminer.psparser import PSStackParser
from babeldoc.pdfminer.psparser import PSStackType
from babeldoc.pdfminer.psparser import keyword_name
from babeldoc.pdfminer.psparser import literal_name
from babeldoc.pdfminer.utils import MATRIX_IDENTITY, apply_matrix_pt
from babeldoc.pdfminer.utils import Matrix
from babeldoc.pdfminer.utils import PathSegment
from babeldoc.pdfminer.utils import Point
from babeldoc.pdfminer.utils import Rect
from babeldoc.pdfminer.utils import choplist
from babeldoc.pdfminer.utils import mult_matrix
from babeldoc.pdfminer import settings

log = logging.getLogger(__name__)


class PDFResourceError(PDFException):
    pass


class PDFInterpreterError(PDFException):
    pass


LITERAL_PDF = LIT("PDF")
LITERAL_TEXT = LIT("Text")
LITERAL_FONT = LIT("Font")
LITERAL_FORM = LIT("Form")
LITERAL_IMAGE = LIT("Image")


class PDFTextState:
    matrix: Matrix
    linematrix: Point

    def __init__(self) -> None:
        self.font: PDFFont | None = None
        self.fontsize: float = 0
        self.charspace: float = 0
        self.wordspace: float = 0
        self.scaling: float = 100
        self.leading: float = 0
        self.render: int = 0
        self.rise: float = 0
        self.reset()
        # self.matrix is set
        # self.linematrix is set

    def __repr__(self) -> str:
        return (
            "<PDFTextState: font=%r, fontsize=%r, charspace=%r, "
            "wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, "
            "matrix=%r, linematrix=%r>"
            % (
                self.font,
                self.fontsize,
                self.charspace,
                self.wordspace,
                self.scaling,
                self.leading,
                self.render,
                self.rise,
                self.matrix,
                self.linematrix,
            )
        )

    def copy(self) -> "PDFTextState":
        obj = PDFTextState()
        obj.font = self.font
        obj.fontsize = self.fontsize
        obj.charspace = self.charspace
        obj.wordspace = self.wordspace
        obj.scaling = self.scaling
        obj.leading = self.leading
        obj.render = self.render
        obj.rise = self.rise
        obj.matrix = self.matrix
        obj.linematrix = self.linematrix
        obj.font_id = getattr(self, "font_id", None)
        return obj

    def reset(self) -> None:
        self.matrix = MATRIX_IDENTITY
        self.linematrix = (0, 0)


Color = Union[
    float,  # Greyscale
    tuple[float, float, float],  # R, G, B
    tuple[float, float, float, float],  # C, M, Y, K
]


class PDFGraphicState:
    def __init__(self) -> None:
        self.linewidth: float = 0
        self.linecap: object | None = None
        self.linejoin: object | None = None
        self.miterlimit: object | None = None
        self.dash: tuple[object, object] | None = None
        self.intent: object | None = None
        self.flatness: object | None = None

        # stroking color
        self.scolor: Color | None = None

        # non stroking color
        self.ncolor: Color | None = None

    def copy(self) -> "PDFGraphicState":
        obj = PDFGraphicState()
        obj.linewidth = self.linewidth
        obj.linecap = self.linecap
        obj.linejoin = self.linejoin
        obj.miterlimit = self.miterlimit
        obj.dash = self.dash
        obj.intent = self.intent
        obj.flatness = self.flatness
        obj.scolor = self.scolor
        obj.ncolor = self.ncolor
        return obj

    def __repr__(self) -> str:
        return (
            "<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, "
            " miterlimit=%r, dash=%r, intent=%r, flatness=%r, "
            " stroking color=%r, non stroking color=%r>"
            % (
                self.linewidth,
                self.linecap,
                self.linejoin,
                self.miterlimit,
                self.dash,
                self.intent,
                self.flatness,
                self.scolor,
                self.ncolor,
            )
        )


class PDFResourceManager:
    """Repository of shared resources.

    ResourceManager facilitates reuse of shared resources
    such as fonts and images so that large objects are not
    allocated multiple times.
    """

    def __init__(self, caching: bool = True) -> None:
        self.caching = caching
        self._cached_fonts: dict[object, PDFFont] = {}

    def get_procset(self, procs: Sequence[object]) -> None:
        for proc in procs:
            if proc is LITERAL_PDF or proc is LITERAL_TEXT:
                pass
            else:
                pass

    def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
        try:
            return CMapDB.get_cmap(cmapname)
        except CMapDB.CMapNotFound:
            if strict:
                raise
            return CMap()

    def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
        if objid and objid in self._cached_fonts:
            font = self._cached_fonts[objid]
        else:
            log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
            if settings.STRICT:
                if spec["Type"] is not LITERAL_FONT:
                    raise PDFFontError("Type is not /Font")
            # Create a Font object.
            if "Subtype" in spec:
                subtype = literal_name(spec["Subtype"])
            else:
                if settings.STRICT:
                    raise PDFFontError("Font Subtype is not specified.")
                subtype = "Type1"
            if subtype in ("Type1", "MMType1"):
                # Type1 Font
                font = PDFType1Font(self, spec)
            elif subtype == "TrueType":
                # TrueType Font
                font = PDFTrueTypeFont(self, spec)
            elif subtype == "Type3":
                # Type3 Font
                font = PDFType3Font(self, spec)
            elif subtype in ("CIDFontType0", "CIDFontType2"):
                # CID Font
                font = PDFCIDFont(self, spec)
            elif subtype == "Type0":
                # Type0 Font
                dfonts = list_value(spec["DescendantFonts"])
                assert dfonts
                subspec = dict_value(dfonts[0]).copy()
                for k in ("Encoding", "ToUnicode"):
                    if k in spec:
                        subspec[k] = resolve1(spec[k])
                font = self.get_font(None, subspec)
            else:
                if settings.STRICT:
                    raise PDFFontError("Invalid Font spec: %r" % spec)
                font = PDFType1Font(self, spec)  # this is so wrong!
            if objid and self.caching:
                self._cached_fonts[objid] = font
        return font


class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
    def __init__(self, streams: Sequence[object]) -> None:
        self.streams = streams
        self.istream = 0
        # PSStackParser.__init__(fp=None) is safe only because we've overloaded
        # all the methods that would attempt to access self.fp without first
        # calling self.fillfp().
        PSStackParser.__init__(self, None)  # type: ignore[arg-type]

    def fillfp(self) -> None:
        if not self.fp:
            if self.istream < len(self.streams):
                strm = stream_value(self.streams[self.istream])
                self.istream += 1
            else:
                raise PSEOF("Unexpected EOF, file truncated?")
            self.fp = BytesIO(strm.get_data())

    def seek(self, pos: int) -> None:
        self.fillfp()
        PSStackParser.seek(self, pos)

    def fillbuf(self) -> None:
        if self.charpos < len(self.buf):
            return
        while 1:
            self.fillfp()
            self.bufpos = self.fp.tell()
            self.buf = self.fp.read(self.BUFSIZ)
            if self.buf:
                break
            self.fp = None  # type: ignore[assignment]
        self.charpos = 0

    def get_inline_data(self, pos: int, target: bytes = b"EI") -> tuple[int, bytes]:
        self.seek(pos)
        i = 0
        data = b""
        while i <= len(target):
            self.fillbuf()
            if i:
                ci = self.buf[self.charpos]
                c = bytes((ci,))
                data += c
                self.charpos += 1
                if (
                    len(target) <= i
                    and c.isspace()
                    or i < len(target)
                    and c == (bytes((target[i],)))
                ):
                    i += 1
                else:
                    i = 0
            else:
                try:
                    j = self.buf.index(target[0], self.charpos)
                    data += self.buf[self.charpos : j + 1]
                    self.charpos = j + 1
                    i = 1
                except ValueError:
                    data += self.buf[self.charpos :]
                    self.charpos = len(self.buf)
        data = data[: -(len(target) + 1)]  # strip the last part
        data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
        return (pos, data)

    def flush(self) -> None:
        self.add_results(*self.popall())

    KEYWORD_BI = KWD(b"BI")
    KEYWORD_ID = KWD(b"ID")
    KEYWORD_EI = KWD(b"EI")

    def do_keyword(self, pos: int, token: PSKeyword) -> None:
        if token is self.KEYWORD_BI:
            # inline image within a content stream
            self.start_type(pos, "inline")
        elif token is self.KEYWORD_ID:
            try:
                (_, objs) = self.end_type("inline")
                if len(objs) % 2 != 0:
                    error_msg = f"Invalid dictionary construct: {objs!r}"
                    raise PSTypeError(error_msg)
                d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
                eos = b"EI"
                filter = d.get("F", None)
                if filter is not None:
                    if isinstance(filter, PSLiteral):
                        filter = [filter]
                    if filter[0] in LITERALS_ASCII85_DECODE:
                        eos = b"~>"
                (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
                if eos != b"EI":  # it may be necessary for decoding
                    data += eos
                obj = PDFStream(d, data)
                self.push((pos, obj))
                if eos == b"EI":  # otherwise it is still in the stream
                    self.push((pos, self.KEYWORD_EI))
            except PSTypeError:
                if settings.STRICT:
                    raise
        else:
            self.push((pos, token))


PDFStackT = PSStackType[PDFStream]
"""Types that may appear on the PDF argument stack."""


class PDFPageInterpreter:
    """Processor for the content of a PDF page

    Reference: PDF Reference, Appendix A, Operator Summary
    """

    def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None:
        self.rsrcmgr = rsrcmgr
        self.device = device

    def dup(self) -> "PDFPageInterpreter":
        return self.__class__(self.rsrcmgr, self.device)

    def init_resources(self, resources: dict[object, object]) -> None:
        """Prepare the fonts and XObjects listed in the Resource attribute."""
        self.resources = resources
        self.fontmap: dict[object, PDFFont] = {}
        self.xobjmap = {}
        self.csmap: dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
        if not resources:
            return

        def get_colorspace(spec: object) -> PDFColorSpace | None:
            if isinstance(spec, list):
                name = literal_name(spec[0])
            else:
                name = literal_name(spec)
            if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
                return PDFColorSpace(name, stream_value(spec[1])["N"])
            elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
                return PDFColorSpace(name, len(list_value(spec[1])))
            else:
                return PREDEFINED_COLORSPACE.get(name)

        for k, v in dict_value(resources).items():
            log.debug("Resource: %r: %r", k, v)
            if k == "Font":
                for fontid, spec in dict_value(v).items():
                    objid = None
                    if isinstance(spec, PDFObjRef):
                        objid = spec.objid
                    spec = dict_value(spec)
                    self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
            elif k == "ColorSpace":
                for csid, spec in dict_value(v).items():
                    colorspace = get_colorspace(resolve1(spec))
                    if colorspace is not None:
                        self.csmap[csid] = colorspace
            elif k == "ProcSet":
                self.rsrcmgr.get_procset(list_value(v))
            elif k == "XObject":
                for xobjid, xobjstrm in dict_value(v).items():
                    self.xobjmap[xobjid] = xobjstrm

    def init_state(self, ctm: Matrix) -> None:
        """Initialize the text and graphic states for rendering a page."""
        # gstack: stack for graphical states.
        self.gstack: list[tuple[Matrix, PDFTextState, PDFGraphicState]] = []
        self.ctm = ctm
        self.device.set_ctm(self.ctm)
        self.textstate = PDFTextState()
        self.graphicstate = PDFGraphicState()
        self.curpath: list[PathSegment] = []
        # argstack: stack for command arguments.
        self.argstack: list[PDFStackT] = []
        # set some global states.
        self.scs: PDFColorSpace | None = None
        self.ncs: PDFColorSpace | None = None
        if self.csmap:
            self.scs = self.ncs = next(iter(self.csmap.values()))

    def push(self, obj: PDFStackT) -> None:
        self.argstack.append(obj)

    def pop(self, n: int) -> list[PDFStackT]:
        if n == 0:
            return []
        x = self.argstack[-n:]
        self.argstack = self.argstack[:-n]
        return x

    def get_current_state(self) -> tuple[Matrix, PDFTextState, PDFGraphicState]:
        return (self.ctm, self.textstate.copy(), self.graphicstate.copy())

    def set_current_state(
        self,
        state: tuple[Matrix, PDFTextState, PDFGraphicState],
    ) -> None:
        (self.ctm, self.textstate, self.graphicstate) = state
        self.device.set_ctm(self.ctm)

    def do_q(self) -> None:
        """Save graphics state"""
        self.gstack.append(self.get_current_state())

    def do_Q(self) -> None:
        """Restore graphics state"""
        if self.gstack:
            self.set_current_state(self.gstack.pop())

    def do_cm(
        self,
        a1: PDFStackT,
        b1: PDFStackT,
        c1: PDFStackT,
        d1: PDFStackT,
        e1: PDFStackT,
        f1: PDFStackT,
    ) -> None:
        """Concatenate matrix to current transformation matrix"""
        matrix = safe_matrix(a1, b1, c1, d1, e1, f1)

        if matrix is None:
            log.warning(
                f"Cannot concatenate matrix to current transformation matrix because not all values in {(a1, b1, c1, d1, e1, f1)!r} can be parsed as floats"
            )
        else:
            self.ctm = mult_matrix(matrix, self.ctm)
            self.device.set_ctm(self.ctm)

    def do_w(self, linewidth: PDFStackT) -> None:
        """Set line width"""
        linewidth_f = safe_float(linewidth)
        if linewidth_f is None:
            log.warning(
                f"Cannot set line width because {linewidth!r} is an invalid float value"
            )
        else:
            self.graphicstate.linewidth = linewidth_f

    def do_J(self, linecap: PDFStackT) -> None:
        """Set line cap style"""
        self.graphicstate.linecap = linecap

    def do_j(self, linejoin: PDFStackT) -> None:
        """Set line join style"""
        self.graphicstate.linejoin = linejoin

    def do_M(self, miterlimit: PDFStackT) -> None:
        """Set miter limit"""
        self.graphicstate.miterlimit = miterlimit

    def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
        """Set line dash pattern"""
        self.graphicstate.dash = (dash, phase)

    def do_ri(self, intent: PDFStackT) -> None:
        """Set color rendering intent"""
        self.graphicstate.intent = intent

    def do_i(self, flatness: PDFStackT) -> None:
        """Set flatness tolerance"""
        self.graphicstate.flatness = flatness

    def do_gs(self, name: PDFStackT) -> None:
        """Set parameters from graphics state parameter dictionary"""
        # to do

    def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
        """Begin new subpath"""
        x_f = safe_float(x)
        y_f = safe_float(y)

        if x_f is None or y_f is None:
            point = ("m", x, y)
            log.warning(
                f"Cannot start new subpath because not all values in {point!r} can be parsed as floats"
            )
        else:
            point = ("m", x_f, y_f)
            self.curpath.append(point)

    def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
        """Append straight line segment to path"""
        x_f = safe_float(x)
        y_f = safe_float(y)
        if x_f is None or y_f is None:
            point = ("l", x, y)
            log.warning(
                f"Cannot append straight line segment to path because not all values in {point!r} can be parsed as floats"
            )
        else:
            point = ("l", x_f, y_f)
            self.curpath.append(point)

    def do_c(
        self,
        x1: PDFStackT,
        y1: PDFStackT,
        x2: PDFStackT,
        y2: PDFStackT,
        x3: PDFStackT,
        y3: PDFStackT,
    ) -> None:
        """Append curved segment to path (three control points)"""
        x1_f = safe_float(x1)
        y1_f = safe_float(y1)
        x2_f = safe_float(x2)
        y2_f = safe_float(y2)
        x3_f = safe_float(x3)
        y3_f = safe_float(y3)
        if (
            x1_f is None
            or y1_f is None
            or x2_f is None
            or y2_f is None
            or x3_f is None
            or y3_f is None
        ):
            point = ("c", x1, y1, x2, y2, x3, y3)
            log.warning(
                f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats"
            )
        else:
            point = ("c", x1_f, y1_f, x2_f, y2_f, x3_f, y3_f)
            self.curpath.append(point)

    def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
        """Append curved segment to path (initial point replicated)"""
        x2_f = safe_float(x2)
        y2_f = safe_float(y2)
        x3_f = safe_float(x3)
        y3_f = safe_float(y3)
        if x2_f is None or y2_f is None or x3_f is None or y3_f is None:
            point = ("v", x2, y2, x3, y3)
            log.warning(
                f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats"
            )
        else:
            point = ("v", x2_f, y2_f, x3_f, y3_f)
            self.curpath.append(point)

    def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
        """Append curved segment to path (final point replicated)"""
        x1_f = safe_float(x1)
        y1_f = safe_float(y1)
        x3_f = safe_float(x3)
        y3_f = safe_float(y3)
        if x1_f is None or y1_f is None or x3_f is None or y3_f is None:
            point = ("y", x1, y1, x3, y3)
            log.warning(
                f"Cannot append curved segment to path because not all values in {point!r} can be parsed as floats"
            )
        else:
            point = ("y", x1_f, y1_f, x3_f, y3_f)
            self.curpath.append(point)

    def do_h(self) -> None:
        """Close subpath"""
        self.curpath.append(("h",))

    def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
        """Append rectangle to path"""
        x_f = safe_float(x)
        y_f = safe_float(y)
        w_f = safe_float(w)
        h_f = safe_float(h)

        if x_f is None or y_f is None or w_f is None or h_f is None:
            values = (x, y, w, h)
            log.warning(
                f"Cannot append rectangle to path because not all values in {values!r} can be parsed as floats"
            )
        else:
            self.curpath.append(("m", x_f, y_f))
            self.curpath.append(("l", x_f + w_f, y_f))
            self.curpath.append(("l", x_f + w_f, y_f + h_f))
            self.curpath.append(("l", x_f, y_f + h_f))
            self.curpath.append(("h",))

    def do_S(self) -> None:
        """Stroke path"""
        self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
        self.curpath = []

    def do_s(self) -> None:
        """Close and stroke path"""
        self.do_h()
        self.do_S()

    def do_f(self) -> None:
        """Fill path using nonzero winding number rule"""
        self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
        self.curpath = []

    def do_F(self) -> None:
        """Fill path using nonzero winding number rule (obsolete)"""

    def do_f_a(self) -> None:
        """Fill path using even-odd rule"""
        self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
        self.curpath = []

    def do_B(self) -> None:
        """Fill and stroke path using nonzero winding number rule"""
        self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
        self.curpath = []

    def do_B_a(self) -> None:
        """Fill and stroke path using even-odd rule"""
        self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
        self.curpath = []

    def do_b(self) -> None:
        """Close, fill, and stroke path using nonzero winding number rule"""
        self.do_h()
        self.do_B()

    def do_b_a(self) -> None:
        """Close, fill, and stroke path using even-odd rule"""
        self.do_h()
        self.do_B_a()

    def do_n(self) -> None:
        """End path without filling or stroking"""
        self.curpath = []

    def do_W(self) -> None:
        """Set clipping path using nonzero winding number rule"""
        pass

    def do_W_a(self) -> None:
        """Set clipping path using even-odd rule"""
        pass

    def do_CS(self, name: PDFStackT) -> None:
        """Set color space for stroking operations

        Introduced in PDF 1.1
        """
        try:
            self.scs = self.csmap[literal_name(name)]
        except KeyError:
            if settings.STRICT:
                raise PDFInterpreterError("Undefined ColorSpace: %r" % name)

    def do_cs(self, name: PDFStackT) -> None:
        """Set color space for nonstroking operations"""
        try:
            self.ncs = self.csmap[literal_name(name)]
        except KeyError:
            if settings.STRICT:
                raise PDFInterpreterError("Undefined ColorSpace: %r" % name)

    def do_G(self, gray: PDFStackT) -> None:
        """Set gray level for stroking operations"""
        gray_f = safe_float(gray)

        if gray_f is None:
            log.warning(
                f"Cannot set gray level because {gray!r} is an invalid float value"
            )
        else:
            self.graphicstate.scolor = gray_f
            self.scs = self.csmap["DeviceGray"]

    def do_g(self, gray: PDFStackT) -> None:
        """Set gray level for nonstroking operations"""
        gray_f = safe_float(gray)

        if gray_f is None:
            log.warning(
                f"Cannot set gray level because {gray!r} is an invalid float value"
            )
        else:
            self.graphicstate.ncolor = gray_f
            self.ncs = self.csmap["DeviceGray"]

    def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
        """Set RGB color for stroking operations"""
        rgb = safe_rgb(r, g, b)

        if rgb is None:
            log.warning(
                f"Cannot set RGB stroke color because not all values in {(r, g, b)!r} can be parsed as floats"
            )
        else:
            self.graphicstate.scolor = rgb
            self.scs = self.csmap["DeviceRGB"]

    def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
        """Set RGB color for nonstroking operations"""
        rgb = safe_rgb(r, g, b)

        if rgb is None:
            log.warning(
                f"Cannot set RGB non-stroke color because not all values in {(r, g, b)!r} can be parsed as floats"
            )
        else:
            self.graphicstate.ncolor = rgb
            self.ncs = self.csmap["DeviceRGB"]

    def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
        """Set CMYK color for stroking operations"""
        cmyk = safe_cmyk(c, m, y, k)

        if cmyk is None:
            log.warning(
                f"Cannot set CMYK stroke color because not all values in {(c, m, y, k)!r} can be parsed as floats"
            )
        else:
            self.graphicstate.scolor = cmyk
            self.scs = self.csmap["DeviceCMYK"]

    def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
        """Set CMYK color for nonstroking operations"""
        cmyk = safe_cmyk(c, m, y, k)

        if cmyk is None:
            log.warning(
                f"Cannot set CMYK non-stroke color because not all values in {(c, m, y, k)!r} can be parsed as floats"
            )
        else:
            self.graphicstate.ncolor = cmyk
            self.ncs = self.csmap["DeviceCMYK"]

    def do_SCN(self) -> None:
        """Set color for stroking operations."""
        if self.scs:
            n = self.scs.ncomponents
        else:
            if settings.STRICT:
                raise PDFInterpreterError("No colorspace specified!")
            n = 1

        if n == 1:
            gray = self.pop(1)[0]
            gray_f = safe_float(gray)
            if gray_f is None:
                log.warning(
                    f"Cannot set gray stroke color because {gray!r} is an invalid float value"
                )
            else:
                self.graphicstate.scolor = gray_f

        elif n == 3:
            values = self.pop(3)
            rgb = safe_rgb(*values)
            if rgb is None:
                log.warning(
                    f"Cannot set RGB stroke color because not all values in {values!r} can be parsed as floats"
                )
            else:
                self.graphicstate.scolor = rgb

        elif n == 4:
            values = self.pop(4)
            cmyk = safe_cmyk(*values)

            if cmyk is None:
                log.warning(
                    f"Cannot set CMYK stroke color because not all values in {values!r} can be parsed as floats"
                )
            else:
                self.graphicstate.scolor = cmyk

        else:
            log.warning(
                f"Cannot set stroke color because {n} components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported"
            )

    def do_scn(self) -> None:
        """Set color for nonstroking operations"""
        if self.ncs:
            n = self.ncs.ncomponents
        else:
            if settings.STRICT:
                raise PDFInterpreterError("No colorspace specified!")
            n = 1

        if n == 1:
            gray = self.pop(1)[0]
            gray_f = safe_float(gray)
            if gray_f is None:
                log.warning(
                    f"Cannot set gray non-stroke color because {gray!r} is an invalid float value"
                )
            else:
                self.graphicstate.ncolor = gray_f

        elif n == 3:
            values = self.pop(3)
            rgb = safe_rgb(*values)

            if rgb is None:
                log.warning(
                    f"Cannot set RGB non-stroke color because not all values in {values!r} can be parsed as floats"
                )
            else:
                self.graphicstate.ncolor = rgb

        elif n == 4:
            values = self.pop(4)
            cmyk = safe_cmyk(*values)

            if cmyk is None:
                log.warning(
                    f"Cannot set CMYK non-stroke color because not all values in {values!r} can be parsed as floats"
                )
            else:
                self.graphicstate.ncolor = cmyk

        else:
            log.warning(
                f"Cannot set non-stroke color because {n} components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported"
            )

    def do_SC(self) -> None:
        """Set color for stroking operations"""
        self.do_SCN()

    def do_sc(self) -> None:
        """Set color for nonstroking operations"""
        self.do_scn()

    def do_sh(self, name: object) -> None:
        """Paint area defined by shading pattern"""

    def do_BT(self) -> None:
        """Begin text object

        Initializing the text matrix, Tm, and the text line matrix, Tlm, to
        the identity matrix. Text objects cannot be nested; a second BT cannot
        appear before an ET.
        """
        self.textstate.reset()

    def do_ET(self) -> None:
        """End a text object"""

    def do_BX(self) -> None:
        """Begin compatibility section"""

    def do_EX(self) -> None:
        """End compatibility section"""

    def do_MP(self, tag: PDFStackT) -> None:
        """Define marked-content point"""
        if isinstance(tag, PSLiteral):
            self.device.do_tag(tag)
        else:
            log.warning(
                f"Cannot define marked-content point because {tag!r} is not a PSLiteral"
            )

    def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
        """Define marked-content point with property list"""
        if isinstance(tag, PSLiteral):
            self.device.do_tag(tag, props)
        else:
            log.warning(
                f"Cannot define marked-content point with property list because {tag!r} is not a PSLiteral"
            )

    def do_BMC(self, tag: PDFStackT) -> None:
        """Begin marked-content sequence"""
        if isinstance(tag, PSLiteral):
            self.device.begin_tag(tag)
        else:
            log.warning(
                f"Cannot begin marked-content sequence because {tag!r} is not a PSLiteral"
            )

    def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
        """Begin marked-content sequence with property list"""
        if isinstance(tag, PSLiteral):
            self.device.begin_tag(tag, props)
        else:
            log.warning(
                f"Cannot begin marked-content sequence with property list because {tag!r} is not a PSLiteral"
            )

    def do_EMC(self) -> None:
        """End marked-content sequence"""
        self.device.end_tag()

    def do_Tc(self, space: PDFStackT) -> None:
        """Set character spacing.

        Character spacing is used by the Tj, TJ, and ' operators.

        :param space: a number expressed in unscaled text space units.
        """
        charspace = safe_float(space)
        if charspace is None:
            log.warning(
                f"Could not set character spacing because {space!r} is an invalid float value"
            )
        else:
            self.textstate.charspace = charspace

    def do_Tw(self, space: PDFStackT) -> None:
        """Set the word spacing.

        Word spacing is used by the Tj, TJ, and ' operators.

        :param space: a number expressed in unscaled text space units
        """
        wordspace = safe_float(space)
        if wordspace is None:
            log.warning(
                f"Could not set word spacing becuase {space!r} is an invalid float value"
            )
        else:
            self.textstate.wordspace = wordspace

    def do_Tz(self, scale: PDFStackT) -> None:
        """Set the horizontal scaling.

        :param scale: is a number specifying the percentage of the normal width
        """
        scale_f = safe_float(scale)

        if scale_f is None:
            log.warning(
                f"Could not set horizontal scaling because {scale!r} is an invalid float value"
            )
        else:
            self.textstate.scaling = scale_f

    def do_TL(self, leading: PDFStackT) -> None:
        """Set the text leading.

        Text leading is used only by the T*, ', and " operators.

        :param leading: a number expressed in unscaled text space units
        """
        leading_f = safe_float(leading)
        if leading_f is None:
            log.warning(
                f"Could not set text leading because {leading!r} is an invalid float value"
            )
        else:
            self.textstate.leading = -leading_f

    def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
        """Set the text font

        :param fontid: the name of a font resource in the Font subdictionary
            of the current resource dictionary
        :param fontsize: size is a number representing a scale factor.
        """
        try:
            self.textstate.font = self.fontmap[literal_name(fontid)]
            self.textstate.font_id = literal_name(fontid)
        except KeyError:
            if settings.STRICT:
                raise PDFInterpreterError("Undefined Font id: %r" % fontid)
            self.textstate.font = self.rsrcmgr.get_font(None, {})

        fontsize_f = safe_float(fontsize)
        if fontsize_f is None:
            log.warning(
                f"Could not set text font because {fontsize!r} is an invalid float value"
            )
        else:
            self.textstate.fontsize = fontsize_f

    def do_Tr(self, render: PDFStackT) -> None:
        """Set the text rendering mode"""
        render_i = safe_int(render)

        if render_i is None:
            log.warning(
                f"Could not set text rendering mode because {render!r} is an invalid int value"
            )
        else:
            self.textstate.render = render_i

    def do_Ts(self, rise: PDFStackT) -> None:
        """Set the text rise

        :param rise: a number expressed in unscaled text space units
        """
        rise_f = safe_float(rise)

        if rise_f is None:
            log.warning(
                f"Could not set text rise because {rise!r} is an invalid float value"
            )
        else:
            self.textstate.rise = rise_f

    def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
        """Move to the start of the next line

        Offset from the start of the current line by (tx , ty).
        """
        tx_ = safe_float(tx)
        ty_ = safe_float(ty)
        if tx_ is not None and ty_ is not None:
            (a, b, c, d, e, f) = self.textstate.matrix
            e_new = tx_ * a + ty_ * c + e
            f_new = tx_ * b + ty_ * d + f
            self.textstate.matrix = (a, b, c, d, e_new, f_new)

        elif settings.STRICT:
            raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td")

        self.textstate.linematrix = (0, 0)

    def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
        """Move to the start of the next line.

        offset from the start of the current line by (tx , ty). As a side effect, this
        operator sets the leading parameter in the text state.
        """
        tx_ = safe_float(tx)
        ty_ = safe_float(ty)

        if tx_ is not None and ty_ is not None:
            (a, b, c, d, e, f) = self.textstate.matrix
            e_new = tx_ * a + ty_ * c + e
            f_new = tx_ * b + ty_ * d + f
            self.textstate.matrix = (a, b, c, d, e_new, f_new)

        elif settings.STRICT:
            raise PDFValueError("Invalid offset ({tx}, {ty}) for TD")

        if ty_ is not None:
            self.textstate.leading = ty_

        self.textstate.linematrix = (0, 0)

    def do_Tm(
        self,
        a: PDFStackT,
        b: PDFStackT,
        c: PDFStackT,
        d: PDFStackT,
        e: PDFStackT,
        f: PDFStackT,
    ) -> None:
        """Set text matrix and text line matrix"""
        values = (a, b, c, d, e, f)
        matrix = safe_matrix(*values)

        if matrix is None:
            log.warning(
                f"Could not set text matrix because not all values in {values!r} can be parsed as floats"
            )
        else:
            self.textstate.matrix = matrix
            self.textstate.linematrix = (0, 0)

    def do_T_a(self) -> None:
        """Move to start of next text line"""
        (a, b, c, d, e, f) = self.textstate.matrix
        self.textstate.matrix = (
            a,
            b,
            c,
            d,
            self.textstate.leading * c + e,
            self.textstate.leading * d + f,
        )
        self.textstate.linematrix = (0, 0)

    def do_TJ(self, seq: PDFStackT) -> None:
        """Show text, allowing individual glyph positioning"""
        if self.textstate.font is None:
            if settings.STRICT:
                raise PDFInterpreterError("No font specified!")
            return
        assert self.ncs is not None
        self.device.render_string(
            self.textstate,
            cast(PDFTextSeq, seq),
            self.ncs,
            self.graphicstate.copy(),
        )

    def do_Tj(self, s: PDFStackT) -> None:
        """Show text"""
        self.do_TJ([s])

    def do__q(self, s: PDFStackT) -> None:
        """Move to next line and show text

        The ' (single quote) operator.
        """
        self.do_T_a()
        self.do_TJ([s])

    def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
        """Set word and character spacing, move to next line, and show text

        The " (double quote) operator.
        """
        self.do_Tw(aw)
        self.do_Tc(ac)
        self.do_TJ([s])

    def do_BI(self) -> None:
        """Begin inline image object"""

    def do_ID(self) -> None:
        """Begin inline image data"""

    def do_EI(self, obj: PDFStackT) -> None:
        """End inline image object"""
        if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
            iobjid = str(id(obj))
            self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
            self.device.render_image(iobjid, obj)
            self.device.end_figure(iobjid)

    def do_Do(self, xobjid_arg: PDFStackT) -> None:
        """Invoke named XObject"""
        xobjid = literal_name(xobjid_arg)
        try:
            xobj = stream_value(self.xobjmap[xobjid])
        except KeyError:
            if settings.STRICT:
                raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
            return
        log.debug("Processing xobj: %r", xobj)
        subtype = xobj.get("Subtype")
        if subtype is LITERAL_FORM and "BBox" in xobj:
            interpreter = self.dup()
            bbox = cast(Rect, list_value(xobj["BBox"]))
            matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
            # According to PDF reference 1.7 section 4.9.1, XObjects in
            # earlier PDFs (prior to v1.2) use the page's Resources entry
            # instead of having their own Resources entry.
            xobjres = xobj.get("Resources")
            if xobjres:
                resources = dict_value(xobjres)
            else:
                resources = self.resources.copy()
            self.device.begin_figure(xobjid, bbox, matrix)
            interpreter.render_contents(
                resources,
                [xobj],
                ctm=mult_matrix(matrix, self.ctm),
            )
            self.device.end_figure(xobjid)
        elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
            self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
            self.device.render_image(xobjid, xobj)
            self.device.end_figure(xobjid)
        else:
            # unsupported xobject type.
            pass

    def process_page(self, page: PDFPage) -> None:
        log.debug("Processing page: %r", page)
        (x0, y0, x1, y1) = page.mediabox
        if page.rotate == 90:
            ctm = (0, -1, 1, 0, -y0, x1)
        elif page.rotate == 180:
            ctm = (-1, 0, 0, -1, x1, y1)
        elif page.rotate == 270:
            ctm = (0, 1, -1, 0, y1, -x0)
        else:
            ctm = (1, 0, 0, 1, -x0, -y0)
        self.device.begin_page(page, ctm)
        self.render_contents(page.resources, page.contents, ctm=ctm)
        self.device.end_page(page)

    def render_contents(
        self,
        resources: dict[object, object],
        streams: Sequence[object],
        ctm: Matrix = MATRIX_IDENTITY,
    ) -> None:
        """Render the content streams.

        This method may be called recursively.
        """
        log.debug(
            "render_contents: resources=%r, streams=%r, ctm=%r",
            resources,
            streams,
            ctm,
        )
        self.init_resources(resources)
        self.init_state(ctm)
        self.execute(list_value(streams))

    def execute(self, streams: Sequence[object]) -> None:
        try:
            parser = PDFContentParser(streams)
        except PSEOF:
            # empty page
            return
        while True:
            try:
                (_, obj) = parser.nextobject()
            except PSEOF:
                break
            if isinstance(obj, PSKeyword):
                name = keyword_name(obj)
                method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
                    "'",
                    "_q",
                )
                if hasattr(self, method):
                    func = getattr(self, method)
                    nargs = func.__code__.co_argcount - 1
                    if nargs:
                        args = self.pop(nargs)
                        log.debug("exec: %s %r", name, args)
                        if len(args) == nargs:
                            func(*args)
                    else:
                        log.debug("exec: %s", name)
                        func()
                elif settings.STRICT:
                    error_msg = "Unknown operator: %r" % name
                    raise PDFInterpreterError(error_msg)
            else:
                self.push(obj)


================================================
FILE: babeldoc/pdfminer/pdfpage.py
================================================
import itertools
import logging
from collections.abc import Container
from collections.abc import Iterator
from typing import Any
from typing import BinaryIO

from babeldoc.pdfminer.pdfdocument import PDFDocument
from babeldoc.pdfminer.pdfdocument import PDFNoPageLabels
from babeldoc.pdfminer.pdfdocument import PDFTextExtractionNotAllowed
from babeldoc.pdfminer.pdfexceptions import PDFObjectNotFound
from babeldoc.pdfminer.pdfexceptions import PDFValueError
from babeldoc.pdfminer.pdfparser import PDFParser
from babeldoc.pdfminer.pdftypes import dict_value, PDFObjRef
from babeldoc.pdfminer.pdftypes import int_value
from babeldoc.pdfminer.pdftypes import list_value
from babeldoc.pdfminer.pdftypes import resolve1
from babeldoc.pdfminer.psparser import LIT
from babeldoc.pdfminer.utils import Rect
from babeldoc.pdfminer.utils import parse_rect
from babeldoc.pdfminer import settings

log = logging.getLogger(__name__)

# some predefined literals and keywords.
LITERAL_PAGE = LIT("Page")
LITERAL_PAGES = LIT("Pages")


class PDFPage:
    """An object that holds the information about a page.

    A PDFPage object is merely a convenience class that has a set
    of keys and values, which describe the properties of a page
    and point to its contents.

    Attributes
    ----------
      doc: a PDFDocument object.
      pageid: any Python object that can uniquely identify the page.
      attrs: a dictionary of page attributes.
      contents: a list of PDFStream objects that represents the page content.
      lastmod: the last modified time of the page.
      resources: a dictionary of resources used by the page.
      mediabox: the physical size of the page.
      cropbox: the crop rectangle of the page.
      rotate: the page rotation (in degree).
      annots: the page annotations.
      beads: a chain that represents natural reading order.
      label: the page's label (typically, the logical page number).

    """

    def __init__(
        self,
        doc: PDFDocument,
        pageid: object,
        attrs: object,
        label: str | None,
    ) -> None:
        """Initialize a page object.

        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        label: page label string.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.label = label
        self.lastmod = resolve1(self.attrs.get("LastModified"))
        self.resources: dict[object, object] = resolve1(
            self.attrs.get("Resources", dict()),
        )
        try:
            while isinstance(attrs["MediaBox"], PDFObjRef):
                attrs["MediaBox"] = resolve1(attrs["MediaBox"])
        except Exception:
            log.exception(f"try to fix mediabox failed: {attrs}")

        self.mediabox = self._parse_mediabox(self.attrs.get("MediaBox"))
        try:
            self.cropbox = self._parse_cropbox(self.attrs.get("CropBox"), self.mediabox)
        except Exception:
            self.cropbox = self.mediabox
        self.contents = self._parse_contents(self.attrs.get("Contents"))

        self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
        self.annots = self.attrs.get("Annots")
        self.beads = self.attrs.get("B")

    def __repr__(self) -> str:
        return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"

    INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}

    @classmethod
    def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
        def depth_first_search(
            obj: Any,
            parent: dict[str, Any],
            visited: set[Any] | None = None,
        ) -> Iterator[tuple[int, dict[Any, dict[Any, Any]]]]:
            if isinstance(obj, int):
                object_id = obj
                object_properties = dict_value(document.getobj(object_id)).copy()
            else:
                # This looks broken. obj.objid means obj could be either
                # PDFObjRef or PDFStream, but neither is valid for dict_value.
                object_id = obj.objid  # type: ignore[attr-defined]
                object_properties = dict_value(obj).copy()

            # Avoid recursion errors by keeping track of visited nodes
            if visited is None:
                visited = set()
            if object_id in visited:
                return
            visited.add(object_id)

            for k, v in parent.items():
                if k in cls.INHERITABLE_ATTRS and k not in object_properties:
                    object_properties[k] = v

            object_type = object_properties.get("Type")
            if object_type is None and not settings.STRICT:  # See #64
                object_type = object_properties.get("type")

            if object_type is LITERAL_PAGES and "Kids" in object_properties:
                log.debug("Pages: Kids=%r", object_properties["Kids"])
                for child in list_value(object_properties["Kids"]):
                    yield from depth_first_search(child, object_properties, visited)

            elif object_type is LITERAL_PAGE:
                log.debug("Page: %r", object_properties)
                yield (object_id, object_properties)

        try:
            page_labels: Iterator[str | None] = document.get_page_labels()
        except PDFNoPageLabels:
            page_labels = itertools.repeat(None)

        pages = False
        if "Pages" in document.catalog:
            objects = depth_first_search(document.catalog["Pages"], document.catalog)
            for objid, tree in objects:
                yield cls(document, objid, tree, next(page_labels))
                pages = True
        if not pages:
            # fallback when /Pages is missing.
            for xref in document.xrefs:
                for objid in xref.get_objids():
                    try:
                        obj = document.getobj(objid)
                        if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
                            yield cls(document, objid, obj, next(page_labels))
                    except PDFObjectNotFound:
                        pass

    @classmethod
    def get_pages(
        cls,
        fp: BinaryIO,
        pagenos: Container[int] | None = None,
        maxpages: int = 0,
        password: str = "",
        caching: bool = True,
        check_extractable: bool = False,
    ) -> Iterator["PDFPage"]:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        doc = PDFDocument(parser, password=password, caching=caching)
        # Check if the document allows text extraction.
        # If not, warn the user and proceed.
        if not doc.is_extractable:
            if check_extractable:
                error_msg = "Text extraction is not allowed: %r" % fp
                raise PDFTextExtractionNotAllowed(error_msg)
            else:
                warning_msg = (
                    "The PDF %r contains a metadata field "
                    "indicating that it should not allow "
                    "text extraction. Ignoring this field "
                    "and proceeding. Use the check_extractable "
                    "if you want to raise an error in this case" % fp
                )
                log.warning(warning_msg)
        # Process each page contained in the document.
        for pageno, page in enumerate(cls.create_pages(doc)):
            if pagenos and (pageno not in pagenos):
                continue
            yield page
            if maxpages and maxpages <= pageno + 1:
                break

    def _parse_mediabox(self, value: Any) -> Rect:
        us_letter = (0.0, 0.0, 612.0, 792.0)

        if value is None:
            log.warning(
                "MediaBox missing from /Page (and not inherited), "
                "defaulting to US Letter"
            )
            return us_letter

        try:
            return parse_rect(resolve1(val) for val in resolve1(value))

        except PDFValueError:
            log.warning("Invalid MediaBox in /Page, defaulting to US Letter")
            return us_letter

    def _parse_cropbox(self, value: Any, mediabox: Rect) -> Rect:
        if value is None:
            # CropBox is optional, and MediaBox is used if not specified.
            return mediabox

        try:
            return parse_rect(resolve1(val) for val in resolve1(value))

        except PDFValueError:
            log.warning("Invalid CropBox in /Page, defaulting to MediaBox")
            return mediabox

    def _parse_contents(self, value: Any) -> list[Any]:
        contents: list[Any] = []
        if value is not None:
            contents = resolve1(value)
            if not isinstance(contents, list):
                contents = [contents]
        return contents


================================================
FILE: babeldoc/pdfminer/pdfparser.py
================================================
import logging
from io import BytesIO
from typing import TYPE_CHECKING
from typing import BinaryIO
from typing import Union

from babeldoc.pdfminer.casting import safe_int
from babeldoc.pdfminer.pdfexceptions import PDFException
from babeldoc.pdfminer.pdftypes import PDFObjRef
from babeldoc.pdfminer.pdftypes import PDFStream
from babeldoc.pdfminer.pdftypes import dict_value
from babeldoc.pdfminer.pdftypes import int_value
from babeldoc.pdfminer.psexceptions import PSEOF
from babeldoc.pdfminer.psparser import KWD
from babeldoc.pdfminer.psparser import PSKeyword
from babeldoc.pdfminer.psparser import PSStackParser
from babeldoc.pdfminer import settings

if TYPE_CHECKING:
    from babeldoc.pdfminer.pdfdocument import PDFDocument

log = logging.getLogger(__name__)


class PDFSyntaxError(PDFException):
    pass


# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
    """PDFParser fetch PDF objects from a file stream.
    It can handle indirect references by referring to
    a PDF document set by set_document method.
    It also reads XRefs at the end of every PDF file.

    Typical usage:
      parser = PDFParser(fp)
      parser.read_xref()
      parser.read_xref(fallback=True) # optional
      parser.set_document(doc)
      parser.seek(offset)
      parser.nextobject()

    """

    def __init__(self, fp: BinaryIO) -> None:
        PSStackParser.__init__(self, fp)
        self.doc: PDFDocument | None = None
        self.fallback = False

    def set_document(self, doc: "PDFDocument") -> None:
        """Associates the parser with a PDFDocument object."""
        self.doc = doc

    KEYWORD_R = KWD(b"R")
    KEYWORD_NULL = KWD(b"null")
    KEYWORD_ENDOBJ = KWD(b"endobj")
    KEYWORD_STREAM = KWD(b"stream")
    KEYWORD_XREF = KWD(b"xref")
    KEYWORD_STARTXREF = KWD(b"startxref")

    def do_keyword(self, pos: int, token: PSKeyword) -> None:
        """Handles PDF-related keywords."""
        if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
            self.add_results(*self.pop(1))

        elif token is self.KEYWORD_ENDOBJ:
            self.add_results(*self.pop(4))

        elif token is self.KEYWORD_NULL:
            # null object
            self.push((pos, None))

        elif token is self.KEYWORD_R:
            # reference to indirect object
            if len(self.curstack) >= 2:
                (_, _object_id), _ = self.pop(2)
                object_id = safe_int(_object_id)
                if object_id is not None:
                    obj = PDFObjRef(self.doc, object_id)
                    self.push((pos, obj))

        elif token is self.KEYWORD_STREAM:
            # stream object
            ((_, dic),) = self.pop(1)
            dic = dict_value(dic)
            objlen = 0
            if not self.fallback:
                try:
                    objlen = int_value(dic["Length"])
                except KeyError:
                    if settings.STRICT:
                        raise PDFSyntaxError("/Length is undefined: %r" % dic)
            self.seek(pos)
            try:
                (_, line) = self.nextline()  # 'stream'
            except PSEOF:
                if settings.STRICT:
                    raise PDFSyntaxError("Unexpected EOF")
                return
            pos += len(line)
            self.fp.seek(pos)
            data = bytearray(self.fp.read(objlen))
            self.seek(pos + objlen)
            while 1:
                try:
                    (linepos, line) = self.nextline()
                except PSEOF:
                    if settings.STRICT:
                        raise PDFSyntaxError("Unexpected EOF")
                    break
                if b"endstream" in line:
                    i = line.index(b"endstream")
                    objlen += i
                    if self.fallback:
                        data += line[:i]
                    break
                objlen += len(line)
                if self.fallback:
                    data += line
            self.seek(pos + objlen)
            # XXX limit objlen not to exceed object boundary
            log.debug(
                "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
                pos,
                objlen,
                dic,
                data[:10],
            )
            assert self.doc is not None
            stream = PDFStream(dic, bytes(data), self.doc.decipher)
            self.push((pos, stream))

        else:
            # others
            self.push((pos, token))


class PDFStreamParser(PDFParser):
    """PDFStreamParser is used to parse PDF content streams
    that is contained in each page and has instructions
    for rendering the page. A reference to a PDF document is
    needed because a PDF content stream can also have
    indirect references to other objects in the same document.
    """

    def __init__(self, data: bytes) -> None:
        PDFParser.__init__(self, BytesIO(data))

    def flush(self) -> None:
        self.add_results(*self.popall())

    KEYWORD_OBJ = KWD(b"obj")

    def do_keyword(self, pos: int, token: PSKeyword) -> None:
        if token is self.KEYWORD_R:
            # reference to indirect object
            (_, _object_id), _ = self.pop(2)
            object_id = safe_int(_object_id)
            if object_id is not None:
                obj = PDFObjRef(self.doc, object_id)
                self.push((pos, obj))
            return

        elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
            if settings.STRICT:
                # See PDF Spec 3.4.6: Only the object values are stored in the
                # stream; the obj and endobj keywords are not used.
                raise PDFSyntaxError("Keyword endobj found in stream")
            return

        # others
        self.push((pos, token))


================================================
FILE: babeldoc/pdfminer/pdftypes.py
================================================
import io
import logging
import zlib
from collections.abc import Iterable
from typing import TYPE_CHECKING
from typing import Any
from typing import Optional
from typing import Protocol
from typing import cast
from warnings import warn

from babeldoc.pdfminer.ascii85 import ascii85decode
from babeldoc.pdfminer.ascii85 import asciihexdecode
from babeldoc.pdfminer.ccitt import ccittfaxdecode
from babeldoc.pdfminer.lzw import lzwdecode
from babeldoc.pdfminer.psparser import LIT
from babeldoc.pdfminer.psparser import PSObject
from babeldoc.pdfminer.runlength import rldecode
from babeldoc.pdfminer.utils import apply_png_predictor
from babeldoc.pdfminer import pdfexceptions
from babeldoc.pdfminer import settings

if TYPE_CHECKING:
    from babeldoc.pdfminer.pdfdocument import PDFDocument

logger = logging.getLogger(__name__)

LITERAL_CRYPT = LIT("Crypt")

# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl"))
LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW"))
LITERALS_ASCII85_DECODE = (LIT("ASCII85Decode"), LIT("A85"))
LITERALS_ASCIIHEX_DECODE = (LIT("ASCIIHexDecode"), LIT("AHx"))
LITERALS_RUNLENGTH_DECODE = (LIT("RunLengthDecode"), LIT("RL"))
LITERALS_CCITTFAX_DECODE = (LIT("CCITTFaxDecode"), LIT("CCF"))
LITERALS_DCT_DECODE = (LIT("DCTDecode"), LIT("DCT"))
LITERALS_JBIG2_DECODE = (LIT("JBIG2Decode"),)
LITERALS_JPX_DECODE = (LIT("JPXDecode"),)


class DecipherCallable(Protocol):
    """Fully typed a decipher callback, with optional parameter."""

    def __call__(
        self,
        objid: int,
        genno: int,
        data: bytes,
        attrs: dict[str, Any] | None = None,
    ) -> bytes:
        raise NotImplementedError


class PDFObject(PSObject):
    pass


# Adding aliases for these exceptions for backwards compatibility
PDFException = pdfexceptions.PDFException
PDFTypeError = pdfexceptions.PDFTypeError
PDFValueError = pdfexceptions.PDFValueError
PDFObjectNotFound = pdfexceptions.PDFObjectNotFound
PDFNotImplementedError = pdfexceptions.PDFNotImplementedError

_DEFAULT = object()


class PDFObjRef(PDFObject):
    def __init__(
        self,
        doc: Optional["PDFDocument"],
        objid: int,
        _: Any = _DEFAULT,
    ) -> None:
        """Reference to a PDF object.

        :param doc: The PDF document.
        :param objid: The object number.
        :param _: Unused argument for backwards compatibility.
        """
        if _ is not _DEFAULT:
            warn(
                "The third argument of PDFObjRef is unused and will be removed after "
                "2024",
                DeprecationWarning,
            )

        if objid == 0:
            if settings.STRICT:
                raise PDFValueError("PDF object id cannot be 0.")

        self.doc = doc
        self.objid = objid

    def __repr__(self) -> str:
        return "<PDFObjRef:%d>" % (self.objid)

    def resolve(self, default: object = None) -> Any:
        assert self.doc is not None
        try:
            return self.doc.getobj(self.objid)
        except PDFObjectNotFound:
            return default


def resolve1(x: object, default: object = None) -> Any:
    """Resolves an object.

    If this is an array or dictionary, it may still contains
    some indirect objects inside.
    """
    while isinstance(x, PDFObjRef):
        x = x.resolve(default=default)
    return x


def resolve_all(x: object, default: object = None) -> Any:
    """Recursively resolves the given object and all the internals.

    Make sure there is no indirect reference within the nested object.
    This procedure might be slow.
    """
    while isinstance(x, PDFObjRef):
        x = x.resolve(default=default)
    if isinstance(x, list):
        x = [resolve_all(v, default=default) for v in x]
    elif isinstance(x, dict):
        for k, v in x.items():
            x[k] = resolve_all(v, default=default)
    return x


def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object) -> Any:
    """Recursively deciphers the given object."""
    if isinstance(x, bytes):
        if len(x) == 0:
            return x
        return decipher(objid, genno, x)
    if isinstance(x, list):
        x = [decipher_all(decipher, objid, genno, v) for v in x]
    elif isinstance(x, dict):
        for k, v in x.items():
            x[k] = decipher_all(decipher, objid, genno, v)
    return x


def int_value(x: object) -> int:
    x = resolve1(x)
    if not isinstance(x, int):
        if settings.STRICT:
            raise PDFTypeError("Integer required: %r" % x)
        return 0
    return x


def float_value(x: object) -> float:
    x = resolve1(x)
    if not isinstance(x, float):
        if settings.STRICT:
            raise PDFTypeError("Float required: %r" % x)
        return 0.0
    return x


def num_value(x: object) -> float:
    x = resolve1(x)
    if not isinstance(x, (int, float)):  # == utils.isnumber(x)
        if settings.STRICT:
            raise PDFTypeError("Int or Float required: %r" % x)
        return 0
    return x


def uint_value(x: object, n_bits: int) -> int:
    """Resolve number and interpret it as a two's-complement unsigned number"""
    xi = int_value(x)
    if xi > 0:
        return xi
    else:
        return xi + cast(int, 2**n_bits)


def str_value(x: object) -> bytes:
    x = resolve1(x)
    if not isinstance(x, bytes):
        if settings.STRICT:
            raise PDFTypeError("String required: %r" % x)
        return b""
    return x


def list_value(x: object) -> list[Any] | tuple[Any, ...]:
    x = resolve1(x)
    if not isinstance(x, (list, tuple)):
        if settings.STRICT:
            raise PDFTypeError("List required: %r" % x)
        return []
    return x


def dict_value(x: object) -> dict[Any, Any]:
    x = resolve1(x)
    if not isinstance(x, dict):
        if settings.STRICT:
            logger.error("PDFTypeError : Dict required: %r", x)
            raise PDFTypeError("Dict required: %r" % x)
        return {}
    return x


def stream_value(x: object) -> "PDFStream":
    x = resolve1(x)
    if not isinstance(x, PDFStream):
        if settings.STRICT:
            raise PDFTypeError("PDFStream required: %r" % x)
        return PDFStream({}, b"")
    return x


def decompress_corrupted(data: bytes) -> bytes:
    """Called on some data that can't be properly decoded because of CRC checksum
    error. Attempt to decode it skipping the CRC.
    """
    d = zlib.decompressobj()
    f = io.BytesIO(data)
    result_str = b""
    buffer = f.read(1)
    i = 0
    try:
        while buffer:
            result_str += d.decompress(buffer)
            buffer = f.read(1)
            i += 1
    except zlib.error:
        # Let the error propagates if we're not yet in the CRC checksum
        if i < len(data) - 3:
            logger.warning("Data-loss while decompressing corrupted data")
    return result_str


class PDFStream(PDFObject):
    def __init__(
        self,
        attrs: dict[str, Any],
        rawdata: bytes,
        decipher: DecipherCallable | None = None,
    ) -> None:
        assert isinstance(attrs, dict), str(type(attrs))
        self.attrs = attrs
        self.rawdata: bytes | None = rawdata
        self.decipher = decipher
        self.data: bytes | None = None
        self.objid: int | None = None
        self.genno: int | None = None

    def set_objid(self, objid: int, genno: int) -> None:
        self.objid = objid
        self.genno = genno

    def __repr__(self) -> str:
        if self.data is None:
            assert self.rawdata is not None
            return "<PDFStream(%r): raw=%d, %r>" % (
                self.objid,
                len(self.rawdata),
                self.attrs,
            )
        else:
            assert self.data is not None
            return "<PDFStream(%r): len=%d, %r>" % (
                self.objid,
                len(self.data),
                self.attrs,
            )

    def __contains__(self, name: object) -> bool:
        return name in self.attrs

    def __getitem__(self, name: str) -> Any:
        return self.attrs[name]

    def get(self, name: str, default: object = None) -> Any:
        return self.attrs.get(name, default)

    def get_any(self, names: Iterable[str], default: object = None) -> Any:
        for name in names:
            if name in self.attrs:
                return self.attrs[name]
        return default

    def get_filters(self) -> list[tuple[Any, Any]]:
        filters = resolve1(self.get_any(("F", "Filter"), []))
        params = resolve1(self.get_any(("DP", "DecodeParms", "FDecodeParms"), {}))
        if not filters:
            return []
        if not isinstance(filters, list):
            filters = [filters]
        if not isinstance(params, list):
            # Make sure the parameters list is the same as filters.
            params = [params] * len(filters)
        if settings.STRICT and len(params) != len(filters):
            raise PDFException("Parameters len filter mismatch")

        resolved_filters = [resolve1(f) for f in filters]
        resolved_params = [resolve1(param) for param in params]
        return list(zip(resolved_filters, resolved_params, strict=False))

    def decode(self) -> None:
        assert self.data is None and self.rawdata is not None, str(
            (self.data, self.rawdata),
        )
        data = self.rawdata
        if self.decipher:
            # Handle encryption
            assert self.objid is not None
            assert self.genno is not None
            data = self.decipher(self.objid, self.genno, data, self.attrs)
        filters = self.get_filters()
        if not filters:
            self.data = data
            self.rawdata = None
            return
        for f, params in filters:
            if f in LITERALS_FLATE_DECODE:
                # will get errors if the document is encrypted.
                try:
                    data = zlib.decompress(data)

                except zlib.error as e:
                    if settings.STRICT:
                        error_msg = f"Invalid zlib bytes: {e!r}, {data!r}"
                        raise PDFException(error_msg)

                    try:
                        data = decompress_corrupted(data)
                    except zlib.error:
                        data = b""

            elif f in LITERALS_LZW_DECODE:
                data = lzwdecode(data)
            elif f in LITERALS_ASCII85_DECODE:
                data = ascii85decode(data)
            elif f in LITERALS_ASCIIHEX_DECODE:
                data = asciihexdecode(data)
            elif f in LITERALS_RUNLENGTH_DECODE:
                data = rldecode(data)
            elif f in LITERALS_CCITTFAX_DECODE:
                data = ccittfaxdecode(data, params)
            elif f in LITERALS_DCT_DECODE:
                # This is probably a JPG stream
                # it does not need to be decoded twice.
                # Just return the stream to the user.
                pass
            elif f in LITERALS_JBIG2_DECODE or f in LITERALS_JPX_DECODE:
                pass
            elif f == LITERAL_CRYPT:
                # not yet..
                raise PDFNotImplementedError("/Crypt filter is unsupported")
            else:
                raise PDFNotImplementedError("Unsupported filter: %r" % f)
            # apply predictors
            if params and "Predictor" in params:
                pred = int_value(params["Predictor"])
                if pred == 1:
                    # no predictor
                    pass
                elif pred >= 10:
                    # PNG predictor
                    colors = int_value(params.get("Colors", 1))
                    columns = int_value(params.get("Columns", 1))
                    raw_bits_per_component = params.get("BitsPerComponent", 8)
                    bitspercomponent = int_value(raw_bits_per_component)
                    data = apply_png_predictor(
                        pred,
                        colors,
                        columns,
                        bitspercomponent,
                        data,
                    )
                else:
                    error_msg = "Unsupported predictor: %r" % pred
                    raise PDFNotImplementedError(error_msg)
        self.data = data
        self.rawdata = None

    def get_data(self) -> bytes:
        if self.data is None:
            self.decode()
            assert self.data is not None
        return self.data

    def get_rawdata(self) -> bytes | None:
        return self.rawdata


================================================
FILE: babeldoc/pdfminer/psexceptions.py
================================================
class PSException(Exception):
    pass


class PSEOF(PSException):
    pass


class PSSyntaxError(PSException):
    pass


class PSTypeError(PSException):
    pass


class PSValueError(PSException):
    pass


================================================
FILE: babeldoc/pdfminer/psparser.py
================================================
#!/usr/bin/env python3
import io
import logging
import re
from collections.abc import Iterator
from typing import Any
from typing import BinaryIO
from typing import Generic
from typing import TypeVar
from typing import Union

from babeldoc.pdfminer.utils import choplist
from babeldoc.pdfminer import psexceptions
from babeldoc.pdfminer import settings

log = logging.getLogger(__name__)


# Adding aliases for these exceptions for backwards compatibility
PSException = psexceptions.PSException
PSEOF = psexceptions.PSEOF
PSSyntaxError = psexceptions.PSSyntaxError
PSTypeError = psexceptions.PSTypeError
PSValueError = psexceptions.PSValueError


class PSObject:
    """Base class for all PS or PDF-related data types."""


class PSLiteral(PSObject):
    """A class that represents a PostScript literal.

    Postscript literals are used as identifiers, such as
    variable names, property names and dictionary keys.
    Literals are case sensitive and denoted by a preceding
    slash sign (e.g. "/Name")

    Note: Do not create an instance of PSLiteral directly.
    Always use PSLiteralTable.intern().
    """

    NameType = Union[str, bytes]

    def __init__(self, name: NameType) -> None:
        self.name = name

    def __repr__(self) -> str:
        name = self.name
        return "/%r" % name


class PSKeyword(PSObject):
    """A class that represents a PostScript keyword.

    PostScript keywords are a dozen of predefined words.
    Commands and directives in PostScript are expressed by keywords.
    They are also used to denote the content boundaries.

    Note: Do not create an instance of PSKeyword directly.
    Always use PSKeywordTable.intern().
    """

    def __init__(self, name: bytes) -> None:
        self.name = name

    def __repr__(self) -> str:
        name = self.name
        return "/%r" % name


_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword)


class PSSymbolTable(Generic[_SymbolT]):
    """A utility class for storing PSLiteral/PSKeyword objects.

    Interned objects can be checked its identity with "is" operator.
    """

    def __init__(self, klass: type[_SymbolT]) -> None:
        self.dict: dict[PSLiteral.NameType, _SymbolT] = {}
        self.klass: type[_SymbolT] = klass

    def intern(self, name: PSLiteral.NameType) -> _SymbolT:
        if name in self.dict:
            lit = self.dict[name]
        else:
            # Type confusion issue: PSKeyword always takes bytes as name
            #                       PSLiteral uses either str or bytes
            lit = self.klass(name)  # type: ignore[arg-type]
            self.dict[name] = lit
        return lit


PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern
KEYWORD_PROC_BEGIN = KWD(b"{")
KEYWORD_PROC_END = KWD(b"}")
KEYWORD_ARRAY_BEGIN = KWD(b"[")
KEYWORD_ARRAY_END = KWD(b"]")
KEYWORD_DICT_BEGIN = KWD(b"<<")
KEYWORD_DICT_END = KWD(b">>")


def literal_name(x: Any) -> str:
    if isinstance(x, PSLiteral):
        if isinstance(x.name, str):
            return x.name
        try:
            return str(x.name, "utf-8")
        except UnicodeDecodeError:
            return str(x.name)
    else:
        if settings.STRICT:
            raise PSTypeError(f"Literal required: {x!r}")
        return str(x)


def keyword_name(x: Any) -> Any:
    if not isinstance(x, PSKeyword):
        if settings.STRICT:
            raise PSTypeError("Keyword required: %r" % x)
        else:
            name = x
    else:
        name = str(x.name, "utf-8", "ignore")
    return name


EOL = re.compile(rb"[\r\n]")
SPC = re.compile(rb"\s")
NONSPC = re.compile(rb"\S")
HEX = re.compile(rb"[0-9a-fA-F]")
END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]")
END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]")
HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.")
END_NUMBER = re.compile(rb"[^0-9]")
END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]")
END_STRING = re.compile(rb"[()\134]")
OCT_STRING = re.compile(rb"[0-7]")
ESC_STRING = {
    b"b": 8,
    b"t": 9,
    b"n": 10,
    b"f": 12,
    b"r": 13,
    b"(": 40,
    b")": 41,
    b"\\": 92,
}


PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes]


class PSBaseParser:
    """Most basic PostScript parser that performs only tokenization."""

    BUFSIZ = 4096

    def __init__(self, fp: BinaryIO) -> None:
        self.fp = fp
        self.eof = False
        self.seek(0)

    def __repr__(self) -> str:
        return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos)

    def flush(self) -> None:
        pass

    def close(self) -> None:
        self.flush()

    def tell(self) -> int:
        return self.bufpos + self.charpos

    def poll(self, pos: int | None = None, n: int = 80) -> None:
        pos0 = self.fp.tell()
        if not pos:
            pos = self.bufpos + self.charpos
        self.fp.seek(pos)
        log.debug("poll(%d): %r", pos, self.fp.read(n))
        self.fp.seek(pos0)

    def seek(self, pos: int) -> None:
        """Seeks the parser to the given position."""
        log.debug("seek: %r", pos)
        self.fp.seek(pos)
        # reset the status for nextline()
        self.bufpos = pos
        self.buf = b""
        self.charpos = 0
        # reset the status for nexttoken()
        self._parse1 = self._parse_main
        self._curtoken = b""
        self._curtokenpos = 0
        self._tokens: list[tuple[int, PSBaseParserToken]] = []
        self.eof = False

    def fillbuf(self) -> None:
        if self.charpos < len(self.buf):
            return
        # fetch next chunk.
        self.bufpos = self.fp.tell()
        self.buf = self.fp.read(self.BUFSIZ)
        if not self.buf:
            raise PSEOF("Unexpected EOF")
        self.charpos = 0

    def nextline(self) -> tuple[int, bytes]:
        """Fetches a next line that ends either with \\r or \\n."""
        linebuf = b""
        linepos = self.bufpos + self.charpos
        eol = False
        while 1:
            self.fillbuf()
            if eol:
                c = self.buf[self.charpos : self.charpos + 1]
                # handle b'\r\n'
                if c == b"\n":
                    linebuf += c
                    self.charpos += 1
                break
            m = EOL.search(self.buf, self.charpos)
            if m:
                linebuf += self.buf[self.charpos : m.end(0)]
                self.charpos = m.end(0)
                if linebuf[-1:] == b"\r":
                    eol = True
                else:
                    break
            else:
                linebuf += self.buf[self.charpos :]
                self.charpos = len(self.buf)
        log.debug("nextline: %r, %r", linepos, linebuf)

        return (linepos, linebuf)

    def revreadlines(self) -> Iterator[bytes]:
        """Fetches a next line backword.

        This is used to locate the trailers at the end of a file.
        """
        self.fp.seek(0, io.SEEK_END)
        pos = self.fp.tell()
        buf = b""
        while pos > 0:
            prevpos = pos
            pos = max(0, pos - self.BUFSIZ)
            self.fp.seek(pos)
            s = self.fp.read(prevpos - pos)
            if not s:
                break
            while 1:
                n = max(s.rfind(b"\r"), s.rfind(b"\n"))
                if n == -1:
                    buf = s + buf
                    break
                yield s[n:] + buf
                s = s[:n]
                buf = b""

    def _parse_main(self, s: bytes, i: int) -> int:
        m = NONSPC.search(s, i)
        if not m:
            return len(s)
        j = m.start(0)
        c = s[j : j + 1]
        self._curtokenpos = self.bufpos + j
        if c == b"%":
            self._curtoken = b"%"
            self._parse1 = self._parse_comment
            return j + 1
        elif c == b"/":
            self._curtoken = b""
            self._parse1 = self._parse_literal
            return j + 1
        elif c in b"-+" or c.isdigit():
            self._curtoken = c
            self._parse1 = self._parse_number
            return j + 1
        elif c == b".":
            self._curtoken = c
            self._parse1 = self._parse_float
            return j + 1
        elif c.isalpha():
            self._curtoken = c
            self._parse1 = self._parse_keyword
            return j + 1
        elif c == b"(":
            self._curtoken = b""
            self.paren = 1
            self._parse1 = self._parse_string
            return j + 1
        elif c == b"<":
            self._curtoken = b""
            self._parse1 = self._parse_wopen
            return j + 1
        elif c == b">":
            self._curtoken = b""
            self._parse1 = self._parse_wclose
            return j + 1
        elif c == b"\x00":
            return j + 1
        else:
            self._add_token(KWD(c))
            return j + 1

    def _add_token(self, obj: PSBaseParserToken) -> None:
        self._tokens.append((self._curtokenpos, obj))

    def _parse_comment(self, s: bytes, i: int) -> int:
        m = EOL.search(s, i)
        if not m:
            self._curtoken += s[i:]
            return len(s)
        j = m.start(0)
        self._curtoken += s[i:j]
        self._parse1 = self._parse_main
        # We ignore comments.
        # self._tokens.append(self._curtoken)
        return j

    def _parse_literal(self, s: bytes, i: int) -> int:
        m = END_LITERAL.search(s, i)
        if not m:
            self._curtoken += s[i:]
            return len(s)
        j = m.start(0)
        self._curtoken += s[i:j]
        c = s[j : j + 1]
        if c == b"#":
            self.hex = b""
            self._parse1 = self._parse_literal_hex
            return j + 1
        try:
            name: str | bytes = str(self._curtoken, "utf-8")
        except Exception:
            name = self._curtoken
        self._add_token(LIT(name))
        self._parse1 = self._parse_main
        return j

    def _parse_literal_hex(self, s: bytes, i: int) -> int:
        c = s[i : i + 1]
        if HEX.match(c) and len(self.hex) < 2:
            self.hex += c
            return i + 1
        if self.hex:
            self._curtoken += bytes((int(self.hex, 16),))
        self._parse1 = self._parse_literal
        return i

    def _parse_number(self, s: bytes, i: int) -> int:
        m = END_NUMBER.search(s, i)
        if not m:
            self._curtoken += s[i:]
            return len(s)
        j = m.start(0)
        self._curtoken += s[i:j]
        c = s[j : j + 1]
        if c == b".":
            self._curtoken += c
            self._parse1 = self._parse_float
            return j + 1
        try:
            self._add_token(int(self._curtoken))
        except ValueError:
            pass
        self._parse1 = self._parse_main
        return j

    def _parse_float(self, s: bytes, i: int) -> int:
        m = END_NUMBER.search(s, i)
        if not m:
            self._curtoken += s[i:]
            return len(s)
        j = m.start(0)
        self._curtoken += s[i:j]
        try:
            self._add_token(float(self._curtoken))
        except ValueError:
            pass
        self._parse1 = self._parse_main
        return j

    def _parse_keyword(self, s: bytes, i: int) -> int:
        m = END_KEYWORD.search(s, i)
        if m:
            j = m.start(0)
            self._curtoken += s[i:j]
        else:
            self._curtoken += s[i:]
            return len(s)
        if self._curtoken == b"true":
            token: bool | PSKeyword = True
        elif self._curtoken == b"false":
            token = False
        else:
            token = KWD(self._curtoken)
        self._add_token(token)
        self._parse1 = self._parse_main
        return j

    def _parse_string(self, s: bytes, i: int) -> int:
        m = END_STRING.search(s, i)
        if not m:
            self._curtoken += s[i:]
            return len(s)
        j = m.start(0)
        self._curtoken += s[i:j]
        c = s[j : j + 1]
        if c == b"\\":
            self.oct = b""
            self._parse1 = self._parse_string_1
            return j + 1
        if c == b"(":
            self.paren += 1
            self._curtoken += c
            return j + 1
        if c == b")":
            self.paren -= 1
            if self.paren:
                # WTF, they said balanced parens need no special treatment.
                self._curtoken += c
                return j + 1
        self._add_token(self._curtoken)
        self._parse1 = self._parse_main
        return j + 1

    def _parse_string_1(self, s: bytes, i: int) -> int:
        """Parse literal strings

        PDF Reference 3.2.3
        """
        c = s[i : i + 1]
        if OCT_STRING.match(c) and len(self.oct) < 3:
            self.oct += c
            return i + 1

        elif self.oct:
            chrcode = int(self.oct, 8)
            assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode)
            self._curtoken += bytes((chrcode,))
            self._parse1 = self._parse_string
            return i

        elif c in ESC_STRING:
            self._curtoken += bytes((ESC_STRING[c],))

        elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n":
            # If current and next character is \r\n skip both because enters
            # after a \ are ignored
            i += 1

        # default action
        self._parse1 = self._parse_string
        return i + 1

    def _parse_wopen(self, s: bytes, i: int) -> int:
        c = s[i : i + 1]
        if c == b"<":
            self._add_token(KEYWORD_DICT_BEGIN)
            self._parse1 = self._parse_main
            i += 1
        else:
            self._parse1 = self._parse_hexstring
        return i

    def _parse_wclose(self, s: bytes, i: int) -> int:
        c = s[i : i + 1]
        if c == b">":
            self._add_token(KEYWORD_DICT_END)
            i += 1
        self._parse1 = self._parse_main
        return i

    def _parse_hexstring(self, s: bytes, i: int) -> int:
        m = END_HEX_STRING.search(s, i)
        if not m:
            self._curtoken += s[i:]
            return len(s)
        j = m.start(0)
        self._curtoken += s[i:j]
        token = HEX_PAIR.sub(
            lambda m: bytes((int(m.group(0), 16),)),
            SPC.sub(b"", self._curtoken),
        )
        self._add_token(token)
        self._parse1 = self._parse_main
        return j

    def nexttoken(self) -> tuple[int, PSBaseParserToken]:
        if self.eof:
            # It's not really unexpected, come on now...
            raise PSEOF("Unexpected EOF")
        while not self._tokens:
            try:
                self.fillbuf()
                self.charpos = self._parse1(self.buf, self.charpos)
            except PSEOF:
                # If we hit EOF in the middle of a token, try to parse
                # it by tacking on whitespace, and delay raising PSEOF
                # until next time around
                self.charpos = self._parse1(b"\n", 0)
                self.eof = True
                # Oh, so there wasn't actually a token there? OK.
                if not self._tokens:
                    raise
        token = self._tokens.pop(0)
        log.debug("nexttoken: %r", token)
        return token


# Stack slots may by occupied by any of:
#  * the name of a literal
#  * the PSBaseParserToken types
#  * list (via KEYWORD_ARRAY)
#  * dict (via KEYWORD_DICT)
#  * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT
ExtraT = TypeVar("ExtraT")
PSStackType = Union[str, float, bool, PSLiteral, bytes, list, dict, ExtraT]
PSStackEntry = tuple[int, PSStackType[ExtraT]]


class PSStackParser(PSBaseParser, Generic[ExtraT]):
    def __init__(self, fp: BinaryIO) -> None:
        PSBaseParser.__init__(self, fp)
        self.reset()

    def reset(self) -> None:
        self.context: list[tuple[int, str | None, list[PSStackEntry[ExtraT]]]] = []
        self.curtype: str | None = None
        self.curstack: list[PSStackEntry[ExtraT]] = []
        self.results: list[PSStackEntry[ExtraT]] = []

    def seek(self, pos: int) -> None:
        PSBaseParser.seek(self, pos)
        self.reset()

    def push(self, *objs: PSStackEntry[ExtraT]) -> None:
        self.curstack.extend(objs)

    def pop(self, n: int) -> list[PSStackEntry[ExtraT]]:
        objs = self.curstack[-n:]
        self.curstack[-n:] = []
        return objs

    def popall(self) -> list[PSStackEntry[ExtraT]]:
        objs = self.curstack
        self.curstack = []
        return objs

    def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
        try:
            log.debug("add_results: %r", objs)
        except Exception:
            log.debug("add_results: (unprintable object)")
        self.results.extend(objs)

    def start_type(self, pos: int, type: str) -> None:
        self.context.append((pos, self.curtype, self.curstack))
        (self.curtype, self.curstack) = (type, [])
        log.debug("start_type: pos=%r, type=%r", pos, type)

    def end_type(self, type: str) -> tuple[int, list[PSStackType[ExtraT]]]:
        if self.curtype != type:
            raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}")
        objs = [obj for (_, obj) in self.curstack]
        (pos, self.curtype, self.curstack) = self.context.pop()
        log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs)
        return (pos, objs)

    def do_keyword(self, pos: int, token: PSKeyword) -> None:
        pass

    def nextobject(self) -> PSStackEntry[ExtraT]:
        """Yields a list of objects.

        Arrays and dictionaries are represented as Python lists and
        dictionaries.

        :return: keywords, literals, strings, numbers, arrays and dictionaries.
        """
        while not self.results:
            (pos, token) = self.nexttoken()
            if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
                # normal token
                self.push((pos, token))
            elif token == KEYWORD_ARRAY_BEGIN:
                # begin array
                self.start_type(pos, "a")
            elif token == KEYWORD_ARRAY_END:
                # end array
                try:
                    self.push(self.end_type("a"))
                except PSTypeError:
                    if settings.STRICT:
                        raise
            elif token == KEYWORD_DICT_BEGIN:
                # begin dictionary
                self.start_type(pos, "d")
            elif token == KEYWORD_DICT_END:
                # end dictionary
                try:
                    (pos, objs) = self.end_type("d")
                    if len(objs) % 2 != 0:
                        error_msg = "Invalid dictionary construct: %r" % objs
                        raise PSSyntaxError(error_msg)
                    d = {
                        literal_name(k): v
                        for (k, v) in choplist(2, objs)
                        if v is not None
                    }
                    self.push((pos, d))
                except PSTypeError:
                    if settings.STRICT:
                        raise
            elif token == KEYWORD_PROC_BEGIN:
                # begin proc
                self.start_type(pos, "p")
            elif token == KEYWORD_PROC_END:
                # end proc
                try:
                    self.push(self.end_type("p"))
                except PSTypeError:
                    if settings.STRICT:
                        raise
            elif isinstance(token, PSKeyword):
                log.debug(
                    "do_keyword: pos=%r, token=%r, stack=%r",
                    pos,
                    token,
                    self.curstack,
                )
                self.do_keyword(pos, token)
            else:
                log.error(
                    "unknown token: pos=%r, token=%r, stack=%r",
                    pos,
                    token,
                    self.curstack,
                )
                self.do_keyword(pos, token)
                raise PSException
            if self.context:
                continue
            else:
                self.flush()
        obj = self.results.pop(0)
        try:
            log.debug("nextobject: %r", obj)
        except Exception:
            log.debug("nextobject: (unprintable object)")
        return obj


================================================
FILE: babeldoc/pdfminer/py.typed
================================================


================================================
FILE: babeldoc/pdfminer/runlength.py
================================================
#
# RunLength decoder (Adobe version) implementation based on PDF Reference
# version 1.4 section 3.3.4.
#
#  * public domain *
#


def rldecode(data: bytes) -> bytes:
    """RunLength decoder (Adobe version) implementation based on PDF Reference
    version 1.4 section 3.3.4:
        The RunLengthDecode filter decodes data that has been encoded in a
        simple byte-oriented format based on run length. The encoded data
        is a sequence of runs, where each run consists of a length byte
        followed by 1 to 128 bytes of data. If the length byte is in the
        range 0 to 127, the following length + 1 (1 to 128) bytes are
        copied literally during decompression. If length is in the range
        129 to 255, the following single byte is to be copied 257 - length
        (2 to 128) times during decompression. A length value of 128
        denotes EOD.
    """
    decoded_array: list[int] = []
    data_iter = iter(data)

    while True:
        length = next(data_iter, 128)
        if length == 128:
            break

        if 0 <= length < 128:
            decoded_array.extend(next(data_iter) for _ in range(length + 1))

        if length > 128:
            run = [next(data_iter)] * (257 - length)
            decoded_array.extend(run)
    return bytes(decoded_array)


================================================
FILE: babeldoc/pdfminer/settings.py
================================================
STRICT = False


================================================
FILE: babeldoc/pdfminer/utils.py
================================================
"""Miscellaneous Routines."""

import io
import pathlib
import string
from collections.abc import Callable
from collections.abc import Iterable
from collections.abc import Iterator
from html import escape
from typing import TYPE_CHECKING
from typing import Any
from typing import BinaryIO
from typing import Generic
from typing import TextIO
from typing import TypeVar
from typing import Union
from typing import cast

from babeldoc.pdfminer.pdfexceptions import PDFTypeError
from babeldoc.pdfminer.pdfexceptions import PDFValueError

if TYPE_CHECKING:
    from babeldoc.pdfminer.layout import LTComponent

import charset_normalizer  # For str encoding detection

# from sys import maxint as INF doesn't work anymore under Python3, but PDF
# still uses 32 bits ints
INF = (1 << 31) - 1


FileOrName = Union[pathlib.PurePath, str, io.IOBase]
AnyIO = Union[TextIO, BinaryIO]


class open_filename:
    """Context manager that allows opening a filename
    (str or pathlib.PurePath type is supported) and closes it on exit,
    (just like `open`), but does nothing for file-like objects.
    """

    def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any) -> None:
        if isinstance(filename, pathlib.PurePath):
            filename = str(filename)
        if isinstance(filename, str):
            self.file_handler: AnyIO = open(filename, *args, **kwargs)
            self.closing = True
        elif isinstance(filename, io.IOBase):
            self.file_handler = cast(AnyIO, filename)
            self.closing = False
        else:
            raise PDFTypeError("Unsupported input type: %s" % type(filename))

    def __enter__(self) -> AnyIO:
        return self.file_handler

    def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
        if self.closing:
            self.file_handler.close()


def make_compat_bytes(in_str: str) -> bytes:
    """Converts to bytes, encoding to unicode."""
    assert isinstance(in_str, str), str(type(in_str))
    return in_str.encode()


def make_compat_str(o: object) -> str:
    """Converts everything to string, if bytes guessing the encoding."""
    if isinstance(o, bytes):
        enc = charset_normalizer.detect(o)
        try:
            return o.decode(enc["encoding"])
        except UnicodeDecodeError:
            return str(o)
    else:
        return str(o)


def shorten_str(s: str, size: int) -> str:
    if size < 7:
        return s[:size]
    if len(s) > size:
        length = (size - 5) // 2
        return f"{s[:length]} ... {s[-length:]}"
    else:
        return s


def compatible_encode_method(
    bytesorstring: bytes | str,
    encoding: str = "utf-8",
    erraction: str = "ignore",
) -> str:
    """When Py2 str.encode is called, it often means bytes.encode in Py3.

    This does either.
    """
    if isinstance(bytesorstring, str):
        return bytesorstring
    assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
    return bytesorstring.decode(encoding, erraction)


def paeth_predictor(left: int, above: int, upper_left: int) -> int:
    # From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
    # Initial estimate
    p = left + above - upper_left
    # Distances to a,b,c
    pa = abs(p - left)
    pb = abs(p - above)
    pc = abs(p - upper_left)

    # Return nearest of a,b,c breaking ties in order a,b,c
    if pa <= pb and pa <= pc:
        return left
    elif pb <= pc:
        return above
    else:
        return upper_left


def apply_png_predictor(
    pred: int,
    colors: int,
    columns: int,
    bitspercomponent: int,
    data: bytes,
) -> bytes:
    """Reverse the effect of the PNG predictor

    Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
    """
    if bitspercomponent not in [8, 1]:
        msg = "Unsupported `bitspercomponent': %d" % bitspercomponent
        raise PDFValueError(msg)

    nbytes = colors * columns * bitspercomponent // 8
    bpp = colors * bitspercomponent // 8  # number of bytes per complete pixel
    buf = []
    line_above = list(b"\x00" * columns)
    for scanline_i in range(0, len(data), nbytes + 1):
        filter_type = data[scanline_i]
        line_encoded = data[scanline_i + 1 : scanline_i + 1 + nbytes]
        raw = []

        if filter_type == 0:
            # Filter type 0: None
            raw = list(line_encoded)

        elif filter_type == 1:
            # Filter type 1: Sub
            # To reverse the effect of the Sub() filter after decompression,
            # output the following value:
            #   Raw(x) = Sub(x) + Raw(x - bpp)
            # (computed mod 256), where Raw() refers to the bytes already
            #  decoded.
            for j, sub_x in enumerate(line_encoded):
                if j - bpp < 0:
                    raw_x_bpp = 0
                else:
                    raw_x_bpp = int(raw[j - bpp])
                raw_x = (sub_x + raw_x_bpp) & 255
                raw.append(raw_x)

        elif filter_type == 2:
            # Filter type 2: Up
            # To reverse the effect of the Up() filter after decompression,
            # output the following value:
            #   Raw(x) = Up(x) + Prior(x)
            # (computed mod 256), where Prior() refers to the decoded bytes of
            # the prior scanline.
            for up_x, prior_x in zip(line_encoded, line_above, strict=False):
                raw_x = (up_x + prior_x) & 255
                raw.append(raw_x)

        elif filter_type == 3:
            # Filter type 3: Average
            # To reverse the effect of the Average() filter after
            # decompression, output the following value:
            #    Raw(x) = Average(x) + floor((Raw(x-bpp)+Prior(x))/2)
            # where the result is computed mod 256, but the prediction is
            # calculated in the same way as for encoding. Raw() refers to the
            # bytes already decoded, and Prior() refers to the decoded bytes of
            # the prior scanline.
            for j, average_x in enumerate(line_encoded):
                if j - bpp < 0:
                    raw_x_bpp = 0
                else:
                    raw_x_bpp = int(raw[j - bpp])
                prior_x = int(line_above[j])
                raw_x = (average_x + (raw_x_bpp + prior_x) // 2) & 255
                raw.append(raw_x)

        elif filter_type == 4:
            # Filter type 4: Paeth
            # To reverse the effect of the Paeth() filter after decompression,
            # output the following value:
            #    Raw(x) = Paeth(x)
            #             + PaethPredictor(Raw(x-bpp), Prior(x), Prior(x-bpp))
            # (computed mod 256), where Raw() and Prior() refer to bytes
            # already decoded. Exactly the same PaethPredictor() function is
            # used by both encoder and decoder.
            for j, paeth_x in enumerate(line_encoded):
                if j - bpp < 0:
                    raw_x_bpp = 0
                    prior_x_bpp = 0
                else:
                    raw_x_bpp = int(raw[j - bpp])
                    prior_x_bpp = int(line_above[j - bpp])
                prior_x = int(line_above[j])
                paeth = paeth_predictor(raw_x_bpp, prior_x, prior_x_bpp)
                raw_x = (paeth_x + paeth) & 255
                raw.append(raw_x)

        else:
            raise PDFValueError("Unsupported predictor value: %d" % filter_type)

        buf.extend(raw)
        line_above = raw
    return bytes(buf)


Point = tuple[float, float]
Rect = tuple[float, float, float, float]
Matrix = tuple[float, float, float, float, float, float]
PathSegment = Union[
    tuple[str],  # Literal['h']
    tuple[str, float, float],  # Literal['m', 'l']
    tuple[str, float, float, float, float],  # Literal['v', 'y']
    tuple[str, float, float, float, float, float, float],
]  # Literal['c']

#  Matrix operations
MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)


def parse_rect(o: Any) -> Rect:
    try:
        (x0, y0, x1, y1) = o
        return float(x0), float(y0), float(x1), float(y1)
    except ValueError:
        raise PDFValueError("Could not parse rectangle")


def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:
    (a1, b1, c1, d1, e1, f1) = m1
    (a0, b0, c0, d0, e0, f0) = m0
    """Returns the multiplication of two matrices."""
    return (
        a0 * a1 + c0 * b1,
        b0 * a1 + d0 * b1,
        a0 * c1 + c0 * d1,
        b0 * c1 + d0 * d1,
        a0 * e1 + c0 * f1 + e0,
        b0 * e1 + d0 * f1 + f0,
    )


def translate_matrix(m: Matrix, v: Point) -> Matrix:
    """Translates a matrix by (x, y)."""
    (a, b, c, d, e, f) = m
    (x, y) = v
    return a, b, c, d, x * a + y * c + e, x * b + y * d + f


def apply_matrix_pt(m: Matrix, v: Point) -> Point:
    (a, b, c, d, e, f) = m
    (x, y) = v
    """Applies a matrix to a point."""
    return a * x + c * y + e, b * x + d * y + f


def apply_matrix_norm(m: Matrix, v: Point) -> Point:
    """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
    (a, b, c, d, e, f) = m
    (p, q) = v
    return a * p + c * q, b * p + d * q


#  Utility functions


def isnumber(x: object) -> bool:
    return isinstance(x, (int, float))


_T = TypeVar("_T")


def uniq(objs: Iterable[_T]) -> Iterator[_T]:
    """Eliminates duplicated elements."""
    done = set()
    for obj in objs:
        if obj in done:
            continue
        done.add(obj)
        yield obj


def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T]) -> tuple[list[_T], list[_T]]:
    """Split a list into two classes according to the predicate."""
    t = []
    f = []
    for obj in objs:
        if pred(obj):
            t.append(obj)
        else:
            f.append(obj)
    return t, f


def drange(v0: float, v1: float, d: int) -> range:
    """Returns a discrete range."""
    return range(int(v0) // d, int(v1 + d) // d)


def get_bound(pts: Iterable[Point]) -> Rect:
    """Compute a minimal rectangle that covers all the points."""
    limit: Rect = (INF, INF, -INF, -INF)
    (x0, y0, x1, y1) = limit
    for x, y in pts:
        x0 = min(x0, x)
        y0 = min(y0, y)
        x1 = max(x1, x)
        y1 = max(y1, y)
    return x0, y0, x1, y1


def pick(
    seq: Iterable[_T],
    func: Callable[[_T], float],
    maxobj: _T | None = None,
) -> _T | None:
    """Picks the object obj where func(obj) has the highest value."""
    maxscore = None
    for obj in seq:
        score = func(obj)
        if maxscore is None or maxscore < score:
            (maxscore, maxobj) = (score, obj)
    return maxobj


def choplist(n: int, seq: Iterable[_T]) -> Iterator[tuple[_T, ...]]:
    """Groups every n elements of the list."""
    r = []
    for x in seq:
        r.append(x)
        if len(r) == n:
            yield tuple(r)
            r = []


def nunpack(s: bytes, default: int = 0) -> int:
    """Unpacks variable-length unsigned integers (big endian)."""
    length = len(s)
    if not length:
        return default
    else:
        return int.from_bytes(s, byteorder="big", signed=False)


PDFDocEncoding = "".join(
    chr(x)
    for x in (
        0x0000,
        0x0001,
        0x0002,
        0x0003,
        0x0004,
        0x0005,
        0x0006,
        0x0007,
        0x0008,
        0x0009,
        0x000A,
        0x000B,
        0x000C,
        0x000D,
        0x000E,
        0x000F,
        0x0010,
        0x0011,
        0x0012,
        0x0013,
        0x0014,
        0x0015,
        0x0017,
        0x0017,
        0x02D8,
        0x02C7,
        0x02C6,
        0x02D9,
        0x02DD,
        0x02DB,
        0x02DA,
        0x02DC,
        0x0020,
        0x0021,
        0x0022,
        0x0023,
        0x0024,
        0x0025,
        0x0026,
        0x0027,
        0x0028,
        0x0029,
        0x002A,
        0x002B,
        0x002C,
        0x002D,
        0x002E,
        0x002F,
        0x0030,
        0x0031,
        0x0032,
        0x0033,
        0x0034,
        0x0035,
        0x0036,
        0x0037,
        0x0038,
        0x0039,
        0x003A,
        0x003B,
        0x003C,
        0x003D,
        0x003E,
        0x003F,
        0x0040,
        0x0041,
        0x0042,
        0x0043,
        0x0044,
        0x0045,
        0x0046,
        0x0047,
        0x0048,
        0x0049,
        0x004A,
        0x004B,
        0x004C,
        0x004D,
        0x004E,
        0x004F,
        0x0050,
        0x0051,
        0x0052,
        0x0053,
        0x0054,
        0x0055,
        0x0056,
        0x0057,
        0x0058,
        0x0059,
        0x005A,
        0x005B,
        0x005C,
        0x005D,
        0x005E,
        0x005F,
        0x0060,
        0x0061,
        0x0062,
        0x0063,
        0x0064,
        0x0065,
        0x0066,
        0x0067,
        0x0068,
        0x0069,
        0x006A,
        0x006B,
        0x006C,
        0x006D,
        0x006E,
        0x006F,
        0x0070,
        0x0071,
        0x0072,
        0x0073,
        0x0074,
        0x0075,
        0x0076,
        0x0077,
        0x0078,
        0x0079,
        0x007A,
        0x007B,
        0x007C,
        0x007D,
        0x007E,
        0x0000,
        0x2022,
        0x2020,
        0x2021,
        0x2026,
        0x2014,
        0x2013,
        0x0192,
        0x2044,
        0x2039,
        0x203A,
        0x2212,
        0x2030,
        0x201E,
        0x201C,
        0x201D,
        0x2018,
        0x2019,
        0x201A,
        0x2122,
        0xFB01,
        0xFB02,
        0x0141,
        0x0152,
        0x0160,
        0x0178,
        0x017D,
        0x0131,
        0x0142,
        0x0153,
        0x0161,
        0x017E,
        0x0000,
        0x20AC,
        0x00A1,
        0x00A2,
        0x00A3,
        0x00A4,
        0x00A5,
        0x00A6,
        0x00A7,
        0x00A8,
        0x00A9,
        0x00AA,
        0x00AB,
        0x00AC,
        0x0000,
        0x00AE,
        0x00AF,
        0x00B0,
        0x00B1,
        0x00B2,
        0x00B3,
        0x00B4,
        0x00B5,
        0x00B6,
        0x00B7,
        0x00B8,
        0x00B9,
        0x00BA,
        0x00BB,
        0x00BC,
        0x00BD,
        0x00BE,
        0x00BF,
        0x00C0,
        0x00C1,
        0x00C2,
        0x00C3,
        0x00C4,
        0x00C5,
        0x00C6,
        0x00C7,
        0x00C8,
        0x00C9,
        0x00CA,
        0x00CB,
        0x00CC,
        0x00CD,
        0x00CE,
        0x00CF,
        0x00D0,
        0x00D1,
        0x00D2,
        0x00D3,
        0x00D4,
        0x00D5,
        0x00D6,
        0x00D7,
        0x00D8,
        0x00D9,
        0x00DA,
        0x00DB,
        0x00DC,
        0x00DD,
        0x00DE,
        0x00DF,
        0x00E0,
        0x00E1,
        0x00E2,
        0x00E3,
        0x00E4,
        0x00E5,
        0x00E6,
        0x00E7,
        0x00E8,
        0x00E9,
        0x00EA,
        0x00EB,
        0x00EC,
        0x00ED,
        0x00EE,
        0x00EF,
        0x00F0,
        0x00F1,
        0x00F2,
        0x00F3,
        0x00F4,
        0x00F5,
        0x00F6,
        0x00F7,
        0x00F8,
        0x00F9,
        0x00FA,
        0x00FB,
        0x00FC,
        0x00FD,
        0x00FE,
        0x00FF,
    )
)


def decode_text(s: bytes) -> str:
    """Decodes a PDFDocEncoding string to Unicode."""
    if s.startswith(b"\xfe\xff"):
        return str(s[2:], "utf-16be", "ignore")
    else:
        return "".join(PDFDocEncoding[c] for c in s)


def enc(x: str) -> str:
    """Encodes a string for SGML/XML/HTML"""
    if isinstance(x, bytes):
        return ""
    return escape(x)


def bbox2str(bbox: Rect) -> str:
    (x0, y0, x1, y1) = bbox
    return f"{x0:.3f},{y0:.3f},{x1:.3f},{y1:.3f}"


def matrix2str(m: Matrix) -> str:
    (a, b, c, d, e, f) = m
    return f"[{a:.2f},{b:.2f},{c:.2f},{d:.2f}, ({e:.2f},{f:.2f})]"


def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
    """A distance function between two TextBoxes.

    Consider the bounding rectangle for obj1 and obj2.
    Return vector between 2 boxes boundaries if they don't overlap, otherwise
    returns vector betweeen boxes centers

             +------+..........+ (x1, y1)
             | obj1 |          :
             +------+www+------+
             :          | obj2 |
    (x0, y0) +..........+------+
    """
    (x0, y0) = (min(obj1.x0, obj2.x0), min(obj1.y0, obj2.y0))
    (x1, y1) = (max(obj1.x1, obj2.x1), max(obj1.y1, obj2.y1))
    (ow, oh) = (x1 - x0, y1 - y0)
    (iw, ih) = (ow - obj1.width - obj2.width, oh - obj1.height - obj2.height)
    if iw < 0 and ih < 0:
        # if one is inside another we compute euclidean distance
        (xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2)
        (xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2)
        return xc1 - xc2, yc1 - yc2
    else:
        return max(0, iw), max(0, ih)


LTComponentT = TypeVar("LTComponentT", bound="LTComponent")


class Plane(Generic[LTComponentT]):
    """A set-like data structure for objects placed on a plane.

    Can efficiently find objects in a certain rectangular area.
    It maintains two parallel lists of objects, each of
    which is sorted by its x or y coordinate.
    """

    def __init__(self, bbox: Rect, gridsize: int = 50) -> None:
        self._seq: list[LTComponentT] = []  # preserve the object order.
        self._objs: set[LTComponentT] = set()
        self._grid: dict[Point, list[LTComponentT]] = {}
        self.gridsize = gridsize
        (self.x0, self.y0, self.x1, self.y1) = bbox

    def __repr__(self) -> str:
        return "<Plane objs=%r>" % list(self)

    def __iter__(self) -> Iterator[LTComponentT]:
        return (obj for obj in self._seq if obj in self._objs)

    def __len__(self) -> int:
        return len(self._objs)

    def __contains__(self, obj: object) -> bool:
        return obj in self._objs

    def _getrange(self, bbox: Rect) -> Iterator[Point]:
        (x0, y0, x1, y1) = bbox
        if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
            return
        x0 = max(self.x0, x0)
        y0 = max(self.y0, y0)
        x1 = min(self.x1, x1)
        y1 = min(self.y1, y1)
        for grid_y in drange(y0, y1, self.gridsize):
            for grid_x in drange(x0, x1, self.gridsize):
                yield (grid_x, grid_y)

    def extend(self, objs: Iterable[LTComponentT]) -> None:
        for obj in objs:
            self.add(obj)

    def add(self, obj: LTComponentT) -> None:
        """Place an object."""
        for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
            if k not in self._grid:
                r: list[LTComponentT] = []
                self._grid[k] = r
            else:
                r = self._grid[k]
            r.append(obj)
        self._seq.append(obj)
        self._objs.add(obj)

    def remove(self, obj: LTComponentT) -> None:
        """Displace an object."""
        for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
            try:
                self._grid[k].remove(obj)
            except (KeyError, ValueError):
                pass
        self._objs.remove(obj)

    def find(self, bbox: Rect) -> Iterator[LTComponentT]:
        """Finds objects that are in a certain area."""
        (x0, y0, x1, y1) = bbox
        done = set()
        for k in self._getrange(bbox):
            if k not in self._grid:
                continue
            for obj in self._grid[k]:
                if obj in done:
                    continue
                done.add(obj)
                if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
                    continue
                yield obj


ROMAN_ONES = ["i", "x", "c", "m"]
ROMAN_FIVES = ["v", "l", "d"]


def format_int_roman(value: int) -> str:
    """Format a number as lowercase Roman numerals."""
    assert 0 < value < 4000
    result: list[str] = []
    index = 0

    while value != 0:
        value, remainder = divmod(value, 10)
        if remainder == 9:
            result.insert(0, ROMAN_ONES[index])
            result.insert(1, ROMAN_ONES[index + 1])
        elif remainder == 4:
            result.insert(0, ROMAN_ONES[index])
            result.insert(1, ROMAN_FIVES[index])
        else:
            over_five = remainder >= 5
            if over_five:
                result.insert(0, ROMAN_FIVES[index])
                remainder -= 5
            result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
        index += 1

    return "".join(result)


def format_int_alpha(value: int) -> str:
    """Format a number as lowercase letters a-z, aa-zz, etc."""
    assert value > 0
    result: list[str] = []

    while value != 0:
        value, remainder = divmod(value - 1, len(string.ascii_lowercase))
        result.append(string.ascii_lowercase[remainder])

    result.reverse()
    return "".join(result)


================================================
FILE: babeldoc/progress_monitor.py
================================================
import asyncio
import logging
import threading
import time
from asyncio import CancelledError
from collections.abc import Callable
from typing import Optional

logger = logging.getLogger(__name__)


class ProgressMonitor:
    def __init__(
        self,
        stages: list[tuple[str, float]],
        progress_change_callback: Callable | None = None,
        finish_callback: Callable | None = None,
        report_interval: float = 0.1,
        finish_event: asyncio.Event | None = None,
        cancel_event: threading.Event | None = None,
        loop: asyncio.AbstractEventLoop | None = None,
        parent_monitor: Optional["ProgressMonitor"] = None,
        part_index: int | None = 0,
        total_parts: int | None = 1,
    ):
        self.lock = threading.Lock()
        self.parent_monitor = parent_monitor
        self.part_index = part_index
        self.total_parts = total_parts
        self.raw_stages = stages
        self.part_results = {}

        # Convert stages list to dict with name and weight
        self.stage = {}
        total_weight = sum(weight for _, weight in stages)
        for name, weight in stages:
            normalized_weight = weight / total_weight
            self.stage[name] = TranslationStage(
                name,
                0,
                self,
                normalized_weight,
                self.lock,
            )

        self.progress_change_callback = progress_change_callback
        self.finish_callback = finish_callback
        self.report_interval = report_interval
        logger.debug(f"report_interval: {self.report_interval}")
        self.last_report_time = 0
        self.finish_stage_count = 0
        self.finish_event = finish_event
        self.cancel_event = cancel_event
        self.loop = loop
        self.disable = False
        if finish_event and not loop:
            raise ValueError("finish_event requires a loop")
        if self.progress_change_callback:
            self.progress_change_callback(
                type="stage_summary",
                stages=[
                    {
                        "name": name,
                        "percent": self.stage[name].weight,
                    }
                    for name, _ in stages
                ],
                part_index=self.part_index,
                total_parts=self.total_parts,
            )

    def create_part_monitor(
        self, part_index: int, total_parts: int
    ) -> "ProgressMonitor":
        """Create a new progress monitor for a document part"""
        return ProgressMonitor(
            stages=self.raw_stages,
            progress_change_callback=self._handle_part_progress,
            finish_callback=self._handle_part_finish,
            report_interval=self.report_interval,
            cancel_event=self.cancel_event,
            loop=self.loop,
            parent_monitor=self,
            part_index=part_index,
            total_parts=total_parts,
        )

    def _handle_part_progress(self, **kwargs):
        """Handle progress updates from part monitors"""
        if self.progress_change_callback and not self.disable:
            # Add part information to progress update
            kwargs["part_index"] = kwargs.get("part_index")
            kwargs["total_parts"] = kwargs.get("total_parts")
            self.progress_change_callback(**kwargs)

    def _handle_part_finish(self, **kwargs):
        """Handle completion of a part translation"""
        if kwargs["type"] == "error":
            logger.info(f"progress_monitor handle_part_finish: {kwargs['error']}")
            self.finish_callback(type="error", error=kwargs["error"])
            return
        if "translate_result" in kwargs:
            part_index = kwargs.get("part_index")
            if part_index is not None:
                self.part_results[part_index] = kwargs["translate_result"]

        # if self.finish_callback and not self.disable:
        #     self.finish_callback(**kwargs)

    def stage_start(self, stage_name: str, total: int):
        if self.disable or self.parent_monitor and self.parent_monitor.disable:
            return DummyTranslationStage(stage_name, total, self, 0)
        stage = self.stage[stage_name]
        stage.run_time += 1
        stage.name = stage_name
        stage.display_name = f"{stage_name}" if stage.run_time > 1 else stage_name
        stage.current = 0
        stage.total = total
        if self.progress_change_callback:
            self.progress_change_callback(
                type="progress_start",
                stage=stage.display_name,
                stage_progress=0.0,
                stage_current=0,
                stage_total=total,
                overall_progress=self.calculate_current_progress(),
                part_index=self.part_index + 1,
                total_parts=self.total_parts,
            )
        self.last_report_time = 0.0
        return stage

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        logger.debug("ProgressMonitor __exit__")

    def on_finish(self):
        if self.disable or self.parent_monitor and self.parent_monitor.disable:
            return
        if self.cancel_event:
            self.cancel_event.set()
        if self.finish_event and self.loop:
            self.loop.call_soon_threadsafe(self.finish_event.set)
        if self.cancel_event and self.cancel_event.is_set():
            self.finish_callback(type="error", error=CancelledError)

    def stage_done(self, stage):
        if self.disable or self.parent_monitor and self.parent_monitor.disable:
            return
        self.last_report_time = 0.0
        self.finish_stage_count += 1
        if (
            stage.current != stage.total
            and self.cancel_event is not None
            and not self.cancel_event.is_set()
        ):
            logger.warning(
                f"Stage {stage.name} completed with {stage.current}/{stage.total} items",
            )
            return
        if self.progress_change_callback:
            self.progress_change_callback(
                type="progress_end",
                stage=stage.display_name,
                stage_progress=100.0,
                stage_current=stage.total,
                stage_total=stage.total,
                overall_progress=self.calculate_current_progress(),
                part_index=self.part_index + 1,
                total_parts=self.total_parts,
            )

    def calculate_current_progress(self, stage=None):
        if self.disable or self.parent_monitor and self.parent_monitor.disable:
            return 100
        part_weight = 1 / self.total_parts
        if self.parent_monitor:
            part_offset = self.part_index * part_weight
        else:
            part_offset = len(self.part_results) * part_weight
        part_offset *= 100
        progress = self._calculate_current_progress(stage) * part_weight + part_offset
        return progress

    def _calculate_current_progress(self, stage=None):
        """Calculate overall progress including part progress"""
        # Count completed stages
        completed_stages = sum(
            1 for s in self.stage.values() if s.run_time > 0 and s.current == s.total
        )

        # If all stages are complete, return exactly 100
        if completed_stages == len(self.stage):
            return 100

        # Calculate progress based on weights
        progress = sum(
            s.weight * 100
            for s in self.stage.values()
            if s.run_time > 0 and s.current == s.total
        )
        if stage is not None and 0 < stage.total != stage.current:
            progress += stage.weight * stage.current * 100 / stage.total

        # If this is a part monitor (has parent_monitor), return the progress as is
        if hasattr(self, "parent_monitor") and self.parent_monitor:
            return progress

        # Otherwise return the standard progress
        return progress

    def stage_update(self, stage, n: int):
        if self.disable or self.parent_monitor and self.parent_monitor.disable:
            return
        report_time_delta = time.time() - self.last_report_time
        if report_time_delta < self.report_interval and stage.total > 3:
            return
        if self.progress_change_callback:
            if stage.total != 0:
                stage_progress = stage.current * 100 / stage.total
            else:
                stage_progress = 100
            self.progress_change_callback(
                type="progress_update",
                stage=stage.display_name,
                stage_progress=stage_progress,
                stage_current=stage.current,
                stage_total=stage.total,
                overall_progress=self.calculate_current_progress(stage),
                part_index=self.part_index + 1,
                total_parts=self.total_parts,
            )
            self.last_report_time = time.time()

    def translate_done(self, translate_result):
        if self.disable or self.parent_monitor and self.parent_monitor.disable:
            return
        if self.finish_callback:
            self.finish_callback(type="finish", translate_result=translate_result)

    def translate_error(self, error):
        if self.disable or self.parent_monitor and self.parent_monitor.disable:
            return
        if self.finish_callback:
            logger.info(f"progress_monitor handle translate_error: {error}")
            self.finish_callback(type="error", error=error)

    def raise_if_cancelled(self):
        if self.cancel_event and self.cancel_event.is_set():
            raise asyncio.CancelledError

    def cancel(self):
        if self.disable or self.parent_monitor and self.parent_monitor.disable:
            return
        if self.cancel_event:
            logger.info("Translation canceled")
            self.cancel_event.set()


class TranslationStage:
    def __init__(
        self,
        name: str,
        total: int,
        pm: ProgressMonitor,
        weight: float,
        lock: threading.Lock,
    ):
        self.name = name
        self.display_name = name
        self.current = 0
        self.total = total
        self.pm = pm
        self.run_time = 0
        self.weight = weight
        self.lock = lock

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        with self.lock:
            diff = self.total - self.current
            if diff > 0:
                logger.info(
                    f"Stage {self.name} completed with {self.current}/{self.total} items"
                )
            self.pm.stage_update(self, diff)
            self.current = self.total
            self.pm.stage_done(self)

    def advance(self, n: int = 1):
        with self.lock:
            self.current += n
            self.pm.stage_update(self, n)


class DummyTranslationStage:
    def __init__(self, name: str, total: int, pm: ProgressMonitor, weight: float):
        self.name = name
        self.display_name = name
        self.current = 0
        self.total = total
        self.pm = pm

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def advance(self, n: int = 1):
        pass


================================================
FILE: babeldoc/tools/generate_cmap_metadata.py
================================================
"""
This script is used to automatically generate the following file:
https://github.com/funstory-ai/BabelDOC-Assets/blob/main/cmap_metadata.json
"""

import argparse
import hashlib
import logging
from pathlib import Path

import orjson
from rich.logging import RichHandler

logger = logging.getLogger(__name__)


def _calc_sha3_256(path: Path) -> str:
    """Calculate sha3-256 for a given file path."""
    hash_ = hashlib.sha3_256()
    with path.open("rb") as f:
        # Read the file in chunks to handle large files efficiently
        while True:
            chunk = f.read(1024 * 1024)
            if not chunk:
                break
            hash_.update(chunk)
    return hash_.hexdigest()


def main() -> None:
    logging.basicConfig(level=logging.INFO, handlers=[RichHandler()])
    parser = argparse.ArgumentParser(description="Generate cmap metadata.")
    parser.add_argument(
        "assets_repo_path",
        type=str,
        help="Path to the BabelDOC-Assets repository.",
    )
    args = parser.parse_args()
    repo_path = Path(args.assets_repo_path)
    assert repo_path.exists(), f"Assets repo path {repo_path} does not exist."
    assert (repo_path / "README.md").exists(), (
        f"Assets repo path {repo_path} does not contain a README.md file."
    )
    assert (repo_path / "cmap").exists(), (
        f"Assets repo path {repo_path} does not contain a cmap folder."
    )
    logger.info(f"Getting cmap metadata for {repo_path}")

    metadatas: dict[str, dict[str, object]] = {}
    cmap_dir = repo_path / "cmap"
    for cmap_path in sorted(cmap_dir.glob("**/*.json")):
        if not cmap_path.is_file():
            continue
        logger.info(f"Getting cmap metadata for {cmap_path}")
        sha3_256 = _calc_sha3_256(cmap_path)
        metadata = {
            "file_name": cmap_path.name,
            "sha3_256": sha3_256,
            "size": cmap_path.stat().st_size,
        }
        metadatas[cmap_path.name] = metadata

    metadatas_json = orjson.dumps(
        metadatas,
        option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
    ).decode()
    print(f"CMAP METADATA: {metadatas_json}")
    with (repo_path / "cmap_metadata.json").open("w") as f:
        f.write(metadatas_json)


if __name__ == "__main__":
    main()


================================================
FILE: babeldoc/tools/generate_font_metadata.py
================================================
# This script is used to automatically generate the following files:
# https://github.com/funstory-ai/BabelDOC-Assets/blob/main/font_metadata.json


import argparse
import hashlib
import io
import logging
import re
from pathlib import Path

import babeldoc.format.pdf.high_level
import babeldoc.format.pdf.translation_config
import orjson
import pymupdf
from babeldoc.format.pdf.document_il import PdfFont
from rich.logging import RichHandler

logger = logging.getLogger(__name__)

serif_keywords = [
    "serif",
]
sans_serif_keywords = ["sans", "GoNotoKurrent"]
serif_regex = "|".join(serif_keywords)
sans_serif_regex = "|".join(sans_serif_keywords)


def get_font_metadata(font_path) -> PdfFont:
    doc = pymupdf.open()
    page = doc.new_page(width=1000, height=1000)
    page.insert_font("test_font", font_path)
    translation_config = babeldoc.format.pdf.translation_config.TranslationConfig(
        *[None for _ in range(4)], doc_layout_model=1
    )
    translation_config.progress_monitor = (
        babeldoc.format.pdf.high_level.ProgressMonitor(
            babeldoc.format.pdf.high_level.get_translation_stage(translation_config)
        )
    )
    translation_config.font = font_path
    il_creater = babeldoc.format.pdf.high_level.ILCreater(translation_config)
    il_creater.mupdf = doc
    buffer = io.BytesIO()
    doc.save(buffer)
    babeldoc.format.pdf.high_level.start_parse_il(
        buffer,
        doc_zh=doc,
        resfont="test_font",
        il_creater=il_creater,
        translation_config=translation_config,
    )

    il = il_creater.create_il()
    il_page = il.page[0]
    font_metadata = il_page.pdf_font[0]
    return font_metadata


def main():
    logging.basicConfig(level=logging.INFO, handlers=[RichHandler()])
    parser = argparse.ArgumentParser(description="Get font metadata.")
    parser.add_argument("assets_repo_path", type=str, help="Path to the font file.")
    args = parser.parse_args()
    repo_path = Path(args.assets_repo_path)
    assert repo_path.exists(), f"Assets repo path {repo_path} does not exist."
    assert (repo_path / "README.md").exists(), (
        f"Assets repo path {repo_path} does not contain a README.md file."
    )
    assert (repo_path / "fonts").exists(), (
        f"Assets repo path {repo_path} does not contain a fonts folder."
    )
    logger.info(f"Getting font metadata for {repo_path}")

    metadatas = {}
    for font_path in list((repo_path / "fonts").glob("**/*.ttf")):
        logger.info(f"Getting font metadata for {font_path}")
        with Path(font_path).open("rb") as f:
            # Read the file in chunks to handle large files efficiently
            hash_ = hashlib.sha3_256()
            while True:
                chunk = f.read(1024 * 1024)
                if not chunk:
                    break
                hash_.update(chunk)
        extracted_metadata = get_font_metadata(font_path)

        if re.search(serif_regex, extracted_metadata.name, re.IGNORECASE):
            serif = 1
        else:
            serif = 0

        metadata = {
            "file_name": font_path.name,
            "font_name": extracted_metadata.name,
            "encoding_length": extracted_metadata.encoding_length,
            "bold": extracted_metadata.bold,
            "italic": extracted_metadata.italic,
            "monospace": extracted_metadata.monospace,
            "serif": serif,
            "ascent": extracted_metadata.ascent,
            "descent": extracted_metadata.descent,
            "sha3_256": hash_.hexdigest(),
            "size": font_path.stat().st_size,
        }
        metadatas[font_path.name] = metadata
    metadatas = orjson.dumps(
        metadatas,
        option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
    ).decode()
    print(f"FONT METADATA: {metadatas}")
    with (repo_path / "font_metadata.json").open("w") as f:
        f.write(metadatas)


if __name__ == "__main__":
    main()


================================================
FILE: babeldoc/tools/italic_assistance.py
================================================
import argparse
import json
import re
from pathlib import Path

import orjson
from babeldoc.const import CACHE_FOLDER
from babeldoc.format.pdf.document_il.utils.formular_helper import is_formulas_font
from babeldoc.format.pdf.translation_config import TranslationConfig
from rich.console import Console
from rich.table import Table

WORKING_FOLDER = Path(CACHE_FOLDER) / "working"


def find_latest_il_json() -> Path | None:
    """
    Find the latest il_translated.json file in ~/.cache/babeldoc/ subdirectories.

    Returns:
        Path to the most recently modified il_translated.json file, or None if not found.
    """
    base_dir = Path(WORKING_FOLDER)
    json_files = list(base_dir.glob("*/il_translated.json"))

    if not json_files:
        return None

    # Sort by modification time (newest first)
    json_files.sort(key=lambda p: p.stat().st_mtime, reverse=True)
    return json_files[0]


def extract_fonts_from_paragraph(
    paragraph: dict, page_font_map: dict[str, tuple[str, str]]
) -> set[tuple[str, str]]:
    """
    Extract all font_ids and names used in a paragraph.

    Args:
        paragraph: The paragraph dictionary
        page_font_map: Dictionary mapping font_id to (font_id, name) tuples

    Returns:
        Set of (font_id, name) tuples
    """
    fonts = set()

    # Check if paragraph has a pdfStyle with font_id
    if (
        "pdf_style" in paragraph
        and paragraph["pdf_style"]
        and "font_id" in paragraph["pdf_style"]
    ):
        font_id = paragraph["pdf_style"]["font_id"]
        if font_id in page_font_map:
            fonts.add(page_font_map[font_id])

    # Process paragraph compositions if present
    if "pdf_paragraph_composition" in paragraph:
        for comp in paragraph["pdf_paragraph_composition"]:
            # Check different composition types that might contain font information

            # Direct pdfCharacter in composition
            if "pdf_character" in comp and comp["pdf_character"]:
                char = comp["pdf_character"]
                if "pdf_style" in char and "font_id" in char["pdf_style"]:
                    font_id = char["pdf_style"]["font_id"]
                    if font_id in page_font_map:
                        fonts.add(page_font_map[font_id])

            # PdfLine in composition
            elif "pdf_line" in comp and comp["pdf_line"]:
                line = comp["pdf_line"]
                if "pdf_character" in line:
                    for char in line["pdf_character"]:
                        if "pdf_style" in char and "font_id" in char["pdf_style"]:
                            font_id = char["pdf_style"]["font_id"]
                            if font_id in page_font_map:
                                fonts.add(page_font_map[font_id])

            # PdfFormula in composition
            elif "pdf_formula" in comp and comp["pdf_formula"]:
                formula = comp["pdf_formula"]
                if "pdf_character" in formula:
                    for char in formula["pdf_character"]:
                        if "pdf_style" in char and "font_id" in char["pdf_style"]:
                            font_id = char["pdf_style"]["font_id"]
                            if font_id in page_font_map:
                                fonts.add(page_font_map[font_id])

            # PdfSameStyleCharacters in composition
            elif (
                "pdf_same_style_characters" in comp
                and comp["pdf_same_style_characters"]
            ):
                same_style = comp["pdf_same_style_characters"]
                if "pdf_style" in same_style and "font_id" in same_style["pdf_style"]:
                    font_id = same_style["pdf_style"]["font_id"]
                    if font_id in page_font_map:
                        fonts.add(page_font_map[font_id])

            # PdfSameStyleUnicodeCharacters in composition
            elif (
                "pdf_same_style_unicode_characters" in comp
                and comp["pdf_same_style_unicode_characters"]
            ):
                same_style_unicode = comp["pdf_same_style_unicode_characters"]
                if (
                    "pdf_style" in same_style_unicode
                    and same_style_unicode["pdf_style"] is not None
                    and "font_id" in same_style_unicode["pdf_style"]
                ):
                    font_id = same_style_unicode["pdf_style"]["font_id"]
                    if font_id in page_font_map:
                        fonts.add(page_font_map[font_id])

    return fonts


def find_fonts_by_debug_id(json_path: Path, debug_id_regex: str) -> dict[str, str]:
    """
    Find all fonts used in paragraphs with matching debug_id.

    Args:
        json_path: Path to the il_translated.json file
        debug_id_regex: Regular expression to match debug_id values

    Returns:
        Dictionary mapping font_ids to font names
    """
    # Load and parse JSON
    with json_path.open("rb") as f:
        doc_data = orjson.loads(f.read())

    # Compile regex pattern (case insensitive)
    pattern = re.compile(debug_id_regex.strip(" \"'"), re.IGNORECASE)

    # Set to collect all found font information
    found_fonts = set()

    # Process each page
    for page in doc_data.get("page", []):
        # Create a mapping of font_id to (font_id, name) tuples for this page
        page_font_map = {}
        for font in page.get("pdf_font", []):
            if "font_id" in font and "name" in font:
                page_font_map[font["font_id"]] = (font["font_id"], font["name"])

        # Check each paragraph
        for paragraph in page.get("pdf_paragraph", []):
            # Check if paragraph has debug_id and if it matches the pattern
            debug_id = paragraph.get("debug_id")
            if debug_id and pattern.search(debug_id):
                # Get all fonts used in this paragraph
                paragraph_fonts = extract_fonts_from_paragraph(paragraph, page_font_map)
                found_fonts.update(paragraph_fonts)

    # Convert set of tuples to dictionary
    return dict(found_fonts)


def main():
    parser = argparse.ArgumentParser(
        description="Extract fonts from paragraphs with matching debug_id"
    )
    parser.add_argument(
        "debug_id_regex", nargs="+", help="Regular expression to match debug_id values"
    )
    parser.add_argument(
        "--json-path",
        help="Path to il_translated.json (if not provided, will use the latest file)",
    )
    parser.add_argument(
        "--working-folder",
        help="Path to the working folder containing il_translated.json files",
    )

    args = parser.parse_args()

    if args.working_folder:
        global WORKING_FOLDER
        WORKING_FOLDER = Path(args.working_folder)
        if not WORKING_FOLDER.exists():
            print(f"Error: Working folder does not exist: {WORKING_FOLDER}")
            return 1

    # Determine JSON file path
    json_path = None
    if args.json_path:
        json_path = Path(args.json_path)
        if not json_path.exists():
            print(f"Error: File not found: {json_path}")
            return 1
    else:
        json_path = find_latest_il_json()
        if not json_path:
            print("Error: Could not find any il_translated.json file")
            return 1

    print(f"Using JSON file: {json_path}")

    # Find fonts matching the debug_id pattern
    fonts = find_fonts_by_debug_id(json_path, "|".join(args.debug_id_regex))

    # Output the results
    if fonts:
        print(
            f"Found {len(fonts)} fonts in paragraphs matching debug_id pattern: {args.debug_id_regex}"
        )
        print(json.dumps(fonts, indent=2, ensure_ascii=False))
    else:
        print(
            f"No fonts found for paragraphs matching debug_id pattern: {args.debug_id_regex}"
        )

    fonts = []

    # Read intermediate representation
    with json_path.open(encoding="utf-8") as f:
        pdf_data = json.load(f)

    for page_index, page in enumerate(pdf_data["page"]):
        for paragraph_index, paragraph_content in enumerate(page["pdf_paragraph"]):
            font_debug_id = paragraph_content["debug_id"]
            if font_debug_id:
                # Create page font mapping
                page_font_map = {}
                for font in page["pdf_font"]:
                    if "font_id" in font and "name" in font:
                        page_font_map[font["font_id"]] = (font["font_id"], font["name"])

                # Extract fonts from paragraph
                name_list = []
                paragraph_fonts = extract_fonts_from_paragraph(
                    paragraph_content, page_font_map
                )
                for _font_id, font_name in paragraph_fonts:
                    name_list.append(font_name)

                font_list = []
                for each in fonts:
                    font_list.append(each[1])

                for each_name in name_list:
                    if each_name not in font_list:
                        fonts.append(
                            (page_index, each_name, paragraph_index, font_debug_id)
                        )

    # Initialize checker
    translation_config = TranslationConfig(
        *[None for _ in range(3)], lang_out="zh_cn", doc_layout_model=1
    )

    # Create table
    table = Table(title="Font Recognition Results")
    table.add_column("Page #", justify="center", style="cyan")
    table.add_column("Paragraph #", justify="center", style="cyan")
    table.add_column("DEBUG_ID", justify="center", style="cyan")
    table.add_column("Font Name", style="magenta")
    table.add_column("Recognition Result", justify="center")

    # Output results
    for each_font in fonts:
        page_index, font_name, paragraph_index, font_debug_id = each_font

        if is_formulas_font(font_name, None):
            table.add_row(
                str(page_index),
                str(paragraph_index),
                str(font_debug_id),
                font_name,
                "[bold red]Formula Font[/bold red]",
            )
        else:
            table.add_row(
                str(page_index),
                str(paragraph_index),
                str(font_debug_id),
                font_name,
                "[bold blue]Non-Formula Font[/bold blue]",
            )

    # Print table
    console = Console()

    console.print(table)

    return 0


if __name__ == "__main__":
    exit(main())


================================================
FILE: babeldoc/tools/italic_recognize_tool.py
================================================
# Identify non-formula italic fonts that were incorrectly classified as formulas in BableDOC translation results (intermediate)

import json

import babeldoc.tools.italic_assistance as italic_assistance
from babeldoc.format.pdf.document_il.midend.styles_and_formulas import StylesAndFormulas
from babeldoc.format.pdf.translation_config import TranslationConfig
from rich.console import Console
from rich.table import Table

console = Console()

json_path = italic_assistance.find_latest_il_json()

fonts = []

# Read intermediate representation
with json_path.open(encoding="utf-8") as f:
    pdf_data = json.load(f)

for page_index, page in enumerate(pdf_data["page"]):
    for paragraph_index, paragraph_content in enumerate(page["pdf_paragraph"]):
        font_debug_id = paragraph_content["debug_id"]
        if font_debug_id:
            # Create page font mapping
            page_font_map = {}
            for font in page["pdf_font"]:
                if "font_id" in font and "name" in font:
                    page_font_map[font["font_id"]] = (font["font_id"], font["name"])

            # Extract fonts from paragraph
            name_list = []
            paragraph_fonts = italic_assistance.extract_fonts_from_paragraph(
                paragraph_content, page_font_map
            )
            for _font_id, font_name in paragraph_fonts:
                name_list.append(font_name)

            font_list = []
            for each in fonts:
                font_list.append(each[1])

            for each_name in name_list:
                if each_name not in font_list:
                    fonts.append(
                        (page_index, each_name, paragraph_index, font_debug_id)
                    )

# Initialize checker
translation_config = TranslationConfig(
    *[None for _ in range(3)], lang_out="zh_cn", doc_layout_model=1
)
checker = StylesAndFormulas(translation_config)

# Create table
table = Table(title="Font Recognition Results")
table.add_column("Page #", justify="center", style="cyan")
table.add_column("Paragraph #", justify="center", style="cyan")
table.add_column("DEBUG_ID", justify="center", style="cyan")
table.add_column("Font Name", style="magenta")
table.add_column("Recognition Result", justify="center")

# Output results
for each_font in fonts:
    page_index, font_name, paragraph_index, font_debug_id = each_font

    if checker.is_formulas_font(font_name):
        table.add_row(
            str(page_index),
            str(paragraph_index),
            str(font_debug_id),
            font_name,
            "[bold red]Formula Font[/bold red]",
        )
    else:
        table.add_row(
            str(page_index),
            str(paragraph_index),
            str(font_debug_id),
            font_name,
            "[bold blue]Non-Formula Font[/bold blue]",
        )

# Print table
console.print(table)


================================================
FILE: babeldoc/translator/__init__.py
================================================


================================================
FILE: babeldoc/translator/cache.py
================================================
import json
import logging
import random
import threading
from pathlib import Path

import peewee
from peewee import SQL
from peewee import AutoField
from peewee import CharField
from peewee import Model
from peewee import SqliteDatabase
from peewee import TextField
from peewee import fn  # For aggregation functions

from babeldoc.const import CACHE_FOLDER

logger = logging.getLogger(__name__)

# we don't init the database here
db = SqliteDatabase(None)

# Cleanup configuration
CLEAN_PROBABILITY = 0.001  # 0.1% chance to trigger cleanup
MAX_CACHE_ROWS = 50_000  # Keep only the latest 50,000 rows

# Thread-level mutex to ensure only one cleanup runs at a time within the process
_cleanup_lock = threading.Lock()


class _TranslationCache(Model):
    id = AutoField()
    translate_engine = CharField(max_length=20)
    translate_engine_params = TextField()
    original_text = TextField()
    translation = TextField()

    class Meta:
        database = db
        constraints = [
            SQL(
                """
            UNIQUE (
                translate_engine,
                translate_engine_params,
                original_text
                )
            ON CONFLICT REPLACE
            """,
            ),
        ]


class TranslationCache:
    @staticmethod
    def _sort_dict_recursively(obj):
        if isinstance(obj, dict):
            return {
                k: TranslationCache._sort_dict_recursively(v)
                for k in sorted(obj.keys())
                for v in [obj[k]]
            }
        elif isinstance(obj, list):
            return [TranslationCache._sort_dict_recursively(item) for item in obj]
        return obj

    def __init__(self, translate_engine: str, translate_engine_params: dict = None):
        self.translate_engine = translate_engine
        self.replace_params(translate_engine_params)

    # The program typically starts multi-threaded translation
    # only after cache parameters are fully configured,
    # so thread safety doesn't need to be considered here.
    def replace_params(self, params: dict = None):
        if params is None:
            params = {}
        self.params = params
        params = self._sort_dict_recursively(params)
        self.translate_engine_params = json.dumps(params)

    def update_params(self, params: dict = None):
        if params is None:
            params = {}
        self.params.update(params)
        self.replace_params(self.params)

    def add_params(self, k: str, v):
        self.params[k] = v
        self.replace_params(self.params)

    # Since peewee and the underlying sqlite are thread-safe,
    # get and set operations don't need locks.
    def get(self, original_text: str) -> str | None:
        try:
            result = _TranslationCache.get_or_none(
                translate_engine=self.translate_engine,
                translate_engine_params=self.translate_engine_params,
                original_text=original_text,
            )
            # Trigger cache cleanup with a small probability.
            if result and random.random() < CLEAN_PROBABILITY:  # noqa: S311
                self._cleanup()
            return result.translation if result else None
        except peewee.OperationalError as e:
            if "database is locked" in str(e):
                logger.debug("Cache is locked")
                return None
            else:
                raise

    def set(self, original_text: str, translation: str):
        try:
            _TranslationCache.create(
                translate_engine=self.translate_engine,
                translate_engine_params=self.translate_engine_params,
                original_text=original_text,
                translation=translation,
            )
            # Trigger cache cleanup with a small probability.
            if random.random() < CLEAN_PROBABILITY:  # noqa: S311
                self._cleanup()
        except peewee.OperationalError as e:
            if "database is locked" in str(e):
                logger.debug("Cache is locked")
            else:
                raise

    def _cleanup(self) -> None:
        """Remove old cache entries, keeping only the latest MAX_CACHE_ROWS records."""
        # Quick exit if another thread is already performing cleanup.
        if not _cleanup_lock.acquire(blocking=False):
            return
        try:
            logger.info("Cleaning up translation cache...")
            max_id = _TranslationCache.select(fn.MAX(_TranslationCache.id)).scalar()
            # Nothing to do if table is empty or below threshold
            if not max_id or max_id <= MAX_CACHE_ROWS:
                return
            threshold = max_id - MAX_CACHE_ROWS
            # Delete rows with id *less than or equal* to threshold so that at most MAX_CACHE_ROWS remain.
            _TranslationCache.delete().where(
                _TranslationCache.id <= threshold
            ).execute()
        finally:
            _cleanup_lock.release()


def init_db(remove_exists=False):
    CACHE_FOLDER.mkdir(parents=True, exist_ok=True)
    # The current version does not support database migration, so add the version number to the file name.
    cache_db_path = CACHE_FOLDER / "cache.v1.db"
    logger.info(f"Initializing cache database at {cache_db_path}")
    if remove_exists and cache_db_path.exists():
        cache_db_path.unlink()
    db.init(
        cache_db_path,
        pragmas={
            "journal_mode": "wal",
            "busy_timeout": 1000,
        },
    )
    db.create_tables([_TranslationCache], safe=True)


def init_test_db():
    import tempfile

    temp_file = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
    cache_db_path = temp_file.name
    temp_file.close()

    test_db = SqliteDatabase(
        cache_db_path,
        pragmas={
            "journal_mode": "wal",
            "busy_timeout": 1000,
        },
    )
    test_db.bind([_TranslationCache], bind_refs=False, bind_backrefs=False)
    test_db.connect()
    test_db.create_tables([_TranslationCache], safe=True)
    return test_db


def clean_test_db(test_db):
    test_db.drop_tables([_TranslationCache])
    test_db.close()
    db_path = Path(test_db.database)
    if db_path.exists():
        db_path.unlink()
    wal_path = Path(str(db_path) + "-wal")
    if wal_path.exists():
        wal_path.unlink()
    shm_path = Path(str(db_path) + "-shm")
    if shm_path.exists():
        shm_path.unlink()


init_db()


================================================
FILE: babeldoc/translator/translator.py
================================================
import contextlib
import logging
import threading
import time
import unicodedata
from abc import ABC
from abc import abstractmethod

import httpx
import openai
from tenacity import before_sleep_log
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

from babeldoc.babeldoc_exception.BabelDOCException import ContentFilterError
from babeldoc.translator.cache import TranslationCache
from babeldoc.utils.atomic_integer import AtomicInteger

logger = logging.getLogger(__name__)


def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")


class RateLimiter:
    """
    A rate limiter using the leaky bucket algorithm to ensure a smooth, constant rate of requests.
    This implementation is thread-safe and robust against system clock changes.
    """

    def __init__(self, max_qps: int):
        if max_qps <= 0:
            raise ValueError("max_qps must be a positive number")
        self.max_qps = max_qps
        self.min_interval = 1.0 / max_qps
        self.lock = threading.Lock()
        # Use monotonic time to prevent issues with system time changes
        self.next_request_time = time.monotonic()

    def wait(self, _rate_limit_params: dict = None):
        """
        Blocks until the next request can be processed, ensuring the rate limit is not exceeded.
        """
        with self.lock:
            now = time.monotonic()

            wait_duration = self.next_request_time - now
            if wait_duration > 0:
                time.sleep(wait_duration)

            # Update the next allowed request time.
            # If the limiter has been idle, the next request should start from 'now'.
            now = time.monotonic()
            self.next_request_time = (
                max(self.next_request_time, now) + self.min_interval
            )

    def set_max_qps(self, max_qps: int):
        """
        Updates the maximum queries per second. This operation is thread-safe.
        """
        if max_qps <= 0:
            raise ValueError("max_qps must be a positive number")
        with self.lock:
            self.max_qps = max_qps
            self.min_interval = 1.0 / max_qps


_translate_rate_limiter = RateLimiter(5)


def set_translate_rate_limiter(max_qps):
    _translate_rate_limiter.set_max_qps(max_qps)


class BaseTranslator(ABC):
    # Due to cache limitations, name should be within 20 characters.
    # cache.py: translate_engine = CharField(max_length=20)
    name = "base"
    lang_map = {}

    def __init__(self, lang_in, lang_out, ignore_cache):
        self.ignore_cache = ignore_cache
        lang_in = self.lang_map.get(lang_in.lower(), lang_in)
        lang_out = self.lang_map.get(lang_out.lower(), lang_out)
        self.lang_in = lang_in
        self.lang_out = lang_out

        self.cache = TranslationCache(
            self.name,
            {
                "lang_in": lang_in,
                "lang_out": lang_out,
            },
        )

        self.translate_call_count = 0
        self.translate_cache_call_count = 0

    def __del__(self):
        with contextlib.suppress(Exception):
            logger.info(
                f"{self.name} translate call count: {self.translate_call_count}"
            )
            logger.info(
                f"{self.name} translate cache call count: {self.translate_cache_call_count}",
            )

    def add_cache_impact_parameters(self, k: str, v):
        """
        Add parameters that affect the translation quality to distinguish the translation effects under different parameters.
        :param k: key
        :param v: value
        """
        self.cache.add_params(k, v)

    def translate(self, text, ignore_cache=False, rate_limit_params: dict = None):
        """
        Translate the text, and the other part should call this method.
        :param text: text to translate
        :return: translated text
        """
        self.translate_call_count += 1
        if not (self.ignore_cache or ignore_cache):
            try:
                cache = self.cache.get(text)
                if cache is not None:
                    self.translate_cache_call_count += 1
                    return cache
            except Exception as e:
                logger.debug(f"try get cache failed, ignore it: {e}")
        _translate_rate_limiter.wait()
        translation = self.do_translate(text, rate_limit_params)
        if not (self.ignore_cache or ignore_cache):
            self.cache.set(text, translation)
        return translation

    def llm_translate(self, text, ignore_cache=False, rate_limit_params: dict = None):
        """
        Translate the text, and the other part should call this method.
        :param text: text to translate
        :return: translated text
        """
        self.translate_call_count += 1
        if not (self.ignore_cache or ignore_cache):
            try:
                cache = self.cache.get(text)
                if cache is not None:
                    self.translate_cache_call_count += 1
                    return cache
            except Exception as e:
                logger.debug(f"try get cache failed, ignore it: {e}")
        _translate_rate_limiter.wait()
        translation = self.do_llm_translate(text, rate_limit_params)
        if not (self.ignore_cache or ignore_cache):
            try:
                self.cache.set(text, translation)
            except Exception as e:
                logger.debug(
                    f"try set cache failed, ignore it: {e}, text: {text}, translation: {translation}"
                )
        return translation

    @abstractmethod
    def do_llm_translate(self, text, rate_limit_params: dict = None):
        """
        Actual translate text, override this method
        :param text: text to translate
        :return: translated text
        """
        raise NotImplementedError

    @abstractmethod
    def do_translate(self, text, rate_limit_params: dict = None):
        """
        Actual translate text, override this method
        :param text: text to translate
        :return: translated text
        """
        logger.critical(
            f"Do not call BaseTranslator.do_translate. "
            f"Translator: {self}. "
            f"Text: {text}. ",
        )
        raise NotImplementedError

    def __str__(self):
        return f"{self.name} {self.lang_in} {self.lang_out} {self.model}"

    def get_rich_text_left_placeholder(self, placeholder_id: int | str):
        return f"<b{placeholder_id}>"

    def get_rich_text_right_placeholder(self, placeholder_id: int | str):
        return f"</b{placeholder_id}>"

    def get_formular_placeholder(self, placeholder_id: int | str):
        return self.get_rich_text_left_placeholder(placeholder_id)


class OpenAITranslator(BaseTranslator):
    # https://github.com/openai/openai-python
    name = "openai"

    def __init__(
        self,
        lang_in,
        lang_out,
        model,
        base_url=None,
        api_key=None,
        ignore_cache=False,
        enable_json_mode_if_requested=False,
        send_dashscope_header=False,
        send_temperature=True,
        reasoning=None,
    ):
        super().__init__(lang_in, lang_out, ignore_cache)
        self.options = {"temperature": 0}  # 随机采样可能会打断公式标记
        self.extra_body = {}
        # if 'gpt-5' in model and 'gpt-5-chat' not in model:
        #     self.extra_body['reasoning'] = {
        #         "effort": "minimal"
        #     }
        #     self.add_cache_impact_parameters("reasoning-effort", 'minimal')
        self.reasoning = reasoning
        self.client = openai.OpenAI(
            base_url=base_url,
            api_key=api_key,
            http_client=httpx.Client(
                limits=httpx.Limits(
                    max_connections=None, max_keepalive_connections=None
                ),
                timeout=60,  # Set a reasonable timeout
            ),
        )
        if send_temperature:
            self.add_cache_impact_parameters("temperature", self.options["temperature"])
        self.model = model
        self.enable_json_mode_if_requested = enable_json_mode_if_requested
        self.send_dashscope_header = send_dashscope_header
        self.send_temperature = send_temperature
        self.add_cache_impact_parameters("model", self.model)
        self.add_cache_impact_parameters("prompt", self.prompt(""))
        if self.reasoning:
            self.extra_body["reasoning"] = {"effort": self.reasoning}
            self.add_cache_impact_parameters("reasoning", self.reasoning)
        if self.enable_json_mode_if_requested:
            self.add_cache_impact_parameters(
                "enable_json_mode_if_requested", self.enable_json_mode_if_requested
            )
        self.token_count = AtomicInteger()
        self.prompt_token_count = AtomicInteger()
        self.completion_token_count = AtomicInteger()
        self.cache_hit_prompt_token_count = AtomicInteger()

    @retry(
        retry=retry_if_exception_type(openai.RateLimitError),
        stop=stop_after_attempt(100),
        wait=wait_exponential(multiplier=1, min=1, max=15),
        before_sleep=before_sleep_log(logger, logging.WARNING),
    )
    def do_translate(self, text, rate_limit_params: dict = None) -> str:
        options = {}
        if self.send_temperature:
            options.update(self.options)

        response = self.client.chat.completions.create(
            model=self.model,
            **options,
            messages=self.prompt(text),
            extra_body=self.extra_body,
        )
        self.update_token_count(response)
        return response.choices[0].message.content.strip()

    def prompt(self, text):
        return [
            {
                "role": "system",
                "content": "You are a professional,authentic machine translation engine.",
            },
            {
                "role": "user",
                "content": f";; Treat next line as plain text input and translate it into {self.lang_out}, output translation ONLY. If translation is unnecessary (e.g. proper nouns, codes, {'{{1}}, etc. '}), return the original text. NO explanations. NO notes. Input:\n\n{text}",
            },
        ]

    @retry(
        retry=retry_if_exception_type(openai.RateLimitError),
        stop=stop_after_attempt(100),
        wait=wait_exponential(multiplier=1, min=1, max=15),
        before_sleep=before_sleep_log(logger, logging.WARNING),
    )
    def do_llm_translate(self, text, rate_limit_params: dict = None):
        if text is None:
            return None

        options = {}
        if self.send_temperature:
            options.update(self.options)
        if self.enable_json_mode_if_requested and rate_limit_params.get(
            "request_json_mode", False
        ):
            options["response_format"] = {"type": "json_object"}

        extra_headers = {}
        if self.send_dashscope_header:
            extra_headers["X-DashScope-DataInspection"] = (
                '{"input": "disable", "output": "disable"}'
            )
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                **options,
                max_tokens=2048,
                messages=[
                    {
                        "role": "user",
                        "content": text,
                    },
                ],
                extra_headers=extra_headers,
                extra_body=self.extra_body,
            )
            self.update_token_count(response)
            return response.choices[0].message.content.strip()
        except openai.BadRequestError as e:
            if (
                "系统检测到输入或生成内容可能包含不安全或敏感内容，请您避免输入易产生敏感内容的提示语，感谢您的配合。"
                in e.message
            ):
                raise ContentFilterError(e.message) from e
            else:
                raise

    def update_token_count(self, response):
        try:
            if response.usage and response.usage.total_tokens:
                self.token_count.inc(response.usage.total_tokens)
            if response.usage and response.usage.prompt_tokens:
                self.prompt_token_count.inc(response.usage.prompt_tokens)
            if response.usage and response.usage.completion_tokens:
                self.completion_token_count.inc(response.usage.completion_tokens)
            # Support both response.usage.prompt_cache_hit_tokens and response.prompt_tokens_details.cached_tokens
            hit_count = 0
            if response.usage and hasattr(response.usage, "prompt_cache_hit_tokens"):
                hit_count = getattr(response.usage, "prompt_cache_hit_tokens", 0)
            if hasattr(response, "prompt_tokens_details") and getattr(
                response.prompt_tokens_details, "cached_tokens", 0
            ):
                hit_count += getattr(response.prompt_tokens_details, "cached_tokens", 0)
            if hit_count:
                self.cache_hit_prompt_token_count.inc(hit_count)
        except Exception as e:
            logger.exception("Error updating token count")

    def get_formular_placeholder(self, placeholder_id: int | str):
        return "{v" + str(placeholder_id) + "}", f"{{\\s*v\\s*{placeholder_id}\\s*}}"
        return "{{" + str(placeholder_id) + "}}"

    def get_rich_text_left_placeholder(self, placeholder_id: int | str):
        return (
            f"<style id='{placeholder_id}'>",
            f"<\\s*style\\s*id\\s*=\\s*'\\s*{placeholder_id}\\s*'\\s*>",
        )

    def get_rich_text_right_placeholder(self, placeholder_id: int | str):
        return "</style>", r"<\s*\/\s*style\s*>"


================================================
FILE: babeldoc/utils/__init__.py
================================================


================================================
FILE: babeldoc/utils/atomic_integer.py
================================================
import threading


class AtomicInteger:
    def __init__(self, value=0):
        self._value = int(value)
        self._lock = threading.Lock()

    def inc(self, d=1):
        with self._lock:
            self._value += int(d)
            return self._value

    def dec(self, d=1):
        return self.inc(-d)

    @property
    def value(self):
        with self._lock:
            return self._value

    @value.setter
    def value(self, v):
        with self._lock:
            self._value = int(v)
            return self._value


================================================
FILE: babeldoc/utils/memory.py
================================================
import os
import sys
import time
from pathlib import Path

try:
    import psutil
except ImportError:
    psutil = None


def _parse_pss_from_smaps_rollup(pid: int) -> int | None:
    """
    Try to read PSS from /proc/<pid>/smaps_rollup.
    Returns PSS in bytes, or None if not available/readable.
    """
    try:
        smaps_rollup_path = Path(f"/proc/{pid}/smaps_rollup")
        with smaps_rollup_path.open() as f:
            for line in f:
                if line.startswith("Pss:"):
                    # Format: "Pss:            1234 kB"
                    parts = line.split()
                    if len(parts) >= 2:
                        pss_kb = int(parts[1])
                        return pss_kb * 1024  # Convert to bytes
        return None
    except (FileNotFoundError, PermissionError, ValueError, OSError):
        return None


def _parse_pss_from_smaps(pid: int) -> int | None:
    """
    Try to read PSS from /proc/<pid>/smaps and sum all Pss entries.
    Returns PSS in bytes, or None if not available/readable.
    """
    try:
        smaps_path = Path(f"/proc/{pid}/smaps")
        total_pss_kb = 0
        with smaps_path.open() as f:
            for line in f:
                if line.startswith("Pss:"):
                    # Format: "Pss:            1234 kB"
                    parts = line.split()
                    if len(parts) >= 2:
                        total_pss_kb += int(parts[1])
        if total_pss_kb > 0:
            return total_pss_kb * 1024  # Convert to bytes
        return None
    except (FileNotFoundError, PermissionError, ValueError, OSError):
        return None


def _get_pss_linux(pid: int) -> int | None:
    """
    Try to get PSS on Linux.
    Priority: smaps_rollup -> smaps -> None
    Returns PSS in bytes, or None if not available.
    """
    # Try smaps_rollup first (lightweight)
    pss = _parse_pss_from_smaps_rollup(pid)
    if pss is not None:
        return pss

    # Fallback to smaps (heavier)
    pss = _parse_pss_from_smaps(pid)
    if pss is not None:
        return pss

    return None


def _get_rss_psutil(pid: int) -> int | None:
    """
    Get RSS using psutil for a single process.
    Returns RSS in bytes, or None if psutil unavailable or process not found.
    """
    if psutil is None:
        return None

    try:
        process = psutil.Process(pid)
        return process.memory_info().rss
    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.TimeoutExpired):
        return None


def _get_single_process_memory(
    pid: int, prefer_pss: bool = True, use_smaps_rollup_only: bool = False
) -> int | None:
    """
    Get memory usage for a single process (no children).

    Args:
        pid: Process ID
        prefer_pss: If True and on Linux, try PSS first; otherwise use RSS
        use_smaps_rollup_only: If True, only try smaps_rollup (faster), fallback to RSS if not available

    Returns:
        Memory usage in bytes, or None if all methods fail
    """
    if sys.platform == "linux":
        if prefer_pss:
            if use_smaps_rollup_only:
                # Only try smaps_rollup, then fallback to RSS
                pss = _parse_pss_from_smaps_rollup(pid)
                if pss is not None:
                    return pss
            else:
                # Try full PSS (smaps_rollup -> smaps)
                pss = _get_pss_linux(pid)
                if pss is not None:
                    return pss

    # Fallback to RSS
    return _get_rss_psutil(pid)


def get_memory_usage_bytes(
    pid: int | None = None,
    include_children: bool = True,
    prefer_pss: bool = True,
) -> int:
    """
    Get memory usage of a process (and optionally its children).

    On Linux with prefer_pss=True:
      - Tries /proc/<pid>/smaps_rollup first (lightweight)
      - Falls back to /proc/<pid>/smaps if smaps_rollup unavailable (heavier)
      - Falls back to psutil RSS if smaps unavailable

    On non-Linux systems or prefer_pss=False:
      - Uses psutil RSS

    Args:
        pid: Process ID to monitor. If None, uses current process.
        include_children: If True, also includes memory of child processes.
        prefer_pss: If True on Linux, attempts to use PSS; otherwise uses RSS.

    Returns:
        Total memory usage in bytes (guaranteed non-negative).
    """
    if pid is None:
        pid = os.getpid()

    total_memory = 0

    # Determine if we're using smaps (heavier) vs smaps_rollup (lighter)
    use_smaps_rollup_only = False
    if sys.platform == "linux" and prefer_pss:
        # If we can read smaps_rollup, use rollup-only mode
        test_rollup = _parse_pss_from_smaps_rollup(pid)
        use_smaps_rollup_only = test_rollup is not None

    # Get current process memory
    memory = _get_single_process_memory(
        pid, prefer_pss=prefer_pss, use_smaps_rollup_only=use_smaps_rollup_only
    )
    if memory is not None:
        total_memory += memory

    # Get children memory if requested
    if include_children:
        if psutil is None:
            # Cannot get children without psutil
            return total_memory

        try:
            parent_process = psutil.Process(pid)
            children = parent_process.children(recursive=True)
        except (psutil.NoSuchProcess, psutil.AccessDenied):
            # Parent process not found or no permission
            return total_memory

        for child in children:
            try:
                child_pid = child.pid
                child_memory = _get_single_process_memory(
                    child_pid,
                    prefer_pss=prefer_pss,
                    use_smaps_rollup_only=use_smaps_rollup_only,
                )
                if child_memory is not None:
                    total_memory += child_memory
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                # Child process died or no permission; skip it
                pass

    return max(0, total_memory)


def get_memory_usage_with_throttle(
    pid: int | None = None,
    include_children: bool = True,
    prefer_pss: bool = True,
    last_pss_check_time: float | None = None,
    pss_throttle_seconds: float = 2.0,
) -> tuple[int, float | None]:
    """
    Get memory usage with throttling for PSS checks on Linux.

    When PSS is not available via smaps_rollup and must read smaps (expensive),
    this throttles checks to at most once per pss_throttle_seconds.

    Args:
        pid: Process ID. If None, uses current process.
        include_children: If True, includes child process memory.
        prefer_pss: If True on Linux, attempts to use PSS.
        last_pss_check_time: Timestamp of last PSS check. For throttling logic.
        pss_throttle_seconds: Minimum interval (seconds) between smaps reads.

    Returns:
        Tuple of (memory_bytes, new_check_time).
        If throttled, returns cached estimate (0) and original check time.
    """
    current_time = time.time()

    # Check if we should throttle
    if (
        prefer_pss
        and sys.platform == "linux"
        and last_pss_check_time is not None
        and (current_time - last_pss_check_time) < pss_throttle_seconds
    ):
        # Throttled: use RSS only as a fast estimate
        memory = 0
        pid_to_check = pid if pid is not None else os.getpid()
        rss = _get_rss_psutil(pid_to_check)
        if rss is not None:
            memory += rss

        if include_children and psutil is not None:
            try:
                parent_process = psutil.Process(pid_to_check)
                for child in parent_process.children(recursive=True):
                    try:
                        child_rss = _get_rss_psutil(child.pid)
                        if child_rss is not None:
                            memory += child_rss
                    except (psutil.NoSuchProcess, psutil.AccessDenied):
                        pass
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                pass

        return memory, last_pss_check_time

    # Not throttled: do full check
    memory = get_memory_usage_bytes(
        pid=pid, include_children=include_children, prefer_pss=prefer_pss
    )
    return memory, current_time


================================================
FILE: babeldoc/utils/priority_thread_pool_executor.py
================================================
# thanks to:
# https://github.com/oleglpts/PriorityThreadPoolExecutor/blob/master/PriorityThreadPoolExecutor/__init__.py
# https://github.com/oleglpts/PriorityThreadPoolExecutor/issues/4

import atexit
import itertools
import logging
import queue
import random
import sys
import threading
import weakref
from concurrent.futures import _base
from concurrent.futures.thread import BrokenThreadPool
from concurrent.futures.thread import ThreadPoolExecutor
from concurrent.futures.thread import _python_exit
from concurrent.futures.thread import _threads_queues
from concurrent.futures.thread import _WorkItem
from heapq import heappop
from heapq import heappush

logger = logging.getLogger(__name__)

########################################################################################################################
#                                                Global variables                                                      #
########################################################################################################################

NULL_ENTRY = (sys.maxsize, _WorkItem(None, None, (), {}))
_shutdown = False

########################################################################################################################
#                                           Before system exit procedure                                               #
########################################################################################################################


def python_exit():
    """

    Cleanup before system exit

    """
    global _shutdown
    _shutdown = True
    items = list(_threads_queues.items())
    for _t, q in items:
        q.put(NULL_ENTRY)
    for t, _q in items:
        t.join()


# change default cleanup


atexit.unregister(_python_exit)
atexit.register(python_exit)


class PriorityQueue(queue.Queue):
    """Variant of Queue that retrieves open entries in priority order (lowest first).

    Entries are typically tuples of the form:  (priority number, data).
    """

    REMOVED = "<removed-task>"
    DEFAULT_PRIORITY = 100

    def _init(self, maxsize):
        self.queue = []
        self.entry_finder = {}
        self.counter = itertools.count()

    def _qsize(self):
        return len(self.queue)

    def _put(self, item):
        # heappush(self.queue, item)
        try:
            if item[1] in self.entry_finder:
                self.remove(item[1])
            count = next(self.counter)
            entry = [item[0], count, item[1]]
            self.entry_finder[item[1]] = entry
            heappush(self.queue, entry)
        except TypeError:  # handle item==None
            self._put((self.DEFAULT_PRIORITY, None))

    def remove(self, task):
        """
        This simply replaces the data with the REMOVED value,
        which will get cleared out once _get reaches it.
        """
        entry = self.entry_finder.pop(task)
        entry[-1] = self.REMOVED

    def _get(self):
        while self.queue:
            entry = heappop(self.queue)
            if entry[2] is not self.REMOVED:
                del self.entry_finder[entry[2]]
                return entry
        return None


def _worker(executor_reference, work_queue, initializer, initargs):
    if initializer is not None:
        try:
            initializer(*initargs)
        except BaseException:
            _base.LOGGER.critical("Exception in initializer:", exc_info=True)
            executor = executor_reference()
            if executor is not None:
                executor._initializer_failed()
            return
    try:
        while True:
            work_item = work_queue.get(block=True)
            try:
                if work_item[2] is not None:
                    work_item[2].run()
                    # Delete references to object. See issue16284
                    del work_item

                    # attempt to increment idle count
                    executor = executor_reference()
                    if executor is not None:
                        executor._idle_semaphore.release()
                    del executor
                    continue

                executor = executor_reference()
                # Exit if:
                #   - The interpreter is shutting down OR
                #   - The executor that owns the worker has been collected OR
                #   - The executor that owns the worker has been shutdown.
                if _shutdown or executor is None or executor._shutdown:
                    # Flag the executor as shutting down as early as possible if it
                    # is not gc-ed yet.
                    if executor is not None:
                        executor._shutdown = True
                    # Notice other workers
                    work_queue.put(None)
                    return
                del executor
            finally:
                work_queue.task_done()
    except BaseException:
        _base.LOGGER.critical("Exception in worker", exc_info=True)


class PriorityThreadPoolExecutor(ThreadPoolExecutor):
    """
    Thread pool executor with priority queue (priorities must be different, lowest first)
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # change work queue type to queue.PriorityQueue
        self._work_queue: PriorityQueue = PriorityQueue()
        self._all_future = []

    def submit(self, fn, *args, **kwargs):
        """

        Sending the function to the execution queue

        :param fn: function being executed
        :type fn: callable
        :param args: function's positional arguments
        :param kwargs: function's keywords arguments
        :return: future instance
        :rtype: _base.Future

        Added keyword:

        - priority (integer later sys.maxsize)

        """
        with self._shutdown_lock:
            if self._broken:
                raise BrokenThreadPool(self._broken)

            if self._shutdown:
                raise RuntimeError("cannot schedule new futures after shutdown")
            if _shutdown:
                raise RuntimeError(
                    "cannot schedule new futures after interpreter shutdown"
                )

            priority = kwargs.get("priority", random.randint(0, sys.maxsize - 1))  # noqa: S311
            if "priority" in kwargs:
                del kwargs["priority"]

            f = _base.Future()
            w = _WorkItem(f, fn, args, kwargs)

            self._work_queue.put((priority, w))
            self._adjust_thread_count()
            self._all_future.append(f)
            return f

    def _adjust_thread_count(self):
        # if idle threads are available, don't spin new threads
        if self._idle_semaphore.acquire(timeout=0):
            return

        # When the executor gets lost, the weakref callback will wake up
        # the worker threads.
        def weakref_cb(_, q=self._work_queue):
            q.put(None)

        num_threads = len(self._threads)
        if num_threads < self._max_workers:
            thread_name = f"{self._thread_name_prefix or self}_{num_threads:d}"
            t = threading.Thread(
                name=thread_name,
                target=_worker,
                args=(
                    weakref.ref(self, weakref_cb),
                    self._work_queue,
                    self._initializer,
                    self._initargs,
                ),
            )
            t.start()
            self._threads.add(t)
            _threads_queues[t] = self._work_queue

    def shutdown(self, wait=True, *, cancel_futures=False):
        logger.debug("Shutting down executor %s", self._thread_name_prefix or self)
        if wait:
            logger.debug(
                "Waiting for all tasks done %s", self._thread_name_prefix or self
            )
            self._work_queue.join()
            logger.debug("All tasks done %s", self._thread_name_prefix or self)

        with self._shutdown_lock:
            self._shutdown = True
            if cancel_futures:
                # Drain all work items from the queue, and then cancel their
                # associated futures.
                while True:
                    try:
                        work_item = self._work_queue.get_nowait()
                    except queue.Empty:
                        break
                    if work_item is not None:
                        work_item.future.cancel()

            # Send a wake-up to prevent threads calling
            # _work_queue.get(block=True) from permanently blocking.
            self._work_queue.put(None)
        if wait:
            logger.debug(
                "Waiting for all thread done %s", self._thread_name_prefix or self
            )
            for t in self._threads:
                self._work_queue.put(None)
                t.join()
        logger.debug("shutdown finish %s", self._thread_name_prefix or self)

    def __del__(self):
        for f in self._all_future:
            if f.done() and not f.cancelled():
                try:
                    f.result()
                except Exception as e:
                    logger.warning("Exception in future %s: %s", f, e, exc_info=True)


================================================
FILE: docs/CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, religion, or sexual identity
and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the
  overall community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or
  advances of any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email
  address, without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
aw@funstory.ai .
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series
of actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or
permanent ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior,  harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within
the community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.0, available at
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.

Community Impact Guidelines were inspired by [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see the FAQ at
https://www.contributor-covenant.org/faq. Translations are available at
https://www.contributor-covenant.org/translations.


================================================
FILE: docs/CONTRIBUTING.md
================================================
# Contributing to BabelDOC

## How to contribute to BabelDOC

### **About Language**

- Issues can be in Chinese or English
- PRs are limited to English
- All documents are provided in English only

### **Did you find a bug?**

- **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/funstory-ai/BabelDOC/issues).

Please pay special attention to:

1. Known compatibility issues with pdf2zh - see [#20](https://github.com/funstory-ai/BabelDOC/issues/20) for details
2. Reported edge cases and limitations from downstream applications - see [#23](https://github.com/funstory-ai/BabelDOC/issues/23) for discussion

- If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/funstory-ai/BabelDOC/issues/new?template=bug_report.md). Be sure to include a **title and clear description**, as much relevant information as possible.

### **If you wish to request changes or new features**

- Suggest your change in the [Issues](https://github.com/funstory-ai/BabelDOC/issues/new?template=feature_request.md) section.

### **If you wish to add more translators**

- This project is not intended for direct end-user use, and the supported translators are mainly for debugging purposes. Unless it clearly helps with development and debugging, PRs for directly adding translators will not be accepted.
- You can directly use [PDFMathTranslate](https://github.com/Byaidu/PDFMathTranslate) to get support for more translators.

### **If you want to add new accelerator support for the layout model**

- This project only plans to support various accelerators through onnxruntime. Please submit your accelerator support directly to onnxruntime.

- Additionally, [translation_config.py](https://github.com/funstory-ai/BabelDOC/blob/9e5be3a05c15ecae98024ba695e4a2db1412c062/babeldoc/translation_config.py#L41) shows that the layout model implementation actually used in this project is passed in from outside. You can implement a layout model class according to the relevant interface, and then pass it through this parameter at runtime.

### **If you wish to contribute to BabelDOC**

> [!TIP]
>
> If you have any questions about the source code or related matters, please contact the maintainer at aw@funstory.ai .
> 
> You can also raise questions in [Issues](https://github.com/funstory-ai/BabelDOC/issues).
> 
> You can contact the maintainers in the pdf2zh discussion group.
> 
> Due to the current high rate of code changes, this project only accepts small PRs. If you would like to suggest a change and you include a patch as a proof-of-concept, that would be great. However, please do not be offended if we rewrite your patch from scratch.
>
> In addition, we do not accept PRs involving the following changes:
> 1. PRs that modify prompts.
> 2. Adding GUI or other features directly targeting end users to this project. (Exceptions granted by maintainers in issues are excluded.)
> 3. PRs that do not comply with this specification.
> 4. Other PRs that maintainers deem inappropriate.
>
> **This project cannot accept all PRs. We recommend that you discuss with the maintainers via [Issue](https://github.com/funstory-ai/BabelDOC/issues) before submitting a PR.**

[//]: # (> We welcome pull requests and will review your contributions.)


1. Fork this repository and clone it locally.
2. Use `doc/deploy.sh` to set up the development environment.
3. Create a new branch and make code changes on that branch. `git checkout -b feature/<feature-name>`
4. Perform development and ensure the code meets the requirements.

5. Commit your changes to your new branch.

```
git add .

git commit -m "<semantic commit message>"
```

5. Push to your repository: `git push origin feature/<feature-name>`.

6. Create a PR on GitHub and provide a detailed description.

7. Ensure all automated checks pass.

#### Basic Requirements

##### Workflow

1. Please create a fork on the main branch and develop on the forked branch.

- When submitting a Pull Request (PR), please provide detailed descriptions of the changes.

- If the PR fails automated checks (showing checks failed and red cross marks), please review the corresponding details and modify the submission to ensure the new PR passes automated checks.

2. Development and Testing

- Use the `uv run BabelDOC` command for development and testing.

- When you need print log, please use `log.debug()` to print info. **DO NOT USE `print()`**

- Code formatting

3. Dependency Updates

- If new dependencies are introduced, please update the dependency list in pyproject.toml accordingly.

- It is recommended to use the `uv add` command for adding dependencies.

4. Documentation Updates

- If new command-line options are added, please update the command-line options list in README.md accordingly.

5. Commit Messages

- Use [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/), for example: feat(translator): add openai.

6. Coding Style

- Please ensure submitted code follows basic coding style guidelines.
- Use pep8-naming.
- Comments should be in English.
- Follow these specific Python coding style guidelines:

  a. Naming Conventions:

  - Class names should use CapWords (PascalCase): `class TranslatorConfig`
  - Function and variable names should use snake_case: `def process_text()`, `word_count = 0`
  - Constants should be UPPER_CASE: `MAX_RETRY_COUNT = 3`
  - Private attributes should start with underscore: `_internal_state`

  b. Code Layout:

  - Use 4 spaces for indentation (no tabs)
  - Maximum line length is 88 characters (compatible with black formatter)
  - Add 2 blank lines before top-level classes and functions
  - Add 1 blank line before class methods
  - No trailing whitespace

  c. Imports:

  - Imports should be on separate lines: `import os\nimport sys`
  - Imports should be grouped in the following order:
    1.  Standard library imports
    2.  Related third party imports
    3.  Local application/library specific imports
  - Use absolute imports over relative imports

  d. String Formatting:

  - Prefer f-strings for string formatting: `f"Count: {count}"`
  - Use double quotes for docstrings

  e. Type Hints:

  - Use type hints for function arguments and return values
  - Example: `def translate_text(text: str) -> str:`

  f. Documentation:

  - All public functions and classes must have docstrings
  - Use Google style for docstrings
  - Example:

    ```python
    def function_name(arg1: str, arg2: int) -> bool:
        """Short description of function.

        Args:
            arg1: Description of arg1
            arg2: Description of arg2

        Returns:
            Description of return value

        Raises:
            ValueError: Description of when this error occurs
        """
    ```

The existing codebase does not comply with the above specifications in some aspects. Contributions for modifications are welcome.

#### How to modify the intermediate representation

The intermediate representation is described by [il_version_1.rnc](https://github.com/funstory-ai/BabelDOC/blob/main/BabelDOC/format/pdf/document_il/il_version_1.rnc). Corresponding Python data classes are generated using [xsdata](https://xsdata.readthedocs.io/en/latest/). The files `il_version_1.rng`, `il_version_1.xsd`, and `il_version_1.py` are auto-generated and must not be manually modified.

##### Format RNC file

```bash
trang babeldoc/format/pdf/document_il/il_version_1.rnc babeldoc/format/pdf/document_il/il_version_1.rnc
```

##### Generate RNG, XSD and Python classes

```bash
# Generate RNG from RNC
trang babeldoc/format/pdf/document_il/il_version_1.rnc babeldoc/format/pdf/document_il/il_version_1.rng

# Generate XSD from RNC
trang babeldoc/format/pdf/document_il/il_version_1.rnc babeldoc/format/pdf/document_il/il_version_1.xsd

# Generate Python classes from XSD
xsdata generate babeldoc/format/pdf/document_il/il_version_1.xsd --package babeldoc.format.pdf.document_il
```

##### Profile memory usage

```bash
uv run memray run --native --aggregate babeldoc/main.py -c yadt.toml
```

================================================
FILE: docs/CONTRIBUTOR_REWARD.md
================================================
# BabelDOC/PDFMathTranslate/OneAIFW 贡献者奖励规则

## 月度活跃贡献者奖励规则

### 一、资格标准
#### **贡献类型要求**
   - 需提交 **至少 1 个有效 PR**（Pull Request），或进行 **PR 审核、文档编写** 等贡献。
   - 有效贡献定义：
     - 非简单的文档错别字修复
     - 非简单的代码格式化调整（如仅调整缩进、空格等）
     - 需做出实质性贡献（如功能开发、Bug 修复、性能优化、架构调整、技术文档编写、PR 审核等）
   - 示例合格贡献：新增功能模块、修复逻辑错误、优化算法效率、编写技术文档等

#### **时间范围**
   - 每月 1 日至月末最后一天合并的 PR 计入当月统计

### 二、申请流程
#### **申请条件**
   - PR 需被成功合并至以下几个仓库：
     1. [funstory-ai/BabelDOC](https://github.com/funstory-ai/BabelDOC/pulls) 仓库
     2. [PDFMathTranslate-next/PDFMathTranslate-next](https://github.com/PDFMathTranslate-next/PDFMathTranslate-next) 的主分支。
     3. [guaguastandup/zotero-pdf2zh](https://github.com/guaguastandup/zotero-pdf2zh) 的主分支
     4. [funstory-ai/aifw](https://github.com/funstory-ai/aifw) 的主分支
   - 若目标为 [funstory-ai/BabelDOC](https://github.com/funstory-ai/BabelDOC/pulls) 的 PR 未被合并，但被维护者认定为有价值的概念验证，同样符合条件。
   - 审核 PR、撰写 wiki 等贡献也必须是以上两个仓库。
   - 同一贡献者每月仅可申请一次（无论提交 PR 数量）
   - 同一贡献者每月最多可以获得 1 个兑换码
   - 对于 PR，只有发起者可以申请兑换码
   - 仅可使用当月的贡献申请兑换码（特殊情况请联系 aw@funstory.ai 说明）

#### **申请方式**
   - 发送邮件至 **aw@funstory.ai**
   - 邮件标题格式：`[贡献者会员兑换码申请] GitHub用户名-月份`（例：`[贡献者会员兑换码申请] awwaawwa-2024-07`）
   - 邮件正文需包含：
     - GitHub 用户名
     - 合并 PR 的完整链接
   - 附件要求：
     - PR 页面完整截图（需包含合并状态、仓库名称及点击头像后弹出来的侧边栏，如下图所示）

> [!IMPORTANT]
>
> 不满足上述格式要求的邮件会被直接忽略！

![附件示例](https://s.immersivetranslate.com/assets/r2-uploads/images/babeldoc-contributor_reward_example.png)

#### **奖励说明**
   - 奖励内容：[沉浸式翻译（Immersive Translate）](https://immersivetranslate.com/zh-Hans/pricing/)月度会员兑换码
   - 兑换码使用：在[沉浸式翻译官网兑换页](https://immersivetranslate.com/zh-Hans/exchange)输入即可激活
   - 会员权益：沉浸式翻译 Pro 会员一个月（详见[官网价格页](https://immersivetranslate.com/zh-Hans/pricing/)说明）
   - 兑换码为专属福利，不可转让

### 三、审核与发放
#### **审核周期**
   - 我们会尽力在收到申请邮件后 1 个工作日内完成审核
   - 审核时间可能因申请数量、审核复杂度等因素有所延长
   - 审核通过后，兑换码将通过邮件方式发送
   - 若审核未通过，我们会通过邮件说明原因

#### **兑换码规则**
   - 使用方式：[官网兑换页](https://immersivetranslate.com/zh-Hans/exchange)输入兑换码激活
   - 权益内容：月度会员（具体权益见[官网价格页](https://immersivetranslate.com/zh-Hans/pricing/)说明）
   - 不可转让

### 四、注意事项
#### **禁止行为**
   - 将完整功能拆分为多个无关 PR 提交
   - 提交质量不合格或具有潜在危害的代码
   - 提供虚假或误导性的申请材料

#### **特别说明**
   - funstory.ai 保留对贡献价值的评估权、规则的最终解释权等所有必要权利
   - 规则如有实质性更新（格式调整等除外），将提前 1 天在 [BabelDOC GitHub PR](https://github.com/funstory-ai/BabelDOC/pulls) 公告
   - 过期未使用的兑换码不予补发
   - 自 2025 年 2 月 1 日起的贡献可以申请兑换码
   - 为了确认您是 Pull Request (PR) 的发起者，防止他人冒领，我们可能会要求您使用发起者账号在 PR 下方留言指定的随机数字。

## 常见问题解答（FAQ）

**Q：如何判断文档翻译贡献是否有效？**

A：系统性的人工翻译（如完整章节的翻译并经过人工校对）视为有效贡献。零散段落翻译或仅依赖机器翻译的内容不计入有效贡献。

**Q：兑换码过期了可以补发吗？**

   A：为确保公平性，过期的兑换码将不予补发，请在有效期内及时使用。

**Q：为什么这个文档是中文的？**

A：因为目前应该是中文贡献者多吧，所以就先写中文的。后面再撰写英文版的。

---
**规则公示**：本规则文档存放于 BabelDOC 仓库 [CONTRIBUTOR_REWARD.md](https://github.com/funstory-ai/BabelDOC/blob/main/docs/CONTRIBUTOR_REWARD.md)，并在 [Contributor Reward - BabelDOC](https://funstory-ai.github.io/BabelDOC/CONTRIBUTOR_REWARD/) 展示。


================================================
FILE: docs/ImplementationDetails/AsyncTranslate/AsyncTranslate.md
================================================
# Async Translation API

> [!NOTE]
> This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
>
> - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
> - Community contribution (PRs welcome!)

## Overview

The `yadt.high_level.async_translate` function provides an asynchronous interface for translating PDF files with real-time progress reporting. This function yields progress events that can be used to update progress bars or other UI elements.

## Usage

```python linenums="1"
async def translate_with_progress():
    config = TranslationConfig(
        input_file="example.pdf",
        translator=your_translator,
        # ... other configuration options
    )
    
    try:
        async for event in async_translate(config):
            if event["type"] == "progress_update":
                print(f"Progress: {event['overall_progress']}%")
            elif event["type"] == "finish":
                result = event["translate_result"]
                print(f"Translation completed: {result.original_pdf_path}")
            elif event["type"] == "error":
                print(f"Error occurred: {event['error']}")
                break
    except asyncio.CancelledError:
        print("Translation was cancelled")
    except KeyboardInterrupt:
        print("Translation was interrupted")
```

## Event Types

The function yields different types of events during the translation process:

### 1. Progress Start Event

Emitted when a translation stage begins:

```python
{
    "type": "progress_start",
    "stage": str,              # Name of the current stage
    "stage_progress": float,   # Always 0.0
    "stage_current": int,      # Current progress count (0)
    "stage_total": int         # Total items to process in this stage
}
```

### 2. Progress Update Event

Emitted periodically during translation (controlled by report_interval, default 0.1s):

```python
{
    "type": "progress_update",
    "stage": str,              # Name of the current stage
    "stage_progress": float,   # Progress percentage of current stage (0-100)
    "stage_current": int,      # Current items processed in this stage
    "stage_total": int,        # Total items to process in this stage
    "overall_progress": float  # Overall translation progress (0-100)
}
```

### 3. Progress End Event

Emitted when a stage completes:

```python
{
    "type": "progress_end",
    "stage": str,              # Name of the completed stage
    "stage_progress": float,   # Always 100.0
    "stage_current": int,      # Equal to stage_total
    "stage_total": int,        # Total items processed in this stage
    "overall_progress": float  # Overall translation progress (0-100)
}
```

### 4. Finish Event

Emitted when translation completes successfully:

```python
{
    "type": "finish",
    "translate_result": TranslateResult  # Contains paths to translated files and timing info
}
```

### 5. Error Event

Emitted if an error occurs during translation:

```python
{
    "type": "error",
    "error": str  # Error message
}
```

## Translation Stages

The translation process goes through the following stages in order:

1. ILCreater
2. LayoutParser
3. ParagraphFinder
4. StylesAndFormulas
5. ILTranslator
6. Typesetting
7. FontMapper
8. PDFCreater

Each stage will emit its own set of progress events.

## Cancellation

The translation process can be cancelled in several ways:

1. By raising a `CancelledError` (e.g., when using `asyncio.Task.cancel()`)
2. Through `KeyboardInterrupt` (e.g., when user presses Ctrl+C)
3. By calling `translation_config.cancel_translation()` method

Example of programmatic cancellation:

```python linenums="1"
async def translate_with_cancellation():
    config = TranslationConfig(
        input_file="example.pdf",
        translator=your_translator,
        # ... other configuration options
    )
    
    try:
        # Start translation in another task
        translation_task = asyncio.create_task(process_translation(config))
        
        # Simulate some condition that requires cancellation
        await asyncio.sleep(5)
        config.cancel_translation()  # This will trigger cancellation
        
        await translation_task  # Wait for the task to finish
    except asyncio.CancelledError:
        print("Translation was cancelled")

async def process_translation(config):
    async for event in async_translate(config):
        if event["type"] == "error":
            if isinstance(event["error"], asyncio.CancelledError):
                print("Translation was cancelled")
                break
            print(f"Error occurred: {event['error']}")
            break
        # ... handle other events ...
```

When cancelled:
- The function will log the cancellation reason
- All resources will be cleaned up properly
- Any ongoing translation tasks will be stopped
- A final error event with `CancelledError` will be emitted
- The function will exit gracefully

## Error Handling

Any errors during translation will be:
1. Logged with full traceback (if debug mode is enabled)
2. Reported through an error event
3. Cause the event stream to stop after the error event
4. Clean up resources properly before exiting

It's recommended to handle these events appropriately in your application to provide feedback to users. The example in the Usage section shows a basic error handling pattern. 

================================================
FILE: docs/ImplementationDetails/ILTranslator/ILTranslator.md
================================================
# Intermediate Layer Translator

> [!NOTE]
> This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
>
> - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
> - Community contribution (PRs welcome!)

## Background

After formula and style processing, we need to translate the document while preserving all formatting, formulas, and styles. The intermediate layer translator handles this complex task by using placeholders and style preservation techniques.

## Goal

1. Translate text while preserving document structure
2. Maintain formulas and special formatting
3. Handle rich text with different styles
4. Support concurrent translation for better performance

## Specific Implementation

The translation process consists of several key steps:

### Step 1: Translation Preparation

1. Process paragraphs:
   - Skip vertical text
   - Handle single-component paragraphs directly
   - Process multi-component paragraphs with placeholders

2. Create placeholders:
   - Formula placeholders for mathematical expressions
   - Rich text placeholders for styled text
   - Ensure placeholder uniqueness within each paragraph

### Step 2: Translation Input Creation

1. Analyze paragraph components:
   - Regular text components
   - Formula components
   - Styled text components

2. Handle special cases:
   - Skip pure formula paragraphs
   - Preserve original text when style matches base style
   - Handle font mapping cases

### Step 3: Translation Execution

1. Concurrent translation:
   - Use thread pool for parallel processing
   - Control QPS (Queries Per Second)
   - Track translation progress

2. Translation tracking:
   - Record original text
   - Record translated text
   - Save tracking information for debugging

### Step 4: Translation Output Processing

1. Parse translated text:
   - Extract text between placeholders
   - Restore formulas at placeholder positions
   - Restore rich text with original styles

2. Create new paragraph components:
   - Maintain style information
   - Preserve formula positioning
   - Handle empty text segments

## Additional Features

1. Style preservation:
   - Maintains original text styles
   - Handles font size variations
   - Preserves formatting attributes

2. Formula handling:
   - Preserves formula integrity
   - Maintains formula positioning
   - Supports complex mathematical expressions

3. Debug support:
   - Translation tracking
   - JSON output for debugging
   - Detailed logging

## Limitations

1. Vertical text is not supported

2. Complex nested styles might not be perfectly preserved

3. Placeholder conflicts could occur in rare cases

4. Translation quality depends on external translation engine

## Configuration Options

The translation process can be customized through `TranslationConfig`:

1. `qps`: Maximum queries per second for translation
2. `debug`: Enable/disable debug mode and tracking
3. Translation engine specific settings 

================================================
FILE: docs/ImplementationDetails/PDFCreation/PDFCreation.md
================================================
# PDF Creation

> [!NOTE]
> This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
>
> - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
> - Community contribution (PRs welcome!)

## Background

After translation and typesetting, we need to create the final PDF document that preserves all the formatting, styles, and layout of the original document while containing the translated text. The PDF creation process handles this final step.

## Goal

1. Create a new PDF document with translated content
2. Preserve all original formatting and styles
3. Support both monolingual and dual-language output
4. Maintain font consistency and character encoding
5. Optimize the output file size and performance

## Specific Implementation

The PDF creation process consists of several key steps:

### Step 1: Font Management

1. Font initialization:
   - Add required fonts to the document
   - Map font identifiers
   - Handle font encoding lengths

2. Font availability checking:
   - Check available fonts for each page
   - Handle XObject font requirements
   - Manage font resources

3. Font subsetting:
   - Optimize font usage
   - Reduce file size
   - Maintain character support

### Step 2: Content Rendering

1. Character processing:
   - Handle individual characters
   - Process character encodings
   - Manage character positioning

2. Graphics state handling:
   - Process color spaces
   - Handle transparency
   - Manage graphic state instructions

3. XObject management:
   - Process form XObjects
   - Handle drawing operations
   - Maintain XObject hierarchy

### Step 3: Document Assembly

1. Page construction:
   - Build page content
   - Process page resources
   - Handle page boundaries

2. Content stream creation:
   - Generate drawing operations
   - Handle text positioning
   - Manage content streams

3. Resource management:
   - Handle font resources
   - Manage XObject resources
   - Process graphic states

### Step 4: Output Generation

1. Monolingual output:
   - Create translated-only PDF
   - Optimize file size
   - Apply compression

2. Dual-language output:
   - Combine original and translated pages
   - Handle page ordering
   - Maintain document structure

3. File optimization:
   - Apply garbage collection
   - Enable compression
   - Optimize for linear reading

## Additional Features

1. Font handling:
   - Support for CID fonts
   - Font subsetting
   - Font resource management

2. Document optimization:
   - File size reduction
   - Performance optimization
   - Resource cleanup

3. Debug support:
   - Decompressed output
   - Debug information
   - Progress tracking

## Limitations

1. Font support:
   - Limited to available font formats
   - Font subsetting restrictions
   - Character encoding constraints

2. File size:
   - Dual-language output increases size
   - Font embedding impact
   - Resource duplication

3. Performance considerations:
   - Processing time for large documents
   - Memory usage during creation
   - Optimization overhead

## Configuration Options

The PDF creation process can be customized through `TranslationConfig`:

1. Output options:
   - `no_mono`: Disable monolingual output
   - `no_dual`: Disable dual-language output
   - Output file naming patterns

2. Optimization settings:
   - Compression options
   - Garbage collection
   - Font subsetting

3. Debug options:
   - Debug mode
   - Decompressed output
   - Progress tracking 

================================================
FILE: docs/ImplementationDetails/PDFParsing/PDFParsing.md
================================================
# PDF Parsing and Intermediate Layer Creation

> [!NOTE]
> This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
>
> - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
> - Community contribution (PRs welcome!)

## Background

The first step in the translation process is to parse the PDF document and create an intermediate layer (IL) representation. This step involves extracting text, styles, formulas, and layout information from the PDF while maintaining their relationships and properties.

## Goal

1. Extract text content while preserving character-level information
2. Maintain font and style information
3. Preserve document structure and layout
4. Handle special elements like XObjects and graphics
5. Create a structured intermediate representation for later processing

## Specific Implementation

The parsing process consists of several key components working together:

### Step 1: PDF Interpreter (PDFPageInterpreterEx)

1. Page content processing:
   - Parse PDF operators and their parameters
   - Handle graphics state operations
   - Process text and font operations
   - Manage XObject rendering

2. Graphics filtering:
   - Filter non-formula lines
   - Handle color space operations
   - Process stroke and fill operations

3. XObject handling:
   - Process form XObjects
   - Handle image XObjects
   - Maintain XObject hierarchy

### Step 2: PDF Converter (PDFConverterEx)

1. Character processing:
   - Extract character information
   - Maintain character positions
   - Preserve style attributes

2. Layout management:
   - Handle page boundaries
   - Process figure elements
   - Manage coordinate systems

3. Font handling:
   - Map font identifiers
   - Process font metadata
   - Handle CID fonts

### Step 3: Intermediate Layer Creator (ILCreater)

1. Document structure creation:
   - Build page hierarchy
   - Create character objects
   - Maintain font registry

2. Resource management:
   - Process font resources
   - Handle color spaces
   - Manage graphic states

3. XObject tracking:
   - Track XObject hierarchy
   - Maintain XObject states
   - Process form content

### Step 4: High-level Coordination

1. Process management:
   - Initialize resources
   - Coordinate component interactions
   - Handle progress tracking

2. Resource initialization:
   - Set up font management
   - Initialize graphics resources
   - Prepare document structure

3. Error handling:
   - Handle malformed content
   - Manage resource errors
   - Provide debug information

## Additional Features

1. Font management:
   - Support for CID fonts
   - Font metadata extraction
   - Font mapping capabilities

2. Graphics state tracking:
   - Color space management
   - Line style preservation
   - Transparency handling

3. Coordinate system handling:
   - Support for transformations
   - Boundary box calculations
   - Position normalization

4. Debug support:
   - Detailed logging
   - Intermediate file generation
   - Progress tracking

## Limitations

1. Complex PDF features:
   - Limited support for some PDF extensions
   - Simplified graphics model
   - Basic transparency support

2. Font handling:
   - Limited support for some font formats
   - Simplified font metrics
   - Basic font feature support

3. Performance considerations:
   - Memory usage for large documents
   - Processing time for complex layouts
   - Resource management overhead

## Configuration Options

The parsing process can be customized through `TranslationConfig`:

1. `debug`: Enable/disable debug mode and intermediate file generation
2. Font-related settings:
   - Font mapping configurations
   - CID font handling options
3. Layout processing options:
   - Page selection
   - Content filtering rules 

================================================
FILE: docs/ImplementationDetails/ParagraphFinding/ParagraphFinding.md
================================================
# Paragraph Finding

> [!NOTE]
> This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
>
> - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
> - Community contribution (PRs welcome!)

## Background

After PDF analysis, we need to identify paragraphs from individual characters. This is a crucial step before translation and typesetting, as it helps maintain the logical structure of the document.

## Goal

1. Group characters into meaningful paragraphs while preserving the document's logical structure
2. Handle special cases like table of contents, short lines, and multi-line paragraphs
3. Maintain layout information for later typesetting

## Specific Implementation

The paragraph finding process consists of four main steps:

### Step 1: Create Initial Paragraphs

1. Group characters into lines based on their spatial relationships
2. Create paragraphs based on layout information and XObject IDs
3. Characters that don't belong to text layouts are skipped

### Step 2: Process Paragraph Spacing

1. Remove completely empty lines
2. Handle trailing spaces within lines
3. Update paragraph boundary boxes and metadata

### Step 3: Calculate Line Width Statistics

1. Calculate the median width of all lines
2. This information is used for identifying potential paragraph breaks

### Step 4: Process Independent Paragraphs

1. Analyze paragraphs with multiple lines
2. Split paragraphs in two cases:
   - When encountering table of contents entries (identified by consecutive dots)
   - When finding lines significantly shorter than the median width (configurable via `short_line_split_factor`)

## Additional Features

1. Layout-aware processing:
   - Respects different layout types (plain text, title, figure caption, etc.)
   - Maintains layout priority order for overlapping regions

2. First line indent detection:
   - Automatically detects and marks paragraphs with first line indentation

3. Flexible character position detection:
   - Uses multiple position detection modes (middle, topleft, bottomright)
   - Special handling for characters with unreliable height information

## Limitations

1. The current implementation assumes left-to-right text direction

2. May not perfectly handle complex layouts with overlapping regions

3. Table of contents detection relies on consecutive dots pattern

4. Short line splitting might occasionally create incorrect paragraph breaks

## Configuration Options

The paragraph finding behavior can be customized through `TranslationConfig`:

1. `split_short_lines`: Enable/disable splitting paragraphs at short lines
2. `short_line_split_factor`: Threshold factor for short line detection (relative to median width) 

================================================
FILE: docs/ImplementationDetails/README.md
================================================
# Implementation Details

> [!NOTE]
> This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
>
> - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
> - Community contribution (PRs welcome!)

## Core Processing Flow

Main processing stages in order of actual execution and corresponding documentation:

1. [PDFParser.md](PDFParsing/PDFParsing.md): **PDF Parsing and Intermediate Layer Creation**

2. [LayoutParser](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/midend/layout_parser.py): **Layout OCR**

3. [ParagraphFinding.md](ParagraphFinding/ParagraphFinding.md): **Paragraph Recognition**

4. [StylesAndFormulas.md](StylesAndFormulas/StylesAndFormulas.md): **Style and Formula Processing**

5. [ILTranslator.md](ILTranslator/ILTranslator.md): **Intermediate Layer Translation**

6. [Typesetting.md](Typesetting/Typesetting.md): **Typesetting Processing**

7. [FontMapper](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/utils/fontmap.py): **Font Mapping**

8. [PDFCreation.md](PDFCreation/PDFCreation.md): **PDF Generation**

## API

1. [Async Translation API](AsyncTranslate/AsyncTranslate.md): **Async Translation API**

> [!TIP]
>
> Click on document links to view detailed implementation principles and configuration options


================================================
FILE: docs/ImplementationDetails/StylesAndFormulas/StylesAndFormulas.md
================================================
# Styles and Formulas Processing

> [!NOTE]
> This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
>
> - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
> - Community contribution (PRs welcome!)

## Background

After paragraph finding, we need to identify formulas and text styles within each paragraph. This step is crucial for maintaining mathematical expressions and text formatting during translation.

## Goal

1. Identify and preserve mathematical formulas
2. Detect and maintain consistent text styles
3. Handle special cases like subscripts and superscripts
4. Calculate proper offsets for formula positioning

## Specific Implementation

The processing consists of several main steps:

### Step 1: Formula Detection

1. Identify formula characters based on:
   - Formula-specific fonts
   - Special Unicode characters
   - Vertical text
   - Corner marks (subscripts/superscripts)

2. Group consecutive formula characters into formula units

### Step 2: Formula Processing

1. Process comma-containing formulas:
   - Split complex formulas at commas when appropriate
   - Preserve brackets and their contents
   - Convert simple number-only formulas to regular text

2. Merge overlapping formulas:
   - Handle cases where subscripts/superscripts are detected as separate formulas
   - Maintain proper character ordering

### Step 3: Style Analysis

1. Calculate base style for each paragraph:
   - Find common style attributes across all text
   - Handle font variations
   - Process graphic states

2. Group characters with identical styles:
   - Font properties
   - Size properties
   - Graphic state properties

### Step 4: Position Calculation

1. Calculate formula offsets:
   - Compute x-offset relative to surrounding text
   - Compute y-offset for proper vertical alignment
   - Handle line spacing variations

## Additional Features

1. Font mapping:
   - Maps different fonts to standard ones
   - Special handling for formula fonts

2. Style inheritance:
   - Maintains style hierarchy
   - Handles partial style overrides

3. Formula classification:
   - Distinguishes between translatable and non-translatable formulas
   - Special handling for numeric formulas with commas

## Limitations

1. Formula detection relies on font and character patterns

2. May not handle all types of mathematical notations

3. Complex subscript/superscript combinations might be misidentified

4. Limited support for vertical formulas

## Configuration Options

The formula and style processing can be customized through `TranslationConfig`:

1. `formular_font_pattern`: Regex pattern for identifying formula fonts
2. `formular_char_pattern`: Regex pattern for identifying formula characters 

================================================
FILE: docs/ImplementationDetails/Typesetting/Typesetting.md
================================================
# Typography

> [!NOTE]
> This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
>
> - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
> - Community contribution (PRs welcome!)

## Background

After translation, text needs to be typeset before placing into PDF.

Translated paragraphs can contain any combination of the following types:

1. PDF formulas

2. Single PDF original character

3. PDF original string with same style

4. Translated Unicode string with same style

Let's discuss different cases:

For the following 3 types, they can be directly transmitted transparently to new positions:

1. PDF formulas

2. Single PDF original character

3. PDF original string with same style

Only "translated Unicode string with same style" needs typesetting operation, as this step loses original layout information. However, since paragraphs may contain other components that need transparent transmission, their positions may also change and need to participate in typesetting.

## Goal

Try to fit all components within the original paragraph bounding box. If impossible, try to expand the bounding box in writing direction.

## Specific Implementation

First perform reflow judgment to determine if the paragraph needs reflow. If all elements can be transmitted transparently, no reflow is needed. Then, if reflow is needed, execute Algorithm 1:

1. Convert all elements to typesetting unit type, which records length and width information.

2. Start from top-left of original paragraph bounding box, place elements sequentially.

3. If current line cannot fit next element, wrap to next line.

4. Repeat 2-3 until all elements are placed or exceed original bounding box.

Algorithm 1 works normally when translated text is shorter than original. When translated text is longer, Algorithm 2 needs to be added:

1. Initialize element scaling factor as 1.0.

2. Initialize line spacing as 1.5.

3. Try typesetting using Algorithm 1.

4. If it cannot fit all elements:

   - First try to reduce line spacing by 0.1 step until reaching minimum line spacing (1.4)
   - If still cannot fit:
     - When scale > 0.6, reduce element scaling by 0.05
     - When scale <= 0.6, reduce element scaling by 0.1
     - Reset line spacing to 1.5
   - When scale becomes less than 0.7, adjust minimum line spacing to 1.1

5. Report error if element scaling is less than 0.1.

Algorithm 2 can fit translations of almost all languages in original position.

However, for special cases like "图 1" translated to "Figure 1", even with the above algorithms some text may still overflow. So Algorithm 3:

1. Before reducing scale, first try to expand the bounding box in writing direction.

2. Calculate paragraph's right whitespace by:

   - Using 90% of page crop box width as maximum limit
   - Checking for overlapping paragraphs on the right
   - Checking for overlapping figures on the right

3. Expand paragraph bounding box based on available whitespace.

4. If still cannot fit all elements, continue with scale reduction as in Algorithm 2.

## Additional Features

1. Mixed Chinese-English text handling:
   - Adds 0.5 character width spacing between Chinese and English text transitions
   - Excludes certain punctuation marks from this spacing rule
2. First line indent:

   - Adds 2 Chinese characters width indent for the first line when specified

3. Hanging punctuation:
   - Allows certain punctuation marks to extend beyond the right margin
   - Helps maintain better visual alignment

## Limitations

1. Currently, we use PDFPlumber for PDF analysis, this is only implemented for paragraphs, only handles left-to-right writing.

2. Cannot handle table of contents alignment by dots.

3. Poor performance, needs optimization.

4. No global page information consideration, inconsistent text sizes.

5. No advanced typography features, poor reading experience.

## Related Resources

[UTR #59: East Asian Spacing](https://www.unicode.org/reports/tr59/) specifies which characters need spacing between them.


================================================
FILE: docs/README.md
================================================
YADT Spec
===

## YADT Document Intermediate Language

[il_version_1.rnc](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/il_version_1.rnc): The definition of the intermediate language used between PDF parsing and rendering stages.

For other implementation details, please refer to [Implementation Details](ImplementationDetails/README.md).

================================================
FILE: docs/deploy.sh
================================================
#!/bin/bash
set -e

command_exists() {
  command -v "$1" >/dev/null 2>&1
}

echo "check uv installed ……"
if command_exists uv; then
  echo "uv installed !"
  exit 0
fi

echo "uv not install, start installing ……"

OS=$(uname -s)
case "$OS" in
  Linux)
    if command_exists curl; then
        curl -LsSf https://astral.sh/uv/install.sh | sh
    elif command_exists wget; then
        wget -qO- https://astral.sh/uv/install.sh | sh
    else
      echo "curl or wget not found. uv installed failed."
      exit 1
    fi
    ;;
  Darwin)
    if command_exists brew; then
      brew install uv
    else
      echo "Homebrew not installed, please installed uv munally. "
      exit 1
    fi
    ;;
  *)
    echo "not support OS: $OS"
    exit 1
    ;;
esac

if command_exists uv; then
     uv run babeldoc --version
     pre-commit install
else
  exit 1
fi


================================================
FILE: docs/example/demo_glossary.csv
================================================
source,target,tgt_lng
AutoML,自动ML,zh-CN
"a,a",a,zh-CN
"""","""",zh-CN

================================================
FILE: docs/index.md
================================================

{!README.md!}


================================================
FILE: docs/intro-to-pdf-object.md
================================================
An Introduction to PDF Object Definitions in dpml
===

## 1. Understanding PDF Structure
A PDF file is fundamentally an indexed collection of objects, where each object represents a structured data unit. The file structure consists of four main components:

1. A header
2. Object definitions
3. A cross-reference table
4. A trailer

The cross-reference table serves as a lookup directory, mapping each numbered object to its byte offset location within the file. The trailer contains critical metadata, including the location of the root object (document catalog), which serves as the entry point for PDF interpretation. The file concludes with a byte offset pointing to the cross-reference table.

Here's an illustrative example of a PDF file structure:

```pdf
%PDF-2.0
1 0 obj
<<
  /Pages 2 0 R
  /Type /Catalog
>>
endobj
2 0 obj
<<
  /Count 1
  /Kids [
    3 0 R
  ]
  /Type /Pages
>>
endobj
3 0 obj
<<
  /Contents 4 0 R
  /MediaBox [ 0 0 612 792 ]
  /Parent 2 0 R
  /Resources <<
    /Font << /F1 5 0 R >>
  >>
  /Type /Page
>>
endobj
4 0 obj
<<
  /Length 44
>>
stream
BT
  /F1 24 Tf
  72 720 Td
  (Potato) Tj
ET
endstream
endobj
5 0 obj
<<
  /BaseFont /Helvetica
  /Encoding /WinAnsiEncoding
  /Subtype /Type1
  /Type /Font
>>
endobj

xref
0 6
0000000000 65535 f 
0000000009 00000 n 
0000000062 00000 n 
0000000133 00000 n 
0000000277 00000 n 
0000000372 00000 n 
trailer <<
  /Root 1 0 R
  /Size 6
  /ID [<42841c13bbf709d79a200fa1691836f8><b1d8b5838eeafe16125317aa78e666aa>]
>>
startxref
478
%%EOF
```

### PDF File Interpretation
When a PDF viewer processes a file, it follows these steps:

1. Starts at the file's end to locate the cross-reference table offset
2. Accesses the cross-reference table to find object locations
3. Reads the trailer dictionary to identify the document catalog
4. Uses the document catalog to access various document components:
   - Pages
   - Outlines
   - Thumbnails
   - Annotations
   - Other PDF elements

The pages tree root is particularly crucial as it enables navigation to specific pages within the document.

### Example Interpretation Flow
Let's trace through our example:

1. The cross-reference table begins at byte offset 478 (indicated after `startxref`)
2. The trailer identifies object 1 as the document catalog (`/Root 1 0 R`)
3. Object 1 is located at byte offset 9
4. The document catalog points to object 2 as the pages tree root
5. Object 2 is found at byte offset 62
6. The pages tree identifies page 3 as the first page
7. Object 3 is positioned at byte offset 133
8. Object 3 defines the page properties and links to object 4 for content
9. Object 4, at byte offset 277, contains the drawing instructions for rendering "Potato"

This structure enables efficient random access to any part of the PDF document.

## 2. PDF Objects

Earlier, we discussed PDF objects and introduced the concept of dictionaries. At the top level of a PDF file, objects are identified by two numbers followed by the keyword "obj". The first number serves as the object number, while the second—known as the generation number—is typically 0. Everything between these identifiers and the "endobj" keyword constitutes the object's body.

The PDF specification provides a mechanism for modifying files by appending object updates and cross-reference table entries. When an object's contents are completely replaced (rather than modified), its generation number can be incremented. This allows object numbers to be reused while preventing old indirect references from resolving to new objects. However, such files are rare in practice, and generation numbers can generally be disregarded. Modern PDF specifications using object streams have even eliminated generation numbers entirely.

PDF objects share similarities with data structures found in JSON, YAML, and modern programming languages, though PDF includes some unique object types. Here are the available PDF object types:

- String: A text sequence enclosed in parentheses, e.g., (potato). Note that PDF strings typically don't support full Unicode encoding, though there are specific cases where this is possible. (A detailed discussion of character encoding is beyond our current scope.)

- Number: Both integers and floating-point numbers (e.g., 12, 3.14159). While the PDF specification distinguishes between integers and real numbers, they're often interchangeable in practice—integers can be used where real numbers are expected, and viewers typically handle real numbers appropriately when integers are required.

- Boolean: Simple true/false values

- Null: Represented by the keyword "null"

- Name: A keyword or dictionary key identifier starting with a forward slash (/), e.g., /Type

- Array: An ordered collection of objects enclosed in square brackets, with no separators between items. Arrays support nested structures, including other arrays and dictionaries. Example: `[1 (two) 3.14 false]`

- Dictionary: A collection of key-value pairs where keys are Names and values can be any object type. Dictionaries are enclosed in << and >> with no separators between entries. Example: `<< /A 1 /B [2, 3 <</Four 4>> ] >>`

- Indirect object reference: A reference to a numbered object in the file, consisting of two numbers (object and generation) followed by 'R', e.g., 1 0 R. While some objects must be direct per the PDF specification, most can be defined at the top level and referenced indirectly.

- Stream: A container for binary data, structured as a dictionary (containing at least a /Length key and other format-specific entries) followed by the specified number of bytes between "stream" and "endstream" keywords. 🔍 The stream length can be specified as an indirect object, enabling single-pass PDF generation where the stream length isn't known in advance—a common practice in PDF creation.

## 3. PDF Object Definitions In dpml

### Coordinate system definition

The positive x-axis extends horizontally to the right, while the positive y-axis extends vertically upward, following
standard mathematical conventions. The unit length along both the x and y axes is defined as 1/72 inch (or 1 point).

## 4. Useful Information

- [PDF32000_2008](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf) page 111: Table 51 - Operator Categories

================================================
FILE: docs/requirements.txt
================================================
sphinx>=8.2.0
sphinx-click>=5.1.0
furo>=2024.1.29
myst-parser[linkify,html_meta,html_admonition]>=2.0.0 

================================================
FILE: docs/supported_languages.md
================================================
# Supported Languages

For languages in the table below that do not rely on ligature support, BabelDOC provides good support. For languages
that partially rely on ligatures, BabelDOC's translation results can generally meet self-reading needs. For languages
that completely rely on ligatures (such as some Indian languages), BabelDOC does not currently support them.

We are working hard to develop support for ligatures as soon as possible.
<!-- | Kazakh (Cyrillic)    | kk            | None                | -->

| Language                        | Language Code | Ligature Dependency |
|:--------------------------------|:--------------|:--------------------|
| English                         | EN            | None                |
| Simplified Chinese              | zh-CN         | None                |
| Traditional Chinese - Hong Kong | zh-HK         | None                |
| Traditional Chinese - Taiwan    | zh-TW         | None                |
| Japanese                        | JA            | None                |
| Korean                          | KO            | None                |
| Polish                          | PL            | Partial             |
| Russian                         | RU            | None                |
| Spanish                         | es            | None                |
| Portuguese                      | pt            | None                |
| French                          | fr            | Partial             |
| Malay                           | ms            | None                |
| Indonesian                      | id            | None                |
| Turkmen                         | tk            | None                |
| Filipino (Tagalog)              | tl            | None                |
| Vietnamese                      | vi            | None                |
| Kazakh (Latin)                  | kk            | None                |
| German                          | de            | None                |
| Dutch                           | nl            | None                |
| Irish                           | ga            | None                |
| Italian                         | it            | None                |
| Greek                           | el            | None                |
| Swedish                         | sv            | None                |
| Danish                          | da            | None                |
| Norwegian                       | no            | None                |
| Icelandic                       | is            | None                |
| Finnish                         | fi            | None                |
| Ukrainian                       | uk            | None                |
| Czech                           | cs            | None                |
| Romanian                        | ro            | None                |
| Hungarian                       | hu            | None                |
| Slovak                          | sk            | None                |
| Croatian                        | hr            | None                |
| Estonian                        | et            | None                |
| Latvian                         | lv            | None                |
| Lithuanian                      | lt            | None                |
| Belarusian                      | be            | None                |
| Macedonian                      | mk            | None                |
| Albanian                        | sq            | None                |
| Serbian (Cyrillic)              | sr            | Partial             |
| Serbian (Latin)                 | sr            | Partial             |
| Slovenian                       | sl            | None                |
| Catalan                         | ca            | None                |
| Bulgarian                       | bg            | None                |
| Maltese                         | mt            | None                |
| Swahili                         | sw            | None                |
| Amharic                         | am            | None                |
| Oromo                           | om            | None                |
| Tigrinya                        | ti            | None                |
| Haitian Creole                  | ht            | None                |
| Latin                           | la            | None                |
| Lao                             | lo            | None                |
| Malayalam                       | ml            | None                |
| Gujarati                        | gu            | None                |
| Thai                            | th            | None                |
| Burmese                         | my            | Partial             |
| Tamil                           | ta            | None                |
| Telugu                          | te            | None                |
| Oriya                           | or            | Partial             |
| Armenian                        | hy            | None                |
| Mongolian (Cyrillic)            | mn            | None                |
| Georgian                        | ka            | None                |
| Khmer                           | km            | None                |
| Bosnian                         | bs            | None                |
| Luxembourgish                   | lb            | None                |
| Moldovan                        | ro            | None                |
| Moldovan (Cyrillic)             | ro            | None                |
| Romansh                         | rm            | None                |
| Turkish                         | tr            | None                |
| Sinhala                         | si            | None                |
| Uzbek                           | uz            | None                |
| Kyrgyz                          | ky            | None                |
| Tajik                           | tg            | None                |
| Abkhazian                       | ab            | None                |
| Afar                            | aa            | None                |
| Afrikaans                       | af            | None                |
| Akan                            | ak            | None                |
| Aragonese                       | an            | None                |
| Avaric                          | av            | None                |
| Ewe                             | ee            | None                |
| Aymara                          | ay            | None                |
| Ojibwa                          | oj            | None                |
| Occitan                         | oc            | None                |
| Oriya                           | or            | None                |
| Ossetian                        | os            | None                |
| Pali                            | pi            | None                |
| Bashkir                         | ba            | None                |
| Basque                          | eu            | None                |
| Breton                          | br            | None                |
| Chamorro                        | ch            | None                |
| Chechen                         | ce            | None                |
| Chuvash                         | cv            | None                |
| Tswana                          | tn            | None                |
| Ndebele, South                  | nr            | None                |
| Ndonga                          | ng            | None                |
| Faroese                         | fo            | None                |
| Fijian                          | fj            | None                |
| Frisian, Western                | fy            | None                |
| Ganda                           | lg            | None                |
| Kongo                           | kg            | None                |
| Kalaallisut                     | kl            | None                |
| Church Slavic                   | cu            | None                |
| Guarani                         | gn            | None                |
| Interlingua                     | ia            | None                |
| Herero                          | hz            | None                |
| Kikuyu                          | ki            | None                |
| Rundi                           | rn            | None                |
| Kinyarwanda                     | rw            | None                |
| Kirghiz                         | ky            | None                |
| Galician                        | gl            | None                |
| Kanuri                          | kr            | None                |
| Cornish                         | kw            | None                |
| Komi                            | kv            | None                |
| Xhosa                           | xh            | None                |
| Corsican                        | co            | None                |
| Cree                            | cr            | None                |
| Croatian                        | hr            | None                |
| Quechua                         | qu            | None                |
| Kurdish (Latin)                 | ku            | None                |
| Kuanyama                        | kj            | None                |
| Limburgan                       | li            | None                |
| Lingala                         | ln            | None                |
| Manx                            | gv            | None                |
| Malagasy                        | mg            | None                |
| Marshallese                     | mh            | None                |
| Maori                           | mi            | None                |
| Navajo                          | nv            | None                |
| Nauru                           | na            | None                |
| Nyanja                          | ny            | None                |
| Norwegian Nynorsk               | nn            | None                |
| Sardinian                       | sc            | None                |
| Northern Sami                   | se            | None                |
| Samoan                          | sm            | None                |
| Sango                           | sg            | None                |
| Shona                           | sn            | None                |
| Esperanto                       | eo            | None                |
| Scottish Gaelic                 | gd            | None                |
| Somali                          | so            | None                |
| Southern Sotho                  | st            | None                |
| Tagalog                         | tl            | None                |
| Tatar                           | tt            | None                |
| Tahitian                        | ty            | None                |
| Tongan                          | to            | None                |
| Twi                             | tw            | None                |
| Walloon                         | wa            | None                |
| Welsh                           | cy            | None                |
| Venda                           | ve            | None                |
| Volapük                         | vo            | None                |
| Interlingue                     | ie            | None                |
| Hiri Motu                       | ho            | None                |
| Igbo                            | ig            | None                |
| Ido                             | io            | None                |
| Inuktitut                       | iu            | None                |
| Inupiaq                         | ik            | None                |
| Sichuan Yi                      | ii            | None                |
| Yoruba                          | yo            | None                |
| Zhuang                          | za            | None                |
| Tsonga                          | ts            | None                |
| Zulu                            | zu            | None                |
| Brazilian Portuguese            | pt-BR         | None                |


================================================
FILE: mkdocs.yml
================================================
# Copyright (c) 2016-2025 Martin Donath <martin.donath@squidfunk.com>

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.

# Project information
site_name: BabelDOC
site_url: https://squidfunk.github.io/mkdocs-material/
site_author: funstory.ai
site_description: >-
  Write your documentation in Markdown and create a professional static site in
  minutes – searchable, customizable, in 60+ languages, for all devices

# Repository
repo_name: funstory-ai/BabelDOC
repo_url: https://github.com/funstory-ai/BabelDOC
edit_uri: edit/main/docs/

# Copyright
copyright: Copyright &copy; 2025 funstory.ai

# Configuration
theme:
  name: material
  # custom_dir: material/overrides
  features:
    - announce.dismiss
    - content.action.edit
    - content.action.view
    - content.code.annotate
    - content.code.copy
    - content.code.select
    # - content.footnote.tooltips
    # - content.tabs.link
    - content.tooltips
    # - header.autohide
    # - navigation.expand
    - navigation.footer
    - navigation.indexes
    # - navigation.instant
    # - navigation.instant.prefetch
    # - navigation.instant.progress
    # - navigation.prune
    - navigation.sections
    - navigation.tabs
    # - navigation.tabs.sticky
    - navigation.top
    - navigation.tracking
    - search.highlight
    - search.share
    - search.suggest
    - toc.follow
    # - toc.integrate
  palette:
    - media: "(prefers-color-scheme)"
      toggle:
        icon: material/brightness-auto
        name: Switch to light mode
    - media: "(prefers-color-scheme: light)"
      scheme: default
      primary: white
      accent: indigo
      toggle:
        icon: material/brightness-7
        name: Switch to dark mode
    - media: "(prefers-color-scheme: dark)"
      scheme: slate
      primary: black
      accent: indigo
      toggle:
        icon: material/brightness-4
        name: Switch to system preference
  font:
    text: Roboto
    code: Roboto Mono
  # favicon: assets/favicon.png
  favicon: images/babeldoc-small-logo-with-transparent-background.svg
  logo: images/babeldoc-small-logo-with-transparent-background.svg

# Plugins
plugins:
  - search:
      separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])'
  - minify:
      minify_html: true
  - git-authors
  - git-revision-date-localized:
      enable_creation_date: true
# Additional configuration
extra:
  status:
    new: Recently added
    deprecated: Deprecated
  social:
    - icon: fontawesome/brands/github
      link: https://github.com/funstory-ai/BabelDOC
    - icon: fontawesome/brands/python
      link: https://pypi.org/project/BabelDOC/

# Extensions
markdown_extensions:
  - github-callouts
  - markdown_include.include
  - pymdownx.highlight:
      anchor_linenums: true
      line_spans: __span
      pygments_lang_class: true
  - pymdownx.inlinehilite
  - pymdownx.snippets
  - pymdownx.superfences
  - def_list
  - pymdownx.tasklist:
      custom_checkbox: true
not_in_nav: |
  /tutorials/**/*.md

# Page tree
nav:
  - Home: index.md
  - Supported Languages: supported_languages.md
  - API:
    - Async Translation API: ImplementationDetails/AsyncTranslate/AsyncTranslate.md
  - Implementation Details:
    - ImplementationDetails/README.md
    - PDF Parsing: ImplementationDetails/PDFParsing/PDFParsing.md
    - Layout Parser(.py): https://github.com/funstory-ai/BabelDOC/blob/main/babeldoc/document_il/midend/layout_parser.py
    - Paragraph Finding: ImplementationDetails/ParagraphFinding/ParagraphFinding.md
    - Styles and Formulas: ImplementationDetails/StylesAndFormulas/StylesAndFormulas.md
    - IL Translator: ImplementationDetails/ILTranslator/ILTranslator.md
    - Typesetting: ImplementationDetails/Typesetting/Typesetting.md
    - Font Mapper(.py): https://github.com/funstory-ai/BabelDOC/blob/main/babeldoc/document_il/utils/fontmap.py
    - PDF Creation: ImplementationDetails/PDFCreation/PDFCreation.md
    - Intro To PDF Object: intro-to-pdf-object.md
  - Community:
    - Code of Conduct: CODE_OF_CONDUCT.md
    - Contributing:
      - Contributing: CONTRIBUTING.md
      - Contributor Reward: CONTRIBUTOR_REWARD.md

================================================
FILE: pyproject.toml
================================================
[project]
name = "BabelDOC"
version = "0.5.23"
description = "Yet Another Document Translator"
license = "AGPL-3.0"
readme = "README.md"
requires-python = ">=3.10,<3.14"
authors = [
    { name = "awwaawwa", email = "aw@funstory.ai" }
]
maintainers = [
    { name = "awwaawwa", email = "aw@funstory.ai" }
]
classifiers = [
    "Programming Language :: Python :: 3",
    "Operating System :: OS Independent",
]
keywords = ["PDF"]
dependencies = [
    "bitstring>=4.3.0",
    "configargparse>=1.7",
    "httpx[socks]>=0.27.0",
    "huggingface-hub>=0.27.0",
    "numpy>=2.0.2",
    "onnx>=1.18.0",
    "onnxruntime>=1.16.1",
    "openai>=1.59.3",
    "orjson>=3.10.14",
    "charset-normalizer >= 2.0.0",
    "cryptography >= 36.0.0",
    #    "pdfminer-six==20250416",
    "peewee>=3.17.8",
    "psutil>=7.0.0",
    "pymupdf>=1.25.1",
    "rich>=13.9.4",
    "toml>=0.10.2",
    "tqdm>=4.67.1",
    "xsdata[cli,lxml,soap]>=24.12",
    "msgpack>=1.1.0",
    "pydantic>=2.10.6",
    "tenacity>=9.0.0",
    "scikit-image>=0.25.2",
    "freetype-py>=2.5.1",
    "tiktoken>=0.9.0",
    "Levenshtein>=0.27.1",
    "opencv-python-headless>=4.10.0.84",
    "rapidocr-onnxruntime>=1.4.4",
    "pyzstd>=0.17.0",
    "hyperscan>=0.7.13",
    "rtree>=1.4.0",
    "chardet>=5.2.0",
    "scipy>=1.15.3",
    "uharfbuzz>=0.50.2",
    "scikit-learn>=1.7.1",
]

[project.optional-dependencies]
directml = ["onnxruntime-directml>=1.16.1"]
cuda = ["onnxruntime-gpu>=1.16.1"]
memray = ["memray>=1.17.1"]

[project.urls]
Homepage = "https://github.com/funstory-ai/BabelDOC"
Issues = "https://github.com/funstory-ai/BabelDOC/issues"

[project.scripts]
babeldoc = "babeldoc.main:cli"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.flake8]
ignore = ["E203", "E261", "E501", "W503", "E741", "E501"]
max-line-length = 88

[tool.ruff]
src = ["babeldoc"]
target-version = "py310"
show-fixes = true

[tool.ruff.format]
# Enable reformatting of code snippets in docstrings.
docstring-code-format = true

[tool.ruff.lint]
ignore = [
    "E203",   # 冒号前的空格
    "E261",   # 注释前至少两个空格
    "E501",   # 行太长
    "E741",   # 变量名歧义
    "F841",   # 未使用的变量
    "C901",   # 太复杂的函数
    "S101",   # use assert
    "SIM",    # flake8-simplify
    "ARG002", # unused argument
    "S110",   # `try`-`except`-`pass` detected, consider logging the exception
    "B024",   # abstract class without abstract methods
    "S112",   # `try`-`except`-`continue` detected, consider logging the exception
    "COM812", # missing-trailing-comma

]
select = [
    "E",   # pycodestyle 错误
    "F",   # Pyflakes
    "N",   # PEP8 命名
    "B",   # flake8-bugbear
    "I",   # isort
    "C",   # mccabe
    "UP",  # pyupgrade
    "S",   # flake8-bandit
    "A",   # flake8-builtins
    "COM", # flake8-commas
    "ARG", # flake8-unused-arguments
    "PTH", # 使用 pathlib
]

[tool.ruff.lint.flake8-quotes]
docstring-quotes = "double"

[tool.ruff.lint.flake8-annotations]
suppress-none-returning = true

[tool.ruff.lint.isort]
force-single-line = true

[tool.ruff.lint.pydocstyle]
convention = "google"

# 设置一些规则的特定配置
[tool.ruff.lint.mccabe]
max-complexity = 10 # 函数圈复杂度阈值

[tool.ruff.lint.per-file-ignores]
"babeldoc/babeldoc_exception/BabelDOCException.py" = ["N999"]
"babeldoc/format/pdf/pdfinterp.py" = ["N"] # 忽略命名规范
"tests/*" = ["S101"]            # 在测试文件中允许 assert
"**/__init__.py" = ["F401"]     # 允许未使用的导入
# 忽略 S311 警告，因为这是有意的
"babeldoc/format/pdf/document_il/midend/paragraph_finder.py" = ["S311"]
"docs/*" = ["A001"]
"babeldoc/pdfminer/*" =["A","F", "I", "N", "S", "B", "C", "COM", "ARG", "PTH", "UP"]
[dependency-groups]
dev = [
    "bumpver>=2024.1130",
    "markdown-callouts>=0.4.0",
    "markdown-include>=0.8.1",
    "mkdocs-git-authors-plugin>=0.9.2",
    "mkdocs-git-committers-plugin-2>=2.5.0",
    "mkdocs-git-revision-date-localized-plugin>=1.3.0",
    "mkdocs-material[recommended]>=9.6.4",
    "pre-commit>=4.1.0",
    "pygments>=2.19.1",
    "ruff>=0.9.2",
    "pytest>=8.3.4",
    "pylance>=0.29.0",
    "py-spy>=0.4.0",
]

[tool.pytest.ini_options]
pythonpath = [".", "src"]
testpaths = ["tests"]

[bumpver]
current_version = "0.5.23"
version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]"

[bumpver.file_patterns]
"pyproject.toml" = [
    'current_version = "{version}"',
    'version = "{version}"'
]
"babeldoc/__init__.py" = [
    '__version__ = "{version}"'
]
"babeldoc/main.py" = [
    '__version__ = "{version}"'
]
"babeldoc/const.py" = [
    '__version__ = "{version}"'
]

[tool.uv.sources]
yadt = { path = ".", editable = true }

[tool.pyright]
pythonVersion = "3.10"
# typeCheckingMode = "off"
reportGeneralTypeIssues = false
reportUnknownVariableType = false
reportMissingParameterType = false
reportUnknownParameterType = false


================================================
FILE: tests/test_translation_cache_cleanup.py
================================================
from concurrent.futures import ThreadPoolExecutor

from babeldoc.translator.cache import TranslationCache
from babeldoc.translator.cache import _TranslationCache
from babeldoc.translator.cache import clean_test_db
from babeldoc.translator.cache import init_test_db


def _prepare_records(cache: TranslationCache, num_records: int) -> None:
    """Insert *num_records* unique records into the cache."""
    for i in range(num_records):
        cache.set(f"text_{i}", f"translation_{i}")


def test_cleanup_under_limit(monkeypatch):
    """When total rows < MAX_CACHE_ROWS, cleanup should do nothing."""
    # Create an isolated test database
    test_db = init_test_db()
    try:
        cache = TranslationCache("dummy")
        # Make cleanup run every time for deterministic behaviour
        monkeypatch.setattr("babeldoc.translator.cache.CLEAN_PROBABILITY", 1.0)
        # Lower the MAX_CACHE_ROWS threshold for quick test execution
        monkeypatch.setattr("babeldoc.translator.cache.MAX_CACHE_ROWS", 1000)

        _prepare_records(cache, 900)
        cache.set("extra", "extra")  # This triggers cleanup
        assert _TranslationCache.select().count() == 901
    finally:
        clean_test_db(test_db)


def test_cleanup_over_limit(monkeypatch):
    """When rows > MAX_CACHE_ROWS, cleanup should trim to the limit."""
    test_db = init_test_db()
    try:
        cache = TranslationCache("dummy")
        monkeypatch.setattr("babeldoc.translator.cache.CLEAN_PROBABILITY", 1.0)
        monkeypatch.setattr("babeldoc.translator.cache.MAX_CACHE_ROWS", 500)

        total_records = 750
        _prepare_records(cache, total_records)
        cache.set("extra", "extra")

        assert _TranslationCache.select().count() <= 500  # capped at limit
    finally:
        clean_test_db(test_db)


def test_cleanup_thread_safety(monkeypatch):
    """Multiple threads attempting cleanup concurrently should not raise errors."""
    test_db = init_test_db()
    try:
        cache = TranslationCache("dummy")
        monkeypatch.setattr("babeldoc.translator.cache.CLEAN_PROBABILITY", 1.0)
        monkeypatch.setattr("babeldoc.translator.cache.MAX_CACHE_ROWS", 500)

        def task(n):
            cache.set(f"text_{n}", f"translation_{n}")

        # Use a pool of threads to stress cleanup
        with ThreadPoolExecutor(max_workers=10) as executor:
            executor.map(task, range(600))

        # After all threads complete, ensure table size is capped
        assert _TranslationCache.select().count() <= 500
    finally:
        clean_test_db(test_db)